IT橘子网站是需要登录之后才能进行数据抓取
找到IT橘子网站的登录链接
- URL地址为:https://www.itjuzi.com/api/authorizations
- 请求方式为:POST
查看POST传入的参数和header信息
- 请求的参数为:Request Payload。包含了账号密码等信息。
- 登录代码实现如下(记得修改用户名和密码):
class JuziSpider(scrapy.Spider):
name = 'juzi'
allowed_domains = ['itjuzi.com']
def start_requests(self):
"""
模拟POST请求登录IT桔子网
"""
header = {
"Content-Type": "application/json",
"User-Agent": "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)",
"Host": "www.itjuzi.com",
"Referer": "https: // www.itjuzi.com / investevent",
}
url = "https://www.itjuzi.com/api/authorizations"
Payload = {"account": "自己的用户名", "password": "密码", "type": "pswd"}
yield scrapy.Request(url=url,
method="POST",
body=json.dumps(Payload),
headers=header,
callback=self.parse
)
此次爬虫主要是为了获取 IT橘子网里面 事件库模块里面的信息
信息爬取分析:
- 找到事件库请求的URL链接
- 分析Request Header里面的参数信息
-
分析Request Payload里面的参数信息
分析结果如下:
-
Authorization:为Request Header里面的必须参数,Authorization需要在登录之后才能获取,可以登录之后的response中查看。
- Request Payload中的pagetotal参数为数据总量,per_page为每一页的数据量,page为当前页码,可以通过计算得出总的页码数量,实现数据循环抓取。
- 爬取下来的数据存入到mongodb数据库当中。
完整代码实现如下:
- juzi.py文件
# -*- coding: utf-8 -*-
import scrapy
import json
import time
import random
from ITjuzi.items import ItjuziItem
from ITjuzi.settings import USER_AGENT_LIST
class JuziSpider(scrapy.Spider):
name = 'juzi'
allowed_domains = ['itjuzi.com']
def start_requests(self):
"""
模拟POST请求登录IT桔子网
"""
header = {
"Content-Type": "application/json",
"User-Agent": "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)",
"Host": "www.itjuzi.com",
"Referer": "https: // www.itjuzi.com / investevent",
}
url = "https://www.itjuzi.com/api/authorizations"
Payload = {"account": "用户名", "password": "密码", "type": "pswd"}
yield scrapy.Request(url=url,
method="POST",
body=json.dumps(Payload),
headers=header,
callback=self.parse
)
def parse(self, response):
url = "https://www.itjuzi.com/api/investevents"
token = json.loads(response.text)['data']['token']
header = {
"Content-Type": "application/json",
"Authorization": token,
"User-Agent": "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)",
"Host": "www.itjuzi.com",
"Referer": "https: // www.itjuzi.com / investevent",
}
payload = {
"pagetotal": 0, "total": 0, "per_page": 20, "page": 1, "type": 1, "scope": "", "sub_scope": "",
"round": [], "valuation": [], "valuations": "", "ipo_platform": "", "equity_ratio": [""],
"status": "", "prov": "", "city": [], "time": [], "selected": "", "location": "", "currency": [],
"keyword": ""
}
yield scrapy.Request(url=url,
method="POST",
body=json.dumps(payload),
meta={'token': token},
headers=header,
callback=self.parse_info
)
# 通过计算页数,获取总的页面数,抓取所有数据。
def parse_info(self, response):
# 获取传入过来的token
token = response.meta["token"]
data = json.loads(response.text)
# 总数据
total_number = data['data']['page']['total']
# 总的页数
if type(int(total_number) / 20) is not int:
page = int(int(total_number) / 20) + 1
else:
page = int(total_number) / 20
url = "https://www.itjuzi.com/api/investevents"
header = {
"Content-Type": "application/json",
"Authorization": token,
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36",
"Host": "www.itjuzi.com",
"Referer": "https: // www.itjuzi.com / investevent",
}
for i in range(1, page):
time.sleep(1)
payload = {
"pagetotal": 0, "total": 0, "per_page": 20, "page": i, "type": 1, "scope": "", "sub_scope": "",
"round": [], "valuation": [], "valuations": "", "ipo_platform": "", "equity_ratio": [""],
"status": "", "prov": "", "city": [], "time": [], "selected": "", "location": "", "currency": [],
"keyword": ""
}
# request()中加入dont_filter=True 解决scrapy自身是默认有过滤重复请求的bug。
# 由于parse_info()函数中 请求的url和parse_detail()函数中请求的URL相同,需要加入dont_filter=True
yield scrapy.Request(dont_filter=True,
url=url,
method="POST",
body=json.dumps(payload),
headers=header,
callback=self.parse_detail
)
def parse_detail(self, response):
infos = json.loads(response.text)["data"]["data"]
for info in infos:
item = ItjuziItem()
item["id"] = info["id"] if info.get('id') else '',
item["com_id"] = info["com_id"] if info.get('com_id') else '',
item["name"] = info["name"] if info.get('name') else '',
item["com_scope"] = info["com_scope"] if info.get('com_scope') else '',
item["money"] = info["money"] if info.get('money') else '',
item["money_num"] = info["money_num"] if info.get('money_num') else '',
item["valuation"] = info["valuation"] if info.get('valuation') else '',
item["city"] = info["city"] if info.get('city') else '',
item["agg_time"] = info["agg_time"] if info.get('agg_time') else '',
item["invse_des"] = info["invse_des"] if info.get('invse_des') else '',
yield item
- item.py文件
class ItjuziItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
id = scrapy.Field()
com_id = scrapy.Field()
name = scrapy.Field()
com_scope = scrapy.Field()
money = scrapy.Field()
money_num = scrapy.Field()
valuation = scrapy.Field()
city = scrapy.Field()
agg_time = scrapy.Field()
invse_des = scrapy.Field()
- pipelines.py实现数据存入到mongdb数据库
# mongodb
class ItjuziMongoPipeline():
def __init__(self, mongo_uri, mongo_db):
self.mongo_uri = mongo_uri
self.mongo_db = mongo_db
@classmethod
def from_crawler(cls, crawler):
return cls(
mongo_uri=crawler.settings.get('MONGO_URI'),
mongo_db=crawler.settings.get('MONGO_DB')
)
def open_spider(self, spider):
self.client = pymongo.MongoClient(self.mongo_uri)
self.db = self.client[self.mongo_db]
def close_spider(self, spider):
self.client.close()
def process_item(self, item, spider):
"""
问题:数据存入数据库之后,出现大量重复数据
解决思路:
在process_item中执行数据插入之前,先对变量进行复制copy,再用复制copy的变量进行操作,通过互斥确保变量不被修改。因此,修正这个问题,我们只需要调整优化下process_item()方法。
解决代码:process_item() - copy.deepcopy(item) ->导入copy包
"""
asynItem = copy.deepcopy(item)
infos = {'id': asynItem['id'],
'com_id': asynItem['com_id'],
'name': asynItem['name'],
'com_scope': asynItem['com_scope'],
'money': asynItem['money'],
'money_num': asynItem['money_num'],
'valuation': asynItem['valuation'],
'city': asynItem['city'],
'agg_time': asynItem['agg_time'],
'invse_des': asynItem['invse_des'],
}
self.db.ITjuzi.insert(infos)
return item
- settings中配置相关信息
ITEM_PIPELINES = {
'ITjuzi.pipelines.ItjuziMongoPipeline': 310,
}
FEED_EXPORT_ENCODING = 'utf-8'
MONGO_URI = 'localhost'
MONGO_DB = 'scrapy_IT_juzi'
总结与补充说明
- IT橘子网站,反扒措施很强可以自己设置User-Agent代理池和IP代理池