- scrapy默认的是get请求,当网页是post请求的时候需要重写start_requests方法,重构起始url请求需要浏览器--参数的数据请求
- meta方法
meta={'form_data': form_data}
当我们在获取数据的时候,当某个页面无法获取我们所需的数据,需要在另一个页面获取其他数据的时候,这时候我们可以在发起一个新的请求的时候把数据传递过去:meta={'tag':tag_item['tagName']}
# windoms运行的时候如果出现了以下错误:
"""UnicodeEncodeError: 'gbk' codec can't encode character '\u2764' in position 261: illegal multibyte sequence"""
# 是由于windows标准输出的默认编码(gbk)无法识别编码格式,解决方法:
# 改变标准输出的默认编码
import sys,io
sys.stdout = io.TextIOWrapper(sys.stdout.buffer,encoding='gb18030')
# 可以把字符串变成xpath格式的标签
from scrapy.selector import Selector
# 移除文本中的标签
from w3lib.html import remove_tags
class SjjySpider(scrapy.Spider):
name = 'sjjy'
allowed_domains = ['jiayuan.com']
start_urls = ['http://search.jiayuan.com/v2/search_v2.php']
def start_requests(self):
form_data = {
'sex': 'f',
'key': '',
'stc': '1: 11, 2: 20.28, 23: 1',
'sn': 'default',
'sv': '1',
'p': '2',
'f': 'search',
'listStyle': 'bigPhoto',
'pri_uid': '0',
'jsversion': 'v5'
}
# formdata:对应的表单数据
for url in self.start_urls:
yield scrapy.FormRequest(
url,
formdata=form_data,
meta={'form_data': form_data}
, dont_filter=True)
def parse(self, response):
# print(response.status)
print(response.text)
# 匹配json,转化成字典
pattern = re.compile('##jiayser##(.*?)##jiayser##', re.S)
result = re.findall(pattern, response.text)[0]
data = json.loads(result)
for userinfo in data['userInfo']:
# print(userinfo)
item = ShijijiayuanItem()
# uid
item['uid'] = userinfo['uid']
# 头像
item['header_img'] = userinfo['image']
# 性别
item['sex'] = userinfo['sex']
# remove_tags移出标签
item['randTag'] = remove_tags(userinfo['randTag'])
# 年龄
item['age'] = userinfo['age']
# 身高
item['height'] = userinfo['height']
# 个性签名
item['shortnote'] = userinfo['shortnote']
# 工作地点
item['workAddress'] = userinfo['work_location']
# 对另一半要求
item['mathCtion'] = userinfo['matchCondition']
# 匿名名
item['nickname'] = userinfo['nickname']
print(item)
yield item
# 发起下一页
form_data = response.meta['form_data']
# print(from_data)
cur_page = form_data['p']
next_page = int(cur_page) + 1
pageTotal = int(data['pageTotal'])
if next_page < pageTotal:
form_data['p'] = str(next_page)
print(cur_page)
yield scrapy.FormRequest('http://search.jiayuan.com/v2/search_v2.php',formdata=form_data,meta={'form_data': form_data},callback=self.parse)