scrapy yield发起网络请求,网页重定向到登录页面,导致request.url为登录url
怎么解决
重新构建middlewares.py中的process_response()
class Spider1688DownloaderMiddleware(object):
@classmethod
def from_crawler(cls, crawler):
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_request(self, request, spider):
return None
def process_response(self, request, response, spider):
if '验证' in response.text:
if 'offerlist' in response.url:
mall_id = re.findall('%3A%2F%2F(.*?).1688', response.url)[0]
requests_url = 'https://{}.1688.com/page/offerlist.htm'.format(mall_id)
request._set_url(requests_url)
return request
elif 'login' in response.url:
if '%3A%2F%2F' in response.url:
pagenum = ''
if 'pageNum' in response.url:
# try:
num = re.findall('pageNum%3D(.*?)&', response.url)[0]
pagenum = '?pageNum={}'.format(num)
mall_id = re.findall('%3A%2F%2F(.*?).1688', response.url)[0]
if 'login' in mall_id:
mall_id = re.findall('%253A%252F%252F(.*?).1688', response.url)[0]
requests_url = 'https://{}.1688.com/page/offerlist.htm{}'.format(mall_id[0], pagenum)
request._set_url(requests_url)
return request
else:
return response
判断response.text中字段或者response.url中特殊字符判断是否网页重定向
从response.url中获取字符串,截取需要构建的url参数
使用request._set_url重新构建request中的访问URL
spider中加上
class Al1688MallSpiderSpider(scrapy.Spider):
name = 'al_1688_mall_spider'
allowed_domains = ['re.1688.com']
# start_urls = ['http://re.1688.com/']
custom_settings = {
'DOWNLOADER_MIDDLEWARES': {
'spider_1688.middlewares.Spider1688DownloaderMiddleware': 120, # 中间件
},
'RETRY_HTTP_CODECS': 20, # 网络请求重试次数
"COOKIES_ENABLED": False # 每次请求不携带cookies
}