把原下载器修改成selenium+webdriver
jd.py
# -*- coding: utf-8 -*-
import scrapy
class JdSpider(scrapy.Spider):
name = 'jd'
allowed_domains = ['jd.com', 'vip.com']
start_urls = ['http://www.vip.com/']
def parse(self, response):
print('_________________________')
print(response.text)
pass
middlewares.py
from selenium import webdriver
from scrapy.http import HtmlResponse
from time import sleep
def process_request(self, request, spider):
#这个方法当下载器被调度起来时被调用
#把下载器修改成selenium+webdriver
driver = webdriver.Chrome()
driver.get(request.url)
sleep(3)
res = driver.page_source
body = etree.HTML(res)
print("下载中间件正在下载内容")
print("当前访问的网址" + driver.current_url)
return HtmlResponse(url=driver.current_url, body=body, encoding='utf-8', request=request)
settings.py
(55)
DOWNLOADER_MIDDLEWARES = {
# 开启了下载器中间件
'seven_jingdong.middlewares.SevenJingdongDownloaderMiddleware': 543,
# 需要关闭掉原来的下载组件
"scrapy.downloadermiddlewares.useragent.UserAgentMiddleware": None
}