requests 中间件
import requests
from scrapy.http import HtmlResponse
from selenium import webdriver
class RequestsMiddleware(object):
def process_request(self, request, spider):
headers = {k.decode(): v[0].decode() for k, v in request.headers.items()}
res = requests.post(
url=request.url,
headers=headers,
data=request.body.decode()
)
"""
status=200,
headers=None,
body=b"",
flags=None,
request=None,
certificate=None,
ip_address=None,
protocol=None,
"""
# HtmlResponse(url=url, body=body, status=200, encoding='utf-8')
response_headers = res.headers
response = HtmlResponse(
status=res.status_code,
# headers=res.headers,
url=res.url,
body=res.text,
encoding='utf-8',
# request=res.request
)
return response
selenium中间件
import requests
from scrapy.http import HtmlResponse
from selenium import webdriver
class SeleniumMiddleware(object):
"""selenium模拟渲染"""
def __init__(self):
self.options = webdriver.ChromeOptions()
self.options.add_argument('-headless')
self.options.add_experimental_option('excludeSwitches', ['enable-automation']) # 忽略证书错误
self.options.add_argument('--ignore-certificate-errors')
self.driver = webdriver.Chrome(executable_path="C:\chromedriver.exe", options=self.options)
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_closed, signal=signals.spider_closed)
return s
def spider_closed(self):
self.driver.close()
def process_request(self, request, spider):
is_selenium = request.meta.get('is_selenium', False)
if is_selenium:
try:
self.driver.get(request.url)
except TimeoutException:
self.driver.execute_script('window.stop()')
return HtmlResponse(url=request.url, status=666, encoding='utf-8')
time.sleep(1)
url = self.driver.current_url
body = self.driver.page_source
return HtmlResponse(url=url, body=body, status=200, encoding='utf-8')
excel中间件
import pandas as pd
class ExcelPipeline:
def open_spider(self, spider):
self.headers = ['列1', '列2']
self.df = []
def process_item(self, item, spider):
data = [
item.get('area', ''),
item.get('name', '')
]
self.df.append(data)
return item
def close_spider(self, spider):
df = pd.DataFrame(self.df)
df.to_excel(drug_file, sheet_name='', startcol=0, index=False, header=self.headers)