中间件

requests 中间件

import requests
from scrapy.http import HtmlResponse
from selenium import webdriver

class RequestsMiddleware(object):
    def process_request(self, request, spider):
        headers = {k.decode(): v[0].decode() for k, v in request.headers.items()}
        res = requests.post(
            url=request.url,
            headers=headers,
            data=request.body.decode()
        )
        """
        status=200,
        headers=None,
        body=b"",
        flags=None,
        request=None,
        certificate=None,
        ip_address=None,
        protocol=None,
        """
        # HtmlResponse(url=url, body=body, status=200, encoding='utf-8')
        response_headers = res.headers
        response = HtmlResponse(
            status=res.status_code,
            # headers=res.headers,
            url=res.url,
            body=res.text,
            encoding='utf-8',
            # request=res.request
        )
        return response

selenium中间件

import requests
from scrapy.http import HtmlResponse
from selenium import webdriver

 class SeleniumMiddleware(object):
    """selenium模拟渲染"""
    def __init__(self):
        self.options = webdriver.ChromeOptions()
        self.options.add_argument('-headless')
        self.options.add_experimental_option('excludeSwitches', ['enable-automation'])  # 忽略证书错误
        self.options.add_argument('--ignore-certificate-errors')
        self.driver = webdriver.Chrome(executable_path="C:\chromedriver.exe", options=self.options)

    @classmethod
    def from_crawler(cls, crawler):
        # This method is used by Scrapy to create your spiders.
        s = cls()
        crawler.signals.connect(s.spider_closed, signal=signals.spider_closed)
        return s

    def spider_closed(self):
        self.driver.close()

    def process_request(self, request, spider):
        is_selenium = request.meta.get('is_selenium', False)
        if is_selenium:
            try:
                self.driver.get(request.url)
            except TimeoutException:
                self.driver.execute_script('window.stop()')
                return HtmlResponse(url=request.url, status=666, encoding='utf-8')
            time.sleep(1)
            url = self.driver.current_url
            body = self.driver.page_source
            return HtmlResponse(url=url, body=body, status=200, encoding='utf-8')

excel中间件

import pandas as pd


class ExcelPipeline:
    def open_spider(self, spider):
        self.headers =  ['列1', '列2']
        self.df = []

    def process_item(self, item, spider):
        data = [
            item.get('area', ''),
            item.get('name', '')
        ]
        self.df.append(data)
        return item

    def close_spider(self, spider):
       df = pd.DataFrame(self.df)
       df.to_excel(drug_file, sheet_name='', startcol=0, index=False, header=self.headers)
最后编辑于
©著作权归作者所有,转载或内容合作请联系作者
平台声明:文章内容(如有图片或视频亦包括在内)由作者上传并发布,文章内容仅代表作者本人观点,简书系信息发布平台,仅提供信息存储服务。

推荐阅读更多精彩内容