day08-scrapy中间件重写和数据库连接

  • scrapy执行逻辑详细图


    QQ图片20190724171321.jpg

1.下载中间件downloader

spider_scrapy_zhujian.png

1.1 下载中间件

  • settings配置
DOWNLOADER_MIDDLEWARES = {
   # 'TestSpider.middlewares.TestspiderDownloaderMiddleware': 543,
   #  'TestSpider.middlewares.Test1Middleware': 543,
    'TestSpider.middlewares.Test2Middleware': 543,

}
  • 重写下载中间件的方法
class Test1Middleware():

    def process_request(self, request, spider):
        # 返回None,表示继续执行其他中间件的process_request方法
        #       如果最后一个中间件的process_request方法还是返回None,
        #       则表示取调用下载器进行下载请求
        return None

        # 返回Response, 表示不去调用下载器进行下载请求,
        #           而是直接返回响应内容给解析方法parse(response)
        # return Response(url='http://www.baidu.com', body='12345')

        # 返回Request,表示发送请求给调度器进行下载,不建议使用
        # return Request(url='http://www.baidu.com',
        #                callback=self.parse,
        #                dont_filter=True)
        # if request.url != 'http://www.baidu.com':
        #     return Request(url='http://www.baidu.com')

    def process_response(self, request, response, spider):
        # 修改响应内容response
        response.status = 201
        return response

    def process_exception(self, request, excepition, spider):
        print('异常处理')
        # return None
        # 换ip
        # request.meta['proxy'] = 'http://'
        return request
  • 该user-agent和ip

class Test2Middleware():

    def process_request(self, request, spider):
        # 设置ip
        # request.meta['proxy'] = 'http://122.117.65.107:52851'
        # 设置头部User-Agent
        ua = UserAgent()
        request.headers['User-Agent'] = ua.random
        return None

    def process_response(self, request, response, spider):

        return response

1.2 爬虫中间件

  • settings配置
SPIDER_MIDDLEWARES = {
   # 'TestSpider.middlewares.TestspiderSpiderMiddleware': 543,
    'TestSpider.middlewares.BiqugeSpiderMiddleware': 543,
}
  • 爬虫返回请求request和item时,都会被调用的方法

class BiqugeSpiderMiddleware():

    def process_spider_output(self, response, result, spider):
        # 爬虫返回请求request和item时,都会被调用的方法
        for i in result:
            # 爬虫返回Request对象时
            if isinstance(i, Request):
                yield i
            # 爬虫返回Item对象时
            if isinstance(i, BiqugeSpiderItem):
                # TODO:处理i的内容
                # count = 0
                i['content'] = str(i['content']).replace('\\xa0', '')
                i['content'] = i['content'].replace('\\r', '').replace("', '',", '').replace('"', '')
                temp = i['content']
                i['content'] = ''
                for x in temp:
                    if x != '[' and x != ']' and x != "'":
                        i['content'] += x
                print(i['content'])
                # print(count)
                yield i

2. 连接数据库

链接数据库准备

  • 在items文件中写items模型
class BiqugeSpiderItem(scrapy.Item):
    content = scrapy.Field()
    name = scrapy.Field()
  • 在spider文件中生成item对象
    def parse_detail(self, response):

        sel = Selector(response)
        item = BiqugeSpiderItem()
        # 解析方法, 解析content内容时,可以对结果进行处理,在返回实体
        item['content'] = sel.xpath('//*[@id="content"]/text()').extract()
        item['name'] = sel.xpath('//*[@class="content"]/h1/text()').extract_first()
        yield item

2.1 连接mongodb

  • settings配置pipelines
ITEM_PIPELINES = {
   # 'TestSpider.pipelines.TestspiderPipeline': 300,
     'TestSpider.pipelines.MongoDBPipeline': 300,
   # 'TestSpider.pipelines.MysqlPipeline': 300,
}
  • 添加settings参数
# Mongo配置
MongoDB_HOST = '127.0.0.1'
MongoDB_PORT = 27017
MongoDB_PASSWORD = '123456'
MongoDB_DB = 'spider'
  • piplines数据库连接

class MongoDBPipeline():
    # 持久化数据
    def __init__(self, mongo_host, mongo_port, mongo_password, mongo_db):
        self.mongo_host = mongo_host
        self.mongo_port = mongo_port
        self.mongo_password = mongo_password
        self.mongo_db = mongo_db

    @classmethod
    def from_crawler(cls, crawler):
        # 返回mongodbpipeline对象
        return cls(
            mongo_host=MongoDB_HOST,
            mongo_port=MongoDB_PORT,
            mongo_password=MongoDB_PASSWORD,
            mongo_db = MongoDB_DB
        )

    def open_spider(self, spider):
        # 链接mongdb
        self.client = pymongo.MongoClient(host=self.mongo_host,
                                          port=self.mongo_port,
                                          password=self.mongo_password)
        self.db = self.client[self.mongo_db]

    def close_spider(self, spider):
        # 关闭链接
        self.client.close()

    def process_item(self, item, spider):
        # 将item数据保存在mongo中
        # type(item)  - item是一个对象
        # <class 'TestSpider.items.BiqugeSpiderItem'>
        self.db['biquge'].insert_one(dict(item))
        return item

2.2 连接mysqldb

  • settings配置pipelines
ITEM_PIPELINES = {
   # 'TestSpider.pipelines.TestspiderPipeline': 300,
   #  'TestSpider.pipelines.MongoDBPipeline': 300,
    'TestSpider.pipelines.MysqlPipeline': 300,
}
  • 添加settings参数
# msyql配置
MYSQL_HOST = '127.0.0.1'
MYSQL_PORT = 3306
MYSQL_PASSWORD = '960218'
MYSQL_USER = 'root'
MYSQL_DB = 'spider'
  • piplines数据库连接

class MysqlPipeline():

    def __init__(self, host, port, user, password, database):
        self.host = host
        self.port = port
        self.user = user
        self.password = password
        self.database = database

    @classmethod
    def from_crawler(cls, crawler):
        return cls(
            host=crawler.settings.get("MYSQL_HOST"),
            port=crawler.settings.get("MYSQL_PORT"),
            user=crawler.settings.get("MYSQL_USER"),
            password=crawler.settings.get("MYSQL_PASSWORD"),
            database=crawler.settings.get("MYSQL_DB"),
        )

    def open_spider(self, spider):
        # 链接数据库
        self.db = pymysql.connect(host=self.host,
                                  port=self.port,
                                  user=self.user,
                                  password=self.password,
                                  db=self.database,
                                  charset='utf8')
        self.cursor = self.db.cursor()

    def close_spider(self, spider):
        self.db.close()

    def process_item(self, item, spider):
        sql = "insert into biquge(content, name) values(('%s'),('%s'))" % (item['content'], item['name'])
        print(sql)
        self.cursor.execute(sql)
        self.db.commit()
        return item

注意:下面俩种写法

from TestSpider.settings import MongoDB_HOST, MongoDB_PORT, MongoDB_PASSWORD, MongoDB_DB
@classmethod
    def from_crawler(cls, crawler):
        # 返回mongodbpipeline对象
        return cls(
            mongo_host=MongoDB_HOST,
            mongo_port=MongoDB_PORT,
            mongo_password=MongoDB_PASSWORD,
            mongo_db = MongoDB_DB
        )
@classmethod
    def from_crawler(cls, crawler):
        return cls(
            host=crawler.settings.get("MYSQL_HOST"),
            port=crawler.settings.get("MYSQL_PORT"),
            user=crawler.settings.get("MYSQL_USER"),
            password=crawler.settings.get("MYSQL_PASSWORD"),
            database=crawler.settings.get("MYSQL_DB"),
        )


    @classmethod
    def from_crawler(cls, crawler):
        return cls(
            MYSQL_HOST,
            MYSQL_PORT,
            MYSQL_USER,
            MYSQL_PASSWORD,
            MYSQL_DB,
        )
    
最后编辑于
©著作权归作者所有,转载或内容合作请联系作者
平台声明:文章内容(如有图片或视频亦包括在内)由作者上传并发布,文章内容仅代表作者本人观点,简书系信息发布平台,仅提供信息存储服务。

推荐阅读更多精彩内容