day09-scrapy分布式

1. 主机爬取瓜子网我要买车页面

  • scrapy startproject dbspider

1.1 爬虫代码

  • spiders下面guazi.py
"""__author__= 雍新有"""
from scrapy import Selector, Spider, Request

from dbspider.items import VehicleItem


class GuaZiSpider(Spider):
    name = 'guazi'
    guazi_urls = 'https://www.guazi.com/cd/buy/o{page}/#bread'

    def start_requests(self):
        for i in range(1, 51):
            print(self.guazi_urls.format(page=i))
            yield Request(url=self.guazi_urls.format(page=i),
                          callback=self.parse_guazi)

    def parse_guazi(self, response):
        sel = Selector(response)
        vehicles = sel.xpath('/html/body/div[6]/ul/li')
        for car in vehicles:
            item = VehicleItem()
            url_prefix = 'https://www.guazi.com'
            item['href'] = url_prefix + car.xpath('./a/@href').extract_first()
            item['name'] = car.xpath('./a/@title').extract_first()
            yield item

1.2 重构下载中间件

  • 因为response.text拿不到源代码,要用到selenium
  • 改settings
DOWNLOADER_MIDDLEWARES = {
   # 'dbspider.middlewares.DbspiderDownloaderMiddleware': 543,
    'dbspider.middlewares.SeleniumMiddleware': 543,
}
  • 在middlewares中写SeleniumMiddleware

class SeleniumMiddleware():

    def __init__(self):
        # 获取浏览器
        self.browser = webdriver.Chrome()

    def __del__(self):
        # 运行完自动关闭
        self.browser.close()

    def process_request(self, request, spider):
        # request.url就是请求的地址https://www.guazi.com/cd/buy/
        self.browser.get(request.url)
        # TODO:如果获取到的源码中车辆信息加载不完整,则在此处执行js
        # js实现下拉拖拽条
        # window.scrollTo(0, 3 * document.body.scrollHeight)
        for i in range(1, 9):
            js = 'window.scrollTo(0, {} * document.body.scrollHeight/8)'.format(i)
            self.browser.execute_script(js)
            time.sleep(1)
        html = self.browser.page_source
        # body是bytes类型
        #  返回Response,表示不去调用下载器进行下载请求,
        #          而是直接返回响应内容给解析方法parse(response)
        return HtmlResponse(url=request.url,
                            body=html,
                            encoding='utf-8')

1.3 数据存到redis中 - 只存href

  • 写item模型
  • 在爬虫代码中会创建VehicleItem对象并返回item-前面有
class VehicleItem(scrapy.Item):
    name = scrapy.Field()
    href = scrapy.Field()
  • 改settings
ITEM_PIPELINES = {
   # 'dbspider.pipelines.DbspiderPipeline': 300,
   #  'dbspider.pipelines.MySQLPipeline': 300,
    'dbspider.pipelines.RedisPipeline': 300,
}
  • pipelines中创建RedisPipeline - 数据存入redis,
    与MySQL和mongodb一样,为了方便少些几个函数

class RedisPipeline():

    def process_item(self, item, spider):
        # 创建redis对象
        db = redis.Redis(host='127.0.0.1', port=6379)
        db.lpush('guazi:start_urls', item['href'])
        return item

2. 从机-爬取瓜子二手车的详细信息

  • scrapy startproject slavespider
  • pip install scrapy_redis - 实现分布式

2.1 爬虫代码

  • spiders下面guazi.py - 从主机存的redis中取出二手车地址
"""__author__= 雍新有"""
from scrapy import Selector, Request
from scrapy_redis.spiders import RedisSpider, RedisCrawlSpider

from slavedbspider.items import SlavedbspiderItem


class GuaziSpider(RedisSpider):

    name = 'guazi'
    # 从机需要从redis的list类型中爬取地址,而不是从start_urls中取地址
    # start_urls = []
    redis_key = 'guazi:start_urls'

    def parse(self, response):
        sel = Selector(response)
        item = SlavedbspiderItem()
        item['info1'] = sel.xpath('//*[@class="basic-eleven clearfix"]/li[1]/div/text()').extract_first()
        item['info2'] = sel.xpath('//*[@class="basic-eleven clearfix"]/li[2]/div/text()').extract_first()
        item['info3'] = sel.xpath('//*[@class="basic-eleven clearfix"]/li[3]/div/text()').extract_first()
        item['info4'] = sel.xpath('//*[@class="basic-eleven clearfix"]/li[4]/div/text()').extract_first()
        item['info5'] = sel.xpath('//*[@class="basic-eleven clearfix"]/li[5]/div/text()').extract_first()
        item['info6'] = sel.xpath('//*[@class="basic-eleven clearfix"]/li[6]/div/text()').extract_first()
        item['info7'] = sel.xpath('//*[@class="basic-eleven clearfix"]/li[7]/div/text()').extract_first()
        item['info8'] = sel.xpath('//*[@class="basic-eleven clearfix"]/li[8]/div/text()').extract_first()
        item['info9'] = sel.xpath('//*[@class="basic-eleven clearfix"]/li[9]/div/text()').extract_first()
        yield item

2.2 重构下载中间件 - 与主机一样

  • 因为response.text拿不到源代码,要用到selenium
  • 改settings
DOWNLOADER_MIDDLEWARES = {
   # 'dbspider.middlewares.DbspiderDownloaderMiddleware': 543,
    'dbspider.middlewares.SeleniumMiddleware': 543,
}
  • 在middlewares中写SeleniumMiddleware

class SeleniumMiddleware():

    def __init__(self):
        # 获取浏览器
        self.browser = webdriver.Chrome()

    def __del__(self):
        # 运行完自动关闭
        self.browser.close()

    def process_request(self, request, spider):
        # request.url就是请求的地址https://www.guazi.com/cd/buy/
        self.browser.get(request.url)
        # TODO:如果获取到的源码中车辆信息加载不完整,则在此处执行js
        # js实现下拉拖拽条
        # window.scrollTo(0, 3 * document.body.scrollHeight)
        for i in range(1, 9):
            js = 'window.scrollTo(0, {} * document.body.scrollHeight/8)'.format(i)
            self.browser.execute_script(js)
            time.sleep(1)
        html = self.browser.page_source
        # body是bytes类型
        #  返回Response,表示不去调用下载器进行下载请求,
        #          而是直接返回响应内容给解析方法parse(response)
        return HtmlResponse(url=request.url,
                            body=html,
                            encoding='utf-8')

2.3 把二手车详细信息存入redis中

  • 写item模型 - 爬虫中创建item = SlavedbspiderItem()

import scrapy


class SlavedbspiderItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    info1 = scrapy.Field()
    info2 = scrapy.Field()
    info3 = scrapy.Field()
    info4 = scrapy.Field()
    info5 = scrapy.Field()
    info6 = scrapy.Field()
    info7 = scrapy.Field()
    info8 = scrapy.Field()
    info9 = scrapy.Field()
  • 改settings配置
  • 这是scrapy_redis自带的pipeline
ITEM_PIPELINES = {
   # 'slavedbspider.pipelines.SlavedbspiderPipeline': 300,
    'scrapy_redis.pipelines.RedisPipeline': 300
}
  • 在settings最后

# scrapy_redis的配置
# 调用scrapy_redis的调度Scheduler
SCHEDULER = "scrapy_redis.scheduler.Scheduler"
# 去重
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"

REDIS_HOST = '127.0.0.1'
REDIS_PORT = 6379
©著作权归作者所有,转载或内容合作请联系作者
平台声明:文章内容(如有图片或视频亦包括在内)由作者上传并发布,文章内容仅代表作者本人观点,简书系信息发布平台,仅提供信息存储服务。

推荐阅读更多精彩内容