1. 主机爬取瓜子网我要买车页面
- scrapy startproject dbspider
1.1 爬虫代码
"""__author__= 雍新有"""
from scrapy import Selector, Spider, Request
from dbspider.items import VehicleItem
class GuaZiSpider(Spider):
name = 'guazi'
guazi_urls = 'https://www.guazi.com/cd/buy/o{page}/#bread'
def start_requests(self):
for i in range(1, 51):
print(self.guazi_urls.format(page=i))
yield Request(url=self.guazi_urls.format(page=i),
callback=self.parse_guazi)
def parse_guazi(self, response):
sel = Selector(response)
vehicles = sel.xpath('/html/body/div[6]/ul/li')
for car in vehicles:
item = VehicleItem()
url_prefix = 'https://www.guazi.com'
item['href'] = url_prefix + car.xpath('./a/@href').extract_first()
item['name'] = car.xpath('./a/@title').extract_first()
yield item
1.2 重构下载中间件
- 因为response.text拿不到源代码,要用到selenium
- 改settings
DOWNLOADER_MIDDLEWARES = {
# 'dbspider.middlewares.DbspiderDownloaderMiddleware': 543,
'dbspider.middlewares.SeleniumMiddleware': 543,
}
- 在middlewares中写SeleniumMiddleware
class SeleniumMiddleware():
def __init__(self):
# 获取浏览器
self.browser = webdriver.Chrome()
def __del__(self):
# 运行完自动关闭
self.browser.close()
def process_request(self, request, spider):
# request.url就是请求的地址https://www.guazi.com/cd/buy/
self.browser.get(request.url)
# TODO:如果获取到的源码中车辆信息加载不完整,则在此处执行js
# js实现下拉拖拽条
# window.scrollTo(0, 3 * document.body.scrollHeight)
for i in range(1, 9):
js = 'window.scrollTo(0, {} * document.body.scrollHeight/8)'.format(i)
self.browser.execute_script(js)
time.sleep(1)
html = self.browser.page_source
# body是bytes类型
# 返回Response,表示不去调用下载器进行下载请求,
# 而是直接返回响应内容给解析方法parse(response)
return HtmlResponse(url=request.url,
body=html,
encoding='utf-8')
1.3 数据存到redis中 - 只存href
- 写item模型
- 在爬虫代码中会创建VehicleItem对象并返回item-前面有
class VehicleItem(scrapy.Item):
name = scrapy.Field()
href = scrapy.Field()
ITEM_PIPELINES = {
# 'dbspider.pipelines.DbspiderPipeline': 300,
# 'dbspider.pipelines.MySQLPipeline': 300,
'dbspider.pipelines.RedisPipeline': 300,
}
- pipelines中创建RedisPipeline - 数据存入redis,
与MySQL和mongodb一样,为了方便少些几个函数
class RedisPipeline():
def process_item(self, item, spider):
# 创建redis对象
db = redis.Redis(host='127.0.0.1', port=6379)
db.lpush('guazi:start_urls', item['href'])
return item
2. 从机-爬取瓜子二手车的详细信息
- scrapy startproject slavespider
- pip install scrapy_redis - 实现分布式
2.1 爬虫代码
- spiders下面guazi.py - 从主机存的redis中取出二手车地址
"""__author__= 雍新有"""
from scrapy import Selector, Request
from scrapy_redis.spiders import RedisSpider, RedisCrawlSpider
from slavedbspider.items import SlavedbspiderItem
class GuaziSpider(RedisSpider):
name = 'guazi'
# 从机需要从redis的list类型中爬取地址,而不是从start_urls中取地址
# start_urls = []
redis_key = 'guazi:start_urls'
def parse(self, response):
sel = Selector(response)
item = SlavedbspiderItem()
item['info1'] = sel.xpath('//*[@class="basic-eleven clearfix"]/li[1]/div/text()').extract_first()
item['info2'] = sel.xpath('//*[@class="basic-eleven clearfix"]/li[2]/div/text()').extract_first()
item['info3'] = sel.xpath('//*[@class="basic-eleven clearfix"]/li[3]/div/text()').extract_first()
item['info4'] = sel.xpath('//*[@class="basic-eleven clearfix"]/li[4]/div/text()').extract_first()
item['info5'] = sel.xpath('//*[@class="basic-eleven clearfix"]/li[5]/div/text()').extract_first()
item['info6'] = sel.xpath('//*[@class="basic-eleven clearfix"]/li[6]/div/text()').extract_first()
item['info7'] = sel.xpath('//*[@class="basic-eleven clearfix"]/li[7]/div/text()').extract_first()
item['info8'] = sel.xpath('//*[@class="basic-eleven clearfix"]/li[8]/div/text()').extract_first()
item['info9'] = sel.xpath('//*[@class="basic-eleven clearfix"]/li[9]/div/text()').extract_first()
yield item
2.2 重构下载中间件 - 与主机一样
- 因为response.text拿不到源代码,要用到selenium
- 改settings
DOWNLOADER_MIDDLEWARES = {
# 'dbspider.middlewares.DbspiderDownloaderMiddleware': 543,
'dbspider.middlewares.SeleniumMiddleware': 543,
}
- 在middlewares中写SeleniumMiddleware
class SeleniumMiddleware():
def __init__(self):
# 获取浏览器
self.browser = webdriver.Chrome()
def __del__(self):
# 运行完自动关闭
self.browser.close()
def process_request(self, request, spider):
# request.url就是请求的地址https://www.guazi.com/cd/buy/
self.browser.get(request.url)
# TODO:如果获取到的源码中车辆信息加载不完整,则在此处执行js
# js实现下拉拖拽条
# window.scrollTo(0, 3 * document.body.scrollHeight)
for i in range(1, 9):
js = 'window.scrollTo(0, {} * document.body.scrollHeight/8)'.format(i)
self.browser.execute_script(js)
time.sleep(1)
html = self.browser.page_source
# body是bytes类型
# 返回Response,表示不去调用下载器进行下载请求,
# 而是直接返回响应内容给解析方法parse(response)
return HtmlResponse(url=request.url,
body=html,
encoding='utf-8')
2.3 把二手车详细信息存入redis中
- 写item模型 - 爬虫中创建item = SlavedbspiderItem()
import scrapy
class SlavedbspiderItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
info1 = scrapy.Field()
info2 = scrapy.Field()
info3 = scrapy.Field()
info4 = scrapy.Field()
info5 = scrapy.Field()
info6 = scrapy.Field()
info7 = scrapy.Field()
info8 = scrapy.Field()
info9 = scrapy.Field()
- 改settings配置
- 这是scrapy_redis自带的pipeline
ITEM_PIPELINES = {
# 'slavedbspider.pipelines.SlavedbspiderPipeline': 300,
'scrapy_redis.pipelines.RedisPipeline': 300
}
# scrapy_redis的配置
# 调用scrapy_redis的调度Scheduler
SCHEDULER = "scrapy_redis.scheduler.Scheduler"
# 去重
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
REDIS_HOST = '127.0.0.1'
REDIS_PORT = 6379