1嵌套爬取
嵌套爬取指爬虫爬取数据,获得新的详情页面网址返回给引擎;引擎对详情页同时也开始爬取的类型。本次嵌套爬取爬取有妖气的主页并拿到漫画详情的章节页面,再次拿到每章的信息。
class ComicSpider(scrapy.Spider):
name = 'comic'
allowed_domains = ['www.u17.com']
start_urls = ['http://www.u17.com/']
def get_headers(self):
headers = {
'Referer': 'http://www.u17.com/comic/ajax.php?mod=comic_list&act=comic_list_new_fun&a=get_comic_list',
'User-Agent': "Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
'Host': 'www.u17.com',
'Accept': 'application/json, text/javascript, */*;', # headers中requests headers中的accept数据
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6,ja;q=0.4,zh-TW;q=0.2,mt;q=0.2',
'Connection': 'keep-alive',
'X-Requested-With': 'XMLHttpRequest',
}
return headers
def start_requests(self):
headers = self.get_headers()
url = 'http://www.u17.com/comic/ajax.php?mod=comic_list&act=comic_list_new_fun&a=get_comic_list'
data = {'data[group_id]': 'no', 'data[theme_id]': 'no', 'data[is_vip]': 'no', 'data[accredit]': 'no',
'data[color]': 'no', 'data[comic_type]': 'no', 'data[series_status]': '1', 'data[order]': '1',
'data[page_num]': '1', 'data[read_mode]': 'no'
}
for page in range(200):
data['data[page_num]'] = str(page)
print(page)
yield scrapy.FormRequest(url=url,
headers=headers,
method='POST',
formdata=data,
callback=self.parse,
)
def parse(self, response):
result=json.loads(response.text)
result_list = result['comic_list']
pattern = re.compile('<font.*?>(.*?)<.*?', re.S)
for item in result_list:
u17_item = U17Item()
u17_item['comic_id'] = item['comic_id']
u17_item['name'] = item['name']
u17_item['cover'] = item['cover']
u17_item['category'] = re.findall(pattern, item['line2'])[0]
yield u17_item
detail_url = 'http://www.u17.com/comic/%s.html'%item['comic_id']
yield scrapy.Request(url=detail_url,
headers=self.get_headers(),
method='GET',
callback=self.parse_detail, # 回调函数
)
def parse_detail(self, response):
chaptor_list = response.selector.css('#chapter').xpath('.//a')
comic_id = response.url.split('/')[-1].split('.')[0]
for chapter in chaptor_list:
detail_item = U17DetailItem()
detail_item['comic_id']=comic_id
detail_item['title']=chapter.xpath('./@title').extract_first()
detail_item['link']=chapter.xpath('./@href').extract_first()
yield detail_item
spider文件中定义了两个请求及两个对爬取的数据进行处理的方法;分别处理主页及详情页的信息。
import scrapy
class U17Item(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
collection_name = 'u17' # 用于管道存数据时判断属于哪个类
comic_id = scrapy.Field()
name = scrapy.Field()
cover = scrapy.Field()
category = scrapy.Field()
class U17DetailItem(scrapy.Item):
collection_name = 'u17_detail' # 用于管道存数据时判断属于哪个类
comic_id = scrapy.Field()
title = scrapy.Field()
link = scrapy.Field()
在items中定义了两个保存数据的类;其中collection_name为对象的属性用于区分是哪个类,数据库中并不保存该字段;故spider文件中不需要写该字段
import pymysql
import pymongo
from scrapy import Request
from scrapy.exceptions import DropItem
from scrapy.pipelines.images import ImagesPipeline
class U17MysqlPipeline(object):
def __init__(self, host, port, username, password, database):
self.host = host
self.port = port
self.username = username
self.password = password
self.database = database
@classmethod
def from_crawler(cls, crawler):
return cls(
host=crawler.settings.get('MYSQL_HOST'),
port=crawler.settings.get('MYSQL_PORT'),
database=crawler.settings.get('MYSQL_DATABASE'),
username=crawler.settings.get('MYSQL_USERNAME'),
password=crawler.settings.get('MYSQL_PASSWORD'),
)
def open_spider(self, spider):
self.db = pymysql.connect(self.host, self.username, self.password, self.database, charset='utf8',
port=self.port)
self.cursor = self.db.cursor()
def close_spider(self, spider):
self.db.close()
def process_item(self, item, spider):
if item.collection_name == 'u17':
sql = 'insert into manhua (comic_id, name, cover, category) values (%s,%s,%s,%s)'
self.cursor.execute(sql, (item['comic_id'], item['name'], item['cover'], item['category']))
else:
sql = 'insert into detail_manhua (comic_id, title, link) values (%s,%s,%s)'
self.cursor.execute(sql, (item['comic_id'], item['title'], item['link']))
self.db.commit()
return item
class U17MongoPipeline(object):
def __init__(self, uri, database):
self.uri = uri
self.database = database
@classmethod
def from_crawler(cls, crawler):
return cls(
uri=crawler.settings.get('MONGO_URI'),
database=crawler.settings.get('MONGO_DB'),
)
def open_spider(self, spider):
self.client = pymongo.MongoClient(self.uri)
self.db = self.client[self.database]
def close_spider(self, spider):
self.client.close()
def process_item(self, item, spider):
self.db[item.collection_name].insert(dict(item))
return item
class U17ImagePipeline(ImagesPipeline):
# 准备图片文件名
def file_path(self, request, response=None, info=None):
url = request.url
file_name = url.split('/')[-1]
return file_name
# 判断图片是否下载成功,丢弃不成功的
def item_completed(self, results, item, info):
image_paths = [x['path'] for ok, x in results if ok]
if not image_paths:
raise DropItem('Image Downloaded Failed')
return item
# 指明图片下载链接,并包装成request对象
def get_media_requests(self, item, info):
if item.collection_name == 'u17':
yield Request(item['cover'])
else:
pass
管道pipelines中定义了两个处理数据的三种方式。
if item.collection_name == 'u17':用于Mysql中判断该对象所属的类中是不是item中的u17类,因为是对象字段,故用.进行引用
self.db[item.collection_name].insert(dict(item))用于mongodb中判断属于哪个表格,collection_name的值需与定义的表格名相同
2中间件
本次中间件爬取京东,在爬取前,需要在settings中添加或者修改成如下代码
ROBOTSTXT_OBEY = False
DOWNLOADER_MIDDLEWARES = {
'jd.middlewares.SeleniumMiddleware': 543,
}
FEED_EXPORT_ENCODING = 'utf-8'
KEYWORDS = ['鞋子', '帽子', '电脑']
SELENIUM_TIMEOUT = 10
其中keywords为本次需要爬取的内容
2.1spider请求
import scrapy
from urllib.parse import urlencode
from jd.items import JdItem
class QiangSpider(scrapy.Spider):
name = 'qiang'
allowed_domains = ['search.jd.com']
start_urls = 'https://search.jd.com/Search?'
def start_requests(self):
# 循环需要遍历的商品
for keyword in self.settings.get('KEYWORDS'):
data = {'keyword': keyword, 'wq': keyword, 'enc': 'utf-8'}
param_str = urlencode(data)
url = self.start_urls + param_str
# 循环每种商品获取的网页数
for page in range(1, 10):
yield scrapy.Request(url=url, callback=self.parse,
meta={'page': page + 1}, dont_filter=True)
def parse(self, response):
gl_items = response.selector.xpath('//div[@id="J_goodsList"]//li
[@class="gl-item"]')
for gl_item in gl_items:
jd_item = JdItem()
img_src = ''.join(gl_item.xpath('.//div[@class="p-img"]/a/img/@src').extract_first())
title = ''.join(gl_item.xpath('.//div[@class="p-name p-name-type-
2"]//em//text()').extract())
jd_item['title'] = title
jd_item['img_src'] = img_src
yield jd_item
callback=self.parse为回调函数,函数名为对数据进行处理的函数名
meta={'page': page + 1}数据传输方式
2.2中间件
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from scrapy.http import HtmlResponse
from scrapy import signals
import time
class JdSpiderMiddleware(object):
@classmethod
def from_crawler(cls, crawler):
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_spider_input(self, response, spider):
return None
def process_spider_output(self, response, result, spider):
for i in result:
yield i
def process_spider_exception(self, response, exception, spider):
pass
def process_start_requests(self, start_requests, spider):
for r in start_requests:
yield r
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
class SeleniumMiddleware(object):
def __init__(self, timeout=None, service_args=[]):
self.timeout = timeout
chromeOptions = webdriver.ChromeOptions()
self.browser = webdriver.Chrome(chrome_options = chromeOptions)
self.browser.set_window_size(1400, 700)
self.wait = WebDriverWait(self.browser, self.timeout)
def __del__(self):
self.browser.close()
@classmethod
def from_crawler(cls, crawler):
return cls(timeout=crawler.settings.get('SELENIUM_TIMEOUT'))
def process_request(self, request, spider):
page = request.meta.get('page', 1)
if page == 1:
self.browser.get(request.url)
# 滑动到页面底部,滑动指定次数,让商品加载
str_js = 'var scrollHeight = document.body.scrollHeight;window.scrollTo(0, scrollHeight);'
self.browser.execute_script(str_js)
for i in range(16, 0, -1):
str_js = 'var scrollHeight = document.body.scrollHeight;window.scrollTo(0, (%d * scrollHeight) / 16);' % i
time.sleep(2)
self.browser.execute_script(str_js)
html = self.browser.page_source
# 准备进入点击下一页
# 滚动分页控制部分
input = self.browser.find_element_by_css_selector('#J_bottomPage input.input-txt')
str_js = 'var scrollHeight = document.body.scrollHeight;window.scrollTo(0, %d);' % (input.location['y'] - 50)
self.browser.execute_script(str_js)
time.sleep(1)
# 输入页码
input = self.wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#J_bottomPage input.input-txt')))
input.clear()
input.send_keys(page + 1)
# 点击下一页
submit = self.wait.until(
EC.element_to_be_clickable((By.CSS_SELECTOR, '#J_bottomPage .btn.btn-default')))
submit.click()
return HtmlResponse(url=request.url, body=html, request=request,
encoding='utf-8',status=200)
def process_response(self, request, response, spider):
return response
def process_exception(self, request, exception, spider):
pass
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
2.3items数据保存
import scrapy
class JdItem(scrapy.Item):
title = scrapy.Field()
img_src = scrapy.Field()
2.4管道数据处理
class JdPipeline(object):
def process_item(self, item, spider):
return item
在用scrapy爬取数据时,当需要对中间件进行修改时,一般修改的是class JdDownloaderMiddleware(object):将其修改成自己命的类名字SeleniumMiddleware(后面必须接Middleware);然后在下面定义init初始化方法(用于打开浏览器及传入参数timeout用于控制浏览器wait时间),del方法用于控制在SeleniumMiddleware的实例化对象不在需要被调用的时候调用该方法(关闭浏览器),然后对类方法from_crawler进行少量修改;主要修改process_request方法,用于控制页面滚动及选择爬取的页面。最后将中间件添加至settings。