环境:
- Windows 10
- python=3.7.2
- scrapy=2.4.1
作用:
- 爬取链接下的所有图片
- 指定XPATH路径
- 根据图片类型下载
蜘蛛:
# -*- coding: utf-8 -*-
import re
import scrapy
from ..items import OneItem
'''
通用爬取图片并下载到"OUT"文件夹
搜狐专用XPATH
//*[@class="article"]
微信专用XPATH
//*[@id="img-content"]
'''
class ScrapySpiderSpider(scrapy.Spider):
name = 'scrapy_spider'
web_url = None
new_name = None
allowed_domains = None
start_urls = None
xpath_url = None
def __init__(self, web_url='', new_name='', *args, **kwargs):
# super().__init__(self,*args, **kwargs)
self.web_url = web_url
self.new_name = new_name
if self.web_url == '':
print('输入网址!!!')
self.web_url = input()
print('输入成功!!!')
# 获取爬取的网址
self.start_urls = [self.web_url]
try:
yuming = re.match(r'https://(.*?)/', self.start_urls[0]).group(1)
except Exception:
yuming = re.match(r'http://(.*?)/', self.start_urls[0]).group(1)
# 获取域名
self.allowed_domains = [yuming]
self.allowed_domains.append('*.*.*')
if self.web_url.find('www.sohu.com') != -1:
self.xpath_url = '//*[@class="article"]'
elif self.web_url.find('mp.weixin.qq.com') != -1:
self.xpath_url = '//*[@id="img-content"]'
else:
print('输入包含图片的DIV的XPATH路径')
self.xpath_url = input()
if self.new_name == None:
print('输入命名')
self.new_name = input()
def parse(self, response):
# 获取图片地址
if self.start_urls[0].find('mp.weixin.qq.com') != -1:
img_list = response.xpath('//img/@data-src'.format(self.xpath_url)).extract()
else:
img_list = response.xpath('{}//img/@src'.format(self.xpath_url)).extract()
for img in range(len(img_list)):
if img_list[img].find('https://') != -1 or img_list[img].find('http://') != -1:
item = OneItem(src=img_list[img], sx=self.new_name + str(img))
else:
if img_list[img][0] and img_list[img][1] == r'/':
img_list[img] = 'http:' + img_list[img]
else:
img_list[img] = 'https://' + self.allowed_domains[0] + img_list[img]
item = OneItem(src=img_list[img], sx=self.new_name + str(img))
yield item
蜘蛛
pipelines中间件:
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
import scrapy
from scrapy.pipelines.images import ImagesPipeline
from scrapy.utils.misc import md5sum
import os
from urllib.parse import urlparse
from scrapy import Request
class OnePipeline(object):
def process_item(self, item, spider):
return item
# 通过重写函数方法重命名
class Imaspipline(ImagesPipeline):
@classmethod
def get_media_requests(self, item, info):
yield scrapy.Request(item['src'], meta={'name': item['sx']}, dont_filter=True)
# 重写默认函数
def _process_request(self, request, info, item):
import functools
import logging
from collections import defaultdict
from twisted.internet.defer import Deferred, DeferredList
from twisted.python.failure import Failure
from scrapy.settings import Settings
from scrapy.utils.datatypes import SequenceExclude
from scrapy.utils.defer import mustbe_deferred, defer_result
from scrapy.utils.request import request_fingerprint
from scrapy.utils.misc import arg_to_iter
from scrapy.utils.log import failure_to_exc_info
logger = logging.getLogger(__name__)
fp = request_fingerprint(request)
cb = request.callback or (lambda _: _)
eb = request.errback
request.callback = None
request.errback = None
# 注释后允许重复下载相同链接
# Return cached result if request was already seen
# if fp in info.downloaded:
# return defer_result(info.downloaded[fp]).addCallbacks(cb, eb)
# Otherwise, wait for result
wad = Deferred().addCallbacks(cb, eb)
info.waiting[fp].append(wad)
# 注释后允许重复下载相同链接
# Check if request is downloading right now to avoid doing it twice
# if fp in info.downloading:
# return wad
# Download request checking media_to_download hook output first
info.downloading.add(fp)
dfd = mustbe_deferred(self.media_to_download, request, info)
dfd.addCallback(self._check_media_to_download, request, info, item)
dfd.addBoth(self._cache_result_and_execute_waiters, fp, info)
dfd.addErrback(lambda f: logger.error(
f.value, exc_info=failure_to_exc_info(f), extra={'spider': info.spider})
)
return dfd.addBoth(lambda _: wad) # it must return wad at last
def file_path(self, request, response=None, info=None):
img_name = str(request.meta['name'])
img_name_hz = request.url.split('/')[-1]
# 针对微信链接
if img_name_hz[-3:] == 'peg':
# file_name=img_name+'.'+img_name_hz[-4:]
file_name = img_name + '.jpg'
elif img_name_hz[-3:] == 'jpg' or img_name_hz[-3:] == 'gif':
file_name = img_name + '.' + img_name_hz[-3:]
elif img_name_hz[-3:] == 'png':
file_name = img_name + '.png'
else:
# 其他网站链接判断是否是gif
if img_name_hz.find('.gif') != -1:
file_name = img_name + '.gif'
# 保存为png
elif img_name_hz.find('.png') != -1:
file_name = img_name + '.png'
else:
file_name = img_name + '.jpg'
return file_name
def item_completed(self, results, item, info):
# print(results)
return item
# https://www.cnblogs.com/lxy-jdsy/p/10389103.html
# 直接保存GIF图片
def persist_gif(self, key, data, info):
root, ext = os.path.splitext(key)
absolute_path = self.store._get_filesystem_path(key)
self.store._mkdir(os.path.dirname(absolute_path), info)
with open(absolute_path, 'wb') as f:
f.write(data)
# 判断图片还是GIF
def check_gif(self, image, response):
# 非jpg图都为None
if image.format is None:
# 判断微信链接末尾是不是gif图
if str(response.url)[-3:] == 'gif':
return True
else:
# 微信非gif图片
if str(response.url).find('mmbiz.qpic.cn') != -1:
return False
# 非微信图片下载gif或者png
else:
return True
else:
return False
def image_downloaded(self, response, request, info):
checksum = None
for path, image, buf in self.get_images(response, request, info):
if checksum is None:
buf.seek(0)
checksum = md5sum(buf)
width, height = image.size
if self.check_gif(image, response):
# GIF下载方法
self.persist_gif(path, response.body, info)
else:
# 默认下载方法
self.store.persist_file(path, buf, info, meta={'width': width, 'height': height}, headers={'Content-Type': 'image/jpeg'})
return checksum
pipelines中间件
middlewares中间件:
# -*- coding: utf-8 -*-
# Define here the models for your spider middleware
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
from scrapy import signals
class OneSpiderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
def __init__(self,agents):
self.agent=agents
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
# return cls(agents=crawler.settings.get('CUSTOM_USER_AGENT'))
def process_spider_input(self, response, spider):
# Called for each response that goes through the spider
# middleware and into the spider.
# Should return None or raise an exception.
return None
def process_spider_output(self, response, result, spider):
# Called with the results returned from the Spider, after
# it has processed the response.
# Must return an iterable of Request, dict or Item objects.
for i in result:
yield i
def process_spider_exception(self, response, exception, spider):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.
# Should return either None or an iterable of Request, dict
# or Item objects.
pass
def process_start_requests(self, start_requests, spider):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesn’t have a response associated.
# Must return only requests (not items).
for r in start_requests:
yield r
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
class OneDownloaderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_request(self, request, spider):
# Called for each request that goes through the downloader
# middleware.
# Must either:
# - return None: continue processing this request
# - or return a Response object
# - or return a Request object
# - or raise IgnoreRequest: process_exception() methods of
# installed downloader middleware will be called
return None
def process_response(self, request, response, spider):
# Called with the response returned from the downloader.
# Must either;
# - return a Response object
# - return a Request object
# - or raise IgnoreRequest
return response
def process_exception(self, request, exception, spider):
# Called when a download handler or a process_request()
# (from other downloader middleware) raises an exception.
# Must either:
# - return None: continue processing this exception
# - return a Response object: stops process_exception() chain
# - return a Request object: stops process_exception() chain
pass
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
from fake_useragent import UserAgent
# 请求头添加随机user-agent
class RandomUserAgentMiddleware(object):
def __init__(self):
self.agent = UserAgent()
@classmethod
def from_crawler(cls, crawler):
return cls()
def process_request(self, request, spider):
request.headers.setdefault('User-Agent', self.agent.random)
print('')
middlewares中间件
settings设置:
# -*- coding: utf-8 -*-
# Scrapy settings for one project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://docs.scrapy.org/en/latest/topics/settings.html
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'one'
SPIDER_MODULES = ['one.spiders']
NEWSPIDER_MODULE = 'one.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
# USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36'
# Obey robots.txt rulesLOG_LEVEL
ROBOTSTXT_OBEY = False
# Configure maximum concurrent requests performed by Scrapy (default: 16)
CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = 1
# The download delay setting will honor only one of:
# CONCURRENT_REQUESTS_PER_DOMAIN = 16
# CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
# TELNETCONSOLE_ENABLED = False
# Override the default request headers:
from .spiders.scrapy_spider import ScrapySpiderSpider
DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
'Referer': ScrapySpiderSpider.web_url,
}
# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
# SPIDER_MIDDLEWARES = {
# 'one.middlewares.OneSpiderMiddleware': 543,
# }
# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
DOWNLOADER_MIDDLEWARES = {
'one.middlewares.RandomUserAgentMiddleware': 10,
}
# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
# EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
# }
# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
# 'one.pipelines.OnePipeline': 300,
'one.pipelines.Imaspipline': 300,
}
# 图片保存路径
IMAGES_STORE = 'D:\\untitled1\\out\\IMG\\'
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
AUTOTHROTTLE_ENABLED = True
# The initial download delay
AUTOTHROTTLE_START_DELAY = 1
# The maximum download delay to be set in case of high latencies
AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
# AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
# HTTPCACHE_ENABLED = True
# HTTPCACHE_EXPIRATION_SECS = 0
# HTTPCACHE_DIR = 'httpcache'
# HTTPCACHE_IGNORE_HTTP_CODES = []
# HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
settings设置