Scrapy爬取网站所有图片(可指定XPATH路径进行爬取)

环境:

  1. Windows 10
  2. python=3.7.2
  3. scrapy=2.4.1

作用:

  1. 爬取链接下的所有图片
  2. 指定XPATH路径
  3. 根据图片类型下载

蜘蛛:

# -*- coding: utf-8 -*-
import re

import scrapy

from ..items import OneItem

'''

通用爬取图片并下载到"OUT"文件夹

搜狐专用XPATH
//*[@class="article"]
微信专用XPATH
//*[@id="img-content"]
'''

class ScrapySpiderSpider(scrapy.Spider):
    name = 'scrapy_spider'
    web_url = None
    new_name = None
    allowed_domains = None
    start_urls = None
    xpath_url = None

    def __init__(self, web_url='', new_name='', *args, **kwargs):
        # super().__init__(self,*args, **kwargs)
        self.web_url = web_url
        self.new_name = new_name

        if self.web_url == '':
            print('输入网址!!!')
            self.web_url = input()
            print('输入成功!!!')
        # 获取爬取的网址
        self.start_urls = [self.web_url]
        try:
            yuming = re.match(r'https://(.*?)/', self.start_urls[0]).group(1)
        except Exception:
            yuming = re.match(r'http://(.*?)/', self.start_urls[0]).group(1)
        # 获取域名
        self.allowed_domains = [yuming]
        self.allowed_domains.append('*.*.*')
        if self.web_url.find('www.sohu.com') != -1:
            self.xpath_url = '//*[@class="article"]'
        elif self.web_url.find('mp.weixin.qq.com') != -1:
            self.xpath_url = '//*[@id="img-content"]'
        else:
            print('输入包含图片的DIV的XPATH路径')
            self.xpath_url = input()
        if self.new_name == None:
            print('输入命名')
            self.new_name = input()

    def parse(self, response):
        # 获取图片地址
        if self.start_urls[0].find('mp.weixin.qq.com') != -1:
            img_list = response.xpath('//img/@data-src'.format(self.xpath_url)).extract()
        else:
            img_list = response.xpath('{}//img/@src'.format(self.xpath_url)).extract()
        for img in range(len(img_list)):
            if img_list[img].find('https://') != -1 or img_list[img].find('http://') != -1:
                item = OneItem(src=img_list[img], sx=self.new_name + str(img))
            else:
                if img_list[img][0] and img_list[img][1] == r'/':
                    img_list[img] = 'http:' + img_list[img]
                else:
                    img_list[img] = 'https://' + self.allowed_domains[0] + img_list[img]
                item = OneItem(src=img_list[img], sx=self.new_name + str(img))
            yield item

蜘蛛

pipelines中间件:

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html

import scrapy
from scrapy.pipelines.images import ImagesPipeline
from scrapy.utils.misc import md5sum
import os
from urllib.parse import urlparse
from scrapy import Request

class OnePipeline(object):
    def process_item(self, item, spider):
        return item

# 通过重写函数方法重命名
class Imaspipline(ImagesPipeline):
    @classmethod
    def get_media_requests(self, item, info):
        yield scrapy.Request(item['src'], meta={'name': item['sx']}, dont_filter=True)

    # 重写默认函数
    def _process_request(self, request, info, item):
        import functools
        import logging
        from collections import defaultdict
        from twisted.internet.defer import Deferred, DeferredList
        from twisted.python.failure import Failure

        from scrapy.settings import Settings
        from scrapy.utils.datatypes import SequenceExclude
        from scrapy.utils.defer import mustbe_deferred, defer_result
        from scrapy.utils.request import request_fingerprint
        from scrapy.utils.misc import arg_to_iter
        from scrapy.utils.log import failure_to_exc_info

        logger = logging.getLogger(__name__)

        fp = request_fingerprint(request)
        cb = request.callback or (lambda _: _)
        eb = request.errback
        request.callback = None
        request.errback = None

        # 注释后允许重复下载相同链接
        # Return cached result if request was already seen
        # if fp in info.downloaded:
        #     return defer_result(info.downloaded[fp]).addCallbacks(cb, eb)

        # Otherwise, wait for result
        wad = Deferred().addCallbacks(cb, eb)
        info.waiting[fp].append(wad)

        # 注释后允许重复下载相同链接
        # Check if request is downloading right now to avoid doing it twice
        # if fp in info.downloading:
        #     return wad

        # Download request checking media_to_download hook output first
        info.downloading.add(fp)
        dfd = mustbe_deferred(self.media_to_download, request, info)
        dfd.addCallback(self._check_media_to_download, request, info, item)
        dfd.addBoth(self._cache_result_and_execute_waiters, fp, info)
        dfd.addErrback(lambda f: logger.error(
            f.value, exc_info=failure_to_exc_info(f), extra={'spider': info.spider})
                       )
        return dfd.addBoth(lambda _: wad)  # it must return wad at last

    def file_path(self, request, response=None, info=None):
        img_name = str(request.meta['name'])
        img_name_hz = request.url.split('/')[-1]
        # 针对微信链接
        if img_name_hz[-3:] == 'peg':
            # file_name=img_name+'.'+img_name_hz[-4:]
            file_name = img_name + '.jpg'
        elif img_name_hz[-3:] == 'jpg' or img_name_hz[-3:] == 'gif':
            file_name = img_name + '.' + img_name_hz[-3:]
        elif img_name_hz[-3:] == 'png':
            file_name = img_name + '.png'
        else:
            # 其他网站链接判断是否是gif
            if img_name_hz.find('.gif') != -1:
                file_name = img_name + '.gif'
            # 保存为png
            elif img_name_hz.find('.png') != -1:
                file_name = img_name + '.png'
            else:
                file_name = img_name + '.jpg'
        return file_name

    def item_completed(self, results, item, info):
        # print(results)
        return item

    # https://www.cnblogs.com/lxy-jdsy/p/10389103.html
    # 直接保存GIF图片
    def persist_gif(self, key, data, info):
        root, ext = os.path.splitext(key)
        absolute_path = self.store._get_filesystem_path(key)
        self.store._mkdir(os.path.dirname(absolute_path), info)
        with open(absolute_path, 'wb') as f:
            f.write(data)

    # 判断图片还是GIF
    def check_gif(self, image, response):
        # 非jpg图都为None
        if image.format is None:
            # 判断微信链接末尾是不是gif图
            if str(response.url)[-3:] == 'gif':
                return True
            else:
                # 微信非gif图片
                if str(response.url).find('mmbiz.qpic.cn') != -1:
                    return False
                # 非微信图片下载gif或者png
                else:
                    return True
        else:
            return False

    def image_downloaded(self, response, request, info):
        checksum = None
        for path, image, buf in self.get_images(response, request, info):
            if checksum is None:
                buf.seek(0)
                checksum = md5sum(buf)
            width, height = image.size
            if self.check_gif(image, response):
                # GIF下载方法
                self.persist_gif(path, response.body, info)
            else:
                # 默认下载方法
                self.store.persist_file(path, buf, info, meta={'width': width, 'height': height}, headers={'Content-Type': 'image/jpeg'})
        return checksum

pipelines中间件

middlewares中间件:

# -*- coding: utf-8 -*-

# Define here the models for your spider middleware
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html

from scrapy import signals

class OneSpiderMiddleware(object):
    # Not all methods need to be defined. If a method is not defined,
    # scrapy acts as if the spider middleware does not modify the
    # passed objects.
    def __init__(self,agents):
        self.agent=agents

    @classmethod
    def from_crawler(cls, crawler):
        # This method is used by Scrapy to create your spiders.
        s = cls()
        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
        return s
        # return cls(agents=crawler.settings.get('CUSTOM_USER_AGENT'))

    def process_spider_input(self, response, spider):
        # Called for each response that goes through the spider
        # middleware and into the spider.

        # Should return None or raise an exception.
        return None

    def process_spider_output(self, response, result, spider):
        # Called with the results returned from the Spider, after
        # it has processed the response.

        # Must return an iterable of Request, dict or Item objects.
        for i in result:
            yield i

    def process_spider_exception(self, response, exception, spider):
        # Called when a spider or process_spider_input() method
        # (from other spider middleware) raises an exception.

        # Should return either None or an iterable of Request, dict
        # or Item objects.
        pass

    def process_start_requests(self, start_requests, spider):
        # Called with the start requests of the spider, and works
        # similarly to the process_spider_output() method, except
        # that it doesn’t have a response associated.

        # Must return only requests (not items).
        for r in start_requests:
            yield r

    def spider_opened(self, spider):
        spider.logger.info('Spider opened: %s' % spider.name)

class OneDownloaderMiddleware(object):
    # Not all methods need to be defined. If a method is not defined,
    # scrapy acts as if the downloader middleware does not modify the
    # passed objects.

    @classmethod
    def from_crawler(cls, crawler):
        # This method is used by Scrapy to create your spiders.
        s = cls()
        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
        return s

    def process_request(self, request, spider):
        # Called for each request that goes through the downloader
        # middleware.

        # Must either:
        # - return None: continue processing this request
        # - or return a Response object
        # - or return a Request object
        # - or raise IgnoreRequest: process_exception() methods of
        #   installed downloader middleware will be called
        return None

    def process_response(self, request, response, spider):
        # Called with the response returned from the downloader.

        # Must either;
        # - return a Response object
        # - return a Request object
        # - or raise IgnoreRequest
        return response

    def process_exception(self, request, exception, spider):
        # Called when a download handler or a process_request()
        # (from other downloader middleware) raises an exception.

        # Must either:
        # - return None: continue processing this exception
        # - return a Response object: stops process_exception() chain
        # - return a Request object: stops process_exception() chain
        pass

    def spider_opened(self, spider):
        spider.logger.info('Spider opened: %s' % spider.name)

from fake_useragent import UserAgent
# 请求头添加随机user-agent
class RandomUserAgentMiddleware(object):

    def __init__(self):
        self.agent = UserAgent()

    @classmethod
    def from_crawler(cls, crawler):
        return cls()

    def process_request(self, request, spider):
        request.headers.setdefault('User-Agent', self.agent.random)
        print('')
middlewares中间件

settings设置:

# -*- coding: utf-8 -*-

# Scrapy settings for one project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#     https://docs.scrapy.org/en/latest/topics/settings.html
#     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#     https://docs.scrapy.org/en/latest/topics/spider-middleware.html

BOT_NAME = 'one'

SPIDER_MODULES = ['one.spiders']
NEWSPIDER_MODULE = 'one.spiders'

# Crawl responsibly by identifying yourself (and your website) on the user-agent
# USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36'

# Obey robots.txt rulesLOG_LEVEL
ROBOTSTXT_OBEY = False

# Configure maximum concurrent requests performed by Scrapy (default: 16)
CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = 1
# The download delay setting will honor only one of:
# CONCURRENT_REQUESTS_PER_DOMAIN = 16
# CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)
# TELNETCONSOLE_ENABLED = False

# Override the default request headers:
from .spiders.scrapy_spider import ScrapySpiderSpider

DEFAULT_REQUEST_HEADERS = {
    # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
    # 'Accept-Language': 'en',
    'Referer': ScrapySpiderSpider.web_url,
}

# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
# SPIDER_MIDDLEWARES = {
#    'one.middlewares.OneSpiderMiddleware': 543,
# }

# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
DOWNLOADER_MIDDLEWARES = {
    'one.middlewares.RandomUserAgentMiddleware': 10,
}

# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
# EXTENSIONS = {
#    'scrapy.extensions.telnet.TelnetConsole': None,
# }

# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
    # 'one.pipelines.OnePipeline': 300,
    'one.pipelines.Imaspipline': 300,
}
# 图片保存路径
IMAGES_STORE = 'D:\\untitled1\\out\\IMG\\'

# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
AUTOTHROTTLE_ENABLED = True
# The initial download delay
AUTOTHROTTLE_START_DELAY = 1
# The maximum download delay to be set in case of high latencies
AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
# AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
# HTTPCACHE_ENABLED = True
# HTTPCACHE_EXPIRATION_SECS = 0
# HTTPCACHE_DIR = 'httpcache'
# HTTPCACHE_IGNORE_HTTP_CODES = []
# HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

settings设置
©著作权归作者所有,转载或内容合作请联系作者
平台声明:文章内容(如有图片或视频亦包括在内)由作者上传并发布,文章内容仅代表作者本人观点,简书系信息发布平台,仅提供信息存储服务。

推荐阅读更多精彩内容