网址:http://www.xfdown.com/class/155_1.html
要求根据右侧的分类进行爬取,获取子页面的APP信息如名称、大小、出版商等一些信息,总数据27000多个,耗时20个小时,使用工具pycharm。废话少说直接上代码
遇到的问题:在爬取数据时,APP的公司和官方连接每个网页会有所不同,所以采取了两种xpath的情形进行采取。
注意事项:存储文件,写了两种的存储方式一种为csv,另一种为MongoDB,根据自己的情况进行选择,修改存储的方法,在settings.py文件中进行修改,不清楚的可以查看之前写的文章https://www.jianshu.com/p/5501827c55fc
app.py
import scrapy
import json
import re,random
from copy import deepcopy
import pprint,csv
from ..items import ScrapyAppItem
class GetHome(scrapy.Spider):
def __init__(self):
pass
name = 'app'
start_urls = ['http://www.xfdown.com/class/155_1.html']
# allowed_domains = ['www.xfdown.com/']
def parse(self, response):
items_name = response.xpath('//*[@id="dlist"]/div[4]/div[3]/div[1]/ul/a')
items_link=response.xpath('//*[@id="dlist"]/div[4]/div[3]/div[1]/ul/a')
for bg_name,bg_link in zip(items_name,items_link):
link = 'http://www.xfdown.com' + bg_link.xpath('./@href').get()
name=bg_name.xpath('./text()').get()
link_key=re.match(r'(http://www.xfdown.com/class/\d+)_\d+.html',link).group(1)
yield scrapy.Request(link,callback=self.content,meta={'app_catalogue':name})
pass
def content(self, response):
name = response.meta['app_catalogue']
links=response.xpath('//*[@id="dlist"]/div[4]/div[2]/div[2]/ul/li/div[1]/div[2]/h3/a/@href')
for link in links:
app_link='http://www.xfdown.com'+link.get()
yield scrapy.Request(url=app_link,callback=self.data,meta={'app_catalogue':name})
pass
next_url=response.xpath('//*[@id="dlist"]/div[4]/div[2]/div[2]/div/div/div[2]/a[@class="tsp_next"]/@href').get()
if next_url is not None:
print('http://www.xfdown.com'+next_url)
yield scrapy.Request('http://www.xfdown.com'+next_url,callback=self.content,meta={'app_catalogue':name})
else:
print('此类页面结束!!!')
pass
def data(self,response):
name = response.meta['app_catalogue']
app_name =response.xpath('/html/body/div[4]/div[2]/div[1]/div/h1/text()').get()
app_size =re.match(r'^(.*?): (.*?\w$)',response.xpath('/html/body/div[4]/div[2]/div[2]/ul/li[1]/text()').get()).group(2)
app_style =re.match(r'^(.*?): (.*?\w$)',response.xpath('/html/body/div[4]/div[2]/div[2]/ul/li[2]/text()').get()).group(2)
app_language = re.match(r'^(.*?): (.*?\w$)',response.xpath('/html/body/div[4]/div[2]/div[2]/ul/li[3]/text()').get()).group(2)
app_updatetime = re.match(r'^(.*?): (.*?\w$)',response.xpath('/html/body/div[4]/div[2]/div[2]/ul/li[6]/text()').get()).group(2)
try:
app_company = re.search(r'(.*?):(.*?\w$)',
response.xpath('/html/body/div[4]/div[2]/div[2]/ul/li[9]/text()').get()).group(2)
except:
app_company = response.xpath('/html/body/div[4]/div[2]/div[2]/ul/li[9]/a/text()').get()
app_navigateto = response.xpath('/html/body/div[4]/div[2]/div[2]/ul/li[8]/text()').get()
try:
app_navigateto = re.search(r'(.*?):(.*?\w$)',
response.xpath('/html/body/div[4]/div[2]/div[2]/ul/li[8]/text()').get()).group(2)
except:
app_navigateto = response.xpath('/html/body/div[4]/div[2]/div[2]/ul/li[8]/a/text()').get()
app_platform = re.match(r'^(.*?): (.*?\w$)',response.xpath('/html/body/div[4]/div[2]/div[2]/ul/li[7]/text()').get()).group(2)
app_abstract = response.xpath('/html/body/div[4]/div[3]/div[2]').get()
items = ScrapyAppItem(app_name=app_name,app_size=app_size,app_style=app_style,app_platform=app_platform,
app_language=app_language,app_company=app_company,app_abstract=app_abstract,
app_navigateto=app_navigateto,app_catalogue=name,app_updatetime=app_updatetime)
yield items
pass
pass
middlewares.py
# -*- coding: utf-8 -*-
# Define here the models for your spider middleware
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
from scrapy import signals
class ScrapyAppSpiderMiddleware:
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_spider_input(self, response, spider):
# Called for each response that goes through the spider
# middleware and into the spider.
# Should return None or raise an exception.
return None
def process_spider_output(self, response, result, spider):
# Called with the results returned from the Spider, after
# it has processed the response.
# Must return an iterable of Request, dict or Item objects.
for i in result:
yield i
def process_spider_exception(self, response, exception, spider):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.
# Should return either None or an iterable of Request, dict
# or Item objects.
pass
def process_start_requests(self, start_requests, spider):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesn’t have a response associated.
# Must return only requests (not items).
for r in start_requests:
yield r
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
class ScrapyAppDownloaderMiddleware:
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_request(self, request, spider):
# Called for each request that goes through the downloader
# middleware.
# Must either:
# - return None: continue processing this request
# - or return a Response object
# - or return a Request object
# - or raise IgnoreRequest: process_exception() methods of
# installed downloader middleware will be called
return None
def process_response(self, request, response, spider):
# Called with the response returned from the downloader.
# Must either;
# - return a Response object
# - return a Request object
# - or raise IgnoreRequest
return response
def process_exception(self, request, exception, spider):
# Called when a download handler or a process_request()
# (from other downloader middleware) raises an exception.
# Must either:
# - return None: continue processing this exception
# - return a Response object: stops process_exception() chain
# - return a Request object: stops process_exception() chain
pass
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
import random
class UserAgent(object):
USER_AGENTS = [
'Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; Acoo Browser 1.98.744; .NET CLR 3.5.30729)',
'Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; Acoo Browser 1.98.744; .NET CLR 3.5.30729)',
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; Acoo Browser; GTB5; Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1) ; InfoPath.1; .NET CLR 3.5.30729; .NET CLR 3.0.30618)',
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; SV1; Acoo Browser; .NET CLR 2.0.50727; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; Avant Browser)',
]
def process_request(self, request, spider): #这里一定要使用process_request不然的话请求设置没用,为内置方法
useragent = random.choice(self.USER_AGENTS)
request.headers['User-Agent'] = useragent
pass
pass
settings.py
# -*- coding: utf-8 -*-
# Scrapy settings for scrapy_app project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://docs.scrapy.org/en/latest/topics/settings.html
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'scrapy_app'
SPIDER_MODULES = ['scrapy_app.spiders']
NEWSPIDER_MODULE = 'scrapy_app.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'scrapy_app (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = 2
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
SPIDER_MIDDLEWARES = {
'scrapy_app.middlewares.ScrapyAppSpiderMiddleware': 543,
}
# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
DOWNLOADER_MIDDLEWARES = {
'scrapy_app.middlewares.UserAgent': 543,
}
# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'scrapy_app.pipelines.ScrapyAppPipeline': 300,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
pipeline.py
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
import csv
import pymongo
class ScrapyAppPipeline:
def __init__(self):
print('爬虫开始!!!')
self.f = open('app.csv', 'a', newline='', encoding='gbk')
self.write = csv.writer(self.f)
self.write.writerow(['分类', '名称', '大小','类型', '语言', '最近更新时间',
'使用平台', '公司', '官网', '描述'])
pass
def start_spider(self):
pass
def process_item(self, item, spider):
data_list = [item['app_catalogue'], item['app_name'], item['app_size'], item['app_style'],
item['app_language'], item['app_updatetime'], item['app_platform'], item['app_company'],
item['app_navigateto'], item['app_abstract']]
self.write.writerow(data_list)
pass
def close_spider(self):
self.f.close()
pass
# from pymongo import MongoClient
# class MongoPipeline(object):
# def __init__(self, databaseIp='127.0.0.1', databasePort=27017,
# mongodbName='dbtry'):
# client = MongoClient(databaseIp, databasePort)
# self.db = client[mongodbName]
#
# def process_item(self, item, spider):
# postItem = dict(item) # 把item转化成字典形式
# self.db.test.insert(postItem) # 向数据库插入一条记录
# return item # 会在控制台输出原item数据,可以选择不写
如有问题可以留言,可相互交流讨论!!!