本文详尽介绍了通过Scrapy框架爬取Unsplash图库的过程:
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
import scrapy
class UnsplashItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
author = scrapy.Field()
author_bio=scrapy.Field()
image_id=scrapy.Field()
image_urls=scrapy.Field()
images=scrapy.Field()
image_paths=scrapy.Field()
# -*- coding: utf-8 -*-
import scrapy
import json
import urllib
from picture.items import UnsplashItem
class UnsplashSpider(scrapy.Spider):
name = "unsplash"
allowed_domains = ["unsplash.com"]
custom_settings = {
'DEFAULT_REQUEST_HEADERS':{
#'Accept':'*/*',
#'Accept-Encoding':'gzip, deflate, sdch, br',
#'Accept-Language':'zh-CN,zh;q=0.8,en;q=0.6',
#'accept-version':'v1',
#'Authorization':'Client-ID d69927c7ea5c770fa2ce9a2f1e3589bd896454f7068f689d8e41a25b54fa6042',
#'Host':'unsplash.com',
'Upgrade-Insecure-Requests': '1',
#'Referer':'https://unsplash.com/?grid=single',
#'Connection':'keep-alive',
'x-unsplash-client':'web',
#'User-Agent':'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.76 Mobile Safari/537.36',
},
"ITEM_PIPELINES":{
'picture.pipelines.UnsplashImagesPipeline': 300,
},
"IMAGES_STORE":'./images',
"LOG_FILE":'unsplash.log',
}
start_urls = (
'https://unsplash.com/napi/feeds/home',
)
def parse(self, response):
for photo in json.loads(response.body)['photos']:
item=UnsplashItem()
item['author']=photo['user']['name']
item['author_bio']=photo['user']['bio']
item['image_id']=photo['id']
item['image_urls']=[photo['urls']['full']]
yield item
next_page='https://unsplash.com/napi/'+json.loads(response.body)['next_page'][25:]
if next_page:
yield scrapy.Request(next_page,callback=self.parse)
# -*- coding: utf-8 -*-
# Define your item pipelines here
import scrapy
from scrapy.pipelines.images import ImagesPipeline
from scrapy.exceptions import DropItem
class UnsplashImagesPipeline(ImagesPipeline):
def get_media_requests(self,item,info):
for image_url in item['image_urls']:
yield scrapy.Request(image_url,meta={'item': item})
def item_completed(self,results,item,info):
image_paths = [x['path'] for ok, x in results if ok]
if not image_paths:
raise DropItem("Item contains no images")
item['image_paths'] = image_paths
return item
def file_path(self, request, response=None, info=None):
item = request.meta['item']
filename = 'full/{0}/{1}.jpg'.format(item['author'],item['image_id'])
return filename