# python3.7
import json
import pymongo
import requests
from multiprocessing import Pool
class ImgSpider(object):
client = pymongo.MongoClient('localhost') # 创建连接
db = client['unsplash'] # 连接到该数据库
def __init__(self):
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'
}
def get_list_json(self, page):
api_url = 'https://unsplash.com/napi/search/photos?query=street%20snap&xp=&per_page=20&page={}'.format(page) # 可更换源
try:
response = requests.get(api_url, headers=self.headers)
if response.status_code == 200:
return response.text
else:
print('请求异常:url={}, status_code={}'.format(api_url, response.status_code))
return None
except Exception as e:
print('请求异常:url={}, error={}'.format(api_url, e))
return None
def get_info_json(self, info_url):
api_url = info_url
response = requests.get(api_url, headers=self.headers)
return response.text
def parse_info(self, info_str):
json_dict = json.loads(info_str)
data = json_dict.get('exif')
data['id'] = json_dict.get('id')
data['url'] = json_dict.get('urls')['full']
self.db['unsplash_info_01'].insert_one(data) # 创建表并将数据插入到表
def parse_list_json(self, json_str):
json_dict = json.loads(json_str)
data_list = json_dict.get('results')
if data_list and len(data_list) > 0: # 说明有数据,可以解析
for item in data_list:
pic_url = item['urls']['full']
self.download_image(pic_url, item['id'])
info_url = 'https://unsplash.com/napi/photos/' + item['id'] + '/info'
info_str = self.get_info_json(info_url)
self.parse_info(info_str)
def download_image(self, img_url, id):
response = requests.get(img_url, headers=self.headers)
if response.status_code == 200:
content = response.content # 图片属于二进制资源,不再使用text,使用content
img_name = id
f = open('imgs/{}.jpg'.format(img_name), 'wb') # 'w': 写入普通文本;'wb': 用于写入二进制数据(图片、音频、视频)
f.write(content)
f.close()
print('正在下载图片...')
else:
print('图片url请求失败:{}'.format(img_url))
def start_spider(self, num):
json_str = self.get_list_json(num)
self.parse_list_json(json_str)
if __name__ == '__main__':
obj = ImgSpider()
pool = Pool(1) # 在进程池里创建3个进程
start_num = int(input('请输入你的起始爬取页数:'))
end_num = int(input('请输入你的结束爬取页数:'))
pool.map(obj.start_spider, [x for x in range(start_num, end_num)]) # 爬取前num页信息
pool.close()
pool.join()
print('程序运行结束')
多进程爬取 unsplash 网站照片及参数,并将相关信息保存到MongoDB
最后编辑于 :
©著作权归作者所有,转载或内容合作请联系作者
- 文/潘晓璐 我一进店门,熙熙楼的掌柜王于贵愁眉苦脸地迎上来,“玉大人,你说我怎么就摊上这事。” “怎么了?”我有些...
- 文/花漫 我一把揭开白布。 她就那样静静地躺着,像睡着了一般。 火红的嫁衣衬着肌肤如雪。 梳的纹丝不乱的头发上,一...
- 文/苍兰香墨 我猛地睁开眼,长吁一口气:“原来是场噩梦啊……” “哼!你这毒妇竟也来了?” 一声冷哼从身侧响起,我...