from multiprocessing.pool import Pool
from urllib.parse import urlencode
import requests
import os
from hashlib import md5
# 设置相关的请求头信息
headers = {
'referer': 'https://www.toutiao.com/search/?keyword=%E8%A1%97%E6%8B%8D',
'accept': 'application/json, text/javascript',
'content-type': 'application/x-www-form-urlencoded',
'User-Agent': 'Mozilla / 5.0(Windows NT 10.0;WOW64) AppleWebKit/537.36(KHTML, like Gecko) Chrome / 67.0.3396.99 Safari/537.36',
'x-requested-with': 'XMLHttpRequest'
}
# 基本的url
base_url = 'https://www.toutiao.com/search_content/?'
def get_page(offest):
params = {
# ajax请求一直在修变的参数,所以将它设置为变量
'offset': offest,
'format': 'json',
# 查询的主题(可以修改)
'keyword': '斯嘉丽·约翰逊',
'autoload': 'true',
'count': 20,
'cur_tab': 1,
'from': 'search_tab'
}
# 拼接url
url = base_url + urlencode(params)
try:
response = requests.get(url, headers=headers)
if response.status_code == 200:
return response.json()
except BaseException:
return None
# 解析页面,将需要的文章标题和图片路径封装为dict
def parse_page(results):
for result in results.get('data'):
try:
dict = {}
dict['title'] = result['title']
dict['images'] = result['image_list']
yield dict
except:
pass
# 保存图片
def save_img(item:dict):
# 保存图片的路径
filename = 'C:\\Users\\13194\\Pictures\\'+item.get('title')
if not os.path.exists(filename):
os.mkdir(filename)
for image in item.get('images'):
img = image['url']
try:
# 如果不加https: 会报抛出没有协议名的异常
resp = requests.get('https:'+img)
if resp.status_code == 200:
# 图片要以二进制的方式获取
content = resp.content
file_path = '{0}/{1}.{2}'.format(filename, md5(content).hexdigest(), 'jpg')
if not os.path.exists(file_path):
with open(file_path, 'wb') as file:
file.write(content)
else:
print('Already Downloaded', file_path)
except requests.ConnectionError:
print('Faleld to Save Image' )
# 根据页面的offest,爬取网页
def main(offest):
results = get_page(offest)
for i in range(len(results)):
items = parse_page(results)
for item in items:
save_img(item)
START = 1
END = 20
if __name__ == '__main__':
pool = Pool()
groups = [ x * 20 for x in range(START, END+1)]
pool.map(main, groups)
pool.close()
pool.join()
今日头条--寡姐
最后编辑于 :
©著作权归作者所有,转载或内容合作请联系作者
- 文/潘晓璐 我一进店门,熙熙楼的掌柜王于贵愁眉苦脸地迎上来,“玉大人,你说我怎么就摊上这事。” “怎么了?”我有些...
- 文/花漫 我一把揭开白布。 她就那样静静地躺着,像睡着了一般。 火红的嫁衣衬着肌肤如雪。 梳的纹丝不乱的头发上,一...
- 文/苍兰香墨 我猛地睁开眼,长吁一口气:“原来是场噩梦啊……” “哼!你这毒妇竟也来了?” 一声冷哼从身侧响起,我...