之前第一个项目应该是做这个头条街拍图的抓取,无奈自己LOW抓了几次不成功,就放弃了头条街拍转而去抓取妹子图网站,今天我再次折回,重新抓取头条街拍图。
首先我们打开头条主页,在对话框中输入街拍,搜索到的数据如下:
这个网页现在也比较复杂,有的东西并不是图片而是一些乱七八糟的评论性小文章,我们就直接跳过,只抓取图片。
分析头条索引页,分析索引页的html,获取想要抓取的目标网页的url:
显然我们想要的url就隐藏在这里面;
拿到了这个url之后呢,我们可以进入详情页:
前面我已经提到,这种网页并不是我们想要的目标网页,故忽略;
显然这个就是把我们想要得到的目标网页;
分析目标网页:
title显而易见,galler也可以提取,先将目标网页源代码转成JSON格式,然后利用正则提取图片信息即可。
然后就是保存图片了,下面是整个代码,代码很low,还得多学习,多多向各位大佬请教:
import json
import os
from json import JSONDecodeError
from urllib.parse import urlencode
import requests
from bs4 import BeautifulSoup
from requests.exceptions import RequestException
import re
from multiprocessing import Pool
from hashlib import md5
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36',
'cookie': 'tt_webid=6689770852458677768; WEATHER_CITY=%E5%8C%97%E4%BA%AC; UM_distinctid=16aa738630e890-02daeed22a3bba-f353163-1fa400-16aa738630f86c; tt_webid=6689770852458677768; csrftoken=85f646aac651995d0f0d477d9e1000b1; s_v_web_id=a9c80360734866fb880cd48244e4d2e3; CNZZDATA1259612802=1806456525-1557579889-https%253A%252F%252Fwww.baidu.com%252F%7C1557585289; __tasessionId=b9cl244be1557590374034; passport_auth_status=be3918f20b2cefdeae2de53331bb068a; sso_uid_tt=9364aef949fb1422b0d8445d87b9adb1; toutiao_sso_user=08d9d869618fc319637f6e2e11d3981e; login_flag=a7eae77d77aac773723e3468446bfa02; sessionid=7bf2d480f660db06fd489845e59608a1; uid_tt=555598b2d1b7b155f07da91a6fd45192; sid_tt=7bf2d480f660db06fd489845e59608a1; sid_guard="7bf2d480f660db06fd489845e59608a1|1557590405|15552000|Thu\054 07-Nov-2019 16:00:05 GMT'
}
def get_page(offset):
global url
data = {
'aid': '24',
'app_name': 'web_search',
'offset': offset,
'format': 'json',
'keyword': '街拍',
'autoload': 'true',
'count': '20',
'en_qc': '1',
'cur_tab': '1',
'from': 'search_tab',
'pd': 'synthesis',
}
url = 'https://www.toutiao.com/api/search/content/?' + urlencode(data)
try:
response = requests.get(url, headers=headers)
if response.status_code == 200:
response.encoding = 'utf-8'
# print(response.text)
return response.text
return None
except RequestException:
print('获取索引页失败')
def parse_page(html):
data = json.loads(html)
# print(data)
if data and 'data' in data.keys():
for item in data.get('data'):
title = item.get('title')
if title:
article_url = item.get('article_url')
yield article_url
def get_detail_page(detail_url):
try:
response = requests.get(detail_url,headers=headers)
if response.status_code == 200:
print('获取详情页成功')
return response.text
return None
except RequestException:
return None
def parse_detail_page(detail_html):
soup = BeautifulSoup(detail_html, 'lxml')
title = soup.select('title')[0].get_text()
image_pattern = re.compile('gallery: JSON.parse\("(.*)"\)', re.S)
result = re.search(image_pattern, detail_html)
if result:
try:
data = json.loads(result.group(1).replace('\\', ''))
if data and 'sub_images' in data.keys():
sub_images = data.get("sub_images")
images = [item.get('url') for item in sub_images]
for image in images: download_image(image)
return {
'title': title,
'url': url,
'images': images
}
except JSONDecodeError:
pass
def download_image(image):
try:
response = requests.get(image, headers=headers)
if response.status_code == 200:
# print(response.text)
data = response.content
save_image(data)
return None
except RequestException:
print('请求图片失败')
def save_image(data):
print('正在下载')
file_path = '{}/{}.{}'.format('D:\\bilibili大学\头条街拍\image',md5(data).hexdigest(), 'jpg')
if not os.path.exists(file_path):
with open(file_path,'wb') as f:
f.write(data)
f.close()
def main(offset):
html = get_page(offset)
detail_urls = parse_page(html)
for detail_url in detail_urls:
detail_html = get_detail_page(detail_url)
# print(detail_html)
parse_detail_page(detail_html)
if __name__ == '__main__':
groups = [x * 2 for x in range(1, 21)]
pool = Pool()
pool.map(main, groups)