代码
百度图片应该是所有图库里最好爬的网站了,先上代码
import requests
import re
def main(url, header, number_photo, photo_path):
"""
爬虫主函数
:param url: 网页url
:param header: 浏览器头部
:param number_photo: 大致想爬取的图片数量
:param photo_path: 图片保存地址
:return: 所有图片的url列表
"""
headers = {
"User_Agent": header}
result = []
url = re.sub(r'pn=\d+', 'pn={page}', url)
for i in range(30, number_photo, 30):
url = url.format(page=i)
response = requests.get(url, headers=headers)
html = response.text
pattern = re.compile(
'"thumbURL":"(.*?)"'
, re.S)
items = re.findall(pattern, html)
result.extend(items) # result为包含所有图片url的列表
# 根据图片url保存图片
for i in range(len(result)):
if result[i].endswith('jpg'):
with open(photo_path + str(i) + '.jpg', 'wb') as f:
photo = requests.get(result[i])
f.write(photo.content)
if __name__ == '__main__':
# 刘亦菲
url = 'https://image.baidu.com/search/acjson?tn=resultjson_com&ipn=rj&ct=201326592&is=&fp=result&query' \
'Word=%E5%88%98%E4%BA%A6%E8%8F%B2&cl=2&lm=-1&hd=&latest=©right=&ie=utf-8&oe=utf-8&adpicid=&st' \
'=-1&z=&ic=0&word=%E5%88%98%E4%BA%A6%E8%8F%B2&s=&se=&tab=&width=&height=&face=0&istype=2&qc=&nc=1&fr=&pn=30' \
'&rn=30&gsm=1e&1594548720776='
header = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36"
main(url, header, 300, './lyf/')
参数
-
header
:浏览器的User_agent
header.png -
url
:要抓取的网址- 打开百度图片https://image.baidu.com/
url获取.png
- 打开百度图片https://image.baidu.com/
结果
result.png