宅男爬虫学习第一课! 宅男们的福利来啦~
话不多说,直接上代码!
# -*- encoding: utf-8 -*-
# FUNCTION: Capture beauty picture
import requests
from bs4 import BeautifulSoup
import os
import time
url_list = ['http://www.mzitu.com/201024', 'http://www.mzitu.com/169782'] # interested beauties
headers = {
'referer': 'https://www.mzitu.com/201024',
'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 '
'Safari/537.36'
}
def get_page_num(url):
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, 'lxml')
page_num = soup.find(class_='pagenavi').find_all('a')[-2].text
name = soup.find(class_='currentpath').text.split()[-1]
return page_num, name # page_num 是字符串
def parse_page(url):
"""
得到一页的图片
:param url: 页面URL
:return: 图片链接,图片名称
"""
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, 'lxml')
pic_url = soup.find(class_='main-image').find('img')['src']
pic_name = soup.find(class_='main-title').text
return pic_url, pic_name
def get_pic(pic_url, pic_name, name):
"""下载并保存图片"""
response = requests.get(pic_url, headers=headers, allow_redirects=False)
filepath = '/home/f/crawler/Beauty/photo/' + name + '/' + pic_name + '.jpg'
with open(filepath, 'wb') as f:
f.write(response.content)
def main():
for url in url_list:
page_num, name = get_page_num(url)
try:
os.mkdir('/home/f/crawler/Beauty/photo/' + name)
except FileExistsError:
pass
for page in range(1, int(page_num) + 1): # range迭代
page_url = url + '/' + str(page)
print(page_url)
pic_url, pic_name = parse_page(page_url)
get_pic(pic_url, pic_name, name)
time.sleep(2)
if __name__ == '__main__':
main()
可以收藏一下,慢慢学习哈!
————————————————————————————————————————————
微信关注号:**python爬虫机器学习深度学习**