爬虫未加入多线程,有兴趣研究的可以深入了解
其实用Scrapy爬取效率更高,代码更少.这个脚本需要第三方库requests和BeautifulSoup4
脚本未做优化或试错
import requests
from bs4 import BeautifulSoup
import os
url = 'http://www.netbian.com/meinv/'
path = './美女壁纸/'
def get_html(url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36'}
try:
res = requests.get(url, headers=headers)
if res.ok:
res.encoding = res.apparent_encoding
return res.text
return None
except:
print('访问出错!!')
def get_img_url(url=url):
html = get_html(url)
if html:
soup = BeautifulSoup(html, 'html.parser')
links = soup.select("div.list b a[href*=.htm]")
for link in links:
url = link.get('href').replace('.htm', '-1920x1080.htm').replace('/desk', 'http://www.netbian.com/desk')
get_wallpage(url)
prev = soup.select('div.page > a.prev')
for a in prev:
if "下一页" in a.text:
next_page = 'http://www.netbian.com' + a.get('href')
print('获取下一页' + next_page)
get_img_url(next_page)
def get_wallpage(url):
html = get_html(url)
if html:
soup = BeautifulSoup(html, 'html.parser')
if soup.select("img[title]"):
link = soup.select("img[title]")[0]
title = link.get('title')
src = link.get('src')
save_wallpage(title, src)
def save_wallpage(name, src):
global path
image = requests.get(src)
if image.ok:
print('正在保存...' + name)
with open(path + name + '.jpg', 'wb') as f:
f.write(image.content)
get_img_url()