代码亲自试了一下,速度取决于网速,简单的贴一下代码好了,有什么问题可以留言。思路可以看一下[视频介绍]: https://b23.tv/h508v3
-- coding: utf-8 --
import os
import bs4
import requests as req
def get_content(page_url): # 获取网页
url = page_url
r = []
headers = {
'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) '
'AppleWebKit/537.36 (KHTML, like Gecko)'
' Chrome/63.0.3239.132 Mobile Safari/537.36',
'cookie': 'bid=QB1DolHTrHk; douban-fav-remind=1; ll="108288"; yadk_uid=VonAsOI0yj6dLjA0uGeBA3Bm2eNCNOTw; _vwo_uuid_v2=D458B58B2BDC40664ADD985D2BBBD6465|baa92a0224e8ed5529abe508121bfb94; gads=ID=db538611950a59d9:T=1580548123:S=ALNI_MYFBTvsfFyou-MeX4br3oJ1CSSX1A; utmz=223695111.1582988839.6.6.utmcsr=baidu|utmccn=(organic)|utmcmd=organic|utmctr=%E6%9C%B1%E8%BF%AA; utmz=30149280.1583041381.8.6.utmcsr=baidu|utmccn=(organic)|utmcmd=organic|utmctr=microsoft%20remote%20desktop%20for%20mac; push_noty_num=0; push_doumail_num=0; utmv=30149280.8277; ct=y; dbcl2="82771268:DgQVL/PAHOM"; ck=NIH4; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1583303993%2C%22https%3A%2F%2Fwww.baidu.com%2Fs%3Fie%3Dutf-8%26f%3D8%26rsv_bp%3D1%26ch%3D%26tn%3Dbaidu%26bar%3D%26wd%3D%25E6%259C%25B1%25E8%25BF%25AA%26oq%3DJudy.2019.%26rsv_pq%3Dc7ca6b9b000c927d%26rsv_t%3Ddf18ju54V9KyOQDdTUOYo4FWELLR2X6OQoXHcJhqO5e5WCUi5CPibeNGq%252Fc%26rqlang%3Dcn%26rsv_enter%3D1%26rsv_dl%3Dtb%26inputT%3D6494%22%5D; _pk_ses.100001.4cf6=*; utma=30149280.363329858.1579408383.1583285998.1583303993.15; utmb=30149280.0.10.1583303993; utmc=30149280; utma=223695111.1275774826.1580548095.1583285998.1583303993.13; utmb=223695111.0.10.1583303993; __utmc=223695111; _pk_id.100001.4cf6=3174f2cb4a0368b8.1580548095.11.1583304000.1583286152.'
}
for each in url:
r.append(req.get(each, headers=headers))
return r
def how_many_pages(): # 爬取深度
pages_url = []
pages = input('How many pages do you want to scrap?(total:10):')
for i in range(0, int(pages)):
pages_url.append('https://movie.douban.com/top250?start=' + str(25 * (int(i))) + '&filter=')
return pages_url
def analysis_content(html): # 分析网页
temp = html
homeIds = {} # 首页id集合
# 此处temp是一个列表,存储pages个网页的内容
for each in temp:
soup = bs4.BeautifulSoup(each.text, 'html.parser') # 注意是each.text
div_all = soup.find_all('div', class_='pic')
for each in div_all:
to = str(each.a['href']).split('/')
one_div = to[len(to) - 2]
movice_name = each.img['alt']
homeIds[movice_name] = one_div
return homeIds
def movice_detail(ids): # 获取每个电影的海报界面
r = []
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_0) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/79.0.3945.117 Safari/537.36'
}
for value in ids.values():
url = "https://movie.douban.com/subject/" + value + "/photos?type=R"
r.append(req.get(url, headers=headers))
return r
def analysis_movice_detail(html): # 分析每个电影的海报界面
global conformIds
t = html
conformIds = {} # 符合要求的海报id集合
for each in t:
flag = 0
soup = bs4.BeautifulSoup(each.text, 'html.parser') # 注意是each.text
ul = soup.find('ul', class_='poster-col3 clearfix')
h1 = soup.find('h1')
lis = ul.find_all('li')
if lis is not None:
# 某一电影海报界面li集合
for a in lis:
real_id = a['data-id']
div = a.find('div', class_='prop').text.split('x')
# 如果图片尺寸满足要求(长宽均>800) 保存到ids
if int(div[0].split()[0]) > 0 and int(div[1].split()[0]) > 0:
flag = flag + 1
conformIds[h1.text.split()[0] + str(flag)] = real_id
if flag == 5:
break
return conformIds
def image_save():
folder = '豆瓣top250'
g = 1
try: # 防止出现同名文件夹
os.mkdir(folder)
except:
folder = '豆瓣top250(' + str(g) + ')'
g += 1
os.mkdir(folder)
os.chdir(folder)
for item in conformIds.items():
with open(item[0] + '.jpg', 'wb') as f:
heards = {
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_0) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/79.0.3945.117 Safari/537.36',
'referer': 'https://movie.douban.com/photos/photo/' + item[1]
}
img_html = req.get("https://img3.doubanio.com/view/photo/raw/public/p" + item[1], headers=heards)
f.write(img_html.content)
def main():
page = how_many_pages()
html = get_content(page)
movies = analysis_content(html)
a = movice_detail(movies) # 获取每个电影的海报界面
analysis_movice_detail(a)
image_save()
if name == "main":
main()