豆瓣电影排行榜前250爬虫
import requests
from lxml import etree
import pandas as pd
def spider_douban_top250():
movie_list_info = []
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36"}
for i in range(0, 250, 25):
url = 'https://movie.douban.com/top250?start={}&filter='.format(i)
data = requests.get(url, headers=headers).content
html = etree.HTML(data)
# html.xpath('')
ol_list = html.xpath('//div[@id="content"]//div[@class="article"]/ol/li')
# print(ol_list)
for li in ol_list:
serial_num = li.xpath('./div[@class="item"]/div[@class="pic"]/em/text()')[0]
# print(serial_num)
# 影片序号
# serial_num
movie_name = li.xpath('./div[@class="item"]/div[@class="info"]/div[@class="hd"]/a/span[1]/text()')[0]
# print(movie_name)
# 电影名字
# movie_name
movie_inroduce = li.xpath('./div[@class="item"]/div[@class="info"]/div[@class="bd"]/p[1]/text()')[0].strip()
print(movie_inroduce)
star = li.xpath('./div[@class="item"]/div[@class="info"]/div[@class="bd"]/div[@class="star"]/span[2]/text()')[0]
# print(star)
evaluate = li.xpath('./div[@class="item"]/div[@class="info"]/div[@class="bd"]/div[@class="star"]/span[4]/text()')[0]
evaluate = int(evaluate.replace("人评价", ""))
# print(evaluate)
description = li.xpath('//div[@id="content"]/div/div[1]/ol/li/div/div[2]/div[2]/p[2]/span')
movie_img_url = li.xpath('./div[@class="item"]/div[@class="pic"]/a/img/@src')[0]
# movie_img_url
movie_list_info.append({
'serial_num': serial_num,
'movie_name': movie_name,
'movie_inroduce': movie_inroduce,
'star': star,
'evaluate': evaluate,
'movie_img_url': movie_img_url
})
for movie in movie_list_info:
print(movie)
下载图片
df = pd.DataFrame(book_list)
df.to_csv('top250.csv')
for movie in movie_list_info:
url = movie['movie_img_url']
resp = requests.get(url)
if resp.status_code == 200:
img_name = '0000000{}.jpg'.format(movie['serial_num'])
with open('./Include/pachong2/{}'.format(img_name), 'wb') as f:
f.write(resp.content)
spider_douban_top250()