import requests
from lxml import html
import pandas as pd
def spider(isbn):
url = 'https://movie.douban.com/cinema/later/{}/'.format(isbn)
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.70 Safari/537.36"}
html_data = requests.get(url, headers=headers).text
selector = html.fromstring(html_data)
div_list = selector.xpath('//div[@id="showing-soon"]/div')
print('有{}部电影即将上映'.format(len(div_list)))
movie_info_list = []
for div in div_list:
# 获取电影名字
movie_names = div.xpath('div[@class="intro"]/h3/a/text()')[0]
# print(movie_names)
# 获取电影上映日期
movie_days = div.xpath('div/ul/li/text()')[0]
# print(movie_days)
# 获取电影类型
movie_types = div.xpath('div/ul/li/text()')[1]
# print(movie_types)
# 获取电影出版国家
movie_countries = div.xpath('div/ul/li/text()')[2]
# print(movie_countries)
# 获取电影想看的人数
movie_numbers = div.xpath('div/ul/li/span/text()')[0]
movie_numbers = movie_numbers.replace('人想看', '')
# print(movie_numbers)
# 获取电影的封面图片
movie_photos = div.xpath('a/img/@src')
# print(movie_photos)
movie_info_list.append(
{"movie_name": movie_names,
"movie_days": movie_days,
"movie_types": movie_types,
"movie_countries": movie_countries,
"movie_numbers": movie_numbers,
"movie_photos": movie_photos}
)
print('############################################')
print('##################排序前####################')
print('############################################')
print(movie_info_list)
print('############################################')
print('##################具体如下###################')
print('############################################')
for movies in movie_info_list:
print(movies)
print('############################################')
print('##################排序后####################')
print('############################################')
movie_info_list.sort(key=lambda x: int(x['movie_numbers']), reverse=True) # 定义匿名函数使列表按'movie_numbers'的方式排序
print(movie_info_list)
print('############################################')
print('##################具体如下###################')
print('############################################')
for movies in movie_info_list:
print(movies)
# 将爬取到的数据存进本地csv文件里
df = pd.DataFrame(movie_info_list) # 转化成dataframe格式
df.to_csv('豆瓣电影信息.csv') # 存储成csv
# 批量图片下载
for movies in movie_info_list:
movie_link = movies['movie_photos'][0] # 记得加[0],不然会报错
# print(movie_link)
res = requests.get(movie_link)
if res.status_code == 200:
with open('../image/{}.jpg'.format(movies['movie_name']), 'wb') as f:
f.write(res.content)
isbn = input('请输入要搜索的城市名(中英文均可),例如:沈阳或shenyang')
spider(isbn)
请输入要搜索的城市名(中英文表示),例如:shenyang沈阳
有25部电影即将上映
############################################
##################排序前####################
############################################
[{'movie_name': '爱在零纬度', 'movie_days': '10月29日', 'movie_types': '爱情 / 家庭', 'movie_countries': '中国大陆', 'movie_numbers': '188', 'movie_photos': ['https://img1.doubanio.com/view/photo/s_ratio_poster/public/p2572231307.jpg']}...]
############################################
##################具体如下###################
############################################
{'movie_name': '爱在零纬度', 'movie_days': '10月29日', 'movie_types': '爱情 / 家庭', 'movie_countries': '中国大陆', 'movie_numbers': '188', 'movie_photos': ['https://img1.doubanio.com/view/photo/s_ratio_poster/public/p2572231307.jpg']}
{'movie_name': '小巷管家', 'movie_days': '10月29日', 'movie_types': '剧情', 'movie_countries': '中国大陆', 'movie_numbers': '122', 'movie_photos': ['https://img3.doubanio.com/view/photo/s_ratio_poster/public/p2572339805.jpg']}
...}
############################################
##################排序后####################
############################################
[{'movie_name': '天气之子', 'movie_days': '11月01日', 'movie_types': '爱情 / 动画 / 奇幻', 'movie_countries': '日本', 'movie_numbers': '67442', 'movie_photos': ['https://img3.doubanio.com/view/photo/s_ratio_poster/public/p2570055890.jpg']},
...]
############################################
##################具体如下###################
############################################
{'movie_name': '天气之子', 'movie_days': '11月01日', 'movie_types': '爱情 / 动画 / 奇幻', 'movie_countries': '日本', 'movie_numbers': '67442', 'movie_photos': ['https://img3.doubanio.com/view/photo/s_ratio_poster/public/p2570055890.jpg']}
{'movie_name': '终结者:黑暗命运', 'movie_days': '11月01日', 'movie_types': '动作 / 科幻 / 冒险', 'movie_countries': '美国', 'movie_numbers': '21889', 'movie_photos': ['https://img9.doubanio.com/view/photo/s_ratio_poster/public/p2571762536.jpg']},
...}
保存到本地csv文件,效果如下:
豆瓣电影信息.csv.png
批量下载图片到本地,效果如下:
图片保存到本地.png