"""
导入相应的库
import requests
import csv
from bs4 import BeautifulSoup
定义基础url和存储结果的列表
def get_douban_top250():
# 1. 定义基础的url和存储结果列表
base_url = "https://movie.douban.com/top250?start={}&filter="
movies = []
# 2. 循环爬取10页,每页25,共250部
for page in range(10):
start=page* 25 # 计算start参数:0,25,50,75...250
url=base_url.format(start) #
# 3. 发送HTTPS请求(模拟浏览器,避免反爬),设置请求头,以键值对的字典形式展示
headers={
"user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/142.0.0.0 Safari/537.36 Edg/142.0.0.0"
}
response=requests.get (url,headers=headers) # 发送get 请求
# 4 解析HTML页面
soup=BeautifulSoup(response.text,"html.parser") # 用beautifulsoup 解析响应内容
# 定位到所有电影项(每个li标签对应一部电影)
movie_list = soup.find_all("li",class_="clearfix")
# 5 提取每部电影的信息
for movie in movie_list:
# 提取电影名(定位到<span class='title'> 标签,取第一个文本)
title_tag=movie.find("span",class_="title")
title = title_tag.get_text() # 例如 “肖申克的救赎”
# 提取评分(定位到<span class="rating_num">标价签)
score_tag=movie.find("span",class_="rating_num")
score = score_tag.get_text()
# 将提取的信息添加到列表
movies.append({
"电影名": title,
"评分":score
})
# 打印进度
print(f'已爬取第{page+1},共{len(movies)}部电影')
# 6 保存数据到CSV文件
with open("douban_top250.csv","w",encoding="utf-8",newline="") as f:
fieldnames = ["电影名","评分"]
writer=csv.DictWriter(f,fieldnames=fieldnames)
writer.writeheader()
for movie in movies:
writer.writerow(movie)
print("爬取已完成,数据已保存到douban_250.csv")