1.豆瓣电影的爬取(无反爬)
## 请求网页库
import requests
## 解析网页库
from lxmlimport etree
## 保存用的库
import pandasas pd
## 建立文件夹
import os
IMGURLS=[]
MOVIES=[]
## 获取网页源代码
def get_html(url):
headers = {
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36"}
try:
html = requests.get(url, headers=headers)
## 进行转码
html.encoding = html.apparent_encoding
if html.status_code ==200:
print("成功获得源代码")
## html.text对应的是源代码
print(html.text)
except Exception as e:
print("获取源代码失败:%s" % e)
return html.text
## 解析网页源代码
def parse_html(html):
movies = []
imgurls = []
## 把网页进行分解
html = etree.HTML(html)
## 用xpath语法获得特定的列元素,得到的结果放置在列表中
lis = html.xpath("//ol[@class='grid_view']/li")
print(len(lis))
## 创建一个循环得到每个列
for liin lis:
name = li.xpath(".//a/span[@class='title']/text()")[0]
director_actor = li.xpath(".//div[@class='bd']/p/text()")[0].strip()
info = li.xpath(".//div[@class='bd']/p/text()")[1].strip()
rating_score = li.xpath(".//div[@class='star']/span[2]/text()")[0]
rating_num = li.xpath(".//div[@class='star']/span[4]/text()")[0]
##introduce = li.xpath(".//p[@class='quote']/span/text()")[0]
imgurl = li.xpath(".//img/@src")[0]
## 把数据封装到一个对象里面
movie = {'name': name, 'director_actor': director_actor, 'info': info, 'rating_score': rating_score,
'rating_num': rating_num,}
movies.append(movie)
imgurls.append(imgurl)
print(name)
print(director_actor)
print(info)
print(rating_score)
print(rating_num)
## print(introduce)
print(imgurl)
return movies, imgurls
## 下载电影对应的图片
def downloading(url, movie):
## 建立一个文件夹
if 'movieposter' in os.listdir("D:\spider"):
pass
else:
os.mkdir('movieposter')
## 更改当前文件位置
os.chdir("D:\spider\movieposter")
img = requests.get(url).content
with open(movie['name'] +'.jpg', 'wb')as f:
print('正在下载: %s' % url)
f.write(img)
if __name__ =='__main__':
for iin range(10):
url ="http://movie.douban.com/top250?start=" +str(i *25) +"&filter="
html = get_html(url)
## 得到字典类型的数据
movies = parse_html(html)[0]
imgurls = parse_html(html)[1]
## 把多页综合到一起
MOVIES.extend(movies)
IMGURLS.extend(imgurls)
for iin range(250):
downloading(IMGURLS[i], MOVIES[i])
##print(movies)
## 把字典类型的数据转化为excel接受的数据形式
os.chdir("D:\spider")
moviedata = pd.DataFrame(MOVIES)
print(moviedata)
moviedata.to_csv("movie1.csv")
print("电影信息保存到本地")