xpath语法
# 读取本地
# pip install lxml
from lxml import html
with open('index.html', mode='r', encoding='utf-8') as f:
data = f.read()
# print(data)
selector = html.fromstring(data)
# 获取标签内容
h1 = selector.xpath('/html/body/h1/text()')[0]
print(h1)
# 获取标签属性 @ 属性名
a = selector.xpath('/html/body/a/@href')[0]
print(a)
link = selector.xpath('/html/body/img/@src')[0]
print(link)
豆瓣top250
import requests
from lxml import html
from matplotlib import pyplot as plt
import pandas as pd
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36"}
movie_ls = []
for i in range(0, 230, 25):
url = "https://movie.douban.com/top250?start={}&filter=".format(i)
# 添加请求头, 目的是伪装成浏览器
response = requests.get(url, headers=headers)
# 看编码
print(response.encoding)
print(response.status_code)
data = response.text
selector = html.fromstring(data)
# // 代表任意位置出发
# 获取html标签的内容
# //标签名1[@属性=属性值]/标签名1[@属性=属性值/text()
# 获取html标签的属性值
# //标签名1[@属性=属性值]/标签名1[@属性=属性值/@属性名
ol_list = selector.xpath('//div[@id="content"]//ol/li')
print(len(ol_list))
counts = {}
for movie in ol_list:
movie_name = movie.xpath('div[@class="item"]/div[@class="info"]/div[@class="hd"]/a/span[1]/text()')[0]
print(movie_name)
movie_score = \
movie.xpath('div[@class="item"]/div[@class="info"]/div[@class="bd"]/div[@class="star"]/span[2]/text()')[0]
print(movie_score)
movie_evals = \
movie.xpath('div[@class="item"]/div[@class="info"]/div[@class="bd"]/div[@class="star"]/span[4]/text()')[0]
print(movie_evals)
movie_img_link = movie.xpath('div[@class="item"]/div[@class="pic"]/a/img/@src')[0]
print(movie_img_link)
# 写入本地 .content获取二进制数据
img_data = requests.get(movie_img_link).content
# wb 是写二进制
with open('./imgs/{}.jpg'.format(movie_name), mode='wb') as f:
f.write(img_data)
counts[movie_score] = counts.get(movie_score,0) + 1
movie_ls.append({
"movie_name":movie_name,
"movie_score":movie_score,
"movie_evals":movie_evals,
"movie_img_link":movie_img_link
})
# 比如各个评分占比 9.1 占 250电影的百分之多少
num_ls = list(counts.values())
score_ls = list(counts.keys())
plt.pie(num_ls, labels=score_ls)
plt.show()
df = pd.DataFrame(movie_ls)
df.to_csv('doubantop250.csv')