一、Xpath
- 定义:XPath即为XML路径语言(XML Path Language),它是一种用来确定XML文档中某部分位置的计算机语言。XPath基于XML的树状结构,提供在数据结构树中找寻节点的能力。
- Xpath解析匹配规则
/ 代表选取直接子节点
// 代表选择任意子孙节点
. 代表选取当前节点
.. 代表选取当前节点的父节点
@ 表示属性的限定,选取匹配属性的特定节点 - 解析例子
index.html:
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title>Title</title>
</head>
<body>
<h1>欢迎来到王者荣耀</h1>
<ul>
<li><a href="herodetail/506.shtml" target="_blank"><img src="//game.gtimg.cn/images/yxzj/img201606/heroimg/506/506.jpg" width="91" height="91" alt="云中君">云中君</a></li>
<li><a href="herodetail/505.shtml" target="_blank"><img src="//game.gtimg.cn/images/yxzj/img201606/heroimg/505/505.jpg" width="91" height="91" alt="瑶">瑶</a></li>
<li><a href="herodetail/529.shtml" target="_blank"><img src="//game.gtimg.cn/images/yxzj/img201606/heroimg/529/529.jpg" width="91" height="91" alt="盘古">盘古</a></li>
<li><a href="herodetail/511.shtml" target="_blank"><img src="//game.gtimg.cn/images/yxzj/img201606/heroimg/511/511.jpg" width="91" height="91" alt="猪八戒">猪八戒</a></li>
<li><a href="herodetail/515.shtml" target="_blank"><img src="//game.gtimg.cn/images/yxzj/img201606/heroimg/515/515.jpg" width="91" height="91" alt="嫦娥">嫦娥</a></li>
<li><a href="herodetail/513.shtml" target="_blank"><img src="//game.gtimg.cn/images/yxzj/img201606/heroimg/513/513.jpg" width="91" height="91" alt="上官婉儿">上官婉儿</a></li>
</ul>
<ol>
<li>坦克</li>
<li>战士</li>
<li>刺客</li>
</ol>
<div>这是第1个div标签</div>
<div>这是第2个div标签</div>
<div>这是第3个div标签</div>
<div id="container">
<p>欢迎拉到百度</p>
<a href="www.baidu.com">百度</a>
</div>
</body>
</html>
python文件:
with open("./index.html","r",encoding="UTF-8")as f:
html_data = f.read()
selector = html.fromstring(html_data)
h1=selector.xpath("/html/body/h1/text()")#要获取标签内容使用“/text()”,“/”表示根目录
print(h1[0])
p = selector.xpath("//div[@id='container']/p/text()")#“//”匹配任意目录
phref = selector.xpath("//div[@id='container']/a/@href")#“//”匹配任意目录
print(p[0])
print(phref[0])
输出结果
欢迎来到王者荣耀
欢迎拉到百度
www.baidu.com
二、使用requests获取网站响应
url = "http://www.baidu.com"
response = requests.get(url)
print(response)
print(response.status_code)#获取状态码
print(response.headers)#获取响应头
print(response.content)#获取bytes类型的响应
print(response.text)#获取str类型的响应
有些网站需要带上请求头headers才能得到网站响应,如知乎,否则会返回状态400 Bad Request
# 使用字典定义请求头能得到正确的响应
headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36"}
resp = requests.get('https://www.zhihu.com/', headers = headers)
print(resp.status_code)
三、爬取网站信息
爬取当当网某图书的信息
import requests
from lxml import html
import pandas as pd
from matplotlib import pyplot as plt
plt.rcParams["font.sans-serif"] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
def spider_dangdang(isbn):
book_list = []
url = 'http://search.dangdang.com/?key={}&act=input'.format(isbn)
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36"}
resp = requests.get(url, headers=headers)
html_data = resp.text
# with open('dangdang.html', 'w', encoding='utf-8') as f:
# f.write(html_data)
# 提取目标站的信息
selector = html.fromstring(html_data)
ul_list = selector.xpath('//div[@id="search_nature_rg"]/ul/li')
print('共有{}家店铺售卖此图书'.format(len(ul_list)))
#遍历ul_list
for li in ul_list:
title = li.xpath('./a/@title')[0].strip()# 图书名称
#print(title)
price = li.xpath('./p[@class="price"]/span[@class="search_now_price"]/text()')[0]# 图书价格
price = float(price.replace("¥",""))
# print(price)
href = li.xpath('./a/@href')[0] # 图书购买链接
# print(href)
shop = li.xpath('./p[@class="search_shangjia"]/a/text()')# 图书卖家名称
shop = '当当自营' if len(shop) == 0 else shop[0]
# print(shop)
book_list.append({
'title':title,
'price':price,
'href':href,
'shop':shop
})
book_list.sort(key=lambda x:x['price'])
for book in book_list:
print(book)
top10_shop = [book_list[i] for i in range(1,11)]
print(top10_shop)
shop = [x['shop'] for x in top10_shop]
print(shop)
price = [y['price'] for y in top10_shop]
print(price)
plt.barh(shop,price)
plt.show()
df = pd.DataFrame(book_list)
df.to_csv('dangdang.csv')
spider_dangdang('9787115428028')
某图书当当价格最低的十家
爬取豆瓣电影上映信息
import requests
from lxml import html
from matplotlib import pyplot as plt
plt.rcParams["font.sans-serif"] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
url = 'https://movie.douban.com/cinema/later/chongqing/'
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36"}
resp = requests.get(url, headers=headers)
html_data = resp.text
selector = html.fromstring(html_data)
movie_list = []
div_list = selector.xpath('//div[@id="showing-soon"]/div')
for div in div_list:
name = div.xpath('./div/h3/a/text()')[0]
date = div.xpath('./div/ul/li[1]/text()')[0]
type = div.xpath('./div/ul/li[2]/text()')[0]
country = div.xpath('./div/ul/li[3]/text()')[0]
number = div.xpath('./div/ul/li[4]/span/text()')[0]
number = int(number.replace("人想看",""))
movie_list.append({
"name":name,
"date":date,
"type":type,
"country":country,
"number":number
})
movie_list.sort(key=lambda x:x['number'],reverse=True)
print(movie_list)
counts = {}
for movie in movie_list:
counts[movie['country']] = counts.get(movie['country'], 0) + 1
countrys = []
countrys_count = []
for k,v in counts.items():
countrys.append(k)
countrys_count.append(v)
print(countrys)
print(countrys_count)
plt.pie(countrys_count,labels=countrys,autopct = '%1.1f%%')
plt.show()
#绘制top5最想看的电影
top5_movie = [movie_list[i] for i in range(5)]
x = [x['name'] for x in top5_movie]
y = [y['number'] for y in top5_movie]
plt.barh(x, y)
plt.show()
电影国家占比
想看人数前五