爬虫学习

# -*- coding: utf-8 -*-
# @Time    : 2019/7/31 11:28
# @Author  : Eric Lee
# @Email   : li.yan_li@neusoft.com
# @File    : spider_dangdang.py
# @Software: PyCharm
import requests
 from lxml import html
def spider_dangdang(isbn):
# 目标站点地址
url = 'http://search.dangdang.com/?key={}&act=input'.format(isbn)
# print(url)
# 获取站点str类型的响应
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36"}

resp = requests.get(url, headers=headers)
html_data = resp.text
#  将html页面写入本地
# with open('dangdang.html', 'w', encoding='utf-8') as f:
#     f.write(html_data)

# 提取目标站的信息
selector = html.fromstring(html_data)
ul_list = selector.xpath('//div[@id="search_nature_rg"]/ul/li')
print('您好，共有{}家店铺售卖此图书'.format(len(ul_list)))

# 遍历 ul_list
for li in ul_list:
    #  图书名称
    title = li.xpath('./a/@title')[0].strip()
    print(title)
    #  图书购买链接
    link = li.xpath('a/@href')[0]
    print(link)
    #  图书价格
    price = li.xpath('./p[@class="price"]/span[@class="search_now_price"]/text()')[0]
    price = float(price.replace('￥',''))
    print(price)
    # 图书卖家名称
    store = li.xpath('./p[@class="search_shangjia"]/a/text()')
    # if len(store) == 0:
    #     store = '当当自营'
    # else:
    #     store = store[0]
    store = '当当自营' if len(store) == 0 else store[0]
    print(store)

XPath

XPath 节点

节点
在 XPath 中，有七种类型的节点：元素、属性、文本、命名空间、处理指令、注释以及文档（根）节点。XML 文档是被作为节点树来对待的。树的根被称为文档节点或者根节点。

请看下面这个 XML 文档：

    <?xml version="1.0" encoding="UTF-8"?>

  <bookstore>
  <book>
   <title lang="en">Harry Potter</title>
   <author>J K. Rowling</author>
   <year>2005</year>
   <price>29.99</price>
 </book>
</bookstore>

上面的XML文档中的节点例子：
<bookstore> (文档节点)
<author>J K. Rowling</author> (元素节点)
lang="en" (属性节点)

基本值（或称原子值，Atomic value）
基本值是无父或无子的节点。
基本值的例子：
J K. Rowling
"en"
项目（Item）
项目是基本值或者节点。

节点关系
父（Parent）
每个元素以及属性都有一个父。
在下面的例子中，book 元素是 title、author、year 以及 price 元素的父：

 <book>
  <title>Harry Potter</title>
  <author>J K. Rowling</author>
  <year>2005</year>
  <price>29.99</price>
</book>

子（Children）
元素节点可有零个、一个或多个子。
在下面的例子中，title、author、year 以及 price 元素都是 book 元素的子：

 <book>
     <title>Harry Potter</title>
   <author>J K. Rowling</author>
   <year>2005</year>
   <price>29.99</price>
</book>

同胞（Sibling）
拥有相同的父的节点
在下面的例子中，title、author、year 以及 price 元素都是同胞：

<book>
  <title>Harry Potter</title>
 <author>J K. Rowling</author>
  <year>2005</year>
 <price>29.99</price>
</book>

先辈（Ancestor）
某节点的父、父的父，等等。
在下面的例子中，title 元素的先辈是 book 元素和 bookstore 元素：

 <bookstore>
   <book>
    <title>Harry Potter</title>
    <author>J K. Rowling</author>
    <year>2005</year>
   <price>29.99</price>
 </book>

 </bookstore>

后代（Descendant）
某个节点的子，子的子，等等。
在下面的例子中，bookstore 的后代是 book、title、author、year 以及 price 元素：

  <bookstore>
     <book>
     <title>Harry Potter</title>
     <author>J K. Rowling</author>
     <year>2005</year>
      <price>29.99</price>
 </book>
 </bookstore>

选取节点

image

电影top5

  import requests
  from lxml import html
  import pandas as pd
 import jieba
 from matplotlib import pyplot as plt
plt.rcParams["font.sans-serif"] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
def Film():
# 目标站点地址
url = 'https://movie.douban.com/cinema/later/chongqing/'
header = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36"}
resp = requests.get(url, headers=header)
html_data = resp.text
# 提取目标站的信息
selector = html.fromstring(html_data)
film = selector.xpath('//div[@id="showing-soon"]/div')
print(film)
div_list = []
for film_list in film:
    # 电影名
    title_list = film_list.xpath('./div/h3/a/text()')[0]
    print(title_list)
    # 上映时间
    time_list = film_list.xpath('./div/ul/li[1]/text()')[0]
    print(time_list)
    # 电影类型
    type_list = film_list.xpath('./div/ul/li[2]/text()')[0]
    print(type_list)
    # 上映国家
    con_list = film_list.xpath('./div/ul/li[3]/text()')[0]
    print(con_list)
    # 想看人数
    number_list = film_list.xpath('./div/ul/li[4]/span/text()')[0]
    print(number_list)
    # 替换
    number_list = int(number_list.replace('人想看',''))
    # 添加电影信息
    div_list.append({
        'title': title_list,
        'time': time_list,
        'type': type_list,
        'con': con_list,
        'number': number_list
    })
    # 按照想看人数排序
div_list.sort(key=lambda x:x['number'], reverse=True )
print(div_list)
# 遍历
for items_list in div_list:
    print(items_list)
# 绘制top5最想看的电影占比图
# 提取前五部电影信息
top5_store = [div_list[i] for i in range(5)]
# 提取电影名
x = [x['title'] for x in top5_store]
print(x)
# 提取想看人数
y = [x['number'] for x in top5_store]
print(y)
explode = [0.1, 0, 0, 0, 0]
plt.pie(y, explode=explode, labels=x, shadow=True, autopct='%1.1f%%')
plt.axis('equal')
plt.legend(loc=2)
plt.show()

# 绘制即将上映电影国家的占比图
counts = {}
# 提取所有上映国家
s = [x['con'] for x in div_list]
print(s)
# 统计上映国家与数量
for word in s:
    counts[word] = counts.get(word, 0) + 1
print(counts)
# 提取上映国家
name = counts.keys()
print(name)
# 提取数量
number = counts.values()
print(number)
explode1 = [0.1, 0, 0, 0]
plt.pie(number, explode=explode1, labels=name, shadow=True, autopct='%1.1f%%')
plt.axis('equal')
plt.legend(loc=2)
plt.show()
Film()

学习Python的第四天