爬虫实例1
import requests
from lxml import html
import pandas as pd
from matplotlib import pyplot as plt
plt.rcParams["font.sans-serif"] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
def splider_dangdang(isbn):
book_list=[]
#目标站点
url = 'http://search.dangdang.com/?key={}&act=input'.format(isbn)
#print(url)
#获取站点String类型的响应
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36"}
resp = requests.get(url,headers)
html_data = resp.text
# 将html页面写入本地
# with open('dangdang.html','w',encoding='utf-8') as f:
# f.write(html_data)
#提取目标站的信息
selector = html.fromstring(html_data)
ul_list = selector.xpath('//div[@id="search_nature_rg"]/ul/li')
print('你好,共有{}家店铺售卖此图书'.format(len(ul_list)))
#
#遍历
for li in ul_list:
# 图书名称
title = li.xpath('./a/@title')[0].strip()
# print(title)
# 图书价格
price = li.xpath('./p[@class = "price"]/span[@class="search_now_price"]/text()')[0]
price = float(price.replace('¥',''))
# print(price)
#图书购买链接
link = li.xpath('a/@href')[0]
# print(link)
#图书卖家名称
store = li.xpath('./p[@class="search_shangjia"]/a/text()')
# if len(store) == 0:
# store = '当当自营'
# else:
# store = store[0]
store = '当当自营' if len(store) == 0 else store[0]
# print(store)
#添加每一个商家信息到列表
book_list.append({
'title':title,
'price':price,
'link':link,
'store':store
})
#按照价格排序
book_list.sort(key = lambda x:x['price'])
#遍历booklist
for book in book_list:
print(book)
#展示价格最低的前十家 柱状图
#店铺名称
top10_store = [book_list[i] for i in range(10)]
# x = []
# for store in top10_store:
# x.append(store['store'])
x = [x['store'] for x in top10_store]
y = [x['price'] for x in top10_store]
#plt.bar(x,y)
plt.barh(x,y)
plt.show()
#存储成csv文件
df = pd.DataFrame(book_list)
df.to_csv('dangdang.csv')
splider_dangdang('9787115428028')
爬虫实例2
import requests
from lxml import html
import pandas as pd
from matplotlib import pyplot as plt
import jieba
import string
plt.rcParams["font.sans-serif"] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
def spider_douban():
movie_list=[]
url='https://movie.douban.com/cinema/later/chongqing/'
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36"}
resp = requests.get(url, headers)
html_data = resp.text
selector = html.fromstring(html_data)
div_list = selector.xpath('//div[@id="showing-soon"]/div')
print('你好,共有{}部电影即将上映'.format(len(div_list)))
# 遍历
for div in div_list:
# 电影名称
title = div.xpath('./div/h3/a/text()')[0].strip()
# print(title)
# 上映日期
date = div.xpath('./div/ul/li[1]/text()')[0]
# date = float(date.replace('¥', ''))
# print(date)
# # 类型
type = div.xpath('./div/ul/li[2]/text()')[0]
# print(type)
# # 上映国家
conuntry = div.xpath('./div/ul/li[3]/text()')[0]
# print(conuntry)
#想看人数
number = div.xpath('./div/ul/li/span/text()')[0]
number = float(number.replace('人想看',''))
# print(number)
# 添加每一个电影信息到列表
movie_list.append({
'title': title,
'date': date,
'type': type,
'conuntry': conuntry,
'number':number
})
# 按照想看人数排序
movie_list.sort(key=lambda x: x['number'],reverse=True)
# 遍历movielist
for movie in movie_list:
print(movie)
#最想看得五部电影
top5_movie = [movie_list[i] for i in range(5)]
x = [x['title'] for x in top5_movie]
y = [x['number'] for x in top5_movie]
plt.barh(x, y)
plt.show()
#绘制即将上映国家的占比图
counts = {}
# 提取所有上映国家
s = [x['conuntry'] for x in div_list]
print(s)
# 统计上映国家与数量
for word in s:
counts[word] = counts.get(word, 0) + 1
print(counts)
# 提取上映国家
name = counts.keys()
print(name)
# 提取数量
counts_num = counts.values()
print(counts_num)
explode1 = [0.1, 0, 0, 0]
plt.pie(counts_num, explode=explode1, labels=name, shadow=True, autopct='%1.1f%%')
plt.axis('equal')
plt.legend(loc=2)
plt.show()
spider_douban()