爬虫
1.当当网根据书籍ibsn截取数据
from lxml import html
import requests
import pandas as pd
from matplotlib import pyplot as plt
plt.rcParams["font.sans-serif"] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
def spider_dangdang(isbn):
book_list = []
#目标站点地址
url='http://search.dangdang.com/?key={}&act=input'.format(isbn)
# print(url)
# 获取站点str类型的响应
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36"}
resp=requests.get(url,headers=headers)
html_data=resp.text
# 酱html页面写入本地
# with open('dangdang.html','w',encoding='utf-8')as f:
# f.write(html_data)
# 提取目标站点的信息
selector=html.fromstring(html_data)
ul_list=selector.xpath('//div[@id="search_nature_rg"]/ul/li')
print('当前共有{}家店铺在售卖此书'.format(len(ul_list)))
#遍历ul_list
for li in ul_list:
#图书名称
title=li.xpath('./a/@title')[0].strip()
print(title)
#图书购买链接
link=li.xpath('a/@href')[0]
print(link)
#图书价格
price=li.xpath('./p[@class="price"]/span[@class="search_now_price"]/text()')[0]
price=price.replace('¥','')
print(price)
#卖家名称
store=li.xpath('./p[@class="search_shangjia"]/a/text()')
# if len(store)==0:
# store='当当自营'
# else:
# store=store[0]
store = '当当自营' if len(store)==0 else store[0]
print(store)
book_list.append({
'title': title,
'price': price,
'link': link,
'store': store
})
# 按照价格进行排序
book_list.sort(key=lambda x: x['price'])
# 遍历booklist
for book in book_list:
print(book)
# 展示价格最低的前10家 柱状图
# 店铺的名称
top10_store = [book_list[i] for i in range(10)]
x = [x['store'] for x in top10_store]
# print(x)
y = [x['price'] for x in top10_store]
# print(y)
plt.barh(x, y)
plt.show()
# 存储成为csv文件
df = pd.DataFrame(book_list)
df.to_csv('dangdang.csv')
spider_dangdang('9787115428028')
2.做受欢迎预播电影排序
from lxml import html
import requests
import pandas as pd
from matplotlib import pyplot as plt
plt.rcParams["font.sans-serif"] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
def spider_douban():
movie_list=[]
url='https://movie.douban.com/cinema/later/chongqing/'
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36"
}
resp = requests.get(url,headers=headers)
html_data = resp.text
# print(html_data)
selector = html.fromstring(html_data)
ul_list = selector.xpath('//div[@id="showing-soon"]/div')
print('当前共有{}场即将上映的电影'.format(len(ul_list)))
for li in ul_list:
name = li.xpath('./div[@class="intro"]/h3/a/text()')[0]
# print(name)
date = li.xpath('./div[@class="intro"]/ul/li/text()')[0]
# print(date)
type = li.xpath('./div[@class="intro"]/ul/li/text()')[1]
# print(type)
country =li.xpath('./div[@class="intro"]/ul/li/text()')[2]
# print(country)
user =li.xpath('./div[@class="intro"]/ul/li/span/text()')[0]
# print(user)
user=int(user.replace("人想看",""))
movie_list.append({
'name': name,
'date': date,
'type': type,
'country': country,
'user':user
})
movie_list.sort(key=lambda x: x['user'],reverse=True)
# for name_c in movie_list:
# print(name_c)
for movie in movie_list:
print(movie)
top5 = [movie_list[i] for i in range(5)]
x = [x['name'] for x in top5]
# print(x)
y = [x['user'] for x in top5]
# print(y)
plt.title("最想看的电影")
plt.bar(x, y)
plt.show()
country = []
for city in movie_list:
country.append((city['country']))
c = {}
# 将国家信息汇总
for city in country:
c[city] = c.get(city, 0) + 1
print(c)
items = list(c.items())
# 分别提取国家名和次数
baifeibi = []
guojia = []
for i in range(len(items)):
city, bai = items[i]
baifeibi.append(bai)
guojia.append(city)
plt.pie(baifeibi, labels=guojia, autopct='%1.1f%%')
plt.show()
plt.show()
df = pd.DataFrame(movie_list)
df.to_csv("douban.txt")
# count=[]
# label=[]
# for i in range(len(movie_list)):
# counts.append(movie_list[i]['count'])
# labels.append(peo_li[i]['name'])
# # 距离圆心点距离
spider_douban()