import requests
import re
import parsel
import json
import csv
import time
import random
headers = {
#'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
#'Accept-Encoding': 'gzip, deflate',
#'Accept-Language': 'zh,zh-CN;q=0.9',
#'Connection': 'keep-alive',
'Cookie': 'ddscreen=2; ddscreen=2; ddscreen=2; dest_area=country_id%3D9000%26province_id%3D111%26city_id%20%3D0%26district_id%3D0%26town_id%3D0; __permanent_id=20220529131201545170054263945468705; __visit_id=20220529131201628993540613163952339; __out_refer=1653801122%7C!%7Cwww.baidu.com%7C!%7C; MDD_channelId=70000; MDD_fromPlatform=307; __rpm=%7C...1653802581239; __trace_id=20220529133621962340066219653765986',
#'Host': 'bang.dangdang.com',
#'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (X11; Linux aarch64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.188 Safari/537.36 CrKey/1.54.250320',
}
f = open('当当书籍.csv',encoding='utf-8',newline='',mode='a')
csv_writer = csv.DictWriter(f,fieldnames = [ # WriterDict DictWriter
'标题',
'推荐',
'评论',
'作者',
'出版日期',
'出版社',
'原价',
'售价',
'折扣',
'电子书',
])
csv_writer.writeheader()
for page in range(1,11):
#print(f'>>>>>>>>>>>>>>>>正在爬取第{page}页》》》》》》》》》》》》》》》》》》》》》》》》》')
time.sleep(random.randint(1,3))
print(f'=================正在爬取第%d 页' %page)
url = f'http://bang.dangdang.com/books/bestsellers/01.00.00.00.00.00-24hours-0-0-1-{page}'
response = requests.get(url=url,headers=headers)
print(response)
selector = parsel.Selector(response.text)
#print(selector)
lis = selector.css('ul.bang_list li')
print(lis)
for li in lis:
title = li.css('.name a::attr(title)').get()
commnet = li.css('.star a::text').get()
recommend = li.css('.tuijian::text').get()
author = li.css('div:nth-child(5) a:nth-child(1)::attr(title)').get()#作者
date = li.css('div:nth-child(6) span::text').get()#出版日期
press = li.css('div:nth-child(6) a::text').get()#出版社
price_n = li.css('.price p:nth-child(1) .price_n::text').get() #售价
price_r = li.css('.price p:nth-child(1) .price_r::text').get() #原价
price_s = li.css('.price p:nth-child(1) .price_s::text').get() #折扣
ebook = li.css('.price_e .price_n::text').get()#电子书价格
#print(title,commnet,recommend,author,date,press,price_n,price_r,price_s,ebook,sep='|')
dict = {
'标题':title ,
'推荐':commnet ,
'评论':recommend ,
'作者':author ,
'出版日期':date ,
'出版社':press ,
'原价':price_n ,
'售价':price_r ,
'折扣':price_s ,
'电子书':ebook ,
}
csv_writer.writerow(dict)
https://blog.csdn.net/qq_52691614/article/details/124001680
python pandas利用str.extract()方法处理标签
extract
比如说输入闪订 ,然后看每一行的 全部标签 数据里有没有 闪订,
如果有的话 闪订 的值计为1,没有的话计为0,总共有9个标签,最后实现了这样的效果
import numpy as np
import pandas as pd
import re
data=pd.read_csv(r'D:\BaiduNetdiskDownload\shanghai_all_districts(1).csv')
while True:
x=input('请输入要匹配的标签')
#输入q退出
if x=='q':
break
data[x] = data['所有标签'].str.extract(r'({})'.format(x), expand=False)
b={x:1}
data[x]=data[x].map(b)
data[x]=data[x].fillna(0)
data[x]
————————————————
版权声明:本文为CSDN博主「鈴音.」的原创文章,遵循CC 4.0 BY-SA版权协议,转载请附上原文出处链接及本声明。
原文链接:https://blog.csdn.net/qq_52691614/article/details/124001680
=========================================================
import pandas as pd
from pyecharts.charts import *
from pyecharts.globals import ThemeType#设定主题
from pyecharts.commons.utils import JsCode
import pyecharts.options as opts
2.导入数据
df = pd.read_csv('书籍信息.csv', encoding='utf-8', engine='python')
df.head()
3.数据处理
df['书名'] = df['标题'].apply(lambda x:x.split('(')[0])
df.head()
df['书籍简介'] = df['标题'].str.extract('.?((.?))')
df['书籍简介'].fillna('无', inplace=True)
df.head(1)
提取评论数
df['评论数'] = df['评价'].str.replace('条评论','').astype('int64')
df.head(1)
原价、售价、电子书价格 数值化
df['原价'] = df['原价'].str.replace('¥', '')
df['售价'] = df['售价'].str.replace('¥', '')
df['电子书价格'] = df['电子书价格'].str.replace('¥', '')
df.head(1)
df.info()
df['原价'] = df['原价'].str.replace(',', '').astype('float64')
df['售价'] = df['售价'].str.replace(',', '').astype('float64')
选择需要用到的列
df = df[['书名','书籍简介','评论数','作者','出版日期','出版社','原价','售价','电子书价格']]
df.head(1)
缺失值
df.isnull().sum()
df['作者'].fillna('未知', inplace=True)
df['出版社'].fillna('未知', inplace=True)
df.isnull().sum()
电子书价格列额外处理
df['电子书价格'] = df['电子书价格'].str.replace(',', '').astype('float64')
df['电子书价格'].fillna('无电子书版本', inplace=True)
重复值
df.duplicated().sum()
df.info()
df.describe()
4.数据可视化
书籍总体价格区间
def tranform_price(x):
if x <= 50.0:
return '0~50元'
elif x <= 100.0:
return '51~100元'
elif x <= 500.0:
return '101~500元'
elif x <= 1000.0:
return '501~1000元'
else:
return '1000以上'
df['价格分级'] = df['原价'].apply(lambda x:tranform_price(x))
price_1 = df['价格分级'].value_counts()
datas_pair_1 = [(i, int(j)) for i, j in zip(price_1.index, price_1.values)]
df['售价价格分级'] = df['售价'].apply(lambda x:tranform_price(x))
price_2 = df['售价价格分级'].value_counts()
datas_pair_2 = [(i, int(j)) for i, j in zip(price_2.index, price_2.values)]
pie1 = (
Pie(init_opts=opts.InitOpts(theme='dark',width='1000px',height='600px'))
.add('', datas_pair_1, radius=['35%', '60%'])
.set_series_opts(label_opts=opts.LabelOpts(formatter="{b}:{d}%"))
.set_global_opts(
title_opts=opts.TitleOpts(
title="当当网书籍\n\n原价价格区间",
pos_left='center',
pos_top='center',
title_textstyle_opts=opts.TextStyleOpts(
color='#F0F8FF',
font_size=20,
font_weight='bold'
),
)
)
.set_colors(['#EF9050', '#3B7BA9', '#6FB27C', '#FFAF34', '#D8BFD8', '#00BFFF', '#7FFFAA'])
)
pie1.render_notebook()
pie1 = (
Pie(init_opts=opts.InitOpts(theme='dark',width='1000px',height='600px'))
.add('', datas_pair_2, radius=['35%', '60%'])
.set_series_opts(label_opts=opts.LabelOpts(formatter="{b}:{d}%"))
.set_global_opts(
title_opts=opts.TitleOpts(
title="当当网书籍\n\n售价价格区间",
pos_left='center',
pos_top='center',
title_textstyle_opts=opts.TextStyleOpts(
color='#F0F8FF',
font_size=20,
font_weight='bold'
),
)
)
.set_colors(['#EF9050', '#3B7BA9', '#6FB27C', '#FFAF34', '#D8BFD8', '#00BFFF', '#7FFFAA'])
)
pie1.render_notebook()
各个出版社书籍数量柱状图
counts = df.groupby('出版社')['书名'].count().sort_values(ascending=False).head(20)
bar=(
Bar(init_opts=opts.InitOpts(height='500px',width='1000px',theme='dark'))
.add_xaxis(counts.index.tolist())
.add_yaxis(
'出版社书籍数量',
counts.values.tolist(),
label_opts=opts.LabelOpts(is_show=True,position='top'),
itemstyle_opts=opts.ItemStyleOpts(
color=JsCode("""new echarts.graphic.LinearGradient(
0, 0, 0, 1,[{offset: 0,color: 'rgb(255,99,71)'}, {offset: 1,color: 'rgb(32,178,170)'}])
"""
)
)
)
.set_global_opts(
title_opts=opts.TitleOpts(
title='各个出版社书籍数量柱状图'),
xaxis_opts=opts.AxisOpts(name='书籍名称',
type_='category',
axislabel_opts=opts.LabelOpts(rotate=90),
),
yaxis_opts=opts.AxisOpts(
name='数量',
min_=0,
max_=29.0,
splitline_opts=opts.SplitLineOpts(is_show=True,linestyle_opts=opts.LineStyleOpts(type_='dash'))
),
tooltip_opts=opts.TooltipOpts(trigger='axis',axis_pointer_type='cross')
)
.set_series_opts(
markline_opts=opts.MarkLineOpts(
data=[
opts.MarkLineItem(type_='average',name='均值'),
opts.MarkLineItem(type_='max',name='最大值'),
opts.MarkLineItem(type_='min',name='最小值'),
]
)
)
)
bar.render_notebook()
电子书版本占比
per = df['电子书价格'].value_counts()['无电子书版本']/len(df)
c = (
Liquid()
.add("lq", [1-per], is_outline_show=False)
.set_global_opts(title_opts=opts.TitleOpts(title="电子书版本占比"))
)
c.render_notebook()
书籍评论数最高Top20
price_top = df.groupby('书名')['原价'].sum().sort_values(ascending=False).head(20)
price_top
bar=(
Bar(init_opts=opts.InitOpts(height='500px',width='1000px',theme='dark'))
.add_xaxis(price_top.index.tolist())
.add_yaxis(
'书籍单价',
price_top.values.tolist(),
label_opts=opts.LabelOpts(is_show=True,position='top'),
itemstyle_opts=opts.ItemStyleOpts(
color=JsCode("""new echarts.graphic.LinearGradient(
0, 0, 0, 1,[{offset: 0,color: 'rgb(255,99,71)'}, {offset: 1,color: 'rgb(32,178,170)'}])
"""
)
)
)
.set_global_opts(
title_opts=opts.TitleOpts(
title='单价最高的书籍详细柱状图'),
xaxis_opts=opts.AxisOpts(name='书籍名称',
type_='category',
axislabel_opts=opts.LabelOpts(rotate=90),
),
yaxis_opts=opts.AxisOpts(
name='单价/元',
min_=0,
max_=1080.0,
splitline_opts=opts.SplitLineOpts(is_show=True,linestyle_opts=opts.LineStyleOpts(type_='dash'))
),
tooltip_opts=opts.TooltipOpts(trigger='axis',axis_pointer_type='cross')
)
.set_series_opts(
markline_opts=opts.MarkLineOpts(
data=[
opts.MarkLineItem(type_='average',name='均值'),
opts.MarkLineItem(type_='max',name='最大值'),
opts.MarkLineItem(type_='min',name='最小值'),
]
)
)
)
bar.render_notebook()