# --*--coding:utf-8--*--
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from urllib.request import urlopen, Request
import re
mc, lj, xq, hx, td, fb, jg, wz = [], [], [], [], [], [], [], []
city = input('请输入城市全称的小写首拼:')
header = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.2 '
'Safari/605.1.15',
}
ip = [{"https": "103.103.3.6:8080"},{"https": "61.37.223.152:8080"},{"https": "211.24.95.49:47615"}]
for q in range(1, 10):
url ='https://%s.ke.com/ershoufang/pg%s/' % (city, q)
print(url)
r = Request(url=url, headers=header, )
html = urlopen(r)
bs = BeautifulSoup(html,'lxml')
# print(bs)
div = bs.find_all('div', {'class': 'info clear'})
for i in div:
n = i.find('div', class_='title').a.text
print(n)
u = i.find('div', class_='title').a.get('href')
a = i.find('div', class_='address').a.text
h = i.find('div', class_='houseInfo').text
d = i.find('div', class_='followInfo').text
y = i.find('div', class_='tag').text
p = i.find('div', class_='priceInfo').text
mc.append(n)
lj.append(u)
xq.append(a)
hx.append(h)
td.append(d)
fb.append(y)
jg.append(p)
u1 = urlopen(u)
bs1 = BeautifulSoup(u1,'lxml')
d1 = bs1.find_all('div', class_='areaName')
for j in d1:
a2 = j.find('span', class_='info').text
wz.append(a2)
df = pd.DataFrame({'名称': mc, '区': wz, '小区': xq, '户型': hx, '特点': fb, '发布时间': td, '价格': jg, '链接': lj})
df['区'] = df['区'].apply(lambda x: re.sub('\s+', ' ', x).strip())
df['户型'] = df['户型'].apply(lambda x: re.sub('\s+', '', x).strip())
df['特点'] = df['特点'].apply(lambda x: re.sub('\s+', ' ', x).strip())
df['价格'] = df['价格'].apply(lambda x: re.sub('\s+', ' ', x).strip())
df['发布时间'] = df['发布时间'].apply(lambda x: re.sub('\s+', '', x).strip())
df['位置1'] = df['区'].apply(lambda x: x.split(' ')[0])
df['位置2'] = df['区'].apply(lambda x: x.split(' ')[1])
df['楼层'] = df['户型'].apply(lambda x: x.split('|')[0])
df['建设时间'] = df['户型'].apply(lambda x: x.split('|')[1])
df['户型1'] = df['户型'].apply(lambda x: x.split('|')[2])
df['面积'] = df['户型'].apply(lambda x: x.split('|')[-2])
df['朝向'] = df['户型'].apply(lambda x: x.split('|')[-1])
df['发布日期'] = df['发布时间'].apply(lambda x: x.split('/')[1])
df['关注度'] = df['发布时间'].apply(lambda x: x.split('/')[0])
df['总价'] = df['价格'].apply(lambda x: x.split(' ')[0]+'万')
df['单价'] = df['价格'].apply(lambda x: x.split(' ')[2].replace('单价', ''))
data = df[['名称', '位置1', '位置2', '小区', '楼层', '建设时间', '户型1', '面积', '朝向', '特点', '发布日期',
'关注度', '单价', '总价', '链接']]
data.columns = ['名称', '区', '详细区域', '小区名称', '楼层', '建设时间', '户型', '面积', '朝向', '特点', '发布日期',
'关注度', '单价', '总价', '链接']
data.to_excel(r'C:\Users\admin\Desktop\%s_lj_data.xlsx' % city, index=False)
以上代码为爬虫爬取数据的代码,下半代码为数据分析代码。
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pyecharts.charts import Bar,Line,Grid,Page
from pyecharts import options as opts
df = pd.read_excel(r'C:\Users\admin\Desktop\20210813_bkdata.xlsx', sheet_name='Sheet1')
df['面积'] = df['面积'].apply(lambda x : float(re.sub('平米','',x)))
df['总价'] = df['总价'].apply(lambda x : float(re.sub('万','',x)))
df['户型'] = df['户型'].apply(lambda x : re.sub('东','',x))
df['户型'] = df['户型'].apply(lambda x : re.sub('南','',x))
df['户型'] = df['户型'].apply(lambda x : re.sub('西','',x))
df['户型'] = df['户型'].apply(lambda x : re.sub('北','',x))
df['户型'] = df['户型'].apply(lambda x : re.sub('0厅','',x))
df['户型'] = df['户型'].apply(lambda x : re.sub('1厅','',x))
df['户型'] = df['户型'].apply(lambda x : re.sub('2厅','',x))
df['户型'] = df['户型'].apply(lambda x : re.sub('3厅','',x))
# 计算各区的平均单价和累计上市房屋数
qy_data = pd.pivot_table(data=df, values=['单价'], index=['区'],aggfunc=[np.mean, np.count_nonzero])
qy_data.columns = ['平均单价','套数']
qy_data = qy_data.round()
fig = plt.figure(num=1, figsize=(25,16),dpi=100)
ax1 = fig.add_subplot(1,2,1)
ax2 = fig.add_subplot(1,2,2)
plt.sca(ax1)
plt.bar(qy_data.index.values,qy_data['平均单价'].values,)
plt.sca(ax2)
plt.plot(qy_data.index.values,qy_data['套数'].values)
plt.show()
# 根据面积对面积进行分段,并计算不同面积的套数和平均单价
# 计数函数
mf =[0,40,60,70,80,90,100,120,150,1000]
mjf= ['40以下','40-60','61-70','71-80','81-90','91-100','101-120','121-150','150以上',]
df['面积分段'] = pd.cut(df['面积'],mf,labels=mjf,right=False)
mc_data = pd.pivot_table(df,values=['单价'],index=['面积分段'],aggfunc=[np.mean, np.count_nonzero])
mc_data = mc_data.round()
mc_data.columns = ['均价','套数']
fig2 = plt.figure(num=1, figsize=(25,16),dpi=100)
ax1 = fig2.add_subplot(1,2,1)
ax2 = fig2.add_subplot(1,2,2)
plt.sca(ax1)
plt.bar(mc_data.index.values,mc_data['均价'].values,)
plt.sca(ax2)
plt.plot(mc_data.index.values,mc_data['套数'].values)
plt.show()
# 按照户型计算各户型均价和套数
hx_data = pd.pivot_table(data=df, values=['单价'], index=['户型'],aggfunc=[np.mean, np.count_nonzero])
hx_data.columns = ['均价','套数']
hx_data = hx_data.round()
fig3 = plt.figure(num=1, figsize=(25,16),dpi=100)
ax1 = fig3.add_subplot(1,2,1)
ax2 = fig3.add_subplot(1,2,2)
plt.sca(ax1)
plt.bar(hx_data.index.values,hx_data['均价'].values,)
plt.sca(ax2)
plt.plot(hx_data.index.values,hx_data['套数'].values)
plt.show()
page = Page()
line1 = (
Line()
.add_xaxis(qy_data.index.values.tolist())
.add_yaxis('各区均价',qy_data['平均单价'].values.tolist())
.set_global_opts(title_opts=opts.TitleOpts(title='各区平均单价',pos_left='20%'),
legend_opts=opts.LegendOpts(pos_left='40%'))
)
bar1 = (
Bar()
.add_xaxis(qy_data.index.values.tolist())
.add_yaxis('各区总套数',qy_data['套数'].values.tolist())
.set_global_opts(title_opts=opts.TitleOpts(title='各区交易总套数',pos_right='15%'),
legend_opts=opts.LegendOpts(pos_right='5%'))
)
grid1 = Grid()
grid1.add(line1,opts.global_options.GridOpts(pos_right='51%'))
grid1.add(bar1,opts.global_options.GridOpts(pos_left='57%'))
line2 = (
Line()
.add_xaxis(mc_data.index.values.tolist())
.add_yaxis('各面积段均价',mc_data['均价'].values.tolist(),color='red')
.set_global_opts(title_opts=opts.TitleOpts(title='各面积段均价', pos_left='20%'),
legend_opts=opts.LegendOpts(pos_left='40%'))
)
bar2 = (
Bar()
.add_xaxis(mc_data.index.values.tolist())
.add_yaxis('各面积段套数',mc_data['套数'].values.tolist(),color='green')
.set_global_opts(title_opts=opts.TitleOpts(title='各面积段交易总套数', pos_right='15%'),
legend_opts=opts.LegendOpts(pos_right='5%'))
)
grid2 = Grid()
grid2.add(line2,opts.global_options.GridOpts(pos_right='51%'))
grid2.add(bar2,opts.global_options.GridOpts(pos_left='57%'))
line3 = (
Line()
.add_xaxis(hx_data.index.values.tolist())
.add_yaxis('各户型均价',hx_data['均价'].values.tolist(),color='red')
.set_global_opts(title_opts=opts.TitleOpts(title='各户型均价', pos_left='20%'),
legend_opts=opts.LegendOpts(pos_left='40%'))
)
bar3 = (
Bar()
.add_xaxis(hx_data.index.values.tolist())
.add_yaxis('各户型套数',hx_data['套数'].values.tolist(),color='green')
.set_global_opts(title_opts=opts.TitleOpts(title='各户型交易总套数', pos_right='15%'),
legend_opts=opts.LegendOpts(pos_right='5%'))
)
grid3 = Grid()
grid3.add(line3,opts.global_options.GridOpts(pos_right='51%'))
grid3.add(bar3,opts.global_options.GridOpts(pos_left='57%'))
page.add(grid1)
page.add(grid2)
数据分析效果展示图。
感兴趣的话可以在微信公众号中搜索“数据处理和工作效率:Big_Data-",查看其他文章。