使用Python爬取腾讯新闻疫情数据,并使用pyecharts可视化,绘制国内、国际日增长人数地图,matplotlib绘制方寸图。
写在前面:这个已经不是什么新鲜的话题了,所以请大佬勿喷
导入相关模块
import time
import json
import requests
from datetime import datetime
import pandas as pd
import numpy as np
1. 疫情数据抓取
通过腾讯新闻公布的数据进行爬取
网址:https://news.qq.com/zt2020/page/feiyan.htm#/
对于静态网页,我们只需要把网页地址栏中的url传到get请求中就可以轻松地获取到网页的数据。 对于动态网页抓取的关键是先分析网页数据获取和跳转的逻辑,再去写代码 。
右击检查,选择Network,Ctrl+R即可
# 定义抓取数据函数:https://beishan.blog.csdn.net/
def Domestic():
url = 'https://view.inews.qq.com/g2/getOnsInfo?name=disease_h5'
reponse = requests.get(url=url).json()
data = json.loads(reponse['data'])
return data
def Oversea():
url = 'https://view.inews.qq.com/g2/getOnsInfo?name=disease_foreign'
reponse = requests.get(url=url).json()
data = json.loads(reponse['data'])
return data
domestic = Domestic()
oversea = Oversea()
print(domestic.keys())
print(oversea.keys())
dict_keys(['lastUpdateTime', 'chinaTotal', 'chinaAdd', 'isShowAdd', 'showAddSwitch', 'areaTree'])
dict_keys(['foreignList', 'globalStatis', 'globalDailyHistory', 'importStatis', 'countryAddConfirmRankList', 'countryConfirmWeekCompareRankList', 'continentStatis'])
2. 初步分析
提取各地区数据明细
# 提取各地区数据明细
areaTree = domestic['areaTree']
# 查看并分析具体数据
areaTree
提取国外地区数据明细
# 提取国外地区数据明细
foreignList = oversea['foreignList']
# 查看并分析具体数据
foreignList
就可以看到在json数据存储的结构了
3. 数据处理
3.1 国内各省疫情数据提取
# Adresss:https://beishan.blog.csdn.net/
china_data = areaTree[0]['children']
china_list = []
for a in range(len(china_data)):
province = china_data[a]['name']
confirm = china_data[a]['total']['confirm']
heal = china_data[a]['total']['heal']
dead = china_data[a]['total']['dead']
nowConfirm = confirm - heal - dead
china_dict = {}
china_dict['province'] = province
china_dict['nowConfirm'] = nowConfirm
china_list.append(china_dict)
china_data = pd.DataFrame(china_list)
china_data.to_excel("国内疫情.xlsx", index=False) #存储为EXCEL文件
china_data.head()
3.2 国际疫情数据提取
world_data = foreignList
world_list = []
for a in range(len(world_data)):
# 提取数据
country = world_data[a]['name']
nowConfirm = world_data[a]['nowConfirm']
confirm = world_data[a]['confirm']
dead = world_data[a]['dead']
heal = world_data[a]['heal']
# 存放数据
world_dict = {}
world_dict['country'] = country
world_dict['nowConfirm'] = nowConfirm
world_dict['confirm'] = confirm
world_dict['dead'] = dead
world_dict['heal'] = heal
world_list.append(world_dict)
world_data = pd.DataFrame(world_list)
world_data.to_excel("国外疫情.xlsx", index=False)
world_data.head()
3.3 数据整合
将国内数据和海外数据合并
查询数据中是否含有中国疫情数据
world_data.loc[world_data['country'] == "中国"]
从新增areaTree中提取中国数据,并添加至world_data
confirm = areaTree[0]['total']['confirm'] # 提取中国累计确诊数据
heal = areaTree[0]['total']['heal'] # 提取中国累计治愈数据
dead = areaTree[0]['total']['dead'] # 提取中国累计死亡数据
nowConfirm = confirm - heal - dead # 计算中国现有确诊数量
world_data = world_data.append(
{
'country': "中国",
'nowConfirm': nowConfirm,
'confirm': confirm,
'heal': heal,
'dead': dead
},
ignore_index=True)
再次查询数据中是否含有中国疫情数据
world_data.loc[world_data['country'] == "中国"]
4. 可视化展示
4.1 国内疫情态势可视化
导入pyecharts相关库
import pyecharts.options as opts
from pyecharts.charts import Map
from pyecharts.globals import CurrentConfig, NotebookType
CurrentConfig.NOTEBOOK_TYPE = NotebookType.JUPYTER_LAB
国内各地区现有确诊人数地图
m = Map()
m.add("", [
list(z)
for z in zip(list(china_data["province"]), list(china_data["nowConfirm"]))
],
maptype="china",
is_map_symbol_show=False)
m.set_global_opts(
title_opts=opts.TitleOpts(title="COVID-19中国现有地区现有确诊人数地图"),
visualmap_opts=opts.VisualMapOpts(
is_piecewise=True,
pieces=[
{
"min": 5000,
"label": '>5000',
"color": "#893448"
}, # 不指定 max,表示 max 为无限大
{
"min": 1000,
"max": 4999,
"label": '1000-4999',
"color": "#ff585e"
},
{
"min": 500,
"max": 999,
"label": '500-1000',
"color": "#fb8146"
},
{
"min": 101,
"max": 499,
"label": '101-499',
"color": "#ffA500"
},
{
"min": 10,
"max": 100,
"label": '10-100',
"color": "#ffb248"
},
{
"min": 1,
"max": 9,
"label": '1-9',
"color": "#fff2d1"
},
{
"max": 1,
"label": '0',
"color": "#ffffff"
}
]))
m.render_notebook()
4.2 国际疫情态势可视化
将各国的中文名称转换成英文名称,使用pandas中的merge方法
pd.merge( left, right, how=‘inner’, on=None, left_on=None, right_on=None, left_index=False, right_index=False, sort=False, suffixes=(’_x’, ‘_y’), copy=True, indicator=False, validate=None,)
how: One of ‘left’, ‘right’, ‘outer’, ‘inner’. 默认inner。inner是取交集,outer取并集
world_name = pd.read_excel("国家中英文对照表.xlsx")
world_data_t = pd.merge(world_data,
world_name,
left_on="country",
right_on="中文",
how="inner")
world_data_t
169 rows × 7 columns
世界各国现有确诊人数地图
m2 = Map()
m2.add("", [
list(z)
for z in zip(list(world_data_t["英文"]), list(world_data_t["nowConfirm"]))
],
maptype="world",
is_map_symbol_show=False)
m2.set_global_opts(title_opts=opts.TitleOpts(title="COVID-19世界各国现有确诊人数地图"),
visualmap_opts=opts.VisualMapOpts(is_piecewise=True,
pieces=[{
"min": 5000,
"label": '>5000',
"color": "#893448"
}, {
"min": 1000,
"max": 4999,
"label": '1000-4999',
"color": "#ff585e"
}, {
"min": 500,
"max": 999,
"label": '500-1000',
"color": "#fb8146"
}, {
"min": 101,
"max": 499,
"label": '101-499',
"color": "#ffA500"
}, {
"min": 10,
"max": 100,
"label": '10-100',
"color": "#ffb248"
}, {
"min": 0,
"max": 9,
"label": '0-9',
"color": "#fff2d1"
}]))
"""取消显示国家名称"""
m2.set_series_opts(label_opts=opts.LabelOpts(is_show=False))
m2.render_notebook()
4.3 国内疫情方寸间
单独取出中国疫情数据
# 单独取出中国疫情数据
China_data = world_data.loc[world_data['country'] == "中国"]
# 使索引从0开始递增
China_data.reset_index(drop=True, inplace=True)
China_data
提取China_data的累计确诊、累计治愈与累计死亡数据
# 提取China_data的累计确诊、累计治愈与累计死亡数据
# data.at[n,'name']代表根据行索引和列名,获取对应元素的值
w_confirm = China_data.at[0, 'confirm']
w_heal = China_data.at[0, 'heal']
w_dead = China_data.at[0, 'dead']
导入matplotlib相关库
import matplotlib.pyplot as plt
import matplotlib.patches as patches
构建国内疫情方寸间图示
# -*- coding: utf-8 -*-
%matplotlib inline
fig1 = plt.figure()
ax1 = fig1.add_subplot(111, aspect='equal', facecolor='#fafaf0')
ax1.set_xlim(-w_confirm / 2, w_confirm / 2)
ax1.set_ylim(-w_confirm / 2, w_confirm / 2)
ax1.spines['top'].set_color('none')
ax1.spines['right'].set_color('none')
ax1.spines['bottom'].set_position(('data', 0))
ax1.spines['left'].set_position(('data', 0))
ax1.set_xticks([])
ax1.set_yticks([])
p0 = patches.Rectangle((-w_confirm / 2, -w_confirm / 2),
width=w_confirm,
height=w_confirm,
facecolor='#29648c',
label='confirm')
p1 = patches.Rectangle((-w_heal / 2, -w_heal / 2),
width=w_heal,
height=w_heal,
facecolor='#69c864',
label='heal')
p2 = patches.Rectangle((-w_dead / 2, -w_dead / 2),
width=w_dead,
height=w_dead,
facecolor='#000000',
label='dead')
plt.gca().add_patch(p0)
plt.gca().add_patch(p1)
plt.gca().add_patch(p2)
plt.title('COVID-19 Square - China', fontdict={'size': 20})
plt.legend(loc='best')
plt.show()
4.4 国际疫情方寸间
重新排序数据
world_data.sort_values("confirm", ascending=False, inplace=True)
world_data.reset_index(drop=True, inplace=True)
world_data
162 rows × 5 columns
构建国际疫情方寸间图示
# -*- coding: utf-8 -*-
plt.rcParams['font.sans-serif'] = [u'SimHei']
plt.rcParams['axes.unicode_minus'] = False
fig1 = plt.figure(figsize=(25, 25))
for a in range(20):
w_confirm = world_data.at[a, 'confirm']
w_heal = world_data.at[a, 'heal']
w_dead = world_data.at[a, 'dead']
ax1 = fig1.add_subplot(20 / 4,
4,
a + 1,
aspect='equal',
facecolor='#fafaf0')
ax1.set_xlim(-w_confirm / 2, w_confirm / 2)
ax1.set_ylim(-w_confirm / 2, w_confirm / 2)
ax1.spines['top'].set_color('none')
ax1.spines['right'].set_color('none')
ax1.spines['bottom'].set_position(('data', 0))
ax1.spines['left'].set_position(('data', 0))
ax1.set_xticks([])
ax1.set_yticks([])
p0 = patches.Rectangle((-w_confirm / 2, -w_confirm / 2),
width=w_confirm,
height=w_confirm,
alpha=w_confirm / 90000,
facecolor='#29648c',
label='confirm')
p1 = patches.Rectangle((-w_heal / 2, -w_heal / 2),
width=w_heal,
height=w_heal,
alpha=1,
facecolor='#69c864',
label='heal')
p2 = patches.Rectangle((-w_dead / 2, -w_dead / 2),
width=w_dead,
height=w_dead,
alpha=1,
facecolor='black',
label='dead')
plt.gca().add_patch(p0)
plt.gca().add_patch(p1)
plt.gca().add_patch(p2)
plt.title(world_data.at[a, 'country'], fontdict={'size': 20})
plt.legend(loc='best')
plt.show()
这样就可以清楚地看到各个国家新冠确诊人数、治愈和死亡人数的关系了
作者:北山啦
原文链接:https://blog.csdn.net/qq_45176548/article/details/115017160