一、百度疫情
-
国内疫情汇总
-
分省市疫情统计
-
分省市新增疫情趋势
二、爬取内容
- 国内疫情汇总,截至时间的新增、累计汇总数据
- 分省市疫情统计,截至时间的新增、累计数据
- 分省市疫情统计,包含二级地区,历史90天的新增(新增确诊+无症状感染)数据
三、R语言实战
# @项目名称:爬取百度疫情数据
# @项目时间:2022-4-16
# @项目作者:斑专先生
# 加载包
library(rvest)
library(dplyr)
library(jsonlite)
library(stringr)
library(rlist)
# 百度疫情合集数据,list格式需要将其解析
bdyq_url <- 'https://voice.baidu.com/act/newpneumonia/newpneumonia/?from=osari_pc_3'
list_content <- read_html(bdyq_url) %>% html_nodes('#captain-config') %>% html_text() %>% fromJSON()
# 解析国内疫情数据
# 国内疫情
summaryDataIn <- list_content$component$summaryDataIn %>%
select(confirmedRelative,unOverseasInputNewAdd,overseasInputRelative,asymptomaticRelative,
curConfirm,curLocalConfirm,curOverseasInput,asymptomatic,
confirmed,overseasInput,cured,died) %>%
mutate(title=list_content$component$title,
mapLastUpdatedTime=list_content$component$mapLastUpdatedTime)
names(summaryDataIn) <- c('新增确诊','新增本土','新增境外','新增无症状',
'现有确诊','现有本土','现有境外','现有无症状',
'累计确诊','累计境外','累计治愈','累计死亡','报告名称','更新时间')
# 国内各地区疫情统计汇总
# 国内疫情list合集,需要解析
# pc_list <- list_content$component$caseList[[1]] %>% list.cbind() %>% as.data.frame()
pc_list <- list_content$component$caseList[[1]] # 已修正
# 获取二级地区疫情函数
get_city_df <- function(i){
if(nrow(pc_list$subList[[i]])==0){
data.frame(city=NA,confirmedRelative=NA,confirmed=NA,crued=NA,died=NA,level=2,kname=pc_list$area[[i]])
} else{
pc_list$subList[[i]] %>% list.cbind() %>% as.data.frame() %>%
select(city,confirmedRelative,confirmed,crued,died) %>%
mutate_at(.vars = -1,as.integer) %>%
mutate(level=2,kname=pc_list$area[[i]])
}
}
# 一级疫情df
area_df <- pc_list %>% select(area,confirmedRelative,confirmed,crued,died) %>%
lapply(unlist) %>% as.data.frame() %>% mutate_at(.vars = -1,as.integer) %>%
mutate(level=1,kname=area)
# 二级疫情df
city_df <- lapply(1:nrow(area_df),get_city_df) %>% list.rbind() %>%
rename(area=city)
# 合并
pc_df <- rbind.data.frame(area_df,city_df)
names(pc_df) <- c('地区','新增','累计','治愈','死亡','层级','地区所属')
# 新增疫情趋势
# 分省市趋势图
get_trend_area <- function(key_name){
xt_url <- 'https://voice.baidu.com/newpneumonia/getv2?from=mola-virus&stage=publish&target='
html_url <- str_c(xt_url,'trend&isCaseIn=1&area=',key_name)
josn_data <- fromJSON(html_url)
if(josn_data$status==0){
df <- josn_data$data$trend$list[[1]]$data %>% list.cbind() %>% data.frame() %>%
select(X5,X6) %>% rename(unOverseasInputNewAdd=X5,asymptomaticRelative=X6)
# names(df) <- josn_data$data$trend$list[[1]]$name
updateDate <- josn_data$data$trend$updateDate[[1]]
area_df <- cbind(area=key_name,df,updateDate)
}
else {
area_df <- data.frame(area=key_name,unOverseasInputNewAdd=NA,asymptomaticRelative=NA,updateDate=NA)
}
return(area_df)
}
get_trend_city <- function(key_name){
xt_url <- 'https://voice.baidu.com/newpneumonia/getv2?from=mola-virus&stage=publish&target='
html_url <- str_c(xt_url,'trendCity&area=',key_name)
josn_data <- fromJSON(html_url)
if(josn_data$status==0){
df <- josn_data$data$trend$list[[1]]$data %>% list.cbind() %>% data.frame() %>%
select(X1,X2) %>% rename(unOverseasInputNewAdd=X2,asymptomaticRelative=X1)
# names(df) <- josn_data$data$trend$list[[1]]$name
updateDate <- josn_data$data$trend$updateDate[[1]]
area_df <- cbind(area=key_name,df,updateDate)
}
else {
area_df <- data.frame(area=key_name,unOverseasInputNewAdd=NA,asymptomaticRelative=NA,updateDate=NA)
}
return(area_df)
}
# 例如:爬取34个省直辖市 get_trend_area('浙江')
area_name <- area_df$area
area_trend_data <- lapply(area_name,get_trend_area) %>% list.rbind() %>% data.frame()
# 例如:爬取地级市 get_trend_city('浙江-杭州')
city_name <- city_df %>% select(area,kname) %>% filter(!is.na(area)) %>%
mutate(key_name=str_c(kname,area,sep = '-'))
area_trend_data <- lapply(city_name$key_name,get_trend_city) %>% list.rbind() %>% data.frame()