数据可视化总是看着简单,但实操起来bug频繁,究其原因,还是数据处理和ggplot2的相关参数等掌握不精。
1. 数据准备
#加载包
library(dplyr)
library(readr)
library(ggplot2)
#读取数据
athlete_events <- read_csv("athlete_events.csv")
noc_regions <- read_csv("noc_regions.csv")
head(athlete_events)
athletedata <- inner_join(athlete_events,noc_regions,c("NOC"))
head(athletedata)
2. 了解每个地区参数人数的分布情况(条形图)
region <- athletedata %>%
group_by(region) %>%
summarise(value=n()) %>%
arrange(desc(value)) #按照地区分组,计算每个地区的参赛人数,并按照从大到小的顺序输出
region30 <- region[1:30,]
region_plot <- ggplot(region30,aes(x=reorder(region,value),y=value)) +
theme_bw(base_family = "STKaiti") +
geom_bar(aes(fill=value),stat = "identity",show.legend = F) +
coord_flip() +
scale_fill_gradient(low = "blue",high = "red") +
labs(x="地区",y="参赛人数",title = "每个地区参赛人数的分布情况") +
theme(axis.text.x = element_text(vjust = 0.5),plot.title = element_text(hjust = 0.5))
region_plot
条形图
3. 了解不同时间、不同地区、不同性别的参赛人数的分布情况(热力图)
index <- region30$region[1:30] #取前30个地区为一组向量,而region30[1:30,1]仍为列表形式
region30_merge
<- athletedata %>%
filter(region %in% index) %>%
group_by(Year,region,Sex) %>%
summarise(value=n())
merge_plot <- ggplot(region30_merge,aes(x=Year,y=region)) +
theme_bw(base_family = "STKaiti") +
geom_tile(aes(fill=value),color="white") +
scale_fill_gradientn(colors = c("blue","red")) +
scale_x_continuous(breaks = unique(region30_merge$Year)) +
theme(axis.text.x = element_text(hjust = 0.5,angle = 90)) +
facet_wrap(~Sex,nrow = 2)
options(repr.plot.width=10, repr.plot.height=8)
merge_plot
热力图
4. 将USA、Germany、France、UK、Russia、China这6个地区每年奥运会奖牌的数量可视化(折线图)
index <- c("USA","Germany","France","UK","Russia","China")
region6 <- athletedata %>%
filter(region %in% index) %>% #只选择6个地区
filter(Medal!="NA") %>% #只选择有奖牌获得
group_by(region,Year) %>%
summarise(value=n())
region6_plot <- ggplot(region6,aes(x=Year,y=value)) +
theme_bw(base_family = "STKaiti") +
geom_line() +
facet_wrap(~region,nrow = 3)
region6_plot
折线图
5. 动态展示不同地区每年的奖牌获得情况
library(gganimate)
index <- region30$region[1:30]
region30_medal <- athletedata %>%
filter(region %in% index) %>%
filter(Medal!="NA") %>%
group_by(region,Year) %>%
summarise(value=n())
region30_medal$Year <- as.integer(region30_medal$Year) #将year这一变量变为整型变量
region30_plot <- ggplot(region30_medal,aes(x=region,y = value)) +
theme_bw() +
geom_bar(stat = "identity",show.legend = F) +
theme(axis.text.x = element_text(hjust = 0.5,angle = 90)) +
transition_time(Year) +
labs(title="Year: {frame_time}")
region30_plot
6. 对地区、运动员数量、性别、奖牌数量这些变量进行可视化分析(树图)
library(treemap)
#计算奖牌数量
medal <- athletedata %>%
filter(Medal!="NA") %>%
group_by(region,Sex) %>%
summarise(medalnum=n())
#计算运动员数量
athelete <- athletedata %>%
group_by(region,Sex) %>%
summarise(atheletenum=n())
data <- inner_join(medal,athelete,c("region","Sex"))
data_plot <- treemap(data,index = c("Sex","region"),vSize = "atheletenum",
vColor="medalnum",type="value",palette=c("blue","red"),
title="不同性别下不同地区运动员的数量",title.legend="奖牌数量",
fontfamily.title="STKaiti",fontfamily.legend="STKaiti")
data_plot
树图
日常废话:很多参数的设置都可以理解为图层的叠加,因此只要心中有大致的图形轮廓,围绕这个轮廓进行添枝加叶,就能更加灵活应用各种参数。可能每个新手都会同我一样,面对这些措手不及,甚至产生畏惧心理,觉得自己可能就不是学代码的这块料,自己就是不行,学啥都慢,等等。有这些想法都是正常的,因为很多知识都是我们需要用了才会去了解去掌握,为了进步为了更加优秀才会深入拓展,所以一定要给自己运用这些知识的机会,在实践中你才能享受到解决问题的快乐!!!