写在前面
还是想要放弃ggpubr做图了,主要问题是这东西和ggplot2体系的不兼容。因为ggpubr中好多变量其实是写死的,当你想要调整某个细节的时候就变得非常蛋疼了。最后不如统一逻辑,使用ggplot2吧。
今天无意中翻开了刚开始学习R语言时候折腾的柱状图,发现现在画图逻辑跟过去有了很大的不同。不如就再折腾一个柱状图,算是记录现在画图的逻辑吧。想想以后回来看这些代码又发现很多不同,也是一个有趣的事情。那就,开始吧。
折腾开始
先展示结果:
本次代码使用到的包
library(ggthemes)
library(ggplot2)
library(tidyverse)
library(ggsignif)
library(agricolae)
library(stringr)
library(palmerpenguins)
library(cowplot)
library(ggpubr)
#没装记得用install.packages()安装
数据集
最近迷上了TidyTuesday系列视频,纯DataScience出身玩大数据还是纯熟。最近看到他们推荐的一个关于企鹅的数据集,刚好鸢尾花数据集实在是用的太多了,现在换个新的数据看看。
数据解释
具体的原始内容可以跳转https://allisonhorst.github.io/palmerpenguins/articles/intro.html
palmerpenguins数据集包含8个变量,分别为:
品种:阿德利企鹅,南极企鹅和巴布亚企鹅(可惜没有帝企鹅)
岛屿:Torgersen, Dream, Biscoe
bill_length_mm:喙长
bill_depth_mm:喙宽
flipper_length_mm:翅膀(鳍?)长
body_mass_g:体重
性别
年
#remotes::install_github("allisonhorst/palmerpenguins")
rm(list = ls())
palmerpenguins::penguins
glimpse(penguins)
#> Rows: 344
#> Columns: 8
#> $ species <fct> Adelie, Adelie, Adelie, Adelie, Adelie, Adelie, Ade…
#> $ island <fct> Torgersen, Torgersen, Torgersen, Torgersen, Torgers…
#> $ bill_length_mm <dbl> 39.1, 39.5, 40.3, NA, 36.7, 39.3, 38.9, 39.2, 34.1,…
#> $ bill_depth_mm <dbl> 18.7, 17.4, 18.0, NA, 19.3, 20.6, 17.8, 19.6, 18.1,…
#> $ flipper_length_mm <int> 181, 186, 195, NA, 193, 190, 181, 195, 193, 190, 18…
#> $ body_mass_g <int> 3750, 3800, 3250, NA, 3450, 3650, 3625, 4675, 3475,…
#> $ sex <fct> male, female, female, NA, female, male, female, mal…
#> $ year <int> 2007, 2007, 2007, 2007, 2007, 2007, 2007, 2007, 200…
数据处理
数据很多,这里我想看的是性别对三个品种企鹅的体重有没有影响。
首先提取需要的数据:
penguins %>%
select(species, sex, body_mass_g) %>% ##选择species,sex和body_mass_g三列
drop_na(body_mass_g, sex) %>% ##剔除空值
group_by(species, sex) %>% #按照species和sex分组
summarise(mean_mass_g = mean(body_mass_g), sd_mass_g = sd(body_mass_g)) #分别求得各品种和性别企鹅的平均体重和体重标准差
# A tibble: 6 x 4
# Groups: species [3]
species sex mean_mass_g sd_mass_g
<fct> <fct> <dbl> <dbl>
1 Adelie female 3369. 269.
2 Adelie male 4043. 347.
3 Chinstrap female 3527. 285.
4 Chinstrap male 3939. 362.
5 Gentoo female 4680. 282.
6 Gentoo male 5485. 313.
是不是很简单?这里一个小提示,可以复制前面几行看每次操作的结果。例如我想看看第一步干了什么只需要选择前两行(不要复制最后的通道符),然后Ctrl+回车即可。
> penguins %>%
+ select(species, sex, body_mass_g)
# A tibble: 344 x 3
species sex body_mass_g
<fct> <fct> <int>
1 Adelie male 3750
2 Adelie female 3800
3 Adelie female 3250
4 Adelie NA NA
5 Adelie female 3450
6 Adelie male 3650
7 Adelie female 3625
8 Adelie male 4675
9 Adelie NA 3475
10 Adelie NA 4250
# … with 334 more rows
ggplot做图
获得了数据那么就是作图环节,这里同样可以用通道符导入ggplot2
penguins %>%
select(species, sex, body_mass_g) %>%
drop_na(body_mass_g, sex) %>%
group_by(species, sex) %>%
mutate(mean_mass_g = mean(body_mass_g),
sd_mass_g = sd(body_mass_g)) %>%
ggplot(aes(x = species, y = mean_mass_g, fill = sex)) +
geom_bar(stat = 'identity',
position= position_dodge(width = 0.6),
width = .5)
缺少了误差线,补上,再做亿点点美化和该有的注释……
##调用两个配色,方便所有人更好的区分颜色
cbp1 <- c("#999999", "#E69F00", "#56B4E9", "#009E73",
"#F0E442", "#0072B2", "#D55E00", "#CC79A7")
cbp2 <- c("#000000", "#E69F00", "#56B4E9", "#009E73",
"#F0E442", "#0072B2", "#D55E00", "#CC79A7")
penguins %>%
select(species, sex, body_mass_g) %>%
drop_na(body_mass_g, sex) %>%
mutate(body_mass_g = body_mass_g/1000) %>%
group_by(species, sex) %>%
summarise(mean_mass_g = mean(body_mass_g),
sd_mass_g = sd(body_mass_g)
) %>%
ggplot(aes(x = species, y = mean_mass_g, fill = sex)) + ##设定x轴为品种,y轴为体重均值,用颜色区分性别。
geom_bar(stat = 'identity', #声明数据已经是调整好的,不需要做count处理
position = position_dodge(width = 0.6), ##设定subgroup两个bar之间的距离
width = .5 ##设定bar的宽度
) +
scale_y_continuous(limits = c(0, NA), ##强制y轴从0点开始计数
expand = expansion(mult = c(0,.1)) ##调大一点y轴范围,否则bar容易显示不全
) +
geom_errorbar(aes(ymin = mean_mass_g - sd_mass_g, ymax = mean_mass_g + sd_mass_g), ##设定误差线的上下限 (mean ± sd)
position = position_dodge(width = 0.6), ##位置要跟之前的bar对应上
width = .2 ##宽度设定一个你喜欢的数字
) +
scale_fill_manual(values = cbp1) + ##设定bar配色
theme_few() + ##去除背景框线
theme(legend.position = c(.2,.85)) + ##设定legend位置
labs(fill = "Sex",
x = "Penguin Species",
y = "Penguin biomass (1000 g)"
) ##设定x, y和legend的名字
差不多了,但是看起来还挺正经的。但正常的文章还需要做统计分析标记好显著性。普通的ANOVA字母标记用geom_text即可,而星号标记则需要ggsignif。
因为是演示这里就随便做个Bonferroni法ANOVA吧,这里没做方差齐性和正态分布检验(注意!)。方法不唯一,选个你需要的做。
penguins %>%
select(species, sex, body_mass_g) %>%
drop_na(body_mass_g, sex) %>%
mutate(body_mass_g = body_mass_g/1000) %>% ##跟之前一样的数据提取流程
mutate(group = paste0(species,' ', sex)) %>% ##新建一个列把species和sex两列合起来放入
aov(body_mass_g ~ group, .) %>% ##以group列做单因素多重检验
LSD.test('group', p.adj = 'bonferroni') %>% ##事后检验(Post hoc tests )
.$groups%>% ##提取需要的部分
mutate(species = word(rownames(.), 1), ##重新分开group变成species和sex两列,中间以空格隔开所以直接当成单词分词即可
sex = word(rownames(.), 2)
) %>%
select(species, sex, body_mass_g, signif = groups) %>% ##更换顺序,groups改名signif
arrange(species,sex) -> aov_res ##按照species和sex重新排序
> aov_res
species sex body_mass_g signif
1 Adelie female 3.368836 d
2 Adelie male 4.043493 c
3 Chinstrap female 3.527206 d
4 Chinstrap male 3.938971 c
5 Gentoo female 4.679741 b
6 Gentoo male 5.484836 a
至于signif反而要麻烦点,因为这里需要你传入x轴,y轴位置信息和星标。
##获取品种的个数
penguins %>%
select(species) %>%
unique() %>%
count() %>%
.$n -> n
##根据品种数量构建x轴的起始点和终止点。这里多说一句,x轴上有三个小ticks对应1,2,3三个值。
xmin <- rep(0.8: (n+0.8-1), by = 1)
xmax <- xmin +.4
##各品种间female和male的t检验
comp_res <- ggpubr::compare_means(body_mass_g ~ sex,
data = penguins,
method = "t.test",
group.by = "species",
p.adjust.method = "bonferroni"
)
# A tibble: 3 x 9
species .y. group1 group2 p p.adj p.format
<fct> <chr> <chr> <chr> <dbl> <dbl> <chr>
1 Adelie body… female male 6.40e-26 1.90e-25 < 2e-16
2 Gentoo body… female male 1.87e-28 5.60e-28 < 2e-16
3 Chinst… body… female male 2.26e- 6 6.80e- 6 2.3e-06
# … with 2 more variables: p.signif <chr>, method <chr>
注意:这里用了ggpubr::compare_means做检验。还是那句话,方法不唯一,如果你有自己的熟悉的方法,一定是你的更好。这里只是演示。
最后把所有命令放在一起,保存再打开运行一遍……嗯没问题了
library(ggthemes)
library(ggplot2)
library(tidyverse)
library(ggsignif)
library(agricolae)
library(stringr)
library(palmerpenguins)
library(cowplot)
library(ggpubr)
comp_res <- ggpubr::compare_means(body_mass_g ~ sex,
data = penguins,
method = "t.test",
group.by = "species",
p.adjust.method = "bonferroni"
)
penguins %>%
select(species) %>%
unique() %>%
count() %>%
.$n -> n
xmin <- rep(0.8: (n+0.8-1), by = 1)
xmax <- xmin +.4
penguins %>%
select(species, sex, body_mass_g) %>%
drop_na(body_mass_g, sex) %>%
mutate(body_mass_g = body_mass_g/1000) %>%
mutate(group = paste0(species,' ', sex)) %>%
aov(body_mass_g ~ group, .) %>%
LSD.test('group', p.adj = 'bonferroni') %>%
.$groups%>%
mutate(species = word(rownames(.), 1),
sex = word(rownames(.), 2)
) %>%
select(species, sex, body_mass_g, signif = groups) %>%
arrange(species,sex) -> aov_res
penguins %>%
select(species, sex, body_mass_g) %>%
drop_na(body_mass_g, sex) %>%
mutate(body_mass_g = body_mass_g/1000) %>%
group_by(species, sex) %>%
summarise(mean_mass_g = mean(body_mass_g), sd_mass_g = sd(body_mass_g)) %>%
summarise(value = mean_mass_g + sd_mass_g) %>%
group_by(species) %>%
summarise(y_position = max(value) + 0.3) %>%
.$y_position -> y_position
cbp1 <- c("#999999", "#E69F00", "#56B4E9", "#009E73",
"#F0E442", "#0072B2", "#D55E00", "#CC79A7")
cbp2 <- c("#000000", "#E69F00", "#56B4E9", "#009E73",
"#F0E442", "#0072B2", "#D55E00", "#CC79A7")
penguins %>%
select(species, sex, body_mass_g) %>%
drop_na(body_mass_g, sex) %>%
mutate(body_mass_g = body_mass_g/1000) %>%
group_by(species, sex) %>%
summarise(mean_mass_g = mean(body_mass_g),
sd_mass_g = sd(body_mass_g)
) %>%
ggplot(aes(x = species, y = mean_mass_g, fill = sex)) +
geom_bar(stat = 'identity',
position= position_dodge(width = .6),
width = .5
) +
scale_y_continuous(limits = c(0, NA),
expand = expansion(mult = c(0,.1))
) +
geom_errorbar(aes(ymin = mean_mass_g - sd_mass_g, ymax = mean_mass_g + sd_mass_g),
position = position_dodge(width = 0.6),
width = .1
) +
scale_fill_manual(values = cbp1) +
theme_few() +
theme(legend.position= c(.2,.85)) +
labs(fill = "Sex",
x = "Penguin Species",
y = "Penguin biomass (1000 g)"
) +
geom_signif(y_position = y_position, ##标注线高度
xmin = xmin, ##标注起始点
xmax = xmax, ##标注终点
annotations = comp_res$p.signif, ##标注符号
tip_length = 0 ##你改一下就知道了
) -> p1
penguins %>%
select(species, sex, body_mass_g) %>%
drop_na(body_mass_g, sex) %>%
mutate(body_mass_g = body_mass_g/1000) %>%
group_by(species, sex) %>%
summarise(mean_mass_g = mean(body_mass_g),
sd_mass_g = sd(body_mass_g)
) %>%
ggplot(aes(x = species, y = mean_mass_g, fill = sex)) +
geom_bar(stat = 'identity',
position= position_dodge(width = 0.6),
width = .5
) +
scale_y_continuous(limits = c(0, NA),
expand = expansion(mult = c(0,.1))
) +
geom_errorbar(aes(ymin = mean_mass_g - sd_mass_g, ymax = mean_mass_g + sd_mass_g),
position = position_dodge(width = 0.6),
width = .2
) +
scale_fill_manual(values = cbp2) +
theme_few() +
theme(legend.position= c(.25,.85)) +
labs(fill = "Sex",
x = "Penguin Species",
y = expression(paste("Penguin biomass (1x",10^3," g)"))
) +
geom_text(stat = 'identity', ##跟柱状图一致
label = aov_res$signif, ##字母
aes(vjust = -sd_mass_g - 2), ##字母高度
size = 5, ##字体大小
position = position_dodge(.6) ##跟柱状图一致
) -> p2
cowplot::plot_grid(p1, p2, labels = "AUTO") ##合一起看看
大概做个流程:
最后
祝元旦快乐……