ggplot2,我总结了下,主要就是数据变形、映射、几何对象、统计变换以及后期图片调整及美化
1、数据导入和变形
(1)数据导入及格式
#设置工作目录
setwd("/media/han/b/rosalind/ggplot2")
#导入数据框(Rstudio中的Rscript栏输入)
gene_exp <- read.table(file = "gene_exp.txt",
sep = "\t",
header = T,
stringsAsFactors = F)
gene_len <- read.table(file = "gene_len.txt",
sep = "\t",
header = T,
stringsAsFactors = F)
group <- read.table(file = "group.txt",
sep = "\t",
header = T,
stringsAsFactors = F)
#数据格式
> head(gene_exp)
Gene S1 S2 S3 S4
1 G1 844.9510 1301.7828 1207.7967 1153.3719
2 G2 1246.8492 785.4974 1182.4283 1193.2796
3 G3 1496.4822 1206.2611 1060.4760 1480.8871
4 G4 1392.3307 1100.6337 687.7282 781.1865
5 G5 1170.3425 857.2048 916.0348 1092.2339
6 G6 721.2031 1477.6733 1543.1986 824.7960
S5 S6 S7 S8
1 899.1235 1150.6556 957.9256 1728.3804
2 1922.2217 731.0976 631.0565 1318.9178
3 1276.8723 1174.2532 1037.7988 842.4523
4 1586.0368 1176.4862 1082.6887 896.8659
5 353.4801 808.1170 1246.9023 1066.0504
6 655.0933 442.7497 1089.1102 859.6822
S9
1 356.0858
2 1399.4487
3 1419.1017
4 1295.0609
5 671.8151
6 1223.9446
> head(gene_len)
Gene Length
1 G1 1712
2 G2 1884
3 G3 2514
4 G4 1559
5 G5 1952
6 G6 2295
> head(group)
Sample Group
1 S1 group1
2 S2 group1
3 S3 group1
4 S4 group2
5 S5 group2
6 S6 group2
(2)数据变形
library(tidyr)
library(dplyr)
#gather函数用于数据变形
dexp <- gather(gene_exp, key = Sample, value = Expression, -Gene)
#查看数据变形后的格式
##tidyr数据格式所有列是变量,每行是观测值,可以直接调用列变量
>head(dexp)
Gene Sample Expression
1 G1 S1 844.9510
2 G2 S1 1246.8492
3 G3 S1 1496.4822
4 G4 S1 1392.3307
5 G5 S1 1170.3425
6 G6 S1 721.2031
(3)合并表格
#重定向%>%符号默认将dexp传递给left_join(a, b, by = "Gene")中的a位置
#a, b 位置互换才能引起合并表格的较大变化,比如left_join(b, a, by = "Gene"),则b表格是在前面的
#left_join或者right_join仅仅是改变了a, b中“Gene”的排列方式,整体还是 a表样式在前,b表在后
dexp <- gather(data = gene_exp,
key = Sample,
value = Expression,
-Gene) %>%
left_join(gene_len, by = "Gene") %>%
left_join(group, by = "Sample") %>%
#select()函数选择变量的顺序和个数
select(Gene, Sample, Group, Expression, Length) %>%
arrange(Gene)
> head(dexp)
Gene Sample Group Expression Length
1 G1 S1 group1 844.9510 1712
2 G1 S2 group1 1301.7828 1712
3 G1 S3 group1 1207.7967 1712
4 G1 S4 group2 1153.3719 1712
5 G1 S5 group2 899.1235 1712
6 G1 S6 group2 1150.6556 1712
(4)映射
#导入包
library(ggplot2)
library(dplyr)
#数据简化
dexp_small <- filter(dexp, Group =="group1", Gene %in% paste("G", 1:10, sep = "")) %>%
select(-Group, -Length)
>head(dexp_small)
Gene Sample Expression
1 G1 S1 844.951
2 G1 S2 1301.783
3 G1 S3 1207.797
4 G10 S1 1407.990
5 G10 S2 473.370
6 G10 S3 1134.640
#ggplot:映射,几何对象,图层
#第一步ggplot()确定了主图层
#X轴是Sample名称,y轴是Expression,aes是映射函数,几何对象是散点图。
ggplot(data = dexp_small, aes(x=Sample, y = Expression, color = Gene)) +
geom_point()
##映射类型
#颜色类:color(颜色或边框颜色)、fill(填充颜色)和alpha(透明度)
#形状类:linetype(线型)、size(点的大小或线的宽度)和shape(形状)
#位置类:x, y, xmin, xmax, ymin, ymax, xend, yend
#特殊类:一类是group和order,另一类是字符串映射
##主图层
p <- ggplot(data = dexp, aes(x = Sample,
y = Expression)) +
##个体几何对象
geom_point(aes(color=Gene,
size=Length,
shape=Group),
#透明度
alpha=8/10)
##分组
#群组几何对象:
##按照样品名进行绘图
p + geom_boxplot(aes(group = Sample))
##按照组名进行分组
p + geom_boxplot(aes(group = Group))
#基因表达趋势变化,并且在group1中添加拟合曲线
p + geom_line(aes(group = Gene, color=Gene)) +
geom_smooth(aes(group=1))
##分面
#将一个图形分配成多个小图形
#facet_wrap()只能按照一个变量进行分面
#facet_wrap(facets, nrow = NULL, ncol = NULL, scales = "fixed",
# shrink = TRUE, labeller = "label_value", as.table = TRUE,
# switch = NULL, drop = TRUE, dir = "h", strip.position = "top")
#重要参数:
#facets: 分面参数如 ~Group,表示用 Group 变量进行数据分类
#nrow: 绘制图形的行数
#ncol: 绘制图形的列数,一般nrow/ncol只设定一个即可
#scales: fixed,小图均使用统一坐标;
# free每个小图按照各自数据范围自由调整坐标;
# free_x为自由调整x轴刻度范围;
# free_y为自由调整y轴刻度范围。
p <- ggplot(data = dexp, aes(x = Sample, y = Expression))
p + geom_point(aes(color=Gene, size=Length)) +
#~Group表示按照组进行分组, scales="free_x"表示坐标轴自由调整,nrow表示只分成一行
facet_wrap(~Group, scales = "free_x", nrow = 1)
#facet_grid():可以按照两个变量进行分面
#facet_grid(facets, margins = FALSE, scales = "fixed", space = "fixed",
# shrink = TRUE, labeller = "label_value", as.table = TRUE,
# switch = NULL, drop = TRUE)
#与facet_wrap不同的重要参数:
#facets: 应用两个标准分面,如Gene ~ Group
#margins: Ture,包含所有数据的组
#space: 每张小图的坐标宽度,值同scales(具有free, fixed等参数),类似于WORD按照内容进行调整
#选择前9条进行展示
dexp_small <- filter(dexp, Gene %in% paste("G", 1:9, sep = ""))
#主图层
ps <- ggplot(data = dexp_small, aes(x=Sample, y = Expression))
#几何图形以及按照基因和数组进行分面
ps + geom_point(aes(color=Length)) +
facet_grid(Gene~Group, scales = "free", margins = TRUE, space = "free")