学习R包
学习R包
生信必备Biocductor上生信分析R包
以dplyr为例
- 配置Rstudio镜像
#代码源生信星球
options("repos" = c(CRAN="https://mirrors.tuna.tsinghua.edu.cn/CRAN/")) #对应清华源
options(BioC_mirror="https://mirrors.ustc.edu.cn/bioc/") #对应中科大源
- 安装
install.packages(' ')
BiocManager::install(" ")
- 加载
library()
require()
dplyr五个基础函数
> test <- iris[c(1:2,51:52,101:102),]
> test
Sepal.Length Sepal.Width Petal.Length Petal.Width
1 5.1 3.5 1.4 0.2
2 4.9 3.0 1.4 0.2
51 7.0 3.2 4.7 1.4
52 6.4 3.2 4.5 1.5
101 6.3 3.3 6.0 2.5
102 5.8 2.7 5.1 1.9
Species
1 setosa
2 setosa
51 versicolor
52 versicolor
101 virginica
102 virginica
> library(dplyr)
-
mutate()
新增列
> mutate(test,new=Sepal.Length* Sepal.Width)
Sepal.Length Sepal.Width Petal.Length Petal.Width
1 5.1 3.5 1.4 0.2
2 4.9 3.0 1.4 0.2
51 7.0 3.2 4.7 1.4
52 6.4 3.2 4.5 1.5
101 6.3 3.3 6.0 2.5
102 5.8 2.7 5.1 1.9
Species new
1 setosa 17.85
2 setosa 14.70
51 versicolor 22.40
52 versicolor 20.48
101 virginica 20.79
102 virginica 15.66
-
select()
按列筛选
按列号筛选
select(test,1)
select(test,c(1,5))
select(test,Sepal.Length)
按列名筛选
select(test, Petal.Length, Petal.Width)
vars <- c("Petal.Length", "Petal.Width")
select(test, one_of(vars))
> select(test,Sepal.Length)
Sepal.Length
1 5.1
2 4.9
51 7.0
52 6.4
101 6.3
102 5.8
-
filter()
筛选行
filter(test, Species == "setosa")
#筛选Species == setosa的行
filter(test, Species == "setosa"&Sepal.Length > 5 )
#筛选Species == setosa并且Sepal.Length>5的行
filter(test, Species %in% c("setosa","versicolor"))
#筛选Species 属于 setosa或versicolor的行
-
arrange()
,按某1列或某几列队整个表格进行排序
arrange(test, Sepal.Length)
#默认从小到大排序
arrange(test, desc(Sepal.Length))
#用desc从大到小
5.summarise()
汇总
summarise(test, mean(Sepal.Length), sd(Sepal.Length))# 计算Sepal.Length的平均值和标准差
# 先按照Species分组,计算每组Sepal.Length的平均值和标准差
group_by(test, Species)
summarise(group_by(test, Species),mean(Sepal.Length), sd(Sepal.Length))
dplyr两个实用技能
- 管道操作
%>%
(cmd/ctr + shift + M)
test %>%
group_by(Species) %>%
summarise(mean(Sepal.Length), sd(Sepal.Length))
###等同于summarise(group_by(test, Species),mean(Sepal.Length), sd(Sepal.Length))
- count统计某列的unique值
count(test,Species)
dplyr 处理关系数据
> options(stringsAsFactors = F)
> test1 <- data.frame(x = c('b','e','f','x'),
+ z = c("A","B","C",'D'),
+ stringsAsFactors = F)
> test1
x z
1 b A
2 e B
3 f C
4 x D
> test2 <- data.frame(x = c('a','b','c','d','e','f'),
+ y = c(1,2,3,4,5,6),
+ stringsAsFactors = F)
> test2
x y
1 a 1
2 b 2
3 c 3
4 d 4
5 e 5
6 f 6
The mutating joins add columns from y to x, matching rows based on the keys:
inner_join()
: includes all rows in x and y.
left_join()
: includes all rows in x.
right_join()
: includes all rows in y.
full_join()
: includes all rows in x or y.
If a row in x matches multiple rows in y, all the rows in y will be returned once for each matching row in x.
> inner_join(test1,test2,by='x')
x z y
1 b A 2
2 e B 5
3 f C 6
> left_join(test1,test2,by='x')
x z y
1 b A 2
2 e B 5
3 f C 6
4 x D NA
> full_join(test1,test2,by='x')
x z y
1 b A 2
2 e B 5
3 f C 6
4 x D NA
5 a <NA> 1
6 c <NA> 3
7 d <NA> 4
> semi_join(test1,test2,by='x')
x z
1 b A
2 e B
3 f C
> anti_join(test1,test2,by='x')
x z
1 x D
> anti_join(test2,test1,by='x')
x y
1 a 1
2 c 3
3 d 4
简单合并
cbind()
rbind()
bind_rows()
函数需要两个表格列数相同,而bind_cols()
函数则需要两个数据框有相同的行数