学习小组Day6-ksprings

学习小组Day6-dplyr基础.png

一、R包安装加载

1. 镜像

options("repos" = c(CRAN="https://mirrors.tuna.tsinghua.edu.cn/CRAN/"))
options(BioC_mirror="https://mirrors.ustc.edu.cn/bioc/")

2. 安装

install.packages("包")
BiocManager::install("包")

3. 加载

library(包)
require(包)

4. 准备工作

> library(dplyr)
> head(iris)
  Sepal.Length Sepal.Width Petal.Length Petal.Width Species
1          5.1         3.5          1.4         0.2  setosa
2          4.9         3.0          1.4         0.2  setosa
3          4.7         3.2          1.3         0.2  setosa
4          4.6         3.1          1.5         0.2  setosa
5          5.0         3.6          1.4         0.2  setosa
6          5.4         3.9          1.7         0.4  setosa
> test <- iris[c(1:2,51:52,101:102),]
> test
    Sepal.Length Sepal.Width Petal.Length Petal.Width    Species
1            5.1         3.5          1.4         0.2     setosa
2            4.9         3.0          1.4         0.2     setosa
51           7.0         3.2          4.7         1.4 versicolor
52           6.4         3.2          4.5         1.5 versicolor
101          6.3         3.3          6.0         2.5  virginica
102          5.8         2.7          5.1         1.9  virginica

二、dplyr五个基础函数

mutate(),新增列

Sepal.Length Sepal.Width Petal.Length Petal.Width    Species   new
1          5.1         3.5          1.4         0.2     setosa 17.85
2          4.9         3.0          1.4         0.2     setosa 14.70
3          7.0         3.2          4.7         1.4 versicolor 22.40
4          6.4         3.2          4.5         1.5 versicolor 20.48
5          6.3         3.3          6.0         2.5  virginica 20.79
6          5.8         2.7          5.1         1.9  virginica 15.66

select(),按列筛选

列号

> select(test,c(1,5))   
    Sepal.Length    Species
1            5.1     setosa
2            4.9     setosa
51           7.0 versicolor
52           6.4 versicolor
101          6.3  virginica
102          5.8  virginica

列名

> select(test, Petal.Length, Petal.Width)
    Petal.Length Petal.Width
1            1.4         0.2
2            1.4         0.2
51           4.7         1.4
52           4.5         1.5
101          6.0         2.5
102          5.1         1.9
> vars <- c("Petal.Length", "Petal.Width")
> select(test, one_of(vars))  
    Petal.Length Petal.Width
1            1.4         0.2
2            1.4         0.2
51           4.7         1.4
52           4.5         1.5
101          6.0         2.5
102          5.1         1.9

ps:one_of()---声明变量

filter()筛选行

> filter(test, Species == "setosa")
  Sepal.Length Sepal.Width Petal.Length Petal.Width Species
1          5.1         3.5          1.4         0.2  setosa
2          4.9         3.0          1.4         0.2  setosa
> filter(test, Species == "setosa"&Sepal.Length > 5 )
  Sepal.Length Sepal.Width Petal.Length Petal.Width Species
1          5.1         3.5          1.4         0.2  setosa
> filter(test, Species %in% c("setosa","versicolor"))
  Sepal.Length Sepal.Width Petal.Length Petal.Width    Species
1          5.1         3.5          1.4         0.2     setosa
2          4.9         3.0          1.4         0.2     setosa
3          7.0         3.2          4.7         1.4 versicolor
4          6.4         3.2          4.5         1.5 versicolor

arrange(),按某1列或某几列对整个表格进行排序

> arrange(test, Sepal.Length)
  Sepal.Length Sepal.Width Petal.Length Petal.Width    Species
1          4.9         3.0          1.4         0.2     setosa
2          5.1         3.5          1.4         0.2     setosa
3          5.8         2.7          5.1         1.9  virginica
4          6.3         3.3          6.0         2.5  virginica
5          6.4         3.2          4.5         1.5 versicolor
6          7.0         3.2          4.7         1.4 versicolor
> arrange(test, desc(Sepal.Length))
  Sepal.Length Sepal.Width Petal.Length Petal.Width    Species
1          7.0         3.2          4.7         1.4 versicolor
2          6.4         3.2          4.5         1.5 versicolor
3          6.3         3.3          6.0         2.5  virginica
4          5.8         2.7          5.1         1.9  virginica
5          5.1         3.5          1.4         0.2     setosa
6          4.9         3.0          1.4         0.2     setosa

summarise()：汇总

小结：

5个函数的工作方式相同：
1.第一个参数是数据框
2.随后参数使用变量名称（不带引号）描述在数据框上的操作
3.输出一个新的数据框

6. group_by()改变上述5个函数作用范围，在每个分组上进行操作

> group_by(test, Species)
# A tibble: 6 x 5
# Groups:   Species [3]
  Sepal.Length Sepal.Width Petal.Length Petal.Width Species   
*        <dbl>       <dbl>        <dbl>       <dbl> <fct>     
1          5.1         3.5          1.4         0.2 setosa    
2          4.9         3            1.4         0.2 setosa    
3          7           3.2          4.7         1.4 versicolor
4          6.4         3.2          4.5         1.5 versicolor
5          6.3         3.3          6           2.5 virginica 
6          5.8         2.7          5.1         1.9 virginica 
> summarise(group_by(test, Species),mean(Sepal.Length), sd(Sepal.Length))
# A tibble: 3 x 3
  Species    `mean(Sepal.Length)` `sd(Sepal.Length)`
  <fct>                     <dbl>              <dbl>
1 setosa                     5                 0.141
2 versicolor                 6.7               0.424
3 virginica                  6.05              0.354

三、dplyr两个实用技能

管道操作 %>% (ctr + shift + M)

> test %>% 
+   group_by(Species) %>% 
+   summarise(mean(Sepal.Length), sd(Sepal.Length))
# A tibble: 3 x 3
  Species    `mean(Sepal.Length)` `sd(Sepal.Length)`
  <fct>                     <dbl>              <dbl>
1 setosa                     5                 0.141
2 versicolor                 6.7               0.424
3 virginica                  6.05              0.354

count统计某列的unique值

> count(test,Species)
# A tibble: 3 x 2
  Species        n
  <fct>      <int>
1 setosa         2
2 versicolor     2
3 virginica      2

四、dplyr处理关系数据

测试数据

> options(stringsAsFactors = F)
> test1 <- data.frame(x = c('b','e','f','x'), 
+                     z = c("A","B","C",'D'),
+                     stringsAsFactors = F)
> test2 <- data.frame(x = c('a','b','c','d','e','f'), 
+                     y = c(1,2,3,4,5,6),
+                     stringsAsFactors = F)
> test1
  x z
1 b A
2 e B
3 f C
4 x D
> test2
  x y
1 a 1
2 b 2
3 c 3
4 d 4
5 e 5
6 f 6

內连inner_join,取交集

> inner_join(test1, test2, by = "x")
  x z y
1 b A 2
2 e B 5
3 f C 6

左连left_join

> left_join(test1, test2, by = 'x')
  x z  y
1 b A  2
2 e B  5
3 f C  6
4 x D NA
> left_join(test2, test1, by = 'x')
  x y    z
1 a 1 <NA>
2 b 2    A
3 c 3 <NA>
4 d 4 <NA>
5 e 5    B
6 f 6    C

全连full_join

> full_join( test1, test2, by = 'x')
  x    z  y
1 b    A  2
2 e    B  5
3 f    C  6
4 x    D NA
5 a <NA>  1
6 c <NA>  3
7 d <NA>  4

半连接：返回能够与y表匹配的x表所有记录semi_join

> semi_join(x = test1, y = test2, by = 'x')
  x z
1 b A
2 e B
3 f C

反连接：返回无法与y表匹配的x表的所记录anti_join

> anti_join(x = test2, y = test1, by = 'x')
  x y
1 a 1
2 c 3
3 d 4

简单合并
1.bind_rows()函数需要两个表格列数相同 == rbind()

> test1 <- data.frame(x = c(1,2,3,4), y = c(10,20,30,40))
> test2 <- data.frame(x = c(5,6), y = c(50,60))
> test1
  x  y
1 1 10
2 2 20
3 3 30
4 4 40
> test2
  x  y
1 5 50
2 6 60
> bind_rows(test1, test2)
  x  y
1 1 10
2 2 20
3 3 30
4 4 40
5 5 50
6 6 60
> rbind(test1, test2)
  x  y
1 1 10
2 2 20
3 3 30
4 4 40
5 5 50
6 6 60

2.bind_cols()函数则需要两个数据框有相同的行数 == cbind()

> bind_cols(test1, test3)
  x  y   z
1 1 10 100
2 2 20 200
3 3 30 300
4 4 40 400
> cbind(test1, test3)
  x  y   z
1 1 10 100
2 2 20 200
3 3 30 300
4 4 40 400

学习内容来自微信公众号--生信星球；引文来自《R数据科学》

学习小组Day6-ksprings

学习小组Day6-ksprings

一、R包安装加载

1. 镜像

2. 安装

3. 加载

4. 准备工作

二、dplyr五个基础函数

三、dplyr两个实用技能

四、dplyr处理关系数据

相关阅读更多精彩内容

友情链接更多精彩内容