学习R包
特别鸣谢:@公众号:生信星球 豆豆&花花
依照惯例,先来张思维导图~
下边以内置数据集iris中的部分数据为操作对象,练习今天学到的dplyr包里的几个函数。
1. 读取数据
#加载dplyr包
library(dplyr)
#取iris的指定行
test_data <- iris[seq(1,150,by=10),]
#查看行列数
dim(test_data)
2. mutate()
> mutate(test_data, ID=1:nrow(test_data))
Sepal.Length Sepal.Width Petal.Length Petal.Width Species ID
1 5.1 3.5 1.4 0.2 setosa 1
2 5.4 3.7 1.5 0.2 setosa 2
3 5.4 3.4 1.7 0.2 setosa 3
4 4.8 3.1 1.6 0.2 setosa 4
5 5.0 3.5 1.3 0.3 setosa 5
6 7.0 3.2 4.7 1.4 versicolor 6
7 5.0 2.0 3.5 1.0 versicolor 7
8 5.9 3.2 4.8 1.8 versicolor 8
9 5.5 2.4 3.8 1.1 versicolor 9
10 5.5 2.6 4.4 1.2 versicolor 10
11 6.3 3.3 6.0 2.5 virginica 11
12 6.5 3.2 5.1 2.0 virginica 12
13 6.9 3.2 5.7 2.3 virginica 13
14 7.4 2.8 6.1 1.9 virginica 14
15 6.7 3.1 5.6 2.4 virginica 15
3. %>% + groupby() + arrange()
> test_data = mutate(test_data, ID=1:nrow(test_data))
> test_data %>%
+ group_by(Species) %>%
+ arrange(desc(Petal.Width))
# A tibble: 15 x 6
# Groups: Species [3]
Sepal.Length Sepal.Width Petal.Length Petal.Width Species ID
<dbl> <dbl> <dbl> <dbl> <fct> <int>
1 6.3 3.3 6 2.5 virginica 11
2 6.7 3.1 5.6 2.4 virginica 15
3 6.9 3.2 5.7 2.3 virginica 13
4 6.5 3.2 5.1 2 virginica 12
5 7.4 2.8 6.1 1.9 virginica 14
6 5.9 3.2 4.8 1.8 versicolor 8
7 7 3.2 4.7 1.4 versicolor 6
8 5.5 2.6 4.4 1.2 versicolor 10
9 5.5 2.4 3.8 1.1 versicolor 9
10 5 2 3.5 1 versicolor 7
11 5 3.5 1.3 0.3 setosa 5
12 5.1 3.5 1.4 0.2 setosa 1
13 5.4 3.7 1.5 0.2 setosa 2
14 5.4 3.4 1.7 0.2 setosa 3
15 4.8 3.1 1.6 0.2 setosa 4
4. join
定义sub_test1和sub_test2
> test_data %>%
+ select(c("ID", "Species")) %>%
+ filter(Species %in% c("setosa","virginica")) -> sub_test1
> sub_test1
ID Species
1 1 setosa
2 2 setosa
3 3 setosa
4 4 setosa
5 5 setosa
6 11 virginica
7 12 virginica
8 13 virginica
9 14 virginica
10 15 virginica
> sub_test2 <- test_data[seq(1,nrow(test_data),2), c("ID", "Sepal.Length")]
> sub_test2
ID Sepal.Length
1 1 5.1
3 3 5.4
5 5 5.0
7 7 5.0
9 9 5.5
11 11 6.3
13 13 6.9
15 15 6.7
各种join
注意:实践发现,join里的by参数所取的列,该列数值不能存在重复,否则结果不对……
# inner-join
> inner_join(sub_test1, sub_test2, by = "ID")
ID Species Sepal.Length
1 1 setosa 5.1
2 3 setosa 5.4
3 5 setosa 5.0
4 11 virginica 6.3
5 13 virginica 6.9
6 15 virginica 6.7
# left-join
> left_join(sub_test1, sub_test2, by = "ID")
ID Species Sepal.Length
1 1 setosa 5.1
2 2 setosa NA
3 3 setosa 5.4
4 4 setosa NA
5 5 setosa 5.0
6 11 virginica 6.3
7 12 virginica NA
8 13 virginica 6.9
9 14 virginica NA
10 15 virginica 6.7
# full-join
> full_join(sub_test1, sub_test2, by = "ID")
ID Species Sepal.Length
1 1 setosa 5.1
2 2 setosa NA
3 3 setosa 5.4
4 4 setosa NA
5 5 setosa 5.0
6 11 virginica 6.3
7 12 virginica NA
8 13 virginica 6.9
9 14 virginica NA
10 15 virginica 6.7
11 7 <NA> 5.0
12 9 <NA> 5.5
# semi-join
> semi_join(sub_test1, sub_test2, by = "ID")
ID Species
1 1 setosa
2 3 setosa
3 5 setosa
4 11 virginica
5 13 virginica
6 15 virginica
# anti-join
> anti_join(sub_test1, sub_test2, by = "ID")
ID Species
1 2 setosa
2 4 setosa
3 12 virginica
4 14 virginica
combine行/列
> test1 <- data.frame(ID = c(1,2,3), num = c(1,11,111))
> test1
ID num
1 1 1
2 2 11
3 3 111
>
> test2 <- data.frame(ID = c(4,5), num = c(1111,11111))
> test2
ID num
1 4 1111
2 5 11111
>
> test3 <- data.frame(level = c("A","AA","AAA"))
> test3
level
1 A
2 AA
3 AAA
>
> bind_rows(test1, test2)
ID num
1 1 1
2 2 11
3 3 111
4 4 1111
5 5 11111
>
> bind_cols(test1, test3)
ID num level
1 1 1 A
2 2 11 AA
3 3 111 AAA
以上,就是今天学到的内容。