1.dplyr包了解及其安装
#安装加载:
install.packages("dplyr");library(dplyr)
2.mutate()新增列函数,在数据集的基础上新增列,不对原数据作更改
构建test数据框
> rm(list = ls())
> test <- iris[c(1:2,51:52,101:102),]
> library(dplyr)
> test
Sepal.Length Sepal.Width Petal.Length Petal.Width Species
1 5.1 3.5 1.4 0.2 setosa
2 4.9 3.0 1.4 0.2 setosa
51 7.0 3.2 4.7 1.4 versicolor
52 6.4 3.2 4.5 1.5 versicolor
101 6.3 3.3 6.0 2.5 virginica
102 5.8 2.7 5.1 1.9 virginica
新增一列
test_mutate<-mutate(test,new=test$Sepal.Length*test$Sepal.Width)
> test_mutate
Sepal.Length Sepal.Width Petal.Length Petal.Width Species new
1 5.1 3.5 1.4 0.2 setosa 17.85
2 4.9 3.0 1.4 0.2 setosa 14.70
3 7.0 3.2 4.7 1.4 versicolor 22.40
4 6.4 3.2 4.5 1.5 versicolor 20.48
5 6.3 3.3 6.0 2.5 virginica 20.79
6 5.8 2.7 5.1 1.9 virginica 15.66
>
3.select()按列筛选
按照列号
> test
Sepal.Length Sepal.Width Petal.Length Petal.Width Species
1 5.1 3.5 1.4 0.2 setosa
2 4.9 3.0 1.4 0.2 setosa
51 7.0 3.2 4.7 1.4 versicolor
52 6.4 3.2 4.5 1.5 versicolor
101 6.3 3.3 6.0 2.5 virginica
102 5.8 2.7 5.1 1.9 virginica
> test_select1<-select(test,1)
> test_select1
Sepal.Length
1 5.1
2 4.9
51 7.0
52 6.4
101 6.3
102 5.8
> test_select2<-select(test,c(1,3,5))
> test_select2
Sepal.Length Petal.Length Species
1 5.1 1.4 setosa
2 4.9 1.4 setosa
51 7.0 4.7 versicolor
52 6.4 4.5 versicolor
101 6.3 6.0 virginica
102 5.8 5.1 virginica
按照列名筛选
> test
Sepal.Length Sepal.Width Petal.Length Petal.Width Species
1 5.1 3.5 1.4 0.2 setosa
2 4.9 3.0 1.4 0.2 setosa
51 7.0 3.2 4.7 1.4 versicolor
52 6.4 3.2 4.5 1.5 versicolor
101 6.3 3.3 6.0 2.5 virginica
102 5.8 2.7 5.1 1.9 virginica
> test_select3<-select(test,Sepal.Width,Species)
> test_select3
Sepal.Width Species
1 3.5 setosa
2 3.0 setosa
51 3.2 versicolor
52 3.2 versicolor
101 3.3 virginica
102 2.7 virginica
varss<-c("Sepal.Width","Species")
> test_select4<-select(test,varss)
Note: Using an external vector in selections is ambiguous.
i Use `all_of(varss)` instead of `varss` to silence this message.
i See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
This message is displayed once per session.
> test_select4
Sepal.Width Species
1 3.5 setosa
2 3.0 setosa
51 3.2 versicolor
52 3.2 versicolor
101 3.3 virginica
102 2.7 virginica
> test_select4<-select(test,one_of(varss)
> test_select4
Sepal.Width Species
1 3.5 setosa
2 3.0 setosa
51 3.2 versicolor
52 3.2 versicolor
101 3.3 virginica
102 2.7 virginica
4.filter()筛选行
filter() 返回行的子集,按照指定的条件筛选符合条件中逻辑判断要求的数据行。
> test
Sepal.Length Sepal.Width Petal.Length Petal.Width Species
1 5.1 3.5 1.4 0.2 setosa
2 4.9 3.0 1.4 0.2 setosa
51 7.0 3.2 4.7 1.4 versicolor
52 6.4 3.2 4.5 1.5 versicolor
101 6.3 3.3 6.0 2.5 virginica
102 5.8 2.7 5.1 1.9 virginica
#筛选物种为setosa的行
test_filter1<-filter(test,Species=="setosa")
> test_filter1
Sepal.Length Sepal.Width Petal.Length Petal.Width Species
1 5.1 3.5 1.4 0.2 setosa
2 4.9 3.0 1.4 0.2 setosa
test_filter2<-filter(test,test$Species=="setosa"&test$Sepal.Length>5)
> test_filter2
Sepal.Length Sepal.Width Petal.Length Petal.Width Species
1 5.1 3.5 1.4 0.2 setosa
> test_filter3<-filter(test,Species %in% c("setosa","versicolor"))
> test_filter3
Sepal.Length Sepal.Width Petal.Length Petal.Width Species
1 5.1 3.5 1.4 0.2 setosa
2 4.9 3.0 1.4 0.2 setosa
3 7.0 3.2 4.7 1.4 versicolor
4 6.4 3.2 4.5 1.5 versicolor
4.arrange()
按某1列或某几列对整个表格进行排序:默认从小到大排序
> test_arrange1<-arrange(test,test$Sepal.Length)
> test_arrange1
Sepal.Length Sepal.Width Petal.Length Petal.Width Species
1 4.9 3.0 1.4 0.2 setosa
2 5.1 3.5 1.4 0.2 setosa
3 5.8 2.7 5.1 1.9 virginica
4 6.3 3.3 6.0 2.5 virginica
5 6.4 3.2 4.5 1.5 versicolor
6 7.0 3.2 4.7 1.4 versicolor
#desc()设置从大到小
test_arrange2<-arrange(test,desc(test$Sepal.Length))
> test_arrange2
Sepal.Length Sepal.Width Petal.Length Petal.Width Species
1 7.0 3.2 4.7 1.4 versicolor
2 6.4 3.2 4.5 1.5 versicolor
3 6.3 3.3 6.0 2.5 virginica
4 5.8 2.7 5.1 1.9 virginica
5 5.1 3.5 1.4 0.2 setosa
6 4.9 3.0 1.4 0.2 setosa
>
5.summarise():汇总
计算Sepal.Length的平均值和标准差
summarise(test,mean(test$Sepal.Length),sd(test$Sepal.Width))
mean(test$Sepal.Length) sd(test$Sepal.Width)
1 5.916667 0.2738613
对数据进行汇总操作,结合group_by使用实用性强
test_summurize<-summarise(group_by(test,Species),mean(test$Sepal.Length),sd(test$Sepal.Length))
> test_summurize
# A tibble: 3 x 3
Species `mean(test$Sepal.Length)` `sd(test$Sepal.Length)`
<fct> <dbl> <dbl>
1 setosa 5.92 0.808
2 versicolor 5.92 0.808
3 virginica 5.92 0.808
>
dplyr两个实用技能
1.管道操作 %>% (cmd/ctr + shift + M)
test %>%
+ group_by(Species) %>%
+ summarise(mean(Sepal.Length), sd(Sepal.Length))
# A tibble: 3 x 3
Species `mean(Sepal.Length)` `sd(Sepal.Length)`
<fct> <dbl> <dbl>
1 setosa 5 0.141
2 versicolor 6.7 0.424
3 virginica 6.05 0.354
2:count统计某列的unique值
> count(test,Species)
# A tibble: 3 x 2
Species n
<fct> <int>
1 setosa 2
2 versicolor 2
3 virginica 2
> table(test$Species)
setosa versicolor virginica
2 2 2
dplyr处理关系数据
构建数据集
> test1 <- data.frame(x = c('b','e','f','x'),
+ z = c("A","B","C",'D'),
+ stringsAsFactors = F)
> test1
x z
1 b A
2 e B
3 f C
4 x D
> test2 <- data.frame(x = c('a','b','c','d','e','f'),
+ y = c(1,2,3,4,5,6),
+ stringsAsFactors = F)
> test2
x y
1 a 1
2 b 2
3 c 3
4 d 4
5 e 5
6 f 6
1.內连inner_join取交集
> test_join1<-inner_join(test1,test2,by="x")
> test_join1 #取交集
x z y
1 b A 2
2 e B 5
3 f C 6
2.左连left_join
>test_jion2<-left_join(test1,test2,by="x")
> test_jion2
x z y
1 b A 2
2 e B 5
3 f C 6
4 x D NA
3.全连full_join
> test_jion3<-full_join(test1,test2,by="x")
> test_jion3
x z y
1 b A 2
2 e B 5
3 f C 6
4 x D NA
5 a <NA> 1
6 c <NA> 3
7 d <NA> 4
4.半连接semi_join
返回能够与y表匹配的x表所有记录
> test_jion4<-semi_join(test1,test2,by="x")
> test_jion4
x z
1 b A
2 e B
3 f C
5.半连接semi_join
返回无法与y表匹配的x表的所记录anti_join
> test_jion5<-anti_join(test1,test2,by="x")
> test_jion5
x z
1 x D
6.简单合并:bind_rows(),bind_cols()
类似cbind()函数和rbind()函数:
注意:bind_rows()函数需要两个表格列数相同,而bind_cols()函数则需要两个数据框有相同的行数.
> test1 <- data.frame(x = c(1,2,3,4), y = c(10,20,30,40))
> test1
x y
1 1 10
2 2 20
3 3 30
4 4 40
> test2 <- data.frame(x = c(5,6), y = c(50,60))
> test2
x y
1 5 50
2 6 60
> test3 <- data.frame(z = c(100,200,300,400))
> test3
z
1 100
2 200
3 300
4 400
> bind_rows(test1, test2)
x y
1 1 10
2 2 20
3 3 30
4 4 40
5 5 50
6 6 60
> bind_cols(test1, test3)
x y z
1 1 10 100
2 2 20 200
3 3 30 300
4 4 40 400