最近学习了一下飞哥的《R语言进阶笔记》(https://dengfei2013.gitee.io/r-language-advanced/),干货满满。下面是我总结的精简版,方便遗忘时快速查询。
install.packages("tidyverse")
library(tidyverse)
#dplyr介绍
#管道符在Rstudio中快捷键是Ctrl + Shift + M
#列修改函数 mutate()
install.packages("agridat")
library(agridat)
dat <- shaw.oats
head(dat)
str(dat)
#对yield进行log转换,保存为logyield
dat %>% mutate(logyield = log10(yield)) %>% head
#对列操作select,提取env,year,yield三列。如果电脑中有MASS或者调用MASS的包,MASS中select函数优先级高,需要先通过:select = dplyr::select定义一下
dat %>% select(env,year,yield) %>% head()
#select如果不想要某一列用-号
#对行操作filter,提取year为1933且env为Karnal的数据
dat %>% filter(year=="1933" & env=="Karnal") %>% head()
#分组操作group_by,看一下不同年份、不同地点的观测值个数
dat %>% group_by(env,year) %>% count()
#汇总函数summarise,看一下不同年份、不同地点的产量平均值
dat %>% group_by(env,year) %>% summarise(mean(yield))
#排序函数arrange,按照year和yield排序,默认是升序,可以用-号降序
dat %>% arrange(year,yield) %>% head()
#合并函数join,left_join 以左边为参考合并,right_join 以右边为参考合并,inner_join 以交集合并,full_join 以并集合并
d1 <- data.frame(ID = c(1:4), x = rnorm(4))
d2 <- data.frame(ID = c(3:6), y = rnorm(4))
left_join(d1,d2,by="ID") #以d1为主,没有匹配的为NA
right_join(d1,d2,by="ID") #以d2为主,没有匹配的为NA
inner_join(d1,d2,by = "ID") #合并两者的交集
full_join(d1,d2,by = "ID") #合并两者的并集
#如果两组数据ID名称不一样,用 by = c("ID1" = "ID2")进行定义
#转化为长数据pivot_longer
set.seed(123) #设定随机数种子123,一个特定的种子可以产生一个特定的伪随机序列,主要目的是可重复
dat <- tibble(Loc = paste0("Loc",rep(1:3,each = 3)),
Rep = rep(1:3,3),
y1 = rnorm(9)+10,
y2 = rnorm(9)+20,
y3 = rnorm(9)+30,
y4 = rnorm(9)+40,
y5 = rnorm(9)+50)
dat
#将y1~y5(3:7)变为一列Type,相关的值为values
re1 <- dat %>% pivot_longer(.,3:7,names_to="Type",values_to = "values")
re1
#转化为宽数据pivot_wider,列名也可以用列号表示
re2 <- re1 %>% pivot_wider(.,names_from = Type,values_from = 4)
re2
#写一个函数,计算最大值、最小值、极值、平均值、标准差和变异系数
#模拟一个20行5列的数据框
set.seed(123)
dat <- as.data.frame(matrix(rnorm(100)+100,20))
head(dat)
#常规方法
huizong <- function (dd) {
func <- function(x) {
c(Max = max(x, na.rm = T),
Min = min(x, na.rm = T),
Range = max(x,na.rm = T) - min(x,na.rm = T),
Mean = mean(x, na.rm = T),
SD = sd(x, na.rm = T),
CV = sd(x, na.rm = T)/mean(x, na.rm = T) * 100)
}
sm <- as.data.frame(t(apply(dd, 2, func)))
return(sm)
}
huizong(dat)
#dplyr的方法
func <- function(x) {
c(Max = max(x, na.rm = T),
Min = min(x, na.rm = T),
Range = max(x,na.rm = T) - min(x,na.rm = T),
Mean = mean(x, na.rm = T),
SD = sd(x, na.rm = T),
CV = sd(x, na.rm = T)/mean(x, na.rm = T)*100)
}
re <- dat %>% summarise_all(func) %>% t %>% as.data.frame()
names(re) <- c("Max","Min","Range","Mean","SD","CV")
re
#更通用的方法,tidyverse分析的都是长数据,所以先将其转化为长数据
d1 <- dat %>% pivot_longer(.,1:5,names_to = "Trait",values_to = "values")
d1 %>% group_by(Trait) %>% summarise(Max = max(values),
Min = min(values),
Range = max(values) - min(values),
Mean = mean(values,na.rm=T),
SD = sd(values,na.rm = T),
CV = sd(values,na.rm = T)/mean(values,na.rm=T)*100)
#更简洁的方法,使用summarise_at函数,然后list合并
ran <- function(values){
max(values) - min(values)
}
cv <- function(values){
sd(values,na.rm = T)/mean(values,na.rm=T)*100
}
d1 %>% group_by(Trait) %>% summarise_at(vars(values), list(Max = max, Min = min, Range = ran, Mean = mean, Sd = sd, CV = cv))
#purrr循环,循环从低到高有三层境界:手动 for 循环,apply 函数族,purrr 包泛函式编程
#函数的函数称为泛函式函数,map(x,f)中,map是函数,f也是函数,f是map的参数,那么map就是泛函式函数
dat <- data.frame(y1 = rnorm(10),y2 = rnorm(10)+10)
dat
map(dat,mean)
lapply(dat,mean)
dat <- data.frame(x1 =rnorm(10),x2 = rnorm(10),x3 = rnorm(10),x4 = rnorm(10))
#map支持一元函数,~max表示匿名函数,默认对列处理,返回x1-x4每列的max
map(dat,max)
map(dat,~max(.x))
map(dat,~max(..1))
#map2支持二元函数,.x和.y表示dat$x1和dat$x2两个元素,默认对行操作,x1和x2每行的max
map2(dat$x1,dat$x2, max)
map2(dat$x1,dat$x2, ~max(.x,.y))
map2(dat$x1,dat$x2, ~max(..1,..2))
#支持两个及以上的多元函数,默认对行操作
pmap(dat,max)
pmap(dat,~max(..1,..2,..3,..4))
#map不同的后缀,*_chr,比如map_chr, map2_chr, pmap_chr等,返回字符;*_lgl,返回逻辑型向量;*_dbl,返回实数型向量;*_int,返回数字型向量;*_df,返回数据框;*_dfr, 返回数据框行合并;*_dfc, 返回数据框列合并
#两种函数参数的写法,直接在函数的函数内部,如计算每列的平均值,允许缺失值用参数na.rm = T
map(dat,~mean(.x,na.rm = T))
#直接在函数内部
map(dat,mean,na.rm=T)
#使用R包learnasreml中的MET数据
if (!requireNamespace("devtools")) install.packages("devtools")
library(devtools)
install_github("dengfei2013/learnasreml")
library(learnasreml)
data(MET)
head(MET)
summary(MET)
#对每个地点的品种进行方差分析,常规做法每个地点计算一次
summary(aov(Yield ~ Cul, data = MET[MET$Location == "CI",]))
#使用map函数批量建模
MET %>% split(.$Location) %>% map(.,~aov(Yield ~ Cul,.) %>% summary)
#walk和map函数类似,不同的是walk不返回结果,每个地点保存一个csv文件
MET %>% group_nest(Location) %>% pwalk(~write.csv(.y,paste0(.x,".csv")))
#创建数据框
data <- data.frame(grammer = c("Python","C","Java","GO",NA,"SQL","PHP","Python"), score = c(1,2,NA,4,5,6,7,10))
data_tb <- tibble(grammer = c("Python","C","Java","GO",NA,"SQL","PHP","Python"), score = c(1,2,NA,4,5,6,7,10))
#提取含有字符串“Python”的行
data[data$grammer == "Python",]
data_tb %>% filter(grammer == "Python")
#输出所有列名
names(data_tb)
#修改第2列列名为“popularity”
names(data)[2] = "popularity"
data_tb <- data_tb %>% rename(popularity = score)
#统计grammer列中每种编程语言出现的次数
data$grammer %>% table
data_tb %>% count(grammer)
#将空值用上下值的平均值填充
install.packages("zoo")
library(zoo)
data_tb <- data_tb %>% mutate(popularity = zoo::na.approx(popularity))
#提取popularity列中值大于3的行
data_tb %>% filter(popularity>3)
#按grammer列进行去重,.keep_all默认F只返回处理列
data_tb[!duplicated(data_tb$grammer),]
data_tb %>% distinct(grammer,.keep_all = T)
#计算popularity列平均值
data_tb %>% mutate(mean = mean(popularity))
#将grammer列转换为序列,转换为向量
data_tb$grammer
#将数据框保存为Excel
install.packages("openxlsx")
library(openxlsx)
write.xlsx(data_tb,"data_tb.xlsx")
#查看数据的行数列数
dim(data_tb)
#提取popularity列值大于3小于7的行
data_tb %>% filter(popularity>3 & popularity < 7)
#交换两列的位置
data_tb %>% select(2,1)
#提取popularity列最大值所在的行
data_tb %>% filter(popularity == max(popularity))
data_tb %>% top_n(1,popularity)
#查看最后几行数据
tail(data_tb)
#删除最后一行数据,n()返回行号,slice根据行号筛选,-删除
data_tb %>% slice(-n())
#添加一行数据:“Perl”, 6
data_tb %>% rbind(tibble(grammer = "Perl",popularity = 6))
#按popularity列值从大到小排序
data_tb %>% arrange(-popularity)
#统计grammer列每个字符串的长度
data_tb %>% mutate(str_count(grammer))
#读取本地Excel数据
df <- readxl::read_excel("pandas120.xlsx")
head(df)
#查看df数据的前几行
df %>% head(10)
#将salary列数据转换为最大值与最小值的平均值
df %>% separate(salary,sep = "-",into= c("Low","Hight")) %>%
mutate(salary = (parse_number(Low) + parse_number(Hight))/2) %>%
select(-Low,-Hight) -> df1
head(df1)
#根据学历分组,并计算平均薪资
df1 %>% group_by(education) %>% summarise(mm = mean(salary))
#将createTime列转换为“月-日”
library(lubridate)
df1 %>% mutate(createTime = paste(month(createTime),day(createTime),sep="-"))
#查看数据结构信息
str(df1)
#查看数据汇总信息
summary(df1)
#新增一列将salary离散化为三水平值
ff <- function(x){
if(x <14){
return("低")
}else if(x >=14 & x <25){
return("中")
}else if(x >=25){
return("高")
}else{
return("未知")
}
}
df1 %>% mutate(level = map_chr(salary,ff)) -> df2
#case_when更简单
df1 %>% mutate(level = case_when(
salary < 14 ~ "低",
salary >=14 & salary < 25 ~ "中",
salary >=25 ~ "高"
))
#按salary列对数据降序排列
df1 %>% arrange(-salary)
#提取第33行数据
df1[33,]
slice(df1,33)
#计算salary列的中位数
median(df1$salary)
#绘制salary的频率分布直方图
df1 %>% ggplot(aes(x = salary)) + geom_histogram()
#绘制salary的频率密度曲线图
df1 %>% ggplot(aes(x = salary)) + geom_density()
#把直方图和密度曲线图放一起,需要在geom_histogram中将纵坐标换为density,否则没有密度曲线
df1 %>% ggplot(aes(x = salary)) + geom_histogram(aes(y=..density..)) + geom_density()
#删除最后一列level
df2 %>% select(-level)
#将第1列与第2列合并为新的一列
df1 %>% mutate(new_col = paste0(createTime,education))
#用unite函数直接修改,类似seperate函数
df1 %>% unite("new_col",1:2,sep="")
#计算salary最大值与最小值之差
df1 %>% mutate(mm = max(salary) - min(salary))
#将第一行与最后一行拼接
rbind(df1[1,],df1[nrow(df1),])
#将第8行添加到末尾
rbind(df1,df1[8,])
#将createTime列设置为行索引,先去重复,column_to_rownames()函数会把某一列作为行名,同时在数据中删掉它
df1 %>% distinct(createTime,.keep_all = T) -> df4
df4 %>% column_to_rownames("createTime") %>% head
#生成一个和df长度相同的随机数数据框
df5 <- tibble(y1 = rnorm(nrow(df4)))
#将上面生成的数据框与df按列合并
df4 %>% cbind(df5) -> df6
#生成新列new为salary列减去随机数列
df6 %>% mutate(new = salary - y1)
#检查数据中是否含有任何缺失值
anyNA(df6)
anyNA(df6$salary)
#计算salary列大于10K的次数
df6 %>% filter(salary > 10) %>% dim
#查看每种学历出现的次数
df6 %>% count(education)
#查看education列共有几种学历
df6 %>% distinct(education)