同名公主号:BBio
tidyverse包实际上就是一些常用R包的集合,包括ggplot2(可视化)、dplyr(数据操作)、tidyr(数据)对齐、tibble(更现代的数据框)、stringr(字符串操作)。加载tidyverse包后,其余包中函数都可以使用。
//如何理解tidyverse的工作流呢?,看完就会有答案。
diamonds %>% filter(carat < 3) %>% ggplot(mapping = aes(x = carat)) + geom_histogram(binwidth = 0.1)
diamonds2 <- diamonds %>% mutate(y = ifelse(y < 3 | y > 20, NA, y))
//具体应用场景举例
#从panglaodb单细胞marker基因数据库中下载数据表格,每个细胞类型对应多个marker基因,格式如下
#species official gene symbol cell type nicknames ubiquitousness index product description gene type canonical marker germ la
#Mm Hs CTRB1 Acinar cells CTRB 0.017 chymotrypsinogen B1 protein-coding gene 1 Endoderm Pancreas 1.0 0.957143
#Mm Hs KLK1 Acinar cells Klk6 0.013 kallikrein 1 protein-coding gene 1 Endoderm Pancreas 0.833333 0.314286
#想要整理为每个组织类型对应的细胞类型,以及对应的所有marker list应该怎么做
#常规操作,此处省略,方法有点笨:
data <- read.table("PanglaoDB_markers_27_Mar_2020.tsv.gz", head=T, sep='\t')
panglao_SMC <- data %>% filter(organ == "Smooth muscle", str_detect(species, "Hs")) %>% group_by(cell.type) %>% summarise(geneset = list(official.gene.symbol))
str(panglao_SMC)
#两行代码完成,tidyverse的魅力
#tibble [5 × 2] (S3: tbl_df/tbl/data.frame)
# $ cell.type: chr [1:5] "Airway smooth muscle cells" "Myoepithelial cells" "Myofibroblasts" "Pulmonary vascular smooth muscle cells" ...
# $ geneset :List of 5
# ..$ : chr [1:4] "NOG" "ACTA2" "FOXF1" "GATA5"
# ..$ : chr [1:26] "SFN" "ACTA2" "CNN1" "CA3" ...
# ..$ : chr [1:9] "CDH11" "DES" "PALLD" "ACTA2" ...
# ..$ : chr [1:2] "ANGPT1" "PDGFRB"
# ..$ : chr [1:6] "ACTA2" "MYH11" "PDGFRB" "SEMA3D" ...
//安装及资料
#https://github.com/tidyverse/tidyverse
# Install from CRAN
install.packages("tidyverse")
# Or the development version from GitHub
# install.packages("devtools")
devtools::install_github("tidyverse/tidyverse")
#https://r4ds.had.co.nz/index.html:学习资源
#ls("package:dplyr")
//dplyr包的主要函数
library(nycflights13) #数据集
library(tidyverse)
flights #测试数据,tibble格式,后文详细介绍
ls("package:dplyr") #查看dplyr包中所有函数
#%>%:管道符
#将数据从左边传入右边,有大用处。x %>% f(y) 相当于 f(x, y), x %>% f(y) %>% g(z) 相当于 g(f(x, y), z)
c(1,2,3) %>% mean()
#filter:针对行的数据过滤
#可以支持多个筛选条件,以运算符作为标准
filter(flights, month == 1, day == 1)
filter(flights, month == 11 | month == 12)
filter(flights, month %in% c(11, 12))
filter(flights, !is.na(dep_delay))
#arrange:针对行的数据排序
#可以支持根据多列数据进行排序
arrange(flights, year, month, day)
arrange(flights, desc(dep_delay)) #降序
#select:针对列的数据筛选
#直接使用列名,支持多列筛选,切片筛选,或者去除某些列。也可以使用默认函数进行筛选。
select(flights, year, month, day)
select(flights, year:day)
select(flights, -(year:day))
select(flights, starts_with("abc"))
select(flights, contains("a"))
#rename:重命名列名
rename(flights, tail_num = tailnum)
#mutate:增加列
#可以基于已有数据,通过计算增加新列
flights_sml <- select(flights, year:day, ends_with("delay"), distance, air_time)
mutate(flights_sml, gain = dep_delay - arr_delay, speed = distance / air_time * 60)
mutate(flights_sml, gain = dep_delay - arr_delay, hours = air_time / 60, gain_per_hour = gain / hours)
transmute(flights, gain = dep_delay - arr_delay, hours = air_time / 60, gain_per_hour = gain / hours) #只保留新列
#min_rank:排名
y <- c(1, 2, 2, NA, 3, 4)
min_rank(y)
#summarise:总结
#和group_by连用对不同的分组数据进行总结。group_by改变原数据,只是添加了分组信息。
by_day <- group_by(flights, year, month, day)
summarise(by_day, delay = mean(dep_delay, na.rm = TRUE))
#管道符连接多个函数,可读性瞬间提升
delays <- flights %>%
group_by(dest) %>%
summarise(
count = n(),
dist = mean(distance, na.rm = TRUE),
delay = mean(arr_delay, na.rm = TRUE)
) %>%
filter(count > 20, dest != "HNL")
//dplyr包的其它函数
#between,返回逻辑值
between(1:12, 7, 9)
#group_split:分组
data.frame(celltype=rep(c("T cells", "B cells"), each=2), marker=c("CD3D", "CD2", "MS4A1", "CD79A")) %>% group_by(celltype) %>% group_split()
#across:选择多列,并用函数处理
iris %>% as_tibble() %>% mutate(across(c(Sepal.Length, Sepal.Width), round))
#if_any,筛选
iris %>% filter(if_any(ends_with("Width"), ~ . > 4))
iris %>% filter(if_all(ends_with("Width"), ~ . > 2))
#inner_join:按列合并俩数据集中的行,取交集。其余为左合并,右合并,并集合并。
band_members %>% inner_join(band_instruments, by="name")
band_members %>% left_join(band_instruments)
band_members %>% right_join(band_instruments)
band_members %>% full_join(band_instruments)
#根据两个数据集中匹配关系过滤行
#semi_join(x, y) keeps all observations in x that have a match in y.
#anti_join(x, y) drops all observations in x that have a match in y.
top_dest <- flights %>% count(dest, sort = TRUE) %>% head(10)
top_dest
flights %>% semi_join(top_dest)
#pull:提取某列,分别为最后一列,第一列,cyl列,并输出为向量
mtcars %>% pull(-1)
mtcars %>% pull(1)
mtcars %>% pull(cyl)
#slice:切片,筛选行
slice(mtcars, -(1:4))
mtcars %>% slice_min(mpg, n = 5)
//tibble包中的主要函数
tibble格式的数据和data.frame非常相似,但是更加现代化,更方便使用。tibble格式数据默认只输出前10行,以及适应屏幕的列,对大数据友好。列名还支持特殊字符,非常人性化。
library(tidyverse)
#格式转换
class(iris)
iris_tibble <- as_tibble(iris)
class(iris_tibble)
iris <- as.data.frame(iris_tibble)
#输出所有列
nycflights13::flights %>% print(n = 10, width = Inf)
#取子集
df <- tibble(
x = runif(5),
y = rnorm(5)
)
df$x #向量
df[["x"]] #向量
df["x"] #tibble
#添加行列
df <- tibble(x = 1:3, y = 3:1)
df %>% add_row(x = 4, y = 0)
df %>% add_column(z = -1:1, w = 0)
#行名转给为数据,列转为行名
mtcars_tbl <- rownames_to_column(mtcars, var = "car") %>% as_tibble()
mtcars_tbl
column_to_rownames(mtcars_tbl, var = "car") %>% head()
//tidyr包中的主要函数
library(tidyverse)
#pivot_longer:宽数据转为长数据
table4a
table4a %>% pivot_longer(c(`1999`, `2000`), names_to = "year", values_to = "cases")
#pivot_wider:宽数据转为长数据
table2
table2 %>% pivot_wider(names_from = type, values_from = count)
#seprate:分割一列为多列,可以指定分隔符或者分割的位置
table3 %>% separate(rate, into = c("cases", "population"), sep="/")
table3 %>% separate(year, into = c("century", "year"), sep = 2)
//stringr中的主要函数
str_length(c("a", "R for data science", NA))
#str_c:连接字符串
str_c("x", "y")
str_c("x", "y", sep = ", ")
str_c("prefix-", c("a", "b", "c"), "-suffix")
#取子集
x <- c("Apple", "Banana", "Pear")
str_sub(x, 1, 3)
#
str_to_lower(x)
str_to_upper(c("i", "ı"))
str_sort(x, locale = "en")
#匹配
x <- c("apple", "banana", "pear")
str_detect(x, "e")
#统计次数
x <- c("apple", "banana", "pear")
str_count(x, "a")
#替换
x <- c("apple", "pear", "banana")
str_replace(x, "[aeiou]", "-")
str_replace_all(x, "[aeiou]", "-")
#分割
sentences %>% head(5) %>% str_split(" ")