Ruizheng 的学习笔记
感谢 生信技能树 小洁老师
stringr
rm(list = ls())
if(!require(stringr))install.packages('stringr')
## Loading required package: stringr
library(stringr)
x <- "The birch canoe slid on the smooth planks."
1.检测字符串长度
length(x)
## [1] 1
str_length(x)
## [1] 42
2.字符串拆分与组合
str_split(x," ")
## [[1]]
## [1] "The" "birch" "canoe" "slid" "on" "the" "smooth"
## [8] "planks."
x2 = str_split(x," ")[[1]]
str_c(x2,collapse = " ")
## [1] "The birch canoe slid on the smooth planks."
str_c(x2,1234,sep = "+")
## [1] "The+1234" "birch+1234" "canoe+1234" "slid+1234" "on+1234"
## [6] "the+1234" "smooth+1234" "planks.+1234"
3.提取字符串的一部分
str_sub(x,5,9)
## [1] "birch"
4.大小写转换
str_to_upper(x2)
## [1] "THE" "BIRCH" "CANOE" "SLID" "ON" "THE" "SMOOTH"
## [8] "PLANKS."
str_to_lower(x2)
## [1] "the" "birch" "canoe" "slid" "on" "the" "smooth"
## [8] "planks."
str_to_title(x2)
## [1] "The" "Birch" "Canoe" "Slid" "On" "The" "Smooth"
## [8] "Planks."
5.字符串排序
str_sort(x2)
## [1] "birch" "canoe" "on" "planks." "slid" "smooth" "the"
## [8] "The"
6.字符检测
str_detect(x2,"h")
## [1] TRUE TRUE FALSE FALSE FALSE TRUE TRUE FALSE
str_starts(x2,"T")
## [1] TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
str_ends(x2,"e")
## [1] TRUE FALSE TRUE FALSE FALSE TRUE FALSE FALSE
与sum和mean连用,可以统计匹配的个数和比例
sum(str_detect(x2,"h"))
## [1] 4
mean(str_detect(x2,"h"))
## [1] 0.5
7.提取匹配到的字符串
str_subset(x2,"h")
## [1] "The" "birch" "the" "smooth"
8.字符计数
str_count(x," ")
## [1] 7
str_count(x2,"o")
## [1] 0 0 1 0 1 0 2 0
9.字符串替换
str_replace(x2,"o","A")
## [1] "The" "birch" "canAe" "slid" "An" "the" "smAoth"
## [8] "planks."
str_replace_all(x2,"o","A")
## [1] "The" "birch" "canAe" "slid" "An" "the" "smAAth"
## [8] "planks."
结合正则表达式更加强大
练习6-2
#Bioinformatics is a new subject of genetic data collection,analysis and dissemination to the research community.
#1.将上面这句话作为一个长字符串,赋值给tmp
tmp <- "Bioinformatics is a new subject of genetic data collection,analysis and dissemination to the research community."
#2.拆分为一个由单词组成的向量,赋值给tmp2(注意标点符号)
tmp2 = tmp %>%
str_replace(","," ") %>%
str_remove("[.]") %>%
str_split(" ")
tmp2 = tmp2[[1]]
#3.用函数返回这句话中有多少个单词。
length(tmp2)
## [1] 16
#4.用函数返回这句话中每个单词由多少个字母组成。
a <- str_count(tmp2);a
## [1] 14 2 1 3 7 2 7 4 10 8 3 13 2 3 8 9
#5.统计tmp2有多少个单词中含有字母"e"
sum(str_detect(tmp2,"e"))
## [1] 7
str_detect(x, “h”) - 返回等长的逻辑值向量
条件语句
if条件语句:如果。。。就。。。,否则。。。
if(一个逻辑值){ 一段代码 } else { 一段代码 }
(1)只有if没有else,那么条件是FALSE时就什么都不做
i = -1
if (i<0) print('up')
## [1] "up"
if (i>0) print('up')
if(!require(tidyr)) install.packages("tidyr")
## Loading required package: tidyr
(2)有else
i =1
if (i>0){
cat('+') # cat 之间打出来里面的内容
} else {
print("-") # print 加点东西
}
## +
ifelse 很重要
x=rnorm(10)
y=ifelse(x>0,"+","-")
y
## [1] "+" "-" "-" "-" "-" "-" "-" "+" "+" "-"
(3)多个条件
i = 0
if (i>0){
print('+')
} else if (i==0) {
print('0')
} else if (i< 0){
print('-')
}
## [1] "0"
ifelse(i>0,"+",ifelse((i<0),"-","0"))
## [1] "0"
2.switch()
cd = 3
foo <- switch(EXPR = cd,
#EXPR = "aa",
aa=c(3.4,1),
bb=matrix(1:4,2,2),
cc=matrix(c(T,T,F,T,F,F),3,2),
dd="string here",
ee=matrix(c("red","green","blue","yellow")))
foo
## [,1] [,2]
## [1,] TRUE TRUE
## [2,] TRUE FALSE
## [3,] FALSE FALSE
dplyr::case_when() 解决无限套娃问题
1. For循环
循环中 中括号建议写两个
顺便看一下next和break
x <- c(5,6,0,3)
s=0
for (i in x){
s=s+i
#if(i == 0) next 跳到下一个循环
#if (i == 0) break 跳出整个循环
print(c(which(x==i),i,1/i,s))
}
## [1] 1.0 5.0 0.2 5.0
## [1] 2.0000000 6.0000000 0.1666667 11.0000000
## [1] 3 0 Inf 11
## [1] 4.0000000 3.0000000 0.3333333 14.0000000
x <- c(5,6,0,3)
s = 0
for (i in 1:length(x)){
s=s+x[[i]] ## 循环中 中括号建议写两个
#if(i == 3) next
#if (i == 3) break
print(c(i,x[[i]],1/x[[i]],s))
}
## [1] 1.0 5.0 0.2 5.0
## [1] 2.0000000 6.0000000 0.1666667 11.0000000
## [1] 3 0 Inf 11
## [1] 4.0000000 3.0000000 0.3333333 14.0000000
如何将结果存下来?
s = 0
result = list()
for(i in 1:length(x)){
s=s+x[[i]]
result[[i]] = c(i,x[[i]],1/i,s)
}
do.call(cbind,result)
## [,1] [,2] [,3] [,4]
## [1,] 1 2.0 3.0000000 4.00
## [2,] 5 6.0 0.0000000 3.00
## [3,] 1 0.5 0.3333333 0.25
## [4,] 5 11.0 11.0000000 14.00
练习6-3
注意最后aes()传参的问题
get()函数可以让字符变成变量名
#1.使用循环,查看"a",TRUE和3的数据类型
m <- list("a", TRUE, 3)
for (i in 1:3) {
class(m[[i]])
}
#2.生成10个随机数,根据这10个随机数生成一个新向量,>中位数的值对应"A",<中位数的值对应"B"。
m <- rnorm(10)
m1 <- ifelse(m>median(m), "A", "B");m1
## [1] "A" "A" "B" "A" "B" "A" "B" "B" "B" "A"
#3.根据上一练习题中的tmp2生成一个新向量,含有e的值对应"A",不含有e的值对应"B"
tmp = "Bioinformatics is a new subject of genetic data collection,analysis and dissemination to the research community."
library(stringr)
tmp2 = tmp %>%
str_replace(","," ") %>%
str_remove("[.]") %>%
str_split(" ")
tmp2 = tmp2[[1]]
ifelse(str_detect(tmp2, "e"), "A", "B")
## [1] "B" "B" "B" "A" "A" "B" "A" "B" "A" "B" "B" "A" "B" "A" "A" "B"
#2.生成一个随机数(rnorm)组成的10行6列的矩阵,列名为sample1,sample2….sample6,
# 行名为gene1,gene2…gene10,
# 分组为sample1、2、3属于A组,sample4、5、6属于B组。
# 用循环对每个基因画ggplot2箱线图。
set.seed(2020)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
library(ggplot2)
library(cowplot)
##
## ********************************************************
## Note: As of version 1.0.0, cowplot does not change the
## default ggplot2 theme anymore. To recover the previous
## behavior, execute:
## theme_set(theme_cowplot())
## ********************************************************
library(patchwork)
##
## Attaching package: 'patchwork'
## The following object is masked from 'package:cowplot':
##
## align_plots
exp = matrix(rnorm(60),nrow = 10)
colnames(exp) <- paste0("sample",1:6)
rownames(exp) <- paste0("gene",1:10)
exp[1:4,1:4]
## sample1 sample2 sample3 sample4
## gene1 0.3769721 -0.8531228 2.17436525 -0.8125047
## gene2 0.3015484 0.9092592 1.09818265 -0.7437022
## gene3 -1.0980232 1.1963730 0.31822032 1.0953451
## gene4 -1.1304059 -0.3715839 -0.07314756 2.4353737
dat = data.frame(t(exp))
dat = mutate(dat,group = rep(c("A","B"),each = 3))%>% mutate(pair = rep(c("AA","BB","cc"),each = 2))
dat2 = gather(dat,key = "gene",value = "expression",-group,-pair)
ggplot(data = dat2)+
geom_boxplot(aes(x = group,y = expression,color = group))+
theme_bw()+
facet_wrap(~gene,nrow = 2)
p <- list()
for (i in colnames(dat)[1:10]) {
p[[which(colnames(dat)==i)]] = ggplot(data = dat,
aes(x = group,y = !!dat[,i], color = group))+
geom_boxplot()+
ylab(paste0("Expression of ", i))
}
wrap_plots(p,nrow=2,guides = 'collect')
2.while 循环
i = 0
while (i < 5){
print(c(i,i^2))
i = i+1
}
## [1] 0 0
## [1] 1 1
## [1] 2 4
## [1] 3 9
## [1] 4 16
apply()族函数
1.apply 处理矩阵或数据框
apply(X, MARGIN, FUN, …)
其中X是数据框/矩阵名;
MARGIN为1表示取行,为2表示取列,FUN是函数
test<- iris[,1:4]
apply(test, 2, mean)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## 5.843333 3.057333 3.758000 1.199333
a <- apply(test, 1, sum);a[1:3]
## [1] 10.2 9.5 9.4
res <- c()
for(i in 1:nrow(test)){
res[[i]] <- sum(test[i,])
}
res[1:3]
## [[1]]
## [1] 10.2
##
## [[2]]
## [1] 9.5
##
## [[3]]
## [1] 9.4
练习:
# 1.加载test2.Rdata,求每一行的方差
# load(file = "test2.Rdata")
apply(test, 1, sd)[1:3]
## [1] 2.179449 2.036950 1.997498
# 2.加载class.Rdata,尝试将前6列转为数值型,得到一个新矩阵
# load(file = "class.Rdata")
# apply(y[,1:6], 2, as.numeric)
# 3.解析代码:
names(tail(sort(apply(test,1,sd)),1000))[1:10]
## NULL
# 求test每行的方差,从小到大排序,取后1000的名字
重点函数
- sort
- match
- names
- ifelse 和 str_detect
- identical
- arrange
- merge 和 inner_join
- unique 和 duplicated
重点知识点
- 向量数据框、列表取子集
- 数据框新增列
- 文件读取
- Rdata的加载与保存
- 作图保存
- R包安装和加载
- 形式参数、实际参数、默认参数
R语言遍历、创建、删除文件夹
- dir()
- file.create()
- file.exists(…)
- file.remove()
- file.rename(from, to)
- file.append(file1, file2)