导读

R语言记笔记，持续更新。

cat(out, file="taxon.nwk")  # 保存文本
save.image("test.RData")  # 保存工作环境
load("test.RData")  # 加载工作环境
options(digits = 3)  # 有效数字
options(scipen = 3)  # 科学计数 
options(scipen=200)  # 改掉Y轴自动科学计数
round( , num) # 有效数字
reshape2::melt  # 使用指定包的函数

颜色

# 渐变色，数字匹配色
color=colorRampPalette(colors = c("purple", "snow", "green"))(10)  # 渐变的10种颜色
color=colorRampPalette(colors = c("snow", "green", "red"))(3)  # 只取三种颜色，与matrix值对应
# library("RColorBrewer")
brewer.pal(12, "Set3")
# 自制color库
col_list = read.table("C:/Users/hutongyuan/Desktop/group_color.list",sep="\t", check.names=F, na.string="", stringsAsFactors=F, quote="", comment.char="")
colors = col_list$V1[1:18]
names(colors) <- sort(unique(data$variable))

逻辑值

all(x)  # all()是在全部为TURE时返回T
all(df[,1] > 0.05)  # 是否全大于 0.05
all(sum == 0)  # 是否全为 0
any(x)  # any()是存在任何一个TRUE时返回TRUE
is.null(a)  # 是否为NULL
is.na(a)  # 是否为缺失值
df[is.na(df)] <- 0  # 0替换NA
anyNA(df)  # 检查表格中是否有NA，返回TRUE或者FALSE
na.omit(df)  # 剔除含NA的行
target$pectin[!is.na(target$pectin)]  # 取向量种非NA的值
length(tmp[,3][tmp[,3]!=0])  # 向量中非0值数量

去重

unique(a)
a[!duplicated(a),]  # 逻辑值
df[!duplicated(df[, c(1,2)]),]  # 两列重复去重，保留第一行

apply

apply(data[, 2:10], 1, FUN=sum)  # 统计第二列到第10列之和，1行 2列
sapply(data[, 1], function(x, y) x*y, y=10)  # 一列返回数组
sapply(card[, 1:2], function(x, y) x*y, y=s16_num[1])  # 两列及以上返回原列名数据框
sapply(list, function(x){sample(x, round(0.2 * length(x)))})  # 随机抽取list中每个向量的20%
apply(card[, 1:2], 2, function(x, y) x*y, y=s16_num[1])  # 必须两列以上，按1行计算，按2列计算
tapply分组统计
lapply返回列表，用法类似sapply
as.data.frame(lapply(df, as.numeric))  # df所有元素转成数字列表，再转成数据框

多文件操作

# 识别文件名含某字符串的文件
list.files()[grepl("nwk", list.files())]
# [1] "demo.denovo.tree.nwk"

setwd("Bin_all/Bin_prokka/prokka_out_table")
files=list.files(pattern="bin.*.gene.txt") # 读取所有文件的全名，含后缀
name=vector()
for(i in 1:length(files))
{
    name[i]=as.character(strsplit(files[i], split=".gene.txt")) # 批量提取文件名
}

ml=list() # 创建一个新列表，存放每个数据框的数据
for(i in 1:length(files))
{
    ml[[i]]=read.table(files[i], sep="\t", na.string="", stringsAsFactors=F, header=T, quote="", comment.char="") # 读取所有数据框
}

merge

# 有多个公共列时，需指出使用哪一列作为连接列
merge(w, q , by = intersect(names(w)[1],names(q)[1]))
# 当两个数据集连接列名称相同或不同
merge(w, q, by.x = 'name', by.y = 'name') 
# 当两个数据集连接列名称相同
merge(w, q, by = 'name') 

merge(w, q, all.x = TRUE, sort = TRUE)  # 建议使用 指定了连接列 的情况
# 多个公共列，未指定连接列
# 左连接，设置 all.x = TRUE，结果只显示数据w的列及w在q数据集中没有的列

文件输出，文件、文件夹操作

pdf(paste(id[i], "circos.pdf", sep="."), width=7, height=14)   # 格式化图片输出名称
write.table(result, file=file.path(fold[i], name[j]), sep="\t", quote=F, row.names=F)  # 变量控制输出文本的路径和名称

rm(list=ls())
path = ‘J:/lab/EX29 –在R语言中进行文件（夹）操作’
setwd(path)
cat(“file A\n”, file=”A”) #创建一个文件A，文件内容是’file A’,’\n’表示换行，这是一个很好的习惯
cat(“file B\n”, file=”B”) #创建一个文件B
file.append(“A”, “B”) #将文件B的内容附到A内容的后面，注意没有空行
file.create(“A”) #创建一个文件A, 注意会覆盖原来的文件
file.append(“A”, rep(“B”, 10)) #将文件B的内容复制10便，并先后附到文件A内容后
file.show(“A”) #新开工作窗口显示文件A的内容
file.copy(“A”, “C”) #复制文件A保存为C文件，同一个文件夹
dir.create(“tmp”) #创建名为tmp的文件夹
file.copy(c(“A”, “B”), “tmp”) #将文件夹拷贝到tmp文件夹中
list.files(“tmp”) #查看文件夹tmp中的文件名
unlink(“tmp”, recursive=F) #如果文件夹tmp为空，删除文件夹tmp
unlink(“tmp”, recursive=TRUE) #删除文件夹tmp，如果其中有文件一并删除
file.remove(“A”, “B”, “C”) #移除三个文件

文件读取

# read.table参数
na.string="",
na.strings="NA",
stringsAsFactors=F,
header=T, 
quote="",  # 可读空值，解决缺失元素的问题，解决存在引号的问题
comment.char=""
check.names = F
header=T, sep="\t", row.names=1, na.string="", stringsAsFactors=F, quote="", comment.char="", check.names = F
# 通配文件
microbes=list.files(pattern="*_Microbe.txt"); microbes[1] 
df = na.omit(df)  # 删除含NA的行

参数传递

args = commandArgs(T)

route_file = unlist(strsplit(args[1], "/"))
route = paste(route_file[1:(length(route_file)-1)], collapse="/")
setwd(route)
file_name = route_file[length(route_file)]

# input file
data = read.table(file_name, header=T, sep="\t")

# output file
write.table(data2, file=args[2], row.names=F, quote=F, sep="\t")
write.table(data3, file=args[3], row.names=F, quote=F, sep="\t")

表格处理

df[c(), ]  行名、行号挑选行
df[, c()]  列名、列号挑选列
df[, which(var1%in%var2)]  变量选列
df[which(var1%in%var2), ]  变量选行
melt(df, id.vars=c("Gene", "Bin"))  # 三维表melt
melt(df, id="Gene"))  # 二维表melt
nrow(data)  # 行数
length(data[, 1])  # 行数
length(data[1,])  # 列数
ncol(data)   # 列数
length(data)  # 列数
data[,-(length(data))]  # 删除最后一列
data[-vector,]  # 删除多行，行数vector
data[, c(-1, -2)]  # 删除多列
data[, -c(1, 2)]  # 删除多列
data[data$raw.p<=0.05, c(1,2,3,4)]  # 根据列值提取行、列
data$sum=apply(data[,2:length(data[1,])], 1, FUN=sum)  # 求和，加到列尾
df[order(row.names(df)),]  # 按行名排序
data[order(data[, 2], decreasing=T),]  # 以第二列倒序
data[order(data[, 1], data[, 2], decreasing=F),]  # 以第一列正序、第二列正序
zero = c()
for(i in 1:ncol(data))
{
    if(sum(data[,i])==0)
    {
        zero = c(zero, i)  # 记录列号
    }
}
data = data[, -c(zero)]  # 删除列和为0的列

字符串处理

library("stringr")

substr("abcd", 1, 2)  # 返回“ab”，从1开始取2位
unlist(strsplit(data, "/"))  # 特殊符切割，返回数组
# grep匹配，或，返回行数
df[, -which(names(df)%in%c("z","u")]  # 删除某些列，字符法，逻辑值法
df[, -which(colnames(df)%in%c("z","u")]  # 删除某些列，字符法，逻辑值法
df[-which(rownames(df)%in%c("D45", "H49", "G33")), which(colnames(df)%in%data[,1])]
# 删除某些列、行，字符法，逻辑值法
df[, -c(1, 2, 3, 4)]  # 删除某些列，数字法
# 去除unknown，所有行
unknown = grepl("Unspecified|unclassified|metagenome", data[,1])
# 返回的时所有逻辑值
unknown = grep("Unspecified|unclassified|metagenome", data[,1])
# 仅返回满足条件的列号或行号
data = data[-unknown,]

data[, colnames(data)%in%mark[,1]]  # data中的列名出现在mark，取出
length(grep(colnames(data)[50], mark[,1], fixed=T))  # 返回1 -> 匹配成功，返回0 -> 匹配失败
library(stringr) str_split_fixed(data$col, "-", 2)  # 用特殊符，切成多列
gsub("-", "_", colnames(data))  # 把向量中所有的，中划线->下划线

strsplit(string, "_|;|/")  # 随便切，用|隔开
paste("a", "b", sep="_")  # 普通粘贴
paste(c("a", "b"), collapse="_")  # 高级粘贴
paste(unlist(strsplit(rownames(data)[i], "__|;"))[c(10, 12)], collapse="_")  # 提取细菌分类信息
str_extract("aaa29", "[0-9]+")  # 得到“29”
unlist(strsplit("aaa29", "[0-9]+"))  # 得到“aaa”

ggplot图形

geom_boxplot(width=0.5, outlier.colour = NA, lwd = 1)
# 去除离群点，线条粗细
geom_bar(stat = "identity", position="stack", color = 'black', width=0.5, size=1)  # 柱形图，使用提供的数值，外色，宽度，粗细
geom_col(position=position_dodge(0.75), width=0.5)
# position还可以为以下几种取值：
# fill 堆叠元素，并标准化为1；
# dodge避免重叠；position_dodge设置两柱子距离
# identity不做任何调整；
# jitter给点添加扰动避免重合；
# stack将图形元素堆叠起来。
geom_line(aes(group=id), color="gray" ,position = position_dodge(0.2)) +
geom_point(aes(fill=group, group=id), position = position_dodge(0.2)) +
# 散点结合连线，箱图中
# position_dodge, position_nudge控制同一坐标上图的位置
geom_point(pch = 21, size = 4, color = "black")  # 散点图，fill内色，color外色
coord_flip()  # 图形翻转
geom_boxplot(lwd=1)  # 箱图，箱体边框粗细
# Change point shapes and colors
ggplot(df, aes(x=wt, y=mpg, group=cyl)) +
  geom_point(aes(shape=cyl, color=cyl))  # 根据分组改变形状
geom_line(group=1/"")  # 解决不出线的问题
geom_hline(aes(yintercept=12), colour="#990000", linetype="dashed")  ## 添加水平直线

因子

df$var = factor(df$var, levels=c("a", "b", "c"))  # 将某一个变量设为因子
scale_x_discrete(limits=factor(data_sort[,1]))  # ggplot参数：X轴设为因子，不排序

ggplot：坐标轴

labs(x = "CAZyme level 2", y = "Count", fill = "CAZyme", title = "HELLO")  # 标签
df$var = factor(df$var, levels=c("a", "b", "c"))  # 将某一个变量设为因子
scale_x_discrete(limits=factor(data_sort[,1]))  # X轴设为因子，不排序
scale_y_discrete(limits = factor(input$name))  # Y轴设为因子，不排序
scale_y_continuous(expand = c(0, 0))  # 去除X轴与图形间的空隙
scale_x_continuous(expand = c(0,0))  # 去掉与Y轴间隙
scale_y_continuous(limits=c(0, 115), 
    # 定义y轴范围
    expand = c(0, 0), 
    # 定义y轴外展范围，下方，上方
    breaks = c(0, 20, 40, 60, 80, 100))
    # 定义刻度
xlim(0, 50)  # X轴范围
ylim(0, 50)  # Y轴范围
scale_y_continuous(labels = scales :: percent)  # 百分比标签
scale_x_continuous(labels = scales :: percent)  # 百分比标签
scale_y_continuous(breaks = as.numeric(as.character/format(seq(0.4, 1, by=0.1))),
                   labels = scales::percent)  # 去除百分号后的.0
geom_text(aes(label = count, y = count/2), size = 5)  # 柱形图加数字
theme(axis.text.x = element_text(angle = 90, 
                                hjust = 1, vjust = 1, # 0表示左对齐,1表示右对齐
                                size = 15, 
                                face = "bold"))  # X轴文本
theme(title = element_text(size = 12))  # 标题
theme(axis.text.y = element_text(size = 12, face = "bold"))  # Y轴文本
theme(axis.title.y = element_text(size = 28, face = "bold"))  # Y轴标题
theme(axis.line = element_line(size = 1)) #  坐标线粗细
theme(axis.line.x = element_line(size = 1)) #  坐标线粗细
theme(axis.line.y = element_line(size = 1)) #  坐标线粗细
theme(axis.ticks = element_line(size = 1))  # 坐标刻度粗细
theme(axis.ticks.x = element_line(size = 1))  # 坐标刻度粗细
theme(axis.ticks.y = element_line(size = 1))  # 坐标刻度粗细
theme(panel.grid=element_blank(), 
  panel.background=element_rect(color='black', fill='transparent'))  # 去掉方格，清空背景，设置边框

ggplot: 主题和图例

labs(x="", y="Gene number", color="Group", size="Genome")  # 标题
theme(legend.position='none')  # 不加legend
theme(legend.title = element_blank(), legend.position = "bottom/top/right/none")  
# legend标题展示、位置
theme(legend.position = c(0.7, 0.1))  # legend position 位置
scale_fill_manual(  # 填充
scale_color_manual(  # 外围或点 
    #values = Palette, 
    values = c("breast_share" = "#ADD8E6", "breast_only" = "#90EE90"), 
    labels = c("Shared with Gut", "Breast Only"))  # legend颜色、标签
colours=c("#ADD8E6", "#90EE90")
names(colours) <- c("breast_share", "breast_only")  # names函数
theme(legend.title=element_text(face="bold"), 
  legend.position="bottom", legend.box="horizontal", 
  legend.text=element_text(face="italic", size=rel(0.5)),
  legend.key=element_rect(fill="transparent"))
# 图例位置、图例标题加粗、图例排版、文字斜体、大小，legend色块背景色
theme(legend.key.size=unit(2,'cm'),
  legend.key.width=unit(5,'cm'),
  legend.key=element_rect(fill="transparent"))
# legend色块尺寸，背景色
guides(color/fill= guide_legend(order=1), shape = guide_legend(order=2), size=FALSE)  # 调整legend位置，有无
guides(fill/color=guide_legend(ncol=1))  # legend列数
df$group = factor(df$group, levels=c(df$group))  # factor图例排列顺序
theme(legend.background = element_rect(color = "black", linetype = "solid", size = 1))  # legend加黑框

ggplot: 文本

geom_hline(yintercept = 0, size=0.1, linetype="dashed")  # 加横直线
geom_vline(xintercept = 0, size=0.1, linetype="dashed")  # 加竖直线
geom_text(aes(label = count, y = count/2), size = 7)  # 柱形图添加数字
ggsave(result, filename=args[2], height = 14, width = 7)  # 保存
geom_text(aes(label = Weight), vjust = 1.5, colour = "white", position = position_dodge(.9), size = 5)
# 标签函数：label设置展示标签，vjust设置标签偏移(正上负下)，position设置各标签的间距
ggsave(p, filename = "p.pdf",width = 12,height = 9)  # 保存
pdf("name", height=num, width=num) dev.off()  # 保存
family="serif"  # windowsFonts()[1]
family="scans"  # windowsFonts()[2]
family="momo"  # windowsFonts()[3]
theme(text=element_text(family="serif"))  # 主题中修改字体
pdf("test.pdf", width=25, height=21)
par(family = "serif")
plot()
dev.off()

dcast是压扁
melt是拉长

library(reshape2)
data=read.table("Correlation_result.txt", sep="\t", header=T)
# microbe metabolite r_value p_value
# microbe metabolite 两两配对，平方的那种
# 转换成以microbe metabolite为横纵，value为填充的矩阵
data_r=dcast(data, microbe ~ metabolite, value.var="r_value")
data_p=dcast(data, microbe ~ metabolite, value.var="p_value")

参考：
ggplot2画图：legend整理
 legend参考
 COLORBREWER 2.0
ggplot2 title : main, axis and legend titles
Combine Two ggplot2 Plots from Different Data Frames in R (Example)
How To Make Grouped Boxplots with ggplot2?
R ggplot2 修改默认颜色
 R|ggplot2(三)|coord 系列函数坐标轴转换
😊2020.1.15更新😊

R学习笔记

R学习笔记

导读

推荐阅读更多精彩内容