题目
链接:http://www.bio-info-trainee.com/4387.html
基础绘图
airway数据集可视化,首先载入表达矩阵。
rm(list = ls())
options(stringsAsFactors = F)
suppressMessages(library(airway))
data(airway)
# 这里需要自行学习bioconductor里面的RangedSummarizedExperiment对象
airway
RNAseq_expr=assay(airway)
colnames(RNAseq_expr)
RNAseq_expr[1:4,1:4]
# RNAseq_expr 是一个数值型矩阵,属于连续性变量,可以探索众数、分位数和平均数 ,极差,方差和标准差等统计学指标
RNAseq_gl=colData(airway)[,3]
table(RNAseq_gl)
箱型图
# 对矩阵进行过滤,过滤那些每一列都为0的行
e1 = RNAseq_expr[apply(RNAseq_expr,1,function(x) sum(x>0)>1),]
boxplot(e1,main = 'Boxplot of RNAseq-expr',
xlab = 'samples',ylab = 'expression')
sample <- colnames(e1)
# 对矩阵加1取log2
e2 = log2(e1+1)
boxplot(e2,main = 'Boxplot of RNAseq-expr',
xlab = 'samples',ylab = 'expression',col=RNAseq_gl)
密度图
# 对RNAseq_expr的每一列绘制density图且分组
opar <- par(no.readonly=T)
par(mfrow=c(3,3))
for (i in c(1:8)) {
plot(density(e2[,i]),col=as.integer(RNAseq_gl)[i],main = "Density")
}
par(opar)
条形图
# 太丑了...
col = c("lightblue","lightgreen")
barplot(e2, main = 'Barplot of RNAseq-expr',
xlab = 'samples',ylab = 'expression',col = RNAseq_gl)
散点图
# 对RNAseq_expr的前两列画散点图并且计算线性回归方程
pairs(RNAseq_expr[,1:2])
# 在R中,拟合线性模型最基本的函数就是lm()
Q6 = as.data.frame(e1)
fit <- lm(Q6[,1] ~ Q6[,2],data = Q6)
summary(fit)
# x = -12.41 + 1.14*y
plot(Q6[,1],Q6[,2],xlab="SRR1039508",ylab="SRR1039509")
abline(fit)
热图
# 对RNAseq_expr的所有列两两之间计算相关系数,并且热图可视化
Q7 = cor(e1)
heatmap(Q7,symm = TRUE )
折线图
# 第一行
plot(e1[1,],type="l",xlab = "gene",ylab="expression",col="red")
Q8_1 = data.frame(expression = e1[1,])
Q8_1$sample <- rownames(Q8_1)
# 表达量最高的10个基因的行
gene <- rownames(as.data.frame(sort(rowSums(RNAseq_expr),decreasing=T)[1:10]))
Q8_2 <- e1[gene,]
plot(Q8_2[1,],type="b",xlab = "gene",ylab="expression",pch = 1)
for (i in c(2:10)){
lines(Q8_2[i,],type="b",xlab = "gene",ylab="expression",pch = i)
}
ggplot绘图
学习链接:
http://biotrainee.com/jmzeng/markdown/ggplot-in-R.html
https://github.com/jmzeng1314/5years/blob/master/learn-R/tasks/top50ggplot.Rmd
# 准备数据
# 数据变成长格式
suppressMessages(library(reshape2))
box_e <- melt(e2)
colnames(box_e) <- c("gene","sample","expression")
tmp=data.frame(group_list=RNAseq_gl)
rownames(tmp)=colnames(RNAseq_expr)
tmp$sample <- rownames(tmp)
d = merge(box_e,tmp,by="sample")
# 分组
group = as.data.frame(colData(airway)[,c(3,5)])
group
## dex Run
## SRR1039508 untrt SRR1039508
## SRR1039509 trt SRR1039509
## SRR1039512 untrt SRR1039512
## SRR1039513 trt SRR1039513
## SRR1039516 untrt SRR1039516
## SRR1039517 trt SRR1039517
## SRR1039520 untrt SRR1039520
## SRR1039521 trt SRR1039521
suppressMessages(library(ggplot2))
# 用来拼图的包
suppressMessages(library("Rmisc"))
suppressMessages(library("plyr"))
# 箱型图
A <- ggplot(data = d,aes(y=expression,x=sample,fill=group_list)) + geom_boxplot() +
theme(axis.text.x = element_text(angle=30, hjust=1, vjust=1,size=6 ))
# 密度图
B1 <- ggplot(box_e,aes(x=expression,colour=sample)) + geom_density() +
theme(legend.text = element_text(size = 8, face = "bold"),legend.key.size=unit(0.3,'cm'))
B2 <- ggplot(data= d,aes(expression,col=group_list)) + geom_density() +
theme(legend.text = element_text(size = 8, face = "bold"),legend.key.size=unit(0.3,'cm'))
# 条形图
C <- ggplot(data = d,aes(x=sample,y=expression,fill= group_list)) + geom_bar(stat="identity") +
theme(axis.text.x = element_text(angle=30, hjust=1, vjust=1,size=6 ))
# factor() 函数将连续型变量转化为离散型变量
# 散点图
D <- ggplot(as.data.frame(RNAseq_expr[,1:2]),aes(x=SRR1039508,y=SRR1039509)) +
geom_point() +
geom_smooth(method = "lm")
# 热图
melt_Q7 <- melt(Q7)
E <- ggplot(data = melt_Q7, aes(x=Var1, y=Var2, fill=value)) + geom_tile()+
theme(axis.text.x = element_text(angle=45, hjust=1, vjust=1,size= 4),
axis.text.y = element_text(size= 4)) + labs(x="",y="")
# 折线图
F1 <- ggplot(Q8_1,aes(x=sample, y=expression, group =1)) + geom_line() + geom_point() +
theme(axis.text.x = element_text(angle=45, hjust=1, vjust=1,size=8 ))
Q8_2_m = melt(Q8_2)
colnames(Q8_2_m) = c("gene","sample","expression")
F2 <- ggplot(Q8_2_m,aes(x=sample,y=expression,colour=gene,group=gene)) + geom_line() + geom_point() +
theme(axis.text.x = element_text(angle=45, hjust=1, vjust=1,size=8 ),
legend.text = element_text(size = 8, face = "bold"),legend.key.size=unit(0.3,'cm'))
multiplot(A,B1,B2, layout = matrix(c(1,1,2,3), nrow=2, byrow=TRUE))
multiplot(C,D,E, layout = matrix(c(1,1,2,3), nrow=2, byrow=TRUE))
multiplot(F1,F2)
生物信息绘图
对RNAseq_expr挑选MAD值最大的100个基因的表达矩阵绘制热图,对RNAseq_expr进行主成分分析并且绘图、进行差异分析并且绘制火山图、(平均值VS变化倍数)图。
top50_mad = rownames(as.data.frame(tail(sort(apply(RNAseq_expr,1,mad)),50)))
top50_matrix = RNAseq_expr[top50_mad,]
top50_matrix2 = t(scale(t(top50_matrix))) # 做个标准化
pheatmap::pheatmap(top50_matrix2,filename = 'cor2.png')
e_mad <- RNAseq_expr[names(sort(apply(RNAseq_expr,1,mad),decreasing=T)[1:50]),]
M = cor(log2(e_mad +1))
tmp=data.frame(group_list=RNAseq_gl)
rownames(tmp)=colnames(M)
pheatmap = pheatmap::pheatmap(M,annotation_col = tmp)
# pheatmap::pheatmap(M,annotation_col = tmp,filename = 'cor.png')
# 主成分分析
library(ggfortify)
df = as.data.frame(t(RNAseq_expr))
df$group=RNAseq_gl
#png('pca.png',res=120)
pca = autoplot(prcomp( df[,1:(ncol(df)-1)] ), data=df,colour = 'group_list')+theme_bw()
#dev.off()
# 差异分析——火山图/差异倍数的
suppressMessages(library(DESeq2))
dds <- DESeqDataSetFromMatrix(countData = RNAseq_expr,
colData = tmp,
design = ~ group_list)
# 这里还可以过滤掉一些基因
dds <- DESeq(dds)
res <- results(dds,
contrast=c("group_list","trt","untrt"))
resOrdered <- res[order(res$padj),]
head(resOrdered)
## log2 fold change (MLE): group_list trt vs untrt
## Wald test p-value: group_list trt vs untrt
## DataFrame with 6 rows and 6 columns
## baseMean log2FoldChange lfcSE
## <numeric> <numeric> <numeric>
## ENSG00000152583 997.439773207048 4.60252567692759 0.211770756715996
## ENSG00000148175 11193.718773887 1.45146587997406 0.0848248918317296
## ENSG00000179094 776.596665468835 3.18385728117598 0.201515441016454
## ENSG00000134686 2737.98198594824 1.38714097803914 0.0915842458278469
## ENSG00000125148 3656.252782247 2.20343864531793 0.147408663443996
## ENSG00000120129 3409.02937530956 2.94898340153752 0.201613621263967
## stat pvalue
## <numeric> <numeric>
## ENSG00000152583 21.733528029557 9.89036416171812e-105
## ENSG00000148175 17.1113201400054 1.22198224764203e-65
## ENSG00000179094 15.7995698251034 3.1324679455702e-56
## ENSG00000134686 15.1460654122389 8.04403913177459e-52
## ENSG00000125148 14.9478232407627 1.60923879656226e-50
## ENSG00000120129 14.6269055783513 1.89198465690132e-48
## padj
## <numeric>
## ENSG00000152583 1.83911321587149e-100
## ENSG00000148175 1.13613799474518e-61
## ENSG00000179094 1.94160804826259e-52
## ENSG00000134686 3.73947269138371e-48
## ENSG00000125148 5.98475908441505e-47
## ENSG00000120129 5.86357578251335e-45
DEG =as.data.frame(resOrdered) # 数据框
DEG = na.omit(DEG) # 返回删除NA后的向量a
nrDEG=DEG
DEseq_DEG=nrDEG
nrDEG=DEseq_DEG[,c(2,6)]
colnames(nrDEG)=c('log2FoldChange','pvalue')
# 丑一点的火山图
# png('volcano.png',res=120)
# plot(nrDEG$log2FoldChange, -log10(nrDEG$pvalue))
# dev.off()
logFC_cutoff <- with(nrDEG,mean(abs( log2FoldChange)) + 2*sd(abs( log2FoldChange)) )
nrDEG$change = as.factor(ifelse(nrDEG$pvalue < 0.05 & abs(nrDEG$log2FoldChange) > logFC_cutoff,
ifelse(nrDEG$log2FoldChange > logFC_cutoff ,'UP','DOWN'),'NOT'))
this_tile <- paste0('Cutoff for logFC is ',round(logFC_cutoff,3),
'\nThe number of up gene is ',nrow(nrDEG[nrDEG$change =='UP',]) ,
'\nThe number of down gene is ',nrow(nrDEG[nrDEG$change =='DOWN',]))
volcano = ggplot(data=nrDEG,
aes(x=log2FoldChange, y=-log10(pvalue),
color=change)) +
geom_point(alpha=0.4, size=1.75) +
xlab("log2 fold change") + ylab("-log10 p-value") +
ggtitle(this_tile) + theme(plot.title = element_text(size=15,hjust = 0.5)) +
scale_colour_manual(values = c('blue','black','red'))
# ggsave(volcano,filename = paste0('DEseq2_volcano.png'))
# M表示log fold change,衡量基因表达量变化,上调还是下调。A表示每个基因的count的均值。
png("MA.png")
plotMA(res,ylim=c(-5,5))
topGene <- rownames(res)[which.min(res$padj)]
with(res[topGene, ], {
points(baseMean, log2FoldChange, col="dodgerblue", cex=2, lwd=2)
text(baseMean, log2FoldChange, topGene, pos=2, col="dodgerblue")
})
dev.off()
# lfcShrink 收缩log2 fold change
resLFC <- lfcShrink(dds,coef = 2,res=res)
# png("MA2.png")
MA2 <- plotMA(resLFC, ylim=c(-5,5))
# 为了标记出topGene
topGene <- rownames(resLFC)[which.min(res$padj)]
with(resLFC[topGene, ], {
points(baseMean, log2FoldChange, col="dodgerblue", cex=2, lwd=2)
text(baseMean, log2FoldChange, topGene, pos=2, col="dodgerblue")
})
idx <- identify(res$baseMean, res$log2FoldChange)
# dev.off()
绘制其中一个差异基因在两个分组的表达量boxplot并且添加统计学显著性指标。
suppressMessages(library(ggpubr))
choose_gene = rownames(nrDEG)[66]
choose_gene_d <- cbind(as.data.frame(RNAseq_expr[choose_gene,]),as.data.frame(tmp$group_list))
choose_gene_d$sample = rownames(choose_gene_d)
colnames(choose_gene_d) = c("e","group","sample")
opar<-par(no.readonly=T)
ggplot(data = choose_gene_d,aes(y=e,x=group)) + geom_boxplot() +
stat_compare_means(method = "t.test")
通过org.Hs.eg.db包拿到RNAseq_expr所有基因的染色体信息,绘制染色体的基因数量条形图,在上面染色体的基因数量条形图并列叠加差异基因数量条形图。
suppressMessages(library(org.Hs.eg.db))
CHR <- toTable(org.Hs.egCHR)
ensembl <- toTable(org.Hs.egENSEMBL)
ensembl_id <- data.frame(rownames(RNAseq_expr))
colnames(ensembl_id) <- c("ensembl_id")
eg <- merge(ensembl_id,ensembl,by="ensembl_id")
egc <- merge(eg,CHR,by="gene_id")
colnames(egc)
ggplot(data = egc,aes(x=chromosome)) + geom_bar(fill="lightblue")
# 在上面染色体的基因数量条形图并列叠加差异基因数量条形图
DEG_ensembl <- data.frame(rownames(nrDEG))
colnames(DEG_ensembl) <- c("ensembl_id")
DEG_eg <- merge(DEG_ensembl,ensembl,by="ensembl_id")
DEG_egc <- merge(DEG_eg,CHR,by="gene_id")
colnames(DEG_egc) <- c("DEG_gene_id","DEG_ensembl_id","chromosome")
str(DEG_egc)
# 叠加
egc$DEG <- as.factor(ifelse(egc$ensembl_id %in% DEG_egc$DEG_ensembl_id,'DEG','NOT'))
ggplot(egc,aes(x=chromosome,y=..count..,fill=DEG)) + geom_bar(stat ='count')
在oncolnc网页工具拿到CUL5基因在BRCA数据集的表达量及病人生存资料自行本地绘制生存分析图。
获取表格步骤
# 生存分析
suppressMessages(library(ggstatsplot))
CUL5 <- read.csv(file = 'BRCA_8065_50_50.csv',header = T)
ggbetweenstats(data = CUL5, x='Status', y='Expression')
suppressMessages(library(survival))
suppressMessages(library(survminer))
table(CUL5$Status)
CUL5$Status = ifelse(CUL5$Status=='Dead',1,0)
sfit <- survfit(Surv(Days,Status)~Group,data = CUL5)
ggsurvplot(sfit, conf.int = F, pval=T)
在xena网页工具拿到CUL5基因在BRCA数据集的表达量及病人的PAM50分类并且绘制分类的boxplot。
cul5 <- read.csv(file = 'denseDataOnlyDownload.tsv',sep="\t",header = T,na.strings="")
# 去除NA以后还有...读取的时候加上参数 na.strings=""
cul5<- cul5[complete.cases(cul5),]
library(RColorBrewer)
ggplot(data = cul5,aes(y=CUL5,x=PAM50_mRNA_nature2012,color=PAM50_mRNA_nature2012)) + geom_boxplot() +
theme_light() + stat_boxplot(geom = "errorbar",width=0.2)
画图还是蛮有趣的!!!
更多学习资源:
生信技能树公益视频合辑
生信技能树简书账号
生信工程师入门最佳指南
生信技能树全球公益巡讲
招学徒
...
你的宣传能让数以万计的初学者找到他们的家,技能树平台一定不会辜负每一个热爱学习和分享的同道中人 😎