【LGG数据下载】
从CGGA数据官网下载RNA-seq数据,选择其中包含325个样本的测序数据,下载表达矩阵和临床信息
表达矩阵:CGGA.mRNAseq_325.RSEM-genes.20200506.txt
临床信息:CGGA.mRNAseq_325_clinical.20200506.txt
lncRNA注释文件:gencode.v35.long_noncoding_RNAs.gtf
链接:http://www.cgga.org.cn/
读取到Rstudio里,并提取m6a相关基因的表达矩阵,以及所有lncRNA的表达矩阵
## 读取表达矩阵
expr<-read.table("CGGA.mRNAseq_325.RSEM-genes.20200506.txt",header = T,check.names = F)
# 读取需要提取的m6a基因文件
m6a_gene<-read.csv("m6a.csv",header = T,check.names = F)
## 根据基因名提取m6a基因的表达矩阵
m6a_expr<-expr[expr$Gene_Name%in%m6a_gene$`Gene name`,]
# 保存为csv文件
write.csv(m6a_expr,file = "m6a_expr.csv")
## 提取lncRNA的表达矩阵
library("rtracklayer") #载入rtracklayer包用于读取gtf文件
lnc_gtf<-rtracklayer::import("gencode.v35.long_noncoding_RNAs.gtf")
lnc_gtf<-as.data.frame(lnc_gtf)
## 从lnc_gft提取需要的信息
lnc_id<- lnc_gtf[,10:12]
unique(lnc_id$gene_type)
## 提取lncRNA对应的基因名
# 给注释文件按基因名去重
lnc_id<-lnc_id[!duplicated(lnc_id$gene_name),]
# 按lncRNA基因名提取表达矩阵
lnc_expr<-expr[expr$Gene_Name%in%lnc_id$gene_name,]
write.csv(lnc_expr,file = "lnc_expr.csv")
结果:
20个m6a相关基因的表达矩阵
1076个lncRNA基因的表达矩阵
【结合临床信息筛选需要的样本】
因为后面需要做单因素cox,lasso回归,列线图,需要有预后生存信息的样本,所以需要提取哪些对应的样本
## 读取临床信息
clinic<-read.table("CGGA.mRNAseq_325_clinical.20200506.txt",header = T,check.names = F,sep = "\t")
clinic_II<-clinic[clinic$Grade=="WHO II",c(1,7,8)]
clinic_III<-clinic[clinic$Grade=="WHO III",c(1,7,8)]
## 选择低级别胶质瘤
clinic<-rbind(clinic_II,clinic_III)
## LGG patients with missing OS values or OS < 30 days were excluded in order to reduce statistical bias in our analysis.
clinic<-clinic[!is.na(clinic$OS),]
clinic<-clinic[!is.na(clinic$`Censor (alive=0; dead=1)`),]
clinic<-clinic[!clinic$OS<30,]
## 提取匹配临床信息的样本表达矩阵
m6a_expr<-m6a_expr[,c(T,colnames(m6a_expr)[-1]%in%clinic$CGGA_ID)]
lnc_expr<-lnc_expr[,c(T,colnames(lnc_expr)[-1]%in%clinic$CGGA_ID)]
save(lnc_expr,m6a_expr,clinic,file = "lnc_m6a_clinic.Rdata")
结果:
171个带有临床信息LGG样本,并匹配了m6a和lncRNA
【pearson相关性分析】
筛选条件:pvalue<0.001&abs(correlation)>0.5
## 作pearson相关性分析
library(dplyr)
library(tidyr)
rownames(m6a_expr)<-m6a_expr$Gene_Name;m6a_expr<-m6a_expr[,-1]
rownames(lnc_expr)<-lnc_expr$Gene_Name;lnc_expr<-lnc_expr[,-1]
## for循环
#lnc_expr1<-lnc_expr[!apply(lnc_expr, 1,mean)==0,]
## for循环
# i<-1
cor_result<-list()
for (i in 1:ncol(m6a_expr)){
exp_cor<-rbind(m6a_expr[i,],lnc_expr)
## exp_cor已就绪
y <- as.numeric(exp_cor[1,])
colnames <- rownames(exp_cor)
cor_data_df <- data.frame(colnames)
for (m in 1:length(colnames)){
test <- cor.test(as.numeric(exp_cor[m,]),y,type="pearson")
cor_data_df[m,2] <- test$estimate
cor_data_df[m,3] <- test$p.value
}
names(cor_data_df) <- c("symbol","correlation","pvalue")
## pvalue<0.001&abs(correlation)>0.5
cor_data_sig <- cor_data_df %>%
filter(pvalue < 0.001) %>%
filter(abs(correlation)>0.5)
cor_result[[i]]<-cor_data_sig
print(i)
}
save(cor_result,file = "cor_result.Rdata")
cor_result[[3]]
## 将相关性分析后得到的基因名提取出来
cor_gene<-matrix(data = NA,ncol = 1)
for (i in 1:20) {
mt<-cor_result[[i]]$symbol[-1]
mt<-as.matrix(mt[!mt%in%cor_gene])
cor_gene<-rbind(cor_gene,mt)
print(i)
}
cor_gene<-cor_gene[-1,]
length(unique(cor_gene))
## 162个
cor_gene_num<-unique(cor_gene)
## 保存txt文件
write.table(cor_gene_num,file = "cor_gene_num.txt")
【单因素cox分析】
筛选条件:pvalue<0.05
## 单因素cox分析
## 数据准备:162个lncRNA的表达矩阵和之前匹配好的临床信息
lnc<-read.table("cor_gene_num.txt",header = T,sep = " ")
exp_lnc<-t(lnc_expr)
exp_lnc<-exp_lnc[,colnames(exp_lnc)%in%lnc$x]
## 提取477个有表达矩阵的样本的临床信息
ovtime01<-clinic[match(rownames(exp_lnc),clinic$CGGA_ID),]
colnames(ovtime01)<-c("Patient_ID","OStime","OSstatus")
## 为单因素cox分析做准备
unicox<-cbind(ovtime01,exp_lnc)
## 创建一个空白的数据框用于存储分析结果
univar_out <- data.frame(matrix(NA,ncol(unicox)-3,6))
univar_out[,1]<-colnames(unicox)[-(1:3)]
colnames(univar_out)<-c("Genesymbol","Coeffcient","HR","lower .95","upper .95","P-value")
## cox_data即为输入文件,关于为什么不用unicox,因为matrix把数据类型给弄乱了而且相关性分析需要数据服从方差齐正太分布特点
cox_data<-unicox[,c(4:ncol(unicox))]
class(cox_data)
cox_data<-as.matrix(cox_data)
cox_data<-apply(cox_data, c(1,2),as.numeric)
# cox_data<-log2(cox_data + 0.01)# 左偏转正太分布,好像不用转换
cox_data<-cbind(unicox[,1:3],cox_data)
cox_data$OSstatus<-as.numeric(cox_data$OSstatus)
library(survival)
for (i in 1:(ncol(cox_data)-3)){
cox = coxph(Surv(OStime,OSstatus) ~ cox_data[,i+3],data = cox_data)
cox_summ = summary(cox)
univar_out[i,2]=cox_summ$coefficients[,1]
univar_out[i,3]=cox_summ$coefficients[,2]
univar_out[i,4]=cox_summ$conf.int[,3]
univar_out[i,5]=cox_summ$conf.int[,4]
univar_out[i,6]=cox_summ$coefficients[,5]
}
save(univar_out,file = "univar_out.RData")
univar_out_0.05 = univar_out[univar_out[,6]<0.05,]
write.table(univar_out_0.05,file = "univar_out_0.05.txt",sep = "\t",quote = F)
结果:
最后得到68个与LGG预后相关的lncRNA
【给TCGA和CGGA单因素cox回归得到的基因取交集,绘制热图】
lnc_cor<-read.table("VEEN-lnc.txt",header = F)
class(cor_result[[2]])
## 创建两个表格存储相关系数和pvalue
cor<-matrix(NA,nrow = nrow(lnc_cor),ncol = 21) ## TCGA是21个m6a基因,CGGA是20个m6a基因
rownames(cor)<-lnc_cor$V1
pvalue<-matrix(NA,nrow = nrow(lnc_cor),ncol = 21)
rownames(pvalue)<-lnc_cor$V1
## 给这两个表格标列名
colnames<-do.call(cbind,lapply(cor_result, function(data){data[1,1]}))
colnames(cor)<-colnames[1,]
colnames(pvalue)<-colnames[1,]
library(tidyr)
## 提取相关系数
for (i in 1:ncol(cor)) {
cor[rownames(cor)%in%cor_result[[i]]$symbol,i]<-cor_result[[i]]$correlation[cor_result[[i]]$symbol%in%rownames(cor)]
}
cor<-replace_na(cor,0)
write.csv(cor,file = "cor_matrix.csv")
## 提取p值
for (i in 1:ncol(pvalue)) {
pvalue[rownames(pvalue)%in%cor_result[[i]]$symbol,i]<-cor_result[[i]]$pvalue[cor_result[[i]]$symbol%in%rownames(pvalue)]
}
pvalue<-replace_na(pvalue,1)
write.csv(pvalue,file = "pvalue_matrix.csv")
## 绘制热图
#install.packages("reshape2")
#install.packages("RColorBrewer")
library(reshape2)
library(RColorBrewer)
options(stringsAsFactors = F)
up <- pvalue
dn <- cor
dn=t(dn)
up=t(up)
#设置颜色
colVector=c("#AB221F","#FFFADD","#3878C1")
#设置标签
m6a.level <- as.character(rownames(dn))
lnc.level <- as.character(colnames(dn))
#把行转为列
dn.long <- setNames(melt(dn), c('Gene', 'lnc', 'Frequency'))
dn.long$Categrory <- "DN"
up.long <- setNames(melt(up), c('Gene', 'lnc', 'Frequency'))
up.long$Categrory <- "UP"
#右下角颜色
dn.long$range <- cut(dn.long$Frequency,
breaks = seq(floor(min(dn.long$Frequency)),
ceiling(max(dn.long$Frequency)),0.01))
rangeMat1 <- levels(dn.long$range)
rbPal1 <- colorRampPalette(colors = c(colVector[3],"white",colVector[1]))
col.vec1 <- rbPal1(length(rangeMat1)); names(col.vec1) <- rangeMat1
dn.long$color <- col.vec1[as.character(dn.long$range)]
#左上角颜色
up.long$range <- cut(up.long$Frequency, breaks = seq(floor(min(up.long$Frequency)),ceiling(max(up.long$Frequency)),0.01))
rangeMat2 <- levels(up.long$range)
rbPal2 <- colorRampPalette(colors = c(colVector[3],colVector[2]))
col.vec2 <- rbPal2(length(rangeMat2)); names(col.vec2) <- rangeMat2
up.long$color <- col.vec2[as.character(up.long$range)]
#合并右上角和左上角
heatmat <- rbind.data.frame(dn.long,up.long)
pdf("heatmap.pdf",width = 7,height = 6)
layout(mat=matrix(c(1,0,1,2,1,0,1,3,1,0),5,2,byrow=T),widths=c(length(lnc.level),2))
#绘制热图区域
par(bty="n", mgp = c(2,0.5,0), mar = c(5.0, 5.5, 3, 3),tcl=-.25,xpd = T)
x=as.numeric(factor(heatmat$lnc,levels = lnc.level))
y=as.numeric(factor(heatmat$Gene,levels = m6a.level))
#创建空白画布
plot(1,xlim=c(1,length(unique(x))+1),ylim=c(1,length(unique(y))+1),
xaxs="i", yaxs="i",xaxt="n",yaxt="n",
type="n",bty="n",xlab="",ylab="",
main = "Correlation between m6a genes and lncRNA",cex.main=2)
#填充颜色
for(i in 1:nrow(heatmat)) {
if(heatmat$Categrory[i]=="DN") polygon(x[i]+c(0,1,1),y[i]+c(0,0,1),col=heatmat$color[i])
if(heatmat$Categrory[i]=="UP") {
polygon(x[i]+c(0,1,0),y[i]+c(0,1,1),col=heatmat$color[i])
if(heatmat$Frequency[i]<0.001){
text(x[i]+0.5,y[i]+0.8,"***",cex=0.8)
}else if(heatmat$Frequency[i]<0.01){
text(x[i]+0.5,y[i]+0.8,"**",cex=0.8)
}else if(heatmat$Frequency[i]<0.05){
text(x[i]+0.5,y[i]+0.8,"*",cex=0.8)
}
}
}
#基因名和癌症
axis(1,at = sort(unique(x)) + 0.5,labels = lnc.level,lty = 0,las = 2)
axis(2,at = sort(unique(y)) + 0.5,labels = m6a.level,lty = 0,las = 1)
#mtext("lnc types",side = 1,line = 3.5,cex=1.2)
#绘制图例
par(mar=c(0,0,0,5),xpd = T,cex.axis=1.6)
barplot(rep(1,length(col.vec2)),border = NA, space = 0,ylab="",xlab="",ylim=c(1,length(col.vec2)),horiz=TRUE,
axes = F, col=col.vec2) # Loss
axis(4,at=c(1,ceiling(length(col.vec2)/2),length(col.vec2)),c(round(min(up),1),'Pvalue',round(max(up),1)),tick=FALSE)
par(mar=c(0,0,0,5),xpd = T,cex.axis=1.6)
barplot(rep(1,length(col.vec1)),border = NA, space = 0,ylab="",xlab="",ylim=c(1,length(col.vec1)),horiz=TRUE,
axes = F, col=col.vec1) # Gain
axis(4,at=c(1,ceiling(length(col.vec1)/2),length(col.vec1)),c(round(min(dn),1),'Cor',round(max(dn),1)),tick=FALSE)
dev.off()