m6A相关的lncRNA怎么进行计算?下面我们就来探究下
1.表达矩阵下载
UCSC Xena是University of California开发的针对多个数据库的综合网站,上面有针对TCGA数据库整理好的RNA-seq表达矩阵。
常用的数据是FPKM,可以转换为TPM。
注意:
1.FPKM是log2(FPKM+1)
2.基因注释版本是gencode.v22.annotation
3.表达矩阵是ENSG格式
2.区分mRNA和lncRNA
打开GENCODE中gtf文件的biotype的说明,可以发现lncRNA包含9种类型。
对lncRNA的说明:
Generic long non-coding RNA biotype that replaced the following biotypes: 3prime_overlapping_ncRNA, antisense, bidirectional_promoter_lncRNA, lincRNA, macro_lncRNA, non_coding, processed_transcript, sense_intronic and sense_overlapping.
# 读取并探索gtf文件
gtf <- rtracklayer::import("D:/jianshu/R-TCGA-m6AlncRNA/gencode.v22.annotation.gtf")
gtf <- as.data.frame(gtf)
head(gtf,6)
## seqnames start end width strand source type score phase
## 1 chr1 11869 14409 2541 + HAVANA gene NA NA
## 2 chr1 11869 14409 2541 + HAVANA transcript NA NA
## 3 chr1 11869 12227 359 + HAVANA exon NA NA
## 4 chr1 12613 12721 109 + HAVANA exon NA NA
## 5 chr1 13221 14409 1189 + HAVANA exon NA NA
## 6 chr1 12010 13670 1661 + HAVANA transcript NA NA
## gene_id gene_type gene_status gene_name
## 1 ENSG00000223972.5 transcribed_unprocessed_pseudogene KNOWN DDX11L1
## 2 ENSG00000223972.5 transcribed_unprocessed_pseudogene KNOWN DDX11L1
## 3 ENSG00000223972.5 transcribed_unprocessed_pseudogene KNOWN DDX11L1
## 4 ENSG00000223972.5 transcribed_unprocessed_pseudogene KNOWN DDX11L1
## 5 ENSG00000223972.5 transcribed_unprocessed_pseudogene KNOWN DDX11L1
## 6 ENSG00000223972.5 transcribed_unprocessed_pseudogene KNOWN DDX11L1
## level havana_gene transcript_id
## 1 2 OTTHUMG00000000961.2 <NA>
## 2 2 OTTHUMG00000000961.2 ENST00000456328.2
## 3 2 OTTHUMG00000000961.2 ENST00000456328.2
## 4 2 OTTHUMG00000000961.2 ENST00000456328.2
## 5 2 OTTHUMG00000000961.2 ENST00000456328.2
## 6 2 OTTHUMG00000000961.2 ENST00000450305.2
## transcript_type transcript_status transcript_name tag
## 1 <NA> <NA> <NA> <NA>
## 2 processed_transcript KNOWN DDX11L1-002 basic
## 3 processed_transcript KNOWN DDX11L1-002 basic
## 4 processed_transcript KNOWN DDX11L1-002 basic
## 5 processed_transcript KNOWN DDX11L1-002 basic
## 6 transcribed_unprocessed_pseudogene KNOWN DDX11L1-001 basic
## transcript_support_level havana_transcript exon_number exon_id
## 1 <NA> <NA> <NA> <NA>
## 2 1 OTTHUMT00000362751.1 <NA> <NA>
## 3 1 OTTHUMT00000362751.1 1 ENSE00002234944.1
## 4 1 OTTHUMT00000362751.1 2 ENSE00003582793.1
## 5 1 OTTHUMT00000362751.1 3 ENSE00002312635.1
## 6 NA OTTHUMT00000002844.2 <NA> <NA>
## ont protein_id ccdsid
## 1 <NA> <NA> <NA>
## 2 <NA> <NA> <NA>
## 3 <NA> <NA> <NA>
## 4 <NA> <NA> <NA>
## 5 <NA> <NA> <NA>
## 6 PGO:0000019 <NA> <NA>
需要的列有:
gene_id:与TCGA数据ENSG格式是一致的
gene_type:区分lncRNA
gene_name:即gene symbol
type:区分gene和其他类型,gene只有60483个
table(gtf$type)
##
## gene transcript exon CDS start_codon
## 60483 198442 1172082 699443 82228
## stop_codon UTR Selenocysteine
## 74337 276542 114
if(!file.exists("D:/jianshu/R-TCGA-m6AlncRNA/gtf_gene.Rdata")){
library(tidyverse)
gtf_gene <- gtf[gtf$type=="gene",] %>% #提取type为gene的行
select(gene_id,gene_name,gene_type)
gtf_mRNA <- gtf_gene[gtf_gene$gene_type=="protein_coding",] #提取mRNA
lnc = c("3prime_overlapping_ncRNA", "antisense", "bidirectional_promoter_lncRNA", "lincRNA", "macro_lncRNA", "non_coding", "processed_transcript", "sense_intronic" , "sense_overlapping")
gtf_lncRNA <- gtf_gene[gtf_gene$gene_type %in% lnc,]
save(gtf_mRNA,gtf_lncRNA,file = "D:/jianshu/R-TCGA-m6AlncRNA/gtf_gene.Rdata")
}
3. m6A和lncRNA求相关性
分别提取m6A基因表达矩阵、lncRNA表达矩阵
删除正常样品
相关性检验: ** a.检测lncRNA sd是否大于0.1; ** b.检验lncRNA 在正常与肿瘤组间是否有差异; ** b.判断p.value是否小于0.05; ** c.判断相关系数(cor)是否大于阈值
rm(list = ls())
library(limma)
corFilter=0.4 #设置相关系数阈值
pvalueFilter=0.001 #设置p值过滤标准
# lncRNA数据处理
rt <- data.table::fread("D:/jianshu/R-TCGA-m6AlncRNA/lncRNA.txt",data.table = F)
rt <- as.matrix(rt) #变为matrix
rownames(rt) <- rt[,1]
exp <- rt[,2:ncol(rt)]
dimnames <- list(rownames(exp),colnames(exp))
data <- matrix(as.numeric(as.matrix(exp)),nrow=nrow(exp),dimnames = dimnames)
data <- avereps(data)
data <- data[rowMeans(data)>0.1,] #去掉数值比较小的lncRNA
library(tidyverse)
lncRNA <- data[,str_sub(colnames(data),14,15)<10]
# m6A数据处理
rt1 <- data.table::fread("D:/jianshu/R-TCGA-m6AlncRNA/m6aGeneExp.txt",data.table = F)
rt1 <- as.matrix(rt1) #变为matrix
rownames(rt1) <- rt1[,1]
exp1 <- rt1[,2:ncol(rt1)]
dimnames1 <- list(rownames(exp1),colnames(exp1))
m6A <- matrix(as.numeric(as.matrix(exp1)),nrow=nrow(exp1),dimnames = dimnames1)
m6A <- avereps(m6A)
m6A <- m6A[rowMeans(m6A)>0.1,] #去掉数值比较小的lncRNA
m6A <- m6A[,str_sub(colnames(m6A),14,15)<10]
# m6A和lncRNA相关性检验
sampleType <- c(rep(1,ncol(data[,str_sub(colnames(data),14,15)>10])),
rep(2,ncol(data[,str_sub(colnames(data),14,15)<10])))
outTab=data.frame()
for(i in row.names(lncRNA)){
if(sd(lncRNA[i,])>0.1){
test=wilcox.test(data[i,] ~ sampleType)
if(test$p.value<0.05){
for(j in row.names(m6A)){
x=as.numeric(lncRNA[i,])
y=as.numeric(m6A[j,])
corT=cor.test(x,y)
cor=corT$estimate
pvalue=corT$p.value
if((cor>corFilter) & (pvalue<pvalueFilter)){
outTab=rbind(outTab,cbind(m6A=j,lncRNA=i,cor,pvalue,Regulation="postive"))
}
if((cor< -corFilter) & (pvalue<pvalueFilter)){
outTab=rbind(outTab,cbind(m6A=j,lncRNA=i,cor,pvalue,Regulation="negative"))
}
}
}
}
}
write.table(outTab,file = "D:/jianshu/R-TCGA-m6AlncRNA/m6AlncRNA-network.txt",sep = "\t",quote = F,col.names = F)
参考文献:
从TCGA表达矩阵中分别提取mRNA和lncRNA→生信星球(公众号)