https://cloud.tencent.com/developer/article/1906294
####################定义基因长度为 非冗余exon长度之和
library("TxDb.Hsapiens.UCSC.hg38.knownGene")
txdb <- TxDb.Hsapiens.UCSC.hg38.knownGene
if(!file.exists('gene_length.Rdata')){
exon_txdb=exons(txdb)
genes_txdb=genes(txdb)
o = findOverlaps(exon_txdb,genes_txdb)
o
t1=exon_txdb[queryHits(o)]
t2=genes_txdb[subjectHits(o)]
t1=as.data.frame(t1)
t1$geneid=mcols(t2)[,1]
g_l = lapply(split(t1,t1$geneid),function(x){
# x=split(t1,t1$geneid)[[1]]
head(x)
tmp=apply(x,1,function(y){
y[2]:y[3]
})
length(unique(unlist(tmp)))
})
head(g_l)
g_l=data.frame(gene_id=names(g_l),length=as.numeric(g_l))
save(g_l,file = 'gene_length.Rdata')
}
#检查表达量矩阵额基因长度信息
load(file = 'gene_length.Rdata')
head(g_l)
library(org.Hs.eg.db)
g2s=toTable(org.Hs.egSYMBOL)
genelength <- merge(g_l,g2s,by='gene_id')
head(genelength)
genelength=genelength[,c(3,2)]
colnames(genelength) <- c("gene","length")
#排序
counts <- exp[rownames(exp) %in% genelength$gene,]
#转换成矩阵
count<- as.matrix(counts)
#存贮基因长度信息
genelength[match(rownames(counts),
genelength$gene),"length"]
# eff_length<- genelength$length
eff_length <- genelength[match(rownames(counts),genelength$gene),"length"]/1000
class(count[1,1])
#count转为数值
data=apply(count,2,as.numeric)
row.names(data)<-row.names(count)
count<-data
x <- count/eff_length
mat_tpm <- t( t(x) / colSums(x) ) * 1e6
rownames(mat_tpm)<- rownames(count)
mat_tpm[1:4,1:4]
#mat_tpm<-mat_tpm[-1,]
write.table(mat_tpm, file="paca.tidy.tpm.txt", sep="\t", col.names=T, quote=F,row.names = T)