提起lncRNA
options(stringsAsFactors = F)
expr_df_nopoint=read.table("RNA_count_data.txt", sep = "\t",header = T)
names(expr_df_nopoint)[1]="gene_id"
首先我定义了一个非编码RNA的集合,这个每个人的标准不一样,但是我的原则是,多多益善,这样出来以后会有个问题,就是编码基因转录出非编码基因会无法从基因名称上区分,可以在运行时把geneid换成转录本id,必须要记在心里。
详见:http://www.bioinfo-scrounger.com/archives/323
lncRNA <- c("3prime_overlapping_ncRNA","antisense_RNA","bidirectional_promoter_lncRNA","lincRNA","macro_lncRNA","non_coding","processed_transcript","sense_intronic","sense_overlapping")
读入GTF数据
library(dplyr)
library(rtracklayer)
AnnoData <- rtracklayer::import('E:/GTF_R/Homo_sapiens.GRCh38.90.chr.gtf')
AnnoData1 =data.frame(AnnoData )
用 gene_biotype来筛选lncRNA
LncRNA_exprSet <- AnnoData1 %>%
filter(gene_biotype %in% lncRNA) %>% #注意这里是gene_biotype
select(c(gene_name,gene_id,gene_biotype)) %>%
distinct() %>% #删除多余行
inner_join(expr_df_nopoint,by ="gene_id")%>%
select(-gene_id,-gene_biotype)
LncRNA_exprSet2<-LncRNA_exprSet%>%
distinct(LncRNA_exprSetgene_name`)
LncRNA_exprSet2gene_name`
Metadata 分组文件
save(LncRNA_exprSet2,file = "LncRNA_exprSet2.Rda")
write.table(LncRNA_exprSet2, "LNC_RNA_count_data.txt", row.names = F, sep = "\t", quote = F)
metadata <- data.frame(names(LncRNA_exprSet2)[-1])
for (i in 1:length(metadata[,1]))
{
num <- as.numeric(substr(metadata[i,1],14,15))
if (num %in% seq(1,9)) {metadata[i,2] <- "T"}
if (num %in% seq(10,29)) {metadata[i,2] <- "N"}
}
加个名称
names(metadata) <- c("TCGA_id","sample")
将sample转化成factor
metadatasample)
table(metadatasample)
我们可总结一下,有44例normal,502例肿瘤组织
metadata %>% dplyr::group_by(sample) %>% summarise(n())
############# 说明所有的样本都是肿瘤样本
制作countData
Lnc_RNA_mycounts <- LncRNA_exprSet2
row.names(Lnc_RNA_mycounts)<-Lnc_RNA_mycounts$gene_name
Lnc_RNA_mycounts=Lnc_RNA_mycounts[-1]
str(Lnc_RNA_mycounts)
############## log2(count+1) 归一化 #############
tt=head(Lnc_RNA_mycounts)
my_normalize<-function(x){
x=log2(x+1)
}
tt2=apply(tt,2,my_normalize)
Lnc_RNA_mycounts_norm<-apply(Lnc_RNA_mycounts,2,my_normalize)
save(Lnc_RNA_mycounts_norm,file = "lncRNA_norm.Rda")