先查看gtf注释文件中的gene_biotype个数
zcat GCF_001605985.2_ASM160598v2_genomic.gtf.gz |perl -alne '{next unless $F[2] eq "gene" ;/gene_biotype "(.*?)";/;print $1}' |sort |uniq -c
用R提取
#需要把压缩包提前放在目录下
install.packages("refGenome_1.7.7.tar.gz",repos=NULL, type="source")
#依赖的包
install.packages("DBI")
install.packages("doBy")
install.packages("make")
install.packages("RSQLite")
#依赖Rtools的安装
install.packages("installr")
install.packages("stringr") ###依赖包
library(stringr)
library(installr)
install.Rtools()
########
library(refGenome)
ens <- ensemblGenome()
read.gtf(ens,"GCF_001605985.2_ASM160598v2_genomic.gtf")###导入gtf文件 比较耗时
class(ens)
my_gene <- getGenePositions(ens)
colnames(my_gene)#看列名
gtf_df=as.data.frame(my_gene)#变成数据框
lncRNA <- gtf_df[gtf_df$gene_biotype=="lncRNA",]#提取lncRNA
protein_coding <- gtf_df[gtf_df$gene_biotype=="protein_coding",]
write.table(lncRNA,file="lncRNA.txt",sep="\t",quote=F,row.names = T)#保存为txt
write.table(protein_coding,file="protein_coding.txt",sep="\t",quote=F,row.names = T)