rm(list=ls())
my_data <- read.csv("expr_counts.csv")
library('tidyr')
library('rtracklayer')
my_data<- tidyr :: separate(my_data, gene_id,into = c('gene_id' , 'junk'), sep='\\.')
my_data<- my_data[,-2]
#去掉行名中的小数点后面数字。
download.file('ftp://ftp.ensembl.org/pub/release-99/gtf/homo_sapiens.GRCh38.99.chr.gtf.gz','Homo_sapiens.GRCh38.99.chr.gtf.gz')
#后续语句不work,因为通过这种方式下载的homo_sapiens文件是损坏的文件,不能解压缩。所以需要在网站下载最新版Homo_sapiens.GRCh38.103.chr.gtf.gz,
进入网站:https://useast.ensembl.org/info/data/index.html
然后将该文件copy到文件夹中。之后再用下面的语句解压缩:
R.utils::gunzip('Homo_sapiens.GRCh38.104.chr.gtf.gz')
gtf1<- rtracklayer::import('Homo_sapiens.GRCh38.104.chr.gtf')
gtf_df<- as.data.frame(gtf1)
mRNA_exprSet<- gtf_df %>%
dplyr::filter(type=="gene",gene_biotype=="protein_coding")%>%
dplyr::select(c(gene_name,gene_id,gene_biotype))%>%
dplyr::inner_join(my_data, by ="gene_id")
# onlyselect the protein coding genes.
mRNA_exprSet<-mRNA_exprSet[!duplicated(mRNA_exprSet$gene_name),]
总结,整个语句如下:
rm(list=ls())
my_data <- read.csv("exp-1.csv")
library('tidyr')
if (!require("BiocManager", quietly = TRUE))
install.packages("BiocManager")
BiocManager::install("rtracklayer")
library('rtracklayer')
R.utils::gunzip('Homo_sapiens.GRCh38.107.chr.gtf.gz')
gtf1<- rtracklayer::import('Homo_sapiens.GRCh38.107.chr.gtf')
gtf_df<- as.data.frame(gtf1)
mRNA_exprSet<- gtf_df %>%
dplyr::filter(type=="gene",gene_biotype=="protein_coding")%>%
dplyr::select(c(gene_name,gene_id,gene_biotype)) %>%
dplyr::inner_join(my_data, by = "gene_id")
# only select the protein coding genes.
mRNA_exprSet<- mRNA_exprSet[!duplicated(mRNA_exprSet$gene_name),]
write.csv(mRNA_exprSet, 'mRNA_exprSet.csv', quote = F, row.names = T)