在R语言中如何将ENSEMBL ID转换成Gene ID或者Symbol

rm(list=ls())

my_data <- read.csv("expr_counts.csv")

library('tidyr')

library('rtracklayer')

my_data<- tidyr :: separate(my_data, gene_id,into = c('gene_id' , 'junk'), sep='\\.')

my_data<- my_data[,-2]

#去掉行名中的小数点后面数字。

~~download.file('ftp://ftp.ensembl.org/pub/release-99/gtf/homo_sapiens.GRCh38.99.chr.gtf.gz','Homo_sapiens.GRCh38.99.chr.gtf.gz')~~

#后续语句不work，因为通过这种方式下载的homo_sapiens文件是损坏的文件，不能解压缩。所以需要在网站下载最新版Homo_sapiens.GRCh38.103.chr.gtf.gz，

进入网站：https://useast.ensembl.org/info/data/index.html

然后将该文件copy到文件夹中。之后再用下面的语句解压缩：

R.utils::gunzip('Homo_sapiens.GRCh38.104.chr.gtf.gz')

gtf1<- rtracklayer::import('Homo_sapiens.GRCh38.104.chr.gtf')

gtf_df<- as.data.frame(gtf1)

mRNA_exprSet<- gtf_df %>%

dplyr::filter(type=="gene",gene_biotype=="protein_coding")%>%

dplyr::select(c(gene_name,gene_id,gene_biotype))%>%

dplyr::inner_join(my_data, by ="gene_id")

# onlyselect the protein coding genes.

mRNA_exprSet<-mRNA_exprSet[!duplicated(mRNA_exprSet$gene_name),]

总结，整个语句如下：

rm(list=ls())

my_data <- read.csv("exp-1.csv")

library('tidyr')

if (!require("BiocManager", quietly = TRUE))

install.packages("BiocManager")

BiocManager::install("rtracklayer")

library('rtracklayer')

R.utils::gunzip('Homo_sapiens.GRCh38.107.chr.gtf.gz')

gtf1<- rtracklayer::import('Homo_sapiens.GRCh38.107.chr.gtf')

gtf_df<- as.data.frame(gtf1)

mRNA_exprSet<- gtf_df %>%

dplyr::filter(type=="gene",gene_biotype=="protein_coding")%>%

dplyr::select(c(gene_name,gene_id,gene_biotype)) %>%

dplyr::inner_join(my_data, by = "gene_id")

# only select the protein coding genes.

mRNA_exprSet<- mRNA_exprSet[!duplicated(mRNA_exprSet$gene_name),]

write.csv(mRNA_exprSet, 'mRNA_exprSet.csv', quote = F, row.names = T)

在R语言中如何将ENSEMBL ID转换成Gene ID或者Symbol

推荐阅读更多精彩内容