参考网页:https://www.jianshu.com/p/a9f50f4fca25
想做烟草的org.nt.tn90.db,还在摸索当中。
看到有前辈以拟南芥为例做了org.At.tair.db,先拿来试试手。
从头到尾想顺下来还是需要一点时间的。中间还出现了一些莫名其妙的错误。
第一步下载原始数据:
https://www.arabidopsis.org/download/index-auto.jsp?dir=%2Fdownload_files%2FPublic_Data_Releases

image.png

image.png
需要用到TAIR Data 20210331目录下的4个文件:
(1) Araport11_functional_descriptions_20210331.txt
(2) ATH_GO_GOSLIM.txt
(3) gene_aliases_20210331.txt
(4) Locus_Published_20210331.txt
代码运行环境:win10 64bit + Rstudio (Version 1.2.1335) + R-4.0.5
第二步:从ATH_GO_GOSLIM.txt文件中提取需要的内容,赋给go_df
library(RSQLite)
library(AnnotationForge)
options(stringsAsFactors = F)
##先设置工作目录
setwd("D:/rr00/做自己的物种注释包/TAIR Data 20210331")
#从 ATH_GO_GOSLIM.txt 中提取信息
# 这里与参考不同的是,原文使用read.table,这里使用了read.csv。后面在处理其他文件时,如果用read.table会出问题:line 765 did not have 3 elements。所以这里统一使用read.csv。
go_df <-read.csv("ATH_GO_GOSLIM.txt",sep="\t",header=FALSE, as.is=TRUE)
go_df <-go_df[,c(1,6,8,10)]
head(go_df)
# V1 V6 V8 V10
#1 AT1G01010 GO:0006355 P ISS
#2 AT1G01010 GO:0006355 P IEA
#3 AT1G01010 GO:0005634 C ISM
#4 AT1G01010 GO:0006355 P ISS
#5 AT1G01010 GO:0003700 F ISS
#6 AT1G01010 GO:0006355 P ISS
#这里是为了将C替换为CC; P替换为BP; F替换为MF;
#用了三层嵌套语句:ifelse(go_df$V8=="C","CC",B),
# 意思是go_df的第三列字符如果等于C,那么就返回CC,不等于C,就进入B。
go_df$V8 <- ifelse(go_df$V8=="C","CC",ifelse(go_df$V8=="P","BP",ifelse(go_df$V8=="F","MF","")))
nrow(go_df)
#[1] 340482
## go_df$V10 %in% "IEA" 这里为取 go_df$V10 与 "IEA" 的交集,即"IEA"
## !的意思是取除"IEA"之外的元素。
go_df <- go_df[! go_df$V10 %in% "IEA",]
nrow(go_df)
[1] 302997
head(go_df)
#给df赋予一个表头,go_df的处理就结束了。
colnames(go_df)<-c("GID","GO","ONTOLOGY","EVIDENCE")
第三步:从 Locus_Published_20210331.txt提取需要的内容,赋给pub_df
# Locus_Published_20210331.txt,这里原文用read.table没出问题。
pub_df<-read.csv("Locus_Published_20210331.txt",sep="\t",header=TRUE)
##只保留基因编号开头为AT的记录
pub_df <- pub_df[grepl(pattern = "^AT\\d", pub_df$name),]
pub_df <- cbind(GID=do.call(rbind,strsplit(pub_df$name, split = "\\."))[,1],
pub_df)
head(pub_df)
GID name reference_id pubmed_id publication_year
135 AT1G01010 AT1G01010 1345963 11118137 2000
136 AT1G01010 AT1G01010 501706904 12820902 2003
137 AT1G01010 AT1G01010 501712076 15029955 2003
138 AT1G01010 AT1G01010 501712147 15010618 2003
139 AT1G01010 AT1G01010 501712210 15108305 2004
140 AT1G01010 AT1G01010 501712561 15173566 2004
## pubmed_id 不能为空
pub_df <- pub_df[!is.na(pub_df$PMID),]
head(pub_df) #在上一步筛选之后,再用head看数据就没信息了,不知道为什么,对R语言还需要继续学习。
[1] GID name reference_id pubmed_id
[5] publication_year
<0 行> (或0-长度的row.names)
colnames(pub_df) <- c("GID","GENEID","REFID","PMID","PUBYEAR")
第四步:从gene_aliases_20210331.txt中使用read.table提取信息赋给symbol_df出错了,后将read.table调整为read.csv就没有出现问题。
在执行语句symbol_df <- read.table("gene_aliases_20210331.txt",sep="\t",header=TRUE)时,出现了下面的错误代码:
line 765 did not have 3 elements
line 1077 did not have 3 elements
line 1158 did not have 3 elements
line 27832 did not have 3 elements
通过排查,出现此类错误的原因有两个:(1)文本里的特殊符号,read.table无法处理,如#,", ' 等字符;(2)这一行的内容的确无法用分割符"\t"分割为3列。解决方法,由于R语言处理文字的能力偏弱,我处理的方法是通过ultraedit文本工具,直接手工替换、去除掉,然后再用Rstudio进行处理。
line765:AT1G06960 U2BL U2B"-LIKE # 特殊符号"影响了读取
line1077: AT1G09020 KINβγ NULL # 特殊符号#影响了读取。
line1158:AT1G09760 U2A' U2 small nuclear ribonucleoprotein A #t特殊符号' 影响了读取
line27832: Tables_in_pub
第四步:从gene_aliases_20210331.txt中提取信息赋给symbol_df ,可以正常运行的代码如下:
# GENE-SYMBOL的注释数据库
symbol_df <- read.csv("gene_aliases_20210331.txt",sep="\t",header=TRUE)
symbol_df <- symbol_df[grepl(pattern = "^AT\\d", symbol_df$name),]
colnames(symbol_df) <- c("GID","SYMBOL","FULL_NAME")
第五步:从Araport11_functional_descriptions_20210331.txt中提取信息赋给func_df
# GENE-FUNCTION
func_df <- read.csv("Araport11_functional_descriptions_20210331.txt",sep = "\t",header=TRUE)
func_df <- func_df[grepl(pattern = "^AT\\d", func_df$name),]
func_df <- cbind(GID=do.call(rbind,strsplit(func_df$name, split = "\\."))[,1],
func_df)
colnames(func_df) <- c("GID","TXID","GENE_MODEL_TYPE",
"SHORT_DESCRIPTION",
"CURATOR_SUMMARY",
"COMPUTATIONAL_DESCRIPTION")
第六步:去除内容完全重复的行
go_df <- go_df[!duplicated(go_df),]
go_df <- go_df[,c(1,2,4)] ###这里是原文的内容,想不通为啥前面费劲整理了5列内容,到最后一步了反而只取了其中3列,如果只取3列,安装可以运行;如果不运行此行,安装数据库时会出错。
pub_df <- pub_df[!duplicated(pub_df),]
symbol_df <- symbol_df[!duplicated(symbol_df),]
func_df <- func_df[!duplicated(func_df),]
第七步:
file_path <- file.path( getwd())
makeOrgPackage(go=go_df,
pub_info = pub_df,
symbol_info = symbol_df,
function_info = func_df,
version = "1.0",
maintainer = "chaojiangtao <chaojiangtao@caas.cn>",
author="chaojiangtao <chaojiangtao@caas.cn>",
outputDir = file_path,
tax_id = "3702",
genus = "At",
species = "tair11",
goTable = "go"
)
#输出的提示信息~~~~
Populating genes table:
genes table filled
Populating go table:
go table filled
Populating pub_info table:
Populating symbol_info table:
Populating function_info table:
function_info table filled
table metadata filled
'select()' returned many:1 mapping between keys and columns
Dropping GO IDs that are too new for the current GO.db
Populating go table:
go table filled
Populating go_bp table:
go_bp table filled
Populating go_cc table:
go_cc table filled
Populating go_mf table:
go_mf table filled
'select()' returned many:1 mapping between keys and columns
Populating go_bp_all table:
go_bp_all table filled
Populating go_cc_all table:
go_cc_all table filled
Populating go_mf_all table:
go_mf_all table filled
Populating go_all table:
go_all table filled
Creating package in D:/rr00/做自己的物种注释包/TAIR Data 20210331/org.Atair10.eg.db
Now deleting temporary database file
[1] "D:/rr00/做自己的物种注释包/TAIR Data 20210331/org.Atair10.eg.db"
There were 50 or more warnings (use warnings() to see the first 50)
最后会在指定目录下生成"org.Atair11.eg.db", 然后就可以用,试试运行安装命令
install.packages("org.Atair11.eg.db", repos = NULL, type = "source")