1. 做植物的组学数据经常遇到的情况是不同的基因组,不同的公司,给到的结果偶尔会不太一样,比如这次我就碰到了老师提供的是locus tag,但是跟tfplantdb的转录因子的id不同,找了好多种方法,最后还是原始的爬虫解决了我的问题。代码就放下面啦
rm(list=ls())
library(bitops)
library(RCurl)
library(stringr)
library(XML)
genes <- read.table("gene_url.txt",header = T,stringsAsFactors = F)
genes$NCBI_url <- paste("https://www.ncbi.nlm.nih.gov/gene/?term=",genes$ENTREZID,sep="")
head(genes)
getNodesTxt <- function(html_txt1,xpath_p){
els1 = getNodeSet(html_txt1, xpath_p)
# 获得Node的内容,并且去除空字符:
els1_txt <- sapply(els1,xmlValue)[!(sapply(els1,xmlValue)=="")]
# 去除\n:
str_replace_all(els1_txt,"(\\n )+","")
}
dealNodeTxt <- function(NodeTxt){
ifelse(is.character(NodeTxt)==T && length(NodeTxt)!=0 , NodeTxt , NA)
}
for(i in 1:nrow(genes)){
c = 1
temp <- try(getURL(genes[i,"NCBI_url"]),silent=FALSE) #判断 grtURL是否返回错误值
if('try-error' %in% class(temp))
{doc <-NULL
cat('第',i,'个失败!\n')
c <- c+1
}else{
doc <- temp
cat('第',i,'个成功!\n')
html_txt1 = htmlParse(doc, asText = TRUE)}
#head(html_txt1,4)
print('-------------------------------------------------')
dd = grep('LOC_Os',str_split(dealNodeTxt(getNodesTxt(html_txt1,'//*[@class="rprt-section gene-general-protein-info"][1]')),' ')[[1]],value = T)
if(length(dd) == 0 ){
genes[i,"Locustag"] = NA
}else{
genes[i,"Locustag"] = dd
}
#print(dd)
}