老板要办个会,本来让我们一个个去查文献,然后获取邮箱等去发邮件。我想偷个懒(“懒惰”是程序员的美德之一)找个R包,直接获取pubmed通讯作者的单位和邮箱,封装函数如下:
建议把pubmed语法先去pubmed搜以下然后再决定用哪个。我测试了以下一次好像最多提取500个文章。
参考
Pubmed语法
rm(list = ls())
#https://cran.r-project.org/web/packages/easyPubMed/vignettes/getting_started_with_easyPubMed.html
#install.packages("easyPubMed")
suppressMessages(library(easyPubMed))
out_dir<-"~/Desktop/pubmed"
dir.create(out_dir,recursive = T)
setwd(out_dir)
#maximine reference will be 5000
get_author<-function(new_query){
suppressMessages(library(easyPubMed))
suppressMessages(library(tidyverse))
my_entrez_id <- get_pubmed_ids(new_query)
#my_abstracts_txt <- fetch_pubmed_data(my_entrez_id, format = "abstract")
my_abstracts_xml <- fetch_pubmed_data(pubmed_id_list = my_entrez_id,retmax=10000000)
my_PM_list <- articles_to_list(pubmed_data = my_abstracts_xml)
xx <- lapply(my_PM_list, article_to_df, autofill = TRUE, max_chars = 2000,getKeywords=T)
full_df <- do.call(rbind, xx)
articals<-as.character(unique(full_df$pmid))
out_tab<-data.frame()
for (i in articals){
rt<-full_df[full_df$pmid==i,]
rrt<-rt[nrow(rt),]
out_tab<-rbind(out_tab,rrt)
}
out_tab$pubmed_link<-paste0("https://pubmed.ncbi.nlm.nih.gov/",out_tab$pmid,"/")
out_tab$doi_link<-paste0("https://doi.org/",out_tab$doi)
out_tab<-as.data.frame(out_tab)
out_tab<-arrange(out_tab,desc(year),desc(month),desc(day))
return(out_tab)
}
#example
new_query_2020_1='lincRNAs OR lincRNA OR lncRNAs OR lncRNA AND "2020/07":"2020/12"[PDAT]'
new_query_2020_1<-get_author(new_query=new_query_2020_1)
影响因子大于5的文章,首先的整个影响因子的table
rm(list = ls())
#https://cran.r-project.org/web/packages/easyPubMed/vignettes/getting_started_with_easyPubMed.html
#install.packages("easyPubMed")
#maximine reference will be 500
get_paper<-function(new_query,sci_dir,IF_cutoff,out_dir){
suppressMessages(library(easyPubMed))
suppressMessages(library(tidyverse))
my_entrez_id <- get_pubmed_ids(new_query)
#my_abstracts_txt <- fetch_pubmed_data(my_entrez_id, format = "abstract")
my_abstracts_xml <- fetch_pubmed_data(pubmed_id_list = my_entrez_id,retmax=10000000)
my_PM_list <- articles_to_list(pubmed_data = my_abstracts_xml)
xx <- lapply(my_PM_list, article_to_df, autofill = TRUE, max_chars = 10000,getKeywords=T)
full_df <- do.call(rbind, xx)
articals<-as.character(unique(full_df$pmid))
out_tab<-data.frame()
for (i in articals){
rt<-full_df[full_df$pmid==i,]
rrt<-rt[nrow(rt),]
out_tab<-rbind(out_tab,rrt)
}
out_tab$pubmed_link<-paste0("https://pubmed.ncbi.nlm.nih.gov/",out_tab$pmid,"/")
out_tab$doi_link<-paste0("https://doi.org/",out_tab$doi)
out_tab<-as.data.frame(out_tab)
out_tab$journal<-toupper(out_tab$journal)
sci_table<-read.csv(sci_dir)
sci_table<-sci_table %>% dplyr::rename(journal=Full.Journal.Title)
sci_table$journal<-toupper(sci_table$journal)
# out_tab<-arrange(out_tab,desc(year),desc(month),desc(day))
out_tab<-left_join(out_tab,sci_table,by="journal")
out_tab<-arrange(out_tab,desc(Journal.Impact.Factor)) %>% filter(Journal.Impact.Factor>IF_cutoff)
dir.create(out_dir,recursive = T)
prefix=strsplit(new_query, split = 'AND "',fixed = T)[[1]][1]
write.csv(out_tab,paste0(out_dir,prefix,"-",as.character(Sys.Date()),".csv"))
}
sci_dir<-"~/Box/HPC/R/often_use/pubmed/2020JournalImpactFactorandQuartile.csv"
new_query='lncRNAs AND Liver AND "2021/09":"2021/10"[PDAT]'
out_dir="~/Desktop/pubmed/"
get_paper(new_query=new_query,sci_dir=sci_dir,IF_cutoff=5,out_dir=out_dir)