首先这个PDF文件是能复制而非图片转换的。可以用PDF转word软件比如foxit phantom可以自动识别表格,另一种方法是在R中利用包tabulizer进行操作。下面只介绍后一种。
## 安装包
install.packages('rJava')
install.packages("tabulizer")
## 如果提示package ‘tabulizer’ is not available (for R version xxx),就从GitHub安装
if (!require("remotes")) {
install.packages("remotes")
}
# on 64-bit Windows
remotes::install_github(c("ropensci/tabulizerjars", "ropensci/tabulizer"), INSTALL_opts = "--no-multiarch")
# elsewhere
remotes::install_github(c("ropensci/tabulizerjars", "ropensci/tabulizer"))
## 加载包
library(tabulizer)
library(tidyverse)
## 文件路径&名称
f <- "D:/R/Mendeley_PDF/Annals of Forest Science/Fries - 2012 - Genetic parameters, genetic gain and correlated responses in growth, fibre dimensions and wood density in a Scots pine br.pdf"
## 提取文件中所有符合要求的表格,会将一些正文文本错误地识别成表格
out1 <- extract_tables(f)
## 指定页提取
out2 <- extract_tables(f, pages = 1, guess = FALSE, output = "data.frame")
## 指定区域,会自动打开文件,进行框选就可以了
extract_areas(f, 2)
## 从提取出的对象中选择目标表格,转换格式并输出,有时识别误差大,这是PDF文件本身的问题,需要手动矫正
tab1 <- out1[[7]] %>% data.frame
tab1 %>% write.csv(file = "tab1.csv", quote = F, row.names = F)