一个简短的Python脚本,实现对PDF文档的读写
import pdfplumber
import pandas as pd
import os
import csv
#在硬盘下新建名为data_pdf的文件夹,放入PDF文件执行即可
with open('pdf_read.csv','a+',newline='',encoding='utf-8') as csvfile:
writer=csv.writer(csvfile,dialect='excel')
for root, dirs, files in os.walk("./data_pdf/" ):
for index_file,name in enumerate(files):
try:
if name.split('.')[1]=='pdf':
path=os.path.join(root, name)
with pdfplumber.open(path) as pdf:
for index,page in enumerate(pdf.pages):
#tables = page.extract_tables() # 获取表格信息
string = page.extract_text() # 获取PDF文本信息
if 'a' in string or 'b' in string or 'c' in string or 'd' in string or 'e' in string or 'f' in string:
print("{}/{},{}在第{}页存在关键词,处理完毕".format(index_file+1,len(files),name,index+1))
count=1
state='T'
writer.writerow([name,count,index+1,state])
break
else:
print("{}/{},{}的第{}页正在处理".format(index_file+1,len(files),name,index+1))
except:
state='F'
writer.writerow([name,state])