去水印只针对pdf
通过给与的水印图,找出相似水印并去除
def is_same_img(area_chart, target_img, VPT):
"""
:param area_chart: 去水印图
:param target_img: 目标图
:param VPT: 阈值
:return: 是否相似
"""
# 规范图片尺寸
with Image.open(area_chart) as img2:
size = img2.size
with Image.open(target_img) as img1:
img1 = img1.convert('RGB')
resize_img = img1.resize(size, Image.ANTIALIAS) # x, y 为压缩后的宽和高 Image.ANTIALIAS 抗锯齿
resize_img.save(target_img, quality=100)
path = './imgs'
if switch == 0:
path = './'
highfreq_factor = 4 # resize的尺度
hash_size = 32 # 最终返回hash数值长度
image_scale = 64
list_file = []
list_phash = []
list_ahash = []
list_dhash = []
list_whash = []
for file in [area_chart, target_img]:
if os.path.splitext(file)[1] == '.png':
path_file = os.path.join(path, file) # 拼路径
list_file.append(file)
phash = imagehash.phash(Image.open(path_file), hash_size=hash_size, highfreq_factor=highfreq_factor) # 感知哈希(perception hashing)
ahash = imagehash.average_hash(Image.open(path_file), hash_size=hash_size) # 平均散列(average hashing)
dhash = imagehash.dhash(Image.open(path_file), hash_size=hash_size) # 梯度散列(difference hashing)
whash = imagehash.whash(Image.open(path_file), image_scale=image_scale, hash_size=hash_size, mode='db4') # 离散小波变换(wavelet hashing)
list_phash.append(phash)
list_ahash.append(ahash)
list_dhash.append(dhash)
list_whash.append(whash)
for i in range(len(list_file)):
for j in range(i + 1, len(list_file)):
phash_value = 1 - (list_phash[i] - list_phash[j]) / len(list_phash[i].hash) ** 2
ahash_value = 1 - (list_ahash[i] - list_ahash[j]) / len(list_ahash[i].hash) ** 2
dhash_value = 1 - (list_dhash[i] - list_dhash[j]) / len(list_dhash[i].hash) ** 2
whash_value = 1 - (list_whash[i] - list_whash[j]) / len(list_whash[i].hash) ** 2
value_hash = max(phash_value, ahash_value, dhash_value, whash_value)
if (value_hash > VPT): # 阈值设为0.7
size_i = os.path.getsize(path + '\\' + list_file[i])
size_j = os.path.getsize(path + '\\' + list_file[j])
print(list_file[i], str(size_i / 1024) + 'KB')
print(list_file[j], str(size_j / 1024) + 'KB')
print(value_hash)
print('***********************')
return True
return False
def delete_wartermark(target_path, area_chart, VPT=0.9):
"""
:param target_path: 目标路径
:param area_chart: 去水印图
:param VPT: 图片相似图 阈值
:return: 文件路径
"""
save_pdf_path = ''
try:
if '.pdf' in target_path:
with fitz.open(target_path) as pdf_document:
for current_page in range(len(pdf_document)):
for image in pdf_document.getPageImageList(current_page):
xref = image[0]
pix = fitz.Pixmap(pdf_document, xref)
if pix.n < 4: # this is GRAY or RGB
save_path = "page%s_%s.png" % (current_page, xref)
pix.writePNG(save_path)
if is_same_img(save_path, area_chart, VPT):
pdf_document._deleteObject(image[0])
if os.path.exists(save_path):
os.remove(save_path)
splittext = os.path.splitext(target_path)
save_pdf_path = splittext[-2] + '1' + splittext[-1]
pdf_document.save(save_pdf_path)
print('成功----删除水印')
if os.path.exists(target_path):
os.remove(target_path)
except Exception as e:
print(e)
print('失败----删除水印')
return save_pdf_path
pdf_document = r'C:\Users\Administrator\OneDrive\all_huaqiu\huaqiu_spider\test\input.pdf'
delete_wartermark(pdf_document, area_chart='area_chart.png', VPT=0.73)
本文参考资料:
Python操作PDF-文本和图片提取(使用PyPDF2和PyMuPDF)
Python处理PDF的实用姿势
使用PyPDF2在PDF上去除水印