Python实现识别文件夹以及子文件夹中的重复文件,使用前需要注意几点:
1、python使用python3
2、确保代码中的folder_path路径正确
3、确保代码中的output_file路径正确且存在
4、确保有访问文件夹的权限,不行就使用sudo
一、python3版本
import os
import hashlib
def calculate_md5(file_path):
"""计算文件的MD5值"""
hash_md5 = hashlib.md5()
with open(file_path, "rb") as f:
for chunk in iter(lambda: f.read(4096), b""):
hash_md5.update(chunk)
return hash_md5.hexdigest()
def find_duplicate_images(folder_path):
"""查找文件夹中的重复图片"""
md5_dict = {}
for root, _, files in os.walk(folder_path):
for file in files:
if file.lower().endswith(('.png', '.jpg', '.jpeg', '.gif', '.bmp', '.apk')):
file_path = os.path.join(root, file)
file_md5 = calculate_md5(file_path)
if file_md5 in md5_dict:
md5_dict[file_md5].append(file_path)
else:
md5_dict[file_md5] = [file_path]
return {k: v for k, v in md5_dict.items() if len(v) > 1}
def save_to_txt(md5_dict, output_file):
"""将重复图片信息保存到txt文件中"""
with open(output_file, "w") as f:
for md5, files in md5_dict.items():
f.write(f"{md5}:\n")
for file in files:
f.write(f"{file}\n")
f.write("\n")
if __name__ == "__main__":
folder_path = "/home/ubuntu/Downloads" # 替换为你的文件夹路径
output_file = "duplicate_files.txt"
duplicates = find_duplicate_images(folder_path)
save_to_txt(duplicates, output_file)
print(f"结果已保存到 {output_file}")
二、python2版本:
# -*- coding: utf-8 -*-
import os
import hashlib
def calculate_md5(file_path):
#计算文件的MD5值
hash_md5 = hashlib.md5()
with open(file_path, "rb") as f:
while True:
chunk = f.read(4096)
if not chunk:
break
hash_md5.update(chunk)
return hash_md5.hexdigest()
def find_duplicate_images(folder_path):
#查找文件夹中的重复图片
md5_dict = {}
for root, _, files in os.walk(folder_path):
for file in files:
if file.lower().endswith(('.png', '.jpg', '.jpeg', '.gif', '.bmp', '.apk')):
file_path = os.path.join(root, file)
file_md5 = calculate_md5(file_path)
if file_md5 in md5_dict:
md5_dict[file_md5].append(file_path)
else:
md5_dict[file_md5] = [file_path]
return dict((k, v) for k, v in md5_dict.items() if len(v) > 1)
def save_to_txt(md5_dict, output_file):
#将重复图片信息保存到txt文件中
with open(output_file, "w") as f:
for md5, files in md5_dict.items():
f.write("{}:\n".format(md5))
for file in files:
f.write("{}\n".format(file))
f.write("\n")
if __name__ == "__main__":
folder_path = "/home/ubuntu/Downloads" # 替换为你的文件夹路径
output_file = "duplicate_files.txt"
duplicates = find_duplicate_images(folder_path)
save_to_txt(duplicates, output_file)
print "结果已保存到 {}".format(output_file)