处理文件
- file_encoding(filepath) :小文件的时候调用,获取文件的编码,可以传入open()函数encoding参数。
filepath是个文件
- file_encoding_big(filepath) :同上,区别是大文件调用,返回文件编码。
filepath是个文件
- change_to_utf8(filepath) : 解决目录下所有文件,乱码的情况。不管是csv还是文本还是2进制文件。filepath是个文件夹
- clear_dir(filepath, save_dir=True):将目录清空,save_dir=False表示这个目录删除
- file_backups(old_dir, new_dir): 目录整体备份,从old_dir,备份到new_dir文件夹
# -*- coding: utf-8 -*-
"""
===========================
# @Time : 2020/8/5 16:39
# @File : handele_file.py
# @Author: adeng
# @Date : 2020/8/5
============================
"""
import chardet
import os
from datetime import datetime
class HanldeFile():
pass
"""
file_encoding:函数返回文件编码
file_encoding_big 处理大文件返回文件编码
"""
@staticmethod
def file_encoding(filepath):
"""
filepath: 文件的路径,非目录
返回的是一个文件的编码格式
"""
if not os.path.isfile(filepath):
print("这不是一个文件")
return
with open(filepath, "rb") as f1:
data = f1.read()
# {'encoding': 编码, 'confidence': 可信度, 'language': 语言}
result = chardet.detect(data)
print(result)
encoding = result.get("encoding")
return encoding
@staticmethod
def file_encoding_big(filepath):
"""
对大文件获取encoding的处理方式
"""
if not os.path.isfile(filepath):
print("这不是一个文件")
return
# 默认一个字典的key为None
dict_encoding = dict(encoding=None)
list_encod = []
bytess = 0
with open(filepath, mode="rb") as f:
for i in range(50): # 对大文件读取50次
data = f.read(6000)
bytess += 1
if len(data) == 0:
break
res = chardet.detect(data)
encoding = res.get("encoding")
if not encoding:
continue
list_encod.append(encoding)
print(list_encod)
if len(list_encod) == 1 and len(list_encod) != 0:
dict_encoding["encoding"] = list_encod[-1]
elif bytess == 1: # 这里表示打开读取文件内容为空,看上面的for循环只读了一次。
print("此文件为空,删除文件重新创建一个编码为【utf-8】的同文件名")
t = datetime.strftime(datetime.now(), '%Y%m%d%H%M%S')
os.remove(filepath)
with open(filepath, mode="w", encoding="utf-8") as new_file:
pass
dict_encoding["encoding"] = new_file.encoding
if len(list_encod) > 1:
list_des = list(set(list_encod)) # 去重
count_dict = {}
for i in list_des:
num = list_encod.count(i)
count_dict.setdefault(num, i) # 不存在就新增到字典
print(count_dict)
# sorted()排序
res_list = sorted(count_dict, key=lambda k: k, reverse=True)
dict_encoding["encoding"] = count_dict[res_list[0]]
return dict_encoding["encoding"]
@staticmethod
def change_to_utf8(filepath):
"""
# 处理文件目录文件乱码的
"""
if not os.path.isdir(filepath):
print(f"{filepath}这不是一个目录")
return
for root, dirs, files in os.walk(filepath):
pass
for f in files:
old_path = os.path.join(root, f)
t = datetime.strftime(datetime.now(), '%Y%m%d%H%M%S')
write_path = os.path.join(root, f"{t}" + f)
try:
with open(old_path, mode="r", encoding=HanldeFile.file_encoding_big(old_path)) as read_old:
data = read_old.read()
with open(write_path, mode="w", encoding="utf8") as write_new:
write_new.write(data)
# 删除原文件,写入的新文件重命名为原文件
os.remove(old_path)
os.rename(write_path, old_path)
except Exception as e:
with open(old_path, mode="rb") as read_old1:
pass
with open(write_path, mode="wb") as write_new1:
write_new1.write(read_old1.read())
# 删除原文件,写入的新文件重命名为原文件
os.remove(old_path)
os.rename(write_path, old_path)
@staticmethod
def clear_dir(filepath, save_dir=True):
"""
清空目录包括子目录
"""
if not os.path.isdir(filepath):
print(f"{filepath}这不是一个目录")
return
# 目录(包括子目录)下所有文件删除,目录保留
for root, dirs, files in os.walk(filepath):
for f in files:
old_path = os.path.join(root, f)
os.remove(old_path)
# 下面代码是为了删除目录
if not save_dir:
list_dir = []
for root, dirs, files in os.walk(filepath):
if not dirs:
list_dir.append(root)
continue
print(list_dir)
# 对空文件夹按层级目录删除,os.removedirs(path)
for f1 in list_dir:
os.removedirs(f1)
@staticmethod
def file_backups(old_dir, new_dir):
"""
备份,ole_dir:原来的文件夹,new_dir:备份后的文件夹
"""
if not os.path.isdir(old_dir):
print(f"传入的{old_dir}不是一个目录")
return
for root, dirs, files in os.walk(old_dir):
new_root = root.replace(old_dir, new_dir)
if not os.path.exists(new_root):
os.mkdir(new_root)
# # 创建对应的文件夹.注释的代码可不要用来理解的
# for dir in dirs:
# dir = os.path.join(new_root, dir)
# if not os.path.exists(dir):
# os.mkdir(dir)
# 取出所有文件,打开文件,重新写入
for f in files:
old_filename = os.path.join(root, f)
new_filename = os.path.join(new_root, f)
with open(old_filename, mode="rb") as f1, open(new_filename, "wb") as f2:
for line in f1:
f2.write(line)
if __name__ == '__main__':
pass
上面代码已经测试过,测试过程我就不发了