处理文件

file_encoding(filepath) ：小文件的时候调用，获取文件的编码，可以传入open()函数encoding参数。filepath是个文件
file_encoding_big(filepath) ：同上，区别是大文件调用，返回文件编码。filepath是个文件
change_to_utf8(filepath) ：解决目录下所有文件，乱码的情况。不管是csv还是文本还是2进制文件。filepath是个文件夹
clear_dir(filepath, save_dir=True)：将目录清空，save_dir=False表示这个目录删除
file_backups(old_dir, new_dir)：目录整体备份，从old_dir,备份到new_dir文件夹

# -*- coding: utf-8 -*-
"""
===========================
# @Time : 2020/8/5 16:39
# @File  : handele_file.py
# @Author: adeng
# @Date  : 2020/8/5
============================
"""

import chardet
import os
from datetime import datetime


class HanldeFile():
    pass
    """
    file_encoding:函数返回文件编码
    file_encoding_big 处理大文件返回文件编码

    """
    @staticmethod
    def file_encoding(filepath):
        """
        filepath： 文件的路径，非目录
        返回的是一个文件的编码格式
        """
        if not os.path.isfile(filepath):
            print("这不是一个文件")
            return
        with open(filepath, "rb") as f1:
            data = f1.read()
            # {'encoding': 编码, 'confidence': 可信度, 'language': 语言}
            result = chardet.detect(data)
            print(result)
            encoding = result.get("encoding")
        return encoding

    @staticmethod
    def file_encoding_big(filepath):
        """
        对大文件获取encoding的处理方式
        """
        if not os.path.isfile(filepath):
            print("这不是一个文件")
            return
        # 默认一个字典的key为None
        dict_encoding = dict(encoding=None)
        list_encod = []
        bytess = 0
        with open(filepath, mode="rb") as f:
            for i in range(50):  # 对大文件读取50次
                data = f.read(6000)
                bytess += 1
                if len(data) == 0:
                    break
                res = chardet.detect(data)
                encoding = res.get("encoding")
                if not encoding:
                    continue
                list_encod.append(encoding)
        print(list_encod)
        if len(list_encod) == 1 and len(list_encod) != 0:
            dict_encoding["encoding"] = list_encod[-1]
        elif bytess == 1:  # 这里表示打开读取文件内容为空，看上面的for循环只读了一次。
            print("此文件为空，删除文件重新创建一个编码为【utf-8】的同文件名")
            t = datetime.strftime(datetime.now(), '%Y%m%d%H%M%S')
            os.remove(filepath)
            with open(filepath, mode="w", encoding="utf-8") as new_file:
                pass
            dict_encoding["encoding"] = new_file.encoding
        if len(list_encod) > 1:
            list_des = list(set(list_encod))  # 去重
            count_dict = {}
            for i in list_des:
                num = list_encod.count(i)
                count_dict.setdefault(num, i)  # 不存在就新增到字典
            print(count_dict)
            # sorted()排序
            res_list = sorted(count_dict, key=lambda k: k, reverse=True)
            dict_encoding["encoding"] = count_dict[res_list[0]]
        return dict_encoding["encoding"]

    @staticmethod
    def change_to_utf8(filepath):
        """
        # 处理文件目录文件乱码的
        """
        if not os.path.isdir(filepath):
            print(f"{filepath}这不是一个目录")
            return
        for root, dirs, files in os.walk(filepath):
            pass

            for f in files:
                old_path = os.path.join(root, f)
                t = datetime.strftime(datetime.now(), '%Y%m%d%H%M%S')
                write_path = os.path.join(root, f"{t}" + f)
                try:
                    with open(old_path, mode="r", encoding=HanldeFile.file_encoding_big(old_path)) as read_old:
                        data = read_old.read()
                        with open(write_path, mode="w", encoding="utf8") as write_new:
                            write_new.write(data)
                    # 删除原文件，写入的新文件重命名为原文件
                    os.remove(old_path)
                    os.rename(write_path, old_path)
                except Exception as e:
                    with open(old_path, mode="rb") as read_old1:
                        pass
                        with open(write_path, mode="wb") as write_new1:
                            write_new1.write(read_old1.read())
                    # 删除原文件，写入的新文件重命名为原文件
                    os.remove(old_path)
                    os.rename(write_path, old_path)

    @staticmethod
    def clear_dir(filepath, save_dir=True):
        """
        清空目录包括子目录
        """
        if not os.path.isdir(filepath):
            print(f"{filepath}这不是一个目录")
            return
        # 目录（包括子目录）下所有文件删除,目录保留
        for root, dirs, files in os.walk(filepath):
            for f in files:
                old_path = os.path.join(root, f)
                os.remove(old_path)
        # 下面代码是为了删除目录
        if not save_dir:
            list_dir = []
            for root, dirs, files in os.walk(filepath):
                if not dirs:
                    list_dir.append(root)
                    continue
            print(list_dir)
            # 对空文件夹按层级目录删除，os.removedirs(path)
            for f1 in list_dir:
                os.removedirs(f1)

    @staticmethod
    def file_backups(old_dir, new_dir):
        """
        备份，ole_dir:原来的文件夹，new_dir:备份后的文件夹
        """
        if not os.path.isdir(old_dir):
            print(f"传入的{old_dir}不是一个目录")
            return
        for root, dirs, files in os.walk(old_dir):
            new_root = root.replace(old_dir, new_dir)
            if not os.path.exists(new_root):
                os.mkdir(new_root)
            # # 创建对应的文件夹.注释的代码可不要用来理解的
            # for dir in dirs:
            #     dir = os.path.join(new_root, dir)
            #     if not os.path.exists(dir):
            #         os.mkdir(dir)
            # 取出所有文件，打开文件，重新写入
            for f in files:
                old_filename = os.path.join(root, f)
                new_filename = os.path.join(new_root, f)
                with open(old_filename, mode="rb") as f1, open(new_filename, "wb") as f2:
                    for line in f1:
                        f2.write(line)


if __name__ == '__main__':
    pass

上面代码已经测试过，测试过程我就不发了

实战：文件处理：获取编码，解决乱码，备份目录，清空文件

实战：文件处理：获取编码，解决乱码，备份目录，清空文件

处理文件