使用python转换文件编码。写中途遇到问题
- 不知道文件具体是什么编码
- str和unicode没理清,str是字节数组,unicode才是字符串,用
type(xx)
可获得类型。
搜集的资料
python 转化文件编码 utf8
python 中str和unicode
Python中如何将文件保存为utf-8(带BOM)的格式
巧用notepad++ 批量转换ansi 和 utf8,notepad++中使用python脚本
1. 使用chardet检测字符串编码
import chardet
content = open("a.txt","r")# str类型
source_encoding = chardet.detect(content)['encoding']
if source_encoding == None:
print "can not detect"
PS: utf8结果字符串是'utf-8',utf8-bom结果是'UTF-8-SIG'。
2. 使用codecs读写指定格式编码文件
content = codecs.open("a.txt", 'r', "ascii").write(content)
codecs.open("b.txt", 'w', encoding="UTF-8-SIG").write(content)
codecs.open
,读取时不指定编码,就和open
一样,返回str
类型。
3. 综合起来就可以转换了
import chardet
import codecs
def convert_file_to_utf8(filename):
# !!! does not backup the origin file
content = codecs.open(filename, 'r').read()
source_encoding = chardet.detect(content)['encoding']
if source_encoding == None:
print "??",filename
return
print " ",source_encoding, filename
if source_encoding != 'utf-8' and source_encoding != 'UTF-8-SIG':
content = content.decode(source_encoding, 'ignore') #.encode(source_encoding)
codecs.open(filename, 'w', encoding='UTF-8-SIG').write(content)
遇到的一些问题
- chardet安装不生效。windows上点击压缩包里的
setup.py
不生效,用命令行python setup.py install
。
遍历文件夹,转换所有文件为utf-8编码。
基于blog里的代码修改了一点。
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import codecs
import os
import sys
import shutil
import re
import chardet
convertfiletypes = [
".xml",
".lua",
".csd",
".py"
]
def check_need_convert(filename):
for filetype in convertfiletypes:
if filename.lower().endswith(filetype):
return True
return False
total_cnt = 0
success_cnt = 0
unkown_cnt = 0
def convert_encoding_to_utf_8(filename):
global total_cnt,success_cnt,unkown_cnt
# Backup the origin file.
# convert file from the source encoding to target encoding
content = codecs.open(filename, 'r').read()
source_encoding = chardet.detect(content)['encoding']
total_cnt+=1
if source_encoding == None:
print "??",filename
unkown_cnt+=1
return
print " ",source_encoding, filename
if source_encoding != 'utf-8' and source_encoding != 'UTF-8-SIG':
content = content.decode(source_encoding, 'ignore') #.encode(source_encoding)
codecs.open(filename, 'w', encoding='UTF-8-SIG').write(content)
success_cnt+=1
def convert_dir(root_dir):
if os.path.exists(root_dir) == False:
print "[error] dir:",root_dir,"do not exit"
return
print "work in",convertdir
for root, dirs, files in os.walk(root_dir):
for f in files:
if check_need_convert(f):
filename = os.path.join(root, f)
try:
convert_encoding_to_utf_8(filename)
except Exception, e:
print "WA",filename,e
print "finish total:",total_cnt,"success:",success_cnt,"unkown_cnt",unkown_cnt
if __name__ == '__main__':
if len(sys.argv) == 1:
raw_input("[error] need root dir")
sys.exit(-1)
convertdir = sys.argv[1]
convert_dir(convertdir)