一、发现串行串列数据
1.1 根据分隔符每行出现次数判断是否串列
根据分隔符出现的次数判断是否串列
#发现串行
with open(file,'r',encoding='utf-8') as file:
rows = file.readlines()
sep_cnt = rows[0].count('|')
num = 0
for i in rows:
cnt = i.count('|')
if cnt!=sep_cnt:
num = num + 1
print(num,'|',cnt,':',i)
1.2 知道在第几行串列
pandas 在读取时会报错,错误会告知在第几行多一个列,可以根据错误查看具体行的数据
pandas.errors.ParserError: Error tokenizing data. C error: Expected 35 fields in line 191072, saw 36
1.2.1 pandas 读取串列行
import pandas as pd
file ='tmp.csv'
#读取错误行
dat = pd.read_table(file, sep='|',encoding = "utf8",
dtype=str,skiprows=191072, nrows=2 ,low_memory=False,
header=None
)
1.2.2 linecache 读取串列行
import linecache
error_line= linecache.getline(file,191073)
二、替换串列数据
2.1 替换第n次出现的某个字符
def nth_repl(s, sub, repl, n):
"""
替换第n次出现的字符
:s:字符串
:sub:被替换的字符串
:repl:替换为的新字符串
:n:字符串第n次出现的次数
-------------------------
替换第7次出现的位置
nth_repl(z,'|','_',7)
"""
find = s.find(sub)
# If find is not -1 we have found at least one match for the substring
i = find != -1
# loop util we find the nth or we find no match
while find != -1 and i != n:
# find + 1 means we start searching from after the last match
find = s.find(sub, find + 1)
i += 1
# If i is equal to n we found nth match so replace
if i == n:
return s[:find] + repl + s[find+len(sub):]
return s
参考:https://stackoverflow.com/questions/35091557/replace-nth-occurrence-of-substring-in-string
2.2 替换空字符
#替换\x00 与空字符串(\r,\n,\t)
z=re.sub('\x00','',re.sub('\s','',count))
2.3 逐行替换
列表效率更高
#清洗数据
def file_sub(old_file,new_file):
file_data = [] # 初始化
with open(old_file, "r", encoding="utf-8") as f:
print('开始替换...')
rows = f.readlines()
sep_cnt = rows[0].count('|')
for line in tqdm(rows): # line一行行读取替换文本
cnt = line.count('|')
if cnt!=sep_cnt:
a = re.sub('\x00','',re.sub('\s','',line))
a = nth_repl(a,'|','_',30)
#如果仍然大于30个,继续替换
while a.count('|')>sep_cnt:
a = nth_repl(a,'|','_',30)
else :
a = re.sub('\x00','',re.sub('\s','',line))
file_data.append(a)
with open(new_file, "w", encoding="utf-8") as f: # 写入替换好的文本
print('写入替换文本...')
for line in tqdm(file_data):
f.write(line + '\n')
print('批量替换完成')
def main():
file_sub('hdx_finance_20211130_f.csv','hdx_finance_20211130_new.csv')