# -*- coding:utf-8 -*-
import codecs
import os
import shutil
import jieba
import jieba.analyse
#Read file and cut
#create path
path ="E:\\python\\"
respath ="E:\\fc\\"
if os.path.isdir(respath):
shutil.rmtree(respath, True)
os.makedirs(respath)
stopwords = {}.fromkeys([line.strip()for linein open("E:\\哈工大停用词表.txt")])#导入用户自定义词典
num =1
while num<=2:
name ="%d" % num
fileName = path + str(name) +".txt"
resName = respath + str(name) +".txt"
source = open(fileName, 'r')
if os.path.exists(resName):
os.remove(resName)
result = codecs.open(resName, 'w', encoding='utf-8')
lines = source.readlines()
for linein lines:
line = line.rstrip('\r\n')
seglist = jieba.cut(line,cut_all=False)#精确模式
output=''
for segin seglist:
if segnot in stopwords:#去停用词
if len(seg)>1:#去掉分词为1个字的结果
output += seg
output +=' '
print (output)
result.write(output)
line = source.readline()
else:
print ('End file: ' + str(num))
source.close()
result.close()
num = num +1
else:
print ('End All')