文本预处理——去停用词
停用词文本可以从https://pan.baidu.com/s/1q21hIK95QU9qDstptd8V8g 自提,不谢
该停用词文本转自https://blog.csdn.net/FontThrone/article/details/74200026,自己还未创建新的停用词,后续更新。。。。
# - * - coding: utf - 8 -*-
import sys
# 获取停用词的List
def GetListOfStopWords(filepath):
f_stop = open(filepath, encoding='utf-8')
try:
f_stop_text = f_stop.read()
finally:
f_stop.close()
f_stop_seg_list = f_stop_text.split('\n')
return f_stop_seg_list
# 保存List
# def SaveFile(list, filename):
# f_stop = open(filename, 'w', encoding='utf-8')
# for item in range(len(list)):
# if item != len(list):
# f_stop.writelines((list[item].encode('utf-8')) + '\n')
# else:
# f_stop.writelines(list[item].encode('utf-8'))
# f_stop.close()
# 求List并集
def GetListUnion(listName):
ListUnion = ['!']
for item in listName:
# print(item)
ListUnion.extend(GetListOfStopWords(item))
return list(set(ListUnion))
def GetStopWords(listOfFileName, FileName='CNstopwords.txt', keynumber=1):
stopwords_pathCN = stop_dir + 'CNstopwords.txt' # 默认中文总表 1
stopwords_pathEN = stop_dir + 'ENstopwords.txt' # 默认英文总表 2
stopwords_pathCNEN = stop_dir + 'CNENstopwords.txt' # 默认中英文混合总表 4
if keynumber == 1:
listOfFileName.append(stopwords_pathCN)
elif keynumber == 2:
listOfFileName.append(stopwords_pathEN)
elif keynumber == 3:
listOfFileName.append(stopwords_pathCN)
listOfFileName.append(stopwords_pathEN)
elif keynumber == 5:
listOfFileName.append(stopwords_pathCN)
listOfFileName.append(stopwords_pathCNEN)
elif keynumber == 6:
listOfFileName.append(stopwords_pathEN)
listOfFileName.append(stopwords_pathCNEN)
elif keynumber == 7:
listOfFileName.append(stopwords_pathCN)
listOfFileName.append(stopwords_pathEN)
listOfFileName.append(stopwords_pathCNEN)
else:
listOfFileName.append(stopwords_pathCN)
print('The keynumber is wrong,change keynumber to 1 ')
listOfFileName.append(stopwords_pathCNEN)
ListUnion = GetListUnion(listOfFileName)
return ListUnion
# SaveFile(ListUnion, FileName)
listOfFileName = []
# 需要添加的 中文 停用词词表
stop_dir = "./stopwords/"
stopwords_path1 = stop_dir + 'stopwords1893.txt'
stopwords_path2 = stop_dir + 'stopwords1229.txt'
stopwords_path3 = stop_dir + 'stopwordshagongdakuozhan.txt'
stopwords_path4 = stop_dir + 'stop_words_zh.txt'
# 需要添加的 英文 停用词词表
stopwords_path5 = stop_dir + 'stop_words_eng.txt'
stopwords_path6 = stop_dir + 'ENstopwords891.txt'
# 需要添加的 中文 停用词词表路径
listOfFileName.append(stopwords_path1)
listOfFileName.append(stopwords_path2)
listOfFileName.append(stopwords_path3)
listOfFileName.append(stopwords_path4)
# 需要添加的 英文 停用词词表路径
listOfFileName.append(stopwords_path5)
listOfFileName.append(stopwords_path6)
res = GetStopWords(listOfFileName, FileName=stop_dir + 'ENstopwords.txt', keynumber=2)
NLP入门:文本预处理(一)停用词
最后编辑于 :
©著作权归作者所有,转载或内容合作请联系作者
- 文/潘晓璐 我一进店门,熙熙楼的掌柜王于贵愁眉苦脸地迎上来,“玉大人,你说我怎么就摊上这事。” “怎么了?”我有些...
- 文/花漫 我一把揭开白布。 她就那样静静地躺着,像睡着了一般。 火红的嫁衣衬着肌肤如雪。 梳的纹丝不乱的头发上,一...
- 文/苍兰香墨 我猛地睁开眼,长吁一口气:“原来是场噩梦啊……” “哼!你这毒妇竟也来了?” 一声冷哼从身侧响起,我...