根据语料库,实现最大前向分词(FMM)和最大后向分词(DMM)
要求:
1.读取 语料库.txt文件
抽取词表,不同的词及其词频。输出文件格式:
每行一个词及其词频,按照词频从小到大排序,如:
今天:100
中国:200
取消语料中的分词和词性标记。
2.基于该词表,编写FMM及BMM程序,
输入:没有分过词的文本或者一段文本,
输出:分词结果
用python实现。
第一步实现词频统计
语料库的截图
一开始是想用replace函数替换,结合正则表达式,快速完成,但是replace函数不接受,所以就放弃。
replace函数详情
继续查阅资料,找到函数re.sub()
re.sub("pattern","repl","string","count=0","flag=0")
前三个为必选参数。
pattern:正则表达式
repl:被替换的内容,字符串或者函数
string:被替换的字符串
cout:只处理一部分
先写出出现在语料库中的每行的时间的正则表达式:^\d{8}-\d{2}-\d{3}-\d{3}/m$
除了汉字和数字出现的其他字符:[/a-z!。”“,、——]
分完词以后需要把重复的词去掉,然后在再原来分词的结果里查询
去掉重复的词用到是dict.fromkeys(),将列表中的词加入字典中,可以把重复的数据去掉,若不去掉,计算量超级大
完整代码如下:
import re
def string_process(x):#处理字符串
a=re.sub(r'\d{8}-\d{2}-\d{3}-\d{3}/m|[/a-z!。”“,、——\[\]():《》……A-Z?]', "", x)#前面时间的正则表达式,后面部分删掉字母和其他符号
b=a.replace(" "," ")
return b.rstrip()
'''
process_1.txt存放分完词的数据
process_2.txt存放无重复的词
result.txt存放最终结果
'''
import re
max_length=15
def string_process(x):#处理字符串
a=re.sub(r'\d{8}-\d{2}-\d{3}-\d{3}/m|[/a-z!。”“,、——\[\]():《》……A-Z?]', "", x)#前面时间的正则表达式,后面部分删掉字母和其他符号
b=a.replace(" "," ")
return b.rstrip()
'''
process_1.txt存放分完词的数据
process_2.txt存放无重复的词
result.txt存放最终结果
'''
def file_process(): #将语料库中的多余字符删掉并写入到新的文件中
s=""
with open('语料库.txt','r',encoding='UTF-8') as f:
for line1 in f:
a=string_process(line1)
a=string_process(a)
a=string_process(a)
print(a)
s+=a+"\n"
f.close()
with open("process_1.txt",'w',encoding='UTF-8') as w:
w.write(s)
w.close()
def count_string_0():#将重复的词去掉
with open("process_1.txt",'r',encoding='UTF-8') as f,open("process_2.txt",'w',encoding='UTF-8') as w:
words=f.read().split(" ")
print(words)
dict_temp=dict.fromkeys(words)
for word in dict_temp:
print(word)
w.write(word+" ")
def count_string_1():#统计频率
total_words_list0=[]
total_words_list1 = []
with open("process_1.txt",'r',encoding='UTF-8') as f:
total_words_list0=f.read().split(" ")
for word in total_words_list0:
new_word=word.rstrip()
total_words_list1.append(new_word)
with open("process_2.txt",'r',encoding="UTF-8") as f1,open("result.txt","w",encoding="UTF-8") as w:
words_list=f1.read().split(" ")
for word in words_list:
new_count_word=word.rstrip()
count_num=total_words_list1.count(new_count_word)
print("{}:{}".format(new_count_word,count_num))
w.write("{}:{}\n".format(new_count_word,count_num))
def sort_result():#对词频排序
i=0 #记录行数
count_dict={"试一试":0}
with open("result.txt",'r',encoding='UTF-8') as f:
while(i<62062):
temp_list = f.readline().split(":")
count_num=temp_list[-1].replace("\n","")
temp={temp_list[0]:int(count_num)}
count_dict.update(temp)
i=i+1
count_dict_result=sorted(count_dict.items(),key=lambda d:d[1])
with open("end_result.txt","w",encoding="UTF-8") as w:
for word in count_dict_result:
w.write(str(word)+"\n")
with open("end_result.txt","r",encoding="UTF-8") as f:
temp=f.read()
temp1=re.sub(r':\d*',"",temp)
word_dict=temp1.split("\n")
def getword(text):
if len(text)==0:
return ""
if len(text)==1:
return text
if text in word_dict:
return text
else:
small=len(text)-1
text=text[0:small]
return getword(text)
def getword_bmm(text):
if len(text)==0:
return ""
if len(text)==1:
return text
if text in word_dict:
return text
else:
text=text[1:]
return getword(text)
def FMM():
start_index = 0
max_length = 15
result_len=0
test_str=input("输入不超过八个字短语:")
result_str=""
while test_str!="":
tmp_str = test_str[0:max_length]
seg_str = getword(tmp_str)
seg_len = len(seg_str)
result_len = result_len + seg_len
if seg_str.strip():
result_str = result_str + seg_str + ' / '
test_str = test_str[seg_len:]
print(result_str+"\n")
def BMM():
start_index = 0
max_length = 15
result_len=0
global test_str
with open("input.txt","r",encoding="UTF-8") as f, open("output1.txt", "w", encoding="UTF-8") as w:
for i in range(0,10):
result_str = ""
test_str = f.readline()
while test_str!="":
tmp_str = test_str[len(test_str)-max_length:]
seg_str = getword_bmm(tmp_str)
seg_len = len(seg_str)
result_len = result_len + seg_len
if seg_str.strip():
result_str = seg_str + ' / '+result_str
test_str = test_str[0:len(test_str)-seg_len]
w.write(result_str+"\n")
def main():
FMM()
if __name__ == '__main__':
main()