题目:针对“语料库.txt”文件,实现基于TF-IDF的检索系统。(每一行看做是一个文档)
基于TF-IDF的检索系统,
输入 不超过8个字的短语,系统首先自动进行分词,按照这些分子在所有文档中的TF-IDF的值求和排序返回前10项结果。
提示:注意检索速度,提前计算每个文档的词频并存储。
先利用实验一分词完成的结果,对每一行进行分词统计词频,并记录
image.png
然后利用实验一的FMM来对输入的单词进行分词,分词完成后先计算idf,因为输入确定后,idf就确定了。计算idf是根据分成每个单词,挨个遍历,若这一行有+1,最终除以行数
image.png
然后计算tf,tf每一行都不一样,需要利用前面的每行的统计词频来计算,最后计算tf-idf,将前十大的数值放进结果列表中,并记录行数。
image.png
最后输出前十符合的句子
image.png
完整代码:
import re
import math
#process_1.txt分词完成
#process_2.txt去重完成
#result.txt没行的结果
#end_result.txt 实验一去重排序的文本
def quchong():
i=0
with open("process_1.txt",'r',encoding='UTF-8') as f,open("process_2.txt",'w',encoding='UTF-8') as w:
while(i<23062):
i=i+1
dict_temp={}
temp_words = f.readline().replace("\n", "").split(" ")
total_words = temp_words[1:]
dict_temp=dict.fromkeys(total_words)
for word in dict_temp:
print(word)
w.write(word+" ")
w.write("\n")
def count(): #统计每行的词频
i=0
with open("process_1.txt", 'r', encoding='UTF-8') as f1,open("process_2.txt", 'r', encoding='UTF-8') as f2 ,open("result.txt","w",encoding="UTF-8") as w:
while(i<23062):
i=i+1
temp_words=f1.readline().replace("\n","").split(" ")
total_words=temp_words[1:]
line_words=f2.readline().replace("\n","").split(" ")
for word in line_words:
word_num=total_words.count(word)
print("{}:{}".format(word, word_num))
w.write("{}:{} ".format(word, word_num))
w.write("\n")
with open("end_result.txt","r",encoding="UTF-8") as f:
temp=f.read()
temp1=re.sub(r':\d*',"",temp)
word_dict=temp1.split("\n")
def getword(text):
if len(text)==0:
return ""
if len(text)==1:
return text
if text in word_dict:
return text
else:
small=len(text)-1
text=text[0:small]
return getword(text)
def FMM():
start_index = 0
max_length = 15
result_len=0
result_str = ""
test_str=input("输入不超过八个字短语:")
while test_str!="":
tmp_str = test_str[0:max_length]
seg_str = getword(tmp_str)
seg_len = len(seg_str)
result_len = result_len + seg_len
if seg_str.strip():
result_str = result_str + seg_str + ' / '
test_str = test_str[seg_len:]
return(result_str)
def idf(text):#计算idf
words=text.replace("'","").split(" / ")
#计算idf
idf = [0, 0, 0, 0, 0, 0, 0, 0]
with open("result.txt",'r',encoding="UTF-8") as f:
i=0
while(i<23062):
i=i+1
line_words=f.readline().replace(":"," ").rstrip().split(" ")
for word in words:
if(word in line_words):
idf[words.index(word)]=idf[words.index(word)]+1
for i in range(0,len(words)):
if(idf[i]!=0):
idf[i]=math.log(23062/idf[i],2)
return idf
def tf(text,idf):#计算tf
result= [0,0,0,0,0,0,0,0,0,0]
result_index=[0,0,0,0,0,0,0,0,0,0]
words = text.replace("'", "").split(" / ")
print(words)
with open("result.txt",'r',encoding="UTF-8") as f:
i=0
while(i<20632):
tf = [0, 0, 0, 0, 0, 0, 0, 0]
line_words=f.readline().replace(":"," ").rstrip().split(" ")
lines_num=len(line_words)
for word in words:
if(word in line_words):
tf[words.index(word)]=int(line_words[line_words.index(word)+1])/lines_num
sum=0
for j in range(0,len(words)):
sum=sum+tf[j]*idf[j]
min_num=min(result)
min_index=result.index(min_num)
if(sum>min_num):
result[min_index]=sum
result_index[min_index]=i
i = i + 1
#print("idf:",idf)
#print("tf:", tf)
#print("result:",result)
#print("result_index",result_index)
#result_dic=dict(zip(result_index,result))
#print(result_dic)
with open("语料库.txt", 'r', encoding="UTF-8") as f:
t=0
while(t<20632):
end_line_words = f.readline()
if(t in result_index):
end_index=result_index.index(t)
#print("数值:{}\n文档:{}".format(result[t],end_line_words)
print(str(result[end_index])+" "+end_line_words)
print("-----------------------------------------------")
t=t+1
def main():
x=FMM()
idf_result=idf(x)
tf(x,idf_result)
if __name__=='__main__':
main()