信息内容安全实验-TF-IDF信息检索

题目：针对“语料库.txt”文件，实现基于TF-IDF的检索系统。（每一行看做是一个文档）
基于TF-IDF的检索系统，
输入不超过8个字的短语，系统首先自动进行分词，按照这些分子在所有文档中的TF-IDF的值求和排序返回前10项结果。
提示：注意检索速度，提前计算每个文档的词频并存储。

先利用实验一分词完成的结果，对每一行进行分词统计词频，并记录

image.png

然后利用实验一的FMM来对输入的单词进行分词，分词完成后先计算idf，因为输入确定后，idf就确定了。计算idf是根据分成每个单词，挨个遍历，若这一行有+1，最终除以行数

image.png

然后计算tf，tf每一行都不一样，需要利用前面的每行的统计词频来计算，最后计算tf-idf，将前十大的数值放进结果列表中，并记录行数。

image.png

最后输出前十符合的句子

image.png

完整代码：

import re
import math
#process_1.txt分词完成
#process_2.txt去重完成
#result.txt没行的结果
#end_result.txt 实验一去重排序的文本
def quchong():
    i=0
    with open("process_1.txt",'r',encoding='UTF-8') as f,open("process_2.txt",'w',encoding='UTF-8') as w:
        while(i<23062):
            i=i+1
            dict_temp={}
            temp_words = f.readline().replace("\n", "").split(" ")
            total_words = temp_words[1:]
            dict_temp=dict.fromkeys(total_words)
            for word in dict_temp:
                print(word)
                w.write(word+" ")
            w.write("\n")
def count(): #统计每行的词频
    i=0
    with open("process_1.txt", 'r', encoding='UTF-8') as f1,open("process_2.txt", 'r', encoding='UTF-8') as f2 ,open("result.txt","w",encoding="UTF-8") as w:
        while(i<23062):
            i=i+1
            temp_words=f1.readline().replace("\n","").split(" ")
            total_words=temp_words[1:]
            line_words=f2.readline().replace("\n","").split(" ")
            for word in line_words:
                word_num=total_words.count(word)
                print("{}:{}".format(word, word_num))
                w.write("{}:{} ".format(word, word_num))
            w.write("\n")
with open("end_result.txt","r",encoding="UTF-8") as f:
    temp=f.read()
    temp1=re.sub(r':\d*',"",temp)
    word_dict=temp1.split("\n")

def getword(text):
    if len(text)==0:
        return ""
    if len(text)==1:
        return text
    if text in word_dict:
        return text
    else:
        small=len(text)-1
        text=text[0:small]
        return getword(text)
def FMM():
    start_index = 0
    max_length = 15
    result_len=0
    result_str = ""
    test_str=input("输入不超过八个字短语:")
    while test_str!="":
        tmp_str = test_str[0:max_length]
        seg_str = getword(tmp_str)
        seg_len = len(seg_str)
        result_len = result_len + seg_len
        if seg_str.strip():
            result_str = result_str + seg_str + ' / '
        test_str = test_str[seg_len:]
    return(result_str)



def idf(text):#计算idf
    words=text.replace("'","").split(" / ")
    #计算idf
    idf = [0, 0, 0, 0, 0, 0, 0, 0]
    with open("result.txt",'r',encoding="UTF-8") as f:
        i=0
        while(i<23062):
            i=i+1
            line_words=f.readline().replace(":"," ").rstrip().split(" ")
            for word in words:
                if(word in line_words):
                    idf[words.index(word)]=idf[words.index(word)]+1
    for i in range(0,len(words)):
        if(idf[i]!=0):
            idf[i]=math.log(23062/idf[i],2)
    return idf

def tf(text,idf):#计算tf
    result= [0,0,0,0,0,0,0,0,0,0]
    result_index=[0,0,0,0,0,0,0,0,0,0]
    words = text.replace("'", "").split(" / ")
    print(words)
    with open("result.txt",'r',encoding="UTF-8") as f:
        i=0
        while(i<20632):
            tf = [0, 0, 0, 0, 0, 0, 0, 0]
            line_words=f.readline().replace(":"," ").rstrip().split(" ")
            lines_num=len(line_words)
            for word in words:
                if(word in line_words):
                    tf[words.index(word)]=int(line_words[line_words.index(word)+1])/lines_num
            sum=0
            for j in range(0,len(words)):
                sum=sum+tf[j]*idf[j]
            min_num=min(result)
            min_index=result.index(min_num)
            if(sum>min_num):
                result[min_index]=sum
                result_index[min_index]=i
            i = i + 1
    #print("idf:",idf)
    #print("tf:", tf)
    #print("result:",result)
    #print("result_index",result_index)
    #result_dic=dict(zip(result_index,result))
    #print(result_dic)
    with open("语料库.txt", 'r', encoding="UTF-8") as f:
        t=0
        while(t<20632):
            end_line_words = f.readline()
            if(t in result_index):
                end_index=result_index.index(t)
                #print("数值：{}\n文档：{}".format(result[t],end_line_words)
                print(str(result[end_index])+"   "+end_line_words)
                print("-----------------------------------------------")
            t=t+1
def main():
    x=FMM()
    idf_result=idf(x)
    tf(x,idf_result)

if __name__=='__main__':
    main()