BM25代码(未完待续)

from gensim.summarization import bm25

class BM25Util(object):

    def __init__(self, corpus):
        self.bm25 = bm25.BM25(corpus)
        self.average_idf = sum(map(lambda k: float(self.bm25.idf[k]), self.bm25.idf.keys())) / len(self.bm25.idf.keys())

    def similarity(self, query, size):

        scores = self.bm25.get_scores(query, self.average_idf)
        scores_sort = sorted(list(enumerate(scores)), key=lambda item: item[1], reverse=True)

        return scores_sort[: size]
import time
import codecs
from utils.bm25_util import BM25Util
from gensim.summarization import bm25

DEBUG_MODE = True

class BM25Model(object):

    def __init__(self, corpus_file, word2id):

        time_s = time.time()
        size = 500000 if DEBUG_MODE else 10000000
        with codecs.open(corpus_file, "r", "utf-8") as rfd:
            data = [s.strip().split("\t") for s in rfd.readlines()[: size]]
            self.contexts = [[w for w in s.split() if w in word2id] for s, _ in data]
            self.responses = [s.replace(" ", "") for _, s in data]
        self.bm25_inst = BM25Util(self.contexts)
        print("Time to build bm25 model: %2.f seconds." % (time.time() - time_s))

    def similarity(self, query, size=10):
        return self.bm25_inst.similarity(query, 10)

    def get_docs(self, sim_items):
        docs = [self.contexts[id_] for id_, socre in sim_items]
        answers = [self.responses[id_] for id_, socre in sim_items]
        return docs, answers
最后编辑于
©著作权归作者所有,转载或内容合作请联系作者
平台声明:文章内容(如有图片或视频亦包括在内)由作者上传并发布,文章内容仅代表作者本人观点,简书系信息发布平台,仅提供信息存储服务。