Source code for claf.machine.components.retrieval.tfidf


from pathlib import Path

from gensim.corpora import Dictionary
from gensim.models import TfidfModel
from gensim.similarities import MatrixSimilarity, SparseMatrixSimilarity

from tqdm import tqdm

from claf.decorator import register


[docs]@register("component:tfidf") class TFIDF: """ TF-IDF document retrieval model - Term Frequency - Inverse Document Frequency - log(tf + 1) * log((N - Nt + 0.5) / (Nt + 0.5)) * Kwargs: k: the number of top k results """ VOCAB_FNAME = "vocab.txt" TFIDF_FNAME = "tfidf.model" INDEX_FNAME = "similarities.index" def __init__(self, texts, word_tokenizer, k=1): super(TFIDF, self).__init__() self.k = k self.texts = texts self.word_tokenizer = word_tokenizer
[docs] def init(self): corpus = [ self.word_tokenizer.tokenize(text) for text in tqdm(self.texts, desc="make corpus (Tokenize)") ] self.vocab = Dictionary(corpus) self.init_model()
[docs] def init_model(self): corpus = [] for text in tqdm(self.texts, desc="make corpus (BoW)"): corpus.append(self.parse(text)) self.model = TfidfModel(corpus) self.index = SparseMatrixSimilarity(self.model[corpus], num_features=len(self.vocab))
[docs] def get_closest(self, query): query_tfidf = self.text_to_tfidf(query) self.index.num_best = self.k results = self.index[query_tfidf] return [ (text_index, self.texts[text_index], score) # return (index, text, score) for (text_index, score) in results ]
[docs] def parse(self, query, ngram=1): query_tokens = self.word_tokenizer.tokenize(query) return self.vocab.doc2bow(query_tokens)
[docs] def text_to_tfidf(self, query): """ Create a tfidf-weighted word vector from query. tfidf = log(tf + 1) * log((N - Nt + 0.5) / (Nt + 0.5)) """ query_bow = self.parse(query) return self.model[query_bow]
[docs] def save(self, dir_path): dir_path = Path(dir_path) dir_path.mkdir(parents=True, exist_ok=True) vocab_path = str(dir_path / self.VOCAB_FNAME) model_path = str(dir_path / self.TFIDF_FNAME) index_path = str(dir_path / self.INDEX_FNAME) self.vocab.save(vocab_path) self.model.save(model_path) self.index.save(index_path)
[docs] def load(self, dir_path): dir_path = Path(dir_path) vocab_path = str(dir_path / self.VOCAB_FNAME) model_path = str(dir_path / self.TFIDF_FNAME) index_path = str(dir_path / self.INDEX_FNAME) self.vocab = Dictionary.load(vocab_path) self.model = TfidfModel.load(model_path) self.index = SparseMatrixSimilarity.load(index_path)