Source code for claf.tokens.tokenizer.base

[docs]class Tokenizer: """ Tokenizer Base Class """ MAX_TO_KEEP_CACHE = 3 def __init__(self, name, cache_name): self.cache = {} # dict: {text: tokenized_tokens} self.name = name self.cache_name = cache_name
[docs] def tokenize(self, text, unit="text"): if type(text) == str and text in self.cache: return self.cache[text] tokenized_tokens = self._tokenize(text, unit="text") # Cache if len(self.cache) <= self.MAX_TO_KEEP_CACHE: self.cache[text] = tokenized_tokens else: first_key = next(iter(self.cache.keys())) del self.cache[first_key] return tokenized_tokens
def _tokenize(self, text, unit="text"): """ splitting text into tokens. """ if type(text) != str: raise ValueError(f"text type is must be str. not {type(text)}") return getattr(self, f"_{self.name}")(text, unit=unit)