Source code for claf.tokens.tokenizer.sent


import nltk.data

from .base import Tokenizer


[docs]class SentTokenizer(Tokenizer): """ Sentence Tokenizer text -> [sent tokens] * Args: name: tokenizer name [punkt] """ def __init__(self, name, config={}): super(SentTokenizer, self).__init__(name, f"sent-{name}") self.config = config """ Tokenizers """ def _punkt(self, text, unit="text"): """ ex) Hello World. This is punkt tokenizer -> ['Hello World', 'This is punkt tokenizer'] """ sent_tokenizer = nltk.data.load("tokenizers/punkt/english.pickle") return sent_tokenizer.tokenize(text)