Source code for claf.tokens.tokenizer.word



import re

from overrides import overrides

from claf import utils as common_utils

from .base import Tokenizer


[docs]class WordTokenizer(Tokenizer): """ Word Tokenizer * Args: name: tokenizer name [treebank_en|spacy_en|mecab_ko|bert_basic] * Kwargs: flatten: return type as flatten list split_with_regex: post split action. Split tokens that the tokenizer cannot split. """ def __init__(self, name, sent_tokenizer, config={}, split_with_regex=True): super(WordTokenizer, self).__init__(name, f"word-{name}+{sent_tokenizer.cache_name}") self.config = config self.sent_tokenizer = sent_tokenizer self.word_tokenizer = None self.split_with_regex = split_with_regex if split_with_regex: self.extra_split_chars_re = self.make_split_regex_expression()
[docs] def make_split_regex_expression(self): """ Apply a small amount of extra splitting to the given tokens, this is in particular to avoid UNK tokens due to contraction, quotation, or other forms of puncutation. I haven't really done tests to see if/how much difference this makes, but it does avoid some common UNKs I noticed in SQuAD/TriviaQA """ extra_split_chars = ( "-", "£", "€", "¥", "¢", "₹", "*", "\u2212", "\u2014", "\u2013", "/", "~", '"', "'", "\ud01C", "\u2019", "\u201D", "\u2018", "\u00B0", ".", ":", ) extra_split_tokens = ( "``", "(?<=[^_])_(?=[^_])", # dashes w/o a preceeding or following dash, so __wow___ -> ___ wow ___ "''", "[" + "".join(extra_split_chars) + "]", ) return re.compile("(" + "|".join(extra_split_tokens) + ")")
@overrides def _tokenize(self, text, unit="text"): """ Text -> word tokens """ if type(text) != str: raise ValueError(f"text type is must be str. not {type(text)}") if unit == "sentence": tokens = getattr(self, f"_{self.name}")(text) else: sentences = self.sent_tokenizer.tokenize(text) tokens = [getattr(self, f"_{self.name}")(sentence) for sentence in sentences] if self.split_with_regex and self.name != "spacy_en": tokens = self._split_with_regex(tokens) return list(common_utils.flatten(tokens)) def _split_with_regex(self, sentences): for i, sentence in enumerate(sentences): sentences[i] = [token for token in self._post_split_tokens(sentence)] return sentences def _post_split_tokens(self, tokens): return [[x for x in self.extra_split_chars_re.split(token) if x != ""] for token in tokens] """ Tokenizers """ def _space_all(self, text): def is_whitespace(c): if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F: return True return False prev_is_whitespace = True tokens = [] for char in text: if is_whitespace(char): prev_is_whitespace = True else: if prev_is_whitespace: tokens.append(char) else: tokens[-1] += char prev_is_whitespace = False return tokens def _treebank_en(self, text): if self.word_tokenizer is None: import nltk self.word_tokenizer = nltk.TreebankWordTokenizer() return [ token.replace("''", '"').replace("``", '"') for token in self.word_tokenizer.tokenize(text) ] def _spacy_en(self, text): if self.word_tokenizer is None: from claf.tokens.tokenizer.utils import load_spacy_model_for_tokenizer self.word_tokenizer = load_spacy_model_for_tokenizer(self.extra_split_chars_re) def _remove_spaces(tokens): return [token.text for token in tokens if not token.is_space] return _remove_spaces(self.word_tokenizer(text)) def _bert_basic(self, text): if self.word_tokenizer is None: from pytorch_transformers import BasicTokenizer self.word_tokenizer = BasicTokenizer(**self.config) return self.word_tokenizer.tokenize(text) def _mecab_ko(self, text): if self.word_tokenizer is None: from konlpy.tag import Mecab self.word_tokenizer = Mecab() return self.word_tokenizer.morphs(text)