Source code for claf.tokens.tokenizer.utils


import spacy


[docs]def create_tokenizer_with_regex(nlp, split_regex): prefixes_re = spacy.util.compile_prefix_regex(nlp.Defaults.prefixes) infix_re = split_regex suffix_re = spacy.util.compile_suffix_regex(nlp.Defaults.suffixes) return spacy.tokenizer.Tokenizer( nlp.vocab, nlp.Defaults.tokenizer_exceptions, prefix_search=prefixes_re.search, infix_finditer=infix_re.finditer, suffix_search=suffix_re.search, token_match=None, )
[docs]def load_spacy_model_for_tokenizer(split_regex): model = spacy.load("en_core_web_sm", disable=["vectors", "textcat", "tagger", "parser", "ner"]) if split_regex is not None: spacy_tokenizer = create_tokenizer_with_regex(model, split_regex) model.tokenizer = spacy_tokenizer return model