Source code for claf.tokens.indexer.linguistic_indexer


from overrides import overrides
import spacy

from claf.tokens.linguistic import POSTag, NER

from .base import TokenIndexer


[docs]class LinguisticIndexer(TokenIndexer): """ Linguistic Token Indexer * Property vocab: Vocab (claf.tokens.vocabulary) * Args: tokenizer: WordTokenizer * Kwargs: pos_tag: POS Tagging ner: Named Entity Recognition dep: Dependency Parser """ def __init__(self, tokenizer, pos_tag=None, ner=None, dep=None): super(LinguisticIndexer, self).__init__(tokenizer) self.spacy_model = None # Features self.use_pos_tag = pos_tag self.pos_to_index = {t: i for i, t in enumerate(POSTag.classes)} self.use_ner = ner self.ner_to_index = {t: i for i, t in enumerate(NER.classes)} self.use_dep = dep if dep: raise NotImplementedError("Dependency Parser feature")
[docs] @overrides def index(self, text): package = self.tokenizer.name return getattr(self, f"_{package}")(text)
""" Need to match with Tokenizer's package """ def _mecab_ko(self, text): raise NotImplementedError("Linguistic Feature with mecab package") def _nltk_en(self, text): raise NotImplementedError("Linguistic Feature with nltk package") def _spacy_en(self, text): if self.spacy_model is None: from claf.tokens.tokenizer.utils import load_spacy_model_for_tokenizer disables = ["vectors", "textcat", "parser"] if not self.use_pos_tag: disables.apppend("tagger") if not self.use_ner: disables.apppend("ner") self.spacy_model = spacy.load("en_core_web_sm", disable=disables) self.spacy_model.tokenizer = load_spacy_model_for_tokenizer( self.tokenizer.extra_split_chars_re ) sent_tokenizer = self.tokenizer.sent_tokenizer sentences = sent_tokenizer.tokenize(text) ner_entities = {} docs = [] for sentence in sentences: doc = self.spacy_model(sentence) docs.append(doc) if self.use_ner: for e in doc.ents: ner_entities[e.text] = e.label_ linguistic_features = [] for doc in docs: for token in doc: if token.is_space: continue feature = [] if self.use_pos_tag: feature.append(self.pos_to_index[token.pos_]) if self.use_ner: feature.append(self.ner_to_index[ner_entities.get(token.text, "NONE")]) linguistic_features.append(feature) return linguistic_features