Source code for claf.tokens


from claf.decorator import register
from claf.tokens import indexer, embedding
from claf.tokens.linguistic import POSTag, NER
from claf.tokens.token_maker import TokenMaker
from claf.tokens.tokenizer import PassText


[docs]def basic_embedding_fn(embedding_config, module): def wrapper(vocab): embedding_config["vocab"] = vocab return module(**embedding_config) return wrapper
[docs]@register(f"token:{TokenMaker.FEATURE_TYPE}") class FeatureTokenMaker(TokenMaker): """ Feature Token Do not use Embedding function. Just pass indexed_feature example. hello -> ['hello', 'world'] -> [3, 5] -> tensor consisting of - tokenizer: Tokenizer (need to define unit) - indexer: WordIndexer - embedding: None - vocab: Vocab """ def __init__(self, tokenizers, indexer_config, embedding_config, vocab_config): tokenizer = PassText() do_tokenize = indexer_config.get("do_tokenize", False) if do_tokenize: text_unit = indexer_config.get("unit", None) if text_unit is None: raise ValueError("When use 'do_tokenize', 'unit' is required. ") del indexer_config["unit"] tokenizer = tokenizers[text_unit] super(FeatureTokenMaker, self).__init__( TokenMaker.FEATURE_TYPE, tokenizer=tokenizer, indexer=indexer.WordIndexer(tokenizer, **indexer_config), embedding_fn=None, vocab_config=vocab_config, )
[docs]@register(f"token:{TokenMaker.BERT_TYPE}") class BertTokenMaker(TokenMaker): """ BERT Token Pre-training of Deep Bidirectional Transformers for Language Understanding example. hello -> ['[CLS]', 'he', '##llo', [SEP]] -> [1, 4, 7, 2] -> BERT -> tensor consisting of - tokenizer: WordTokenizer - indexer: WordIndexer - embedding: ELMoEmbedding (Language Modeling BiLSTM) - vocab: Vocab """ def __init__(self, tokenizers, indexer_config, embedding_config, vocab_config): tokenizer = tokenizers["subword"] super(BertTokenMaker, self).__init__( TokenMaker.BERT_TYPE, tokenizer=tokenizer, indexer=indexer.BertIndexer(tokenizer, **indexer_config), embedding_fn=basic_embedding_fn(embedding_config, embedding.BertEmbedding), vocab_config=vocab_config, )
[docs]@register(f"token:{TokenMaker.CHAR_TYPE}") class CharTokenMaker(TokenMaker): """ Character Token Character-level Convolutional Networks for Text Classification (https://arxiv.org/abs/1509.01626) example. hello -> ['h', 'e', 'l', 'l', 'o'] -> [2, 3, 4, 4, 5] -> CharCNN -> tensor consisting of - tokenizer: CharTokenizer - indexer: CharIndexer - embedding: CharEmbedding (CharCNN) - vocab: Vocab """ def __init__(self, tokenizers, indexer_config, embedding_config, vocab_config): super(CharTokenMaker, self).__init__( TokenMaker.CHAR_TYPE, tokenizer=tokenizers["char"], indexer=indexer.CharIndexer(tokenizers["char"], **indexer_config), embedding_fn=basic_embedding_fn(embedding_config, embedding.CharEmbedding), vocab_config=vocab_config, )
[docs]@register(f"token:{TokenMaker.COVE_TYPE}") class CoveTokenMaker(TokenMaker): """ CoVe Token Learned in Translation: Contextualized Word Vectors (McCann et. al. 2017) (https://github.com/salesforce/cove) example. hello -> ['hello'] -> [2] -> CoVe -> tensor consisting of - tokenizer: WordTokenizer - indexer: WordIndexer - embedding: CoveEmbedding (Machine Translation LSTM) - vocab: Vocab """ def __init__(self, tokenizers, indexer_config, embedding_config, vocab_config): super(CoveTokenMaker, self).__init__( TokenMaker.CHAR_TYPE, tokenizer=tokenizers["word"], indexer=indexer.WordIndexer(tokenizers["word"], **indexer_config), embedding_fn=basic_embedding_fn(embedding_config, embedding.CoveEmbedding), vocab_config=vocab_config, )
[docs]@register(f"token:{TokenMaker.ELMO_TYPE}") class ElmoTokenMaker(TokenMaker): """ ELMo Token Embedding from Language Modeling Deep contextualized word representations (https://github.com/allenai/allennlp/blob/master/allennlp/modules/elmo.py) example. hello -> ['h', 'e', 'l', 'l', 'o'] -> [2, 3, 4, 4, 5] -> ELMo -> tensor consisting of - tokenizer: WordTokenizer - indexer: WordIndexer - embedding: ELMoEmbedding (Language Modeling BiLSTM) - vocab: Vocab """ def __init__(self, tokenizers, indexer_config, embedding_config, vocab_config): super(ElmoTokenMaker, self).__init__( TokenMaker.WORD_TYPE, tokenizer=tokenizers["word"], indexer=indexer.ELMoIndexer(tokenizers["word"], **indexer_config), embedding_fn=basic_embedding_fn(embedding_config, embedding.ELMoEmbedding), vocab_config="elmo", )
[docs]@register(f"token:{TokenMaker.EXACT_MATCH_TYPE}") class ExactMatchTokenMaker(TokenMaker): """ Exact Match Token (Sparse Feature) Three simple binary features, indicating whether p_i can be exactly matched to one question word in q, either in its original, lowercase or lemma form. example. c: i do, q: i -> ['i', 'do'] -> [1, 0] -> tensor consisting of - tokenizer: WordTokenizer - indexer: WordIndexer - embedding: SparseFeature - vocab: Vocab """ def __init__(self, tokenizers, indexer_config, embedding_config, vocab_config): super(ExactMatchTokenMaker, self).__init__( TokenMaker.EXACT_MATCH_TYPE, tokenizer=tokenizers["word"], indexer=indexer.ExactMatchIndexer(tokenizers["word"], **indexer_config), embedding_fn=self._embedding_fn(embedding_config, indexer_config), vocab_config=vocab_config, ) def _embedding_fn(self, embedding_config, indexer_config): def wrapper(vocab): embed_type = embedding_config.get("type", "sparse") if "type" in embedding_config: del embedding_config["type"] binary_classes = ["False", "True"] feature_count = 1 # origin embedding_config["classes"] = [binary_classes] if indexer_config.get("lower", False): feature_count += 1 embedding_config["classes"].append(binary_classes) if indexer_config.get("lemma", False): feature_count += 1 embedding_config["classes"].append(binary_classes) return embedding.SparseFeature( vocab, embed_type, feature_count, params=embedding_config ) return wrapper
[docs]@register(f"token:{TokenMaker.WORD_TYPE}") class WordTokenMaker(TokenMaker): """ Word Token (default) i do -> ['i', 'do'] -> [1, 2] -> Embedding Matrix -> tensor consisting of - tokenizer: WordTokenizer - indexer: WordIndexer - embedding: WordEmbedding - vocab: Vocab """ def __init__(self, tokenizers, indexer_config, embedding_config, vocab_config): super(WordTokenMaker, self).__init__( TokenMaker.WORD_TYPE, tokenizer=tokenizers["word"], indexer=indexer.WordIndexer(tokenizers["word"], **indexer_config), embedding_fn=basic_embedding_fn(embedding_config, embedding.WordEmbedding), vocab_config=vocab_config, )
[docs]@register(f"token:{TokenMaker.FREQUENT_WORD_TYPE}") class FrequentWordTokenMaker(TokenMaker): """ Frequent-Tuning Word Token word token + pre-trained word embeddings fixed and only fine-tune the N most frequent example. i do -> ['i', 'do'] -> [1, 2] -> Embedding Matrix -> tensor finetuning only 'do' consisting of - tokenizer: WordTokenizer - indexer: WordIndexer - embedding: FrequentTuningWordEmbedding - vocab: Vocab """ def __init__(self, tokenizers, indexer_config, embedding_config, vocab_config): super(FrequentWordTokenMaker, self).__init__( TokenMaker.FREQUENT_WORD_TYPE, tokenizer=tokenizers["word"], indexer=indexer.WordIndexer(tokenizers["word"], **indexer_config), embedding_fn=basic_embedding_fn( embedding_config, embedding.FrequentTuningWordEmbedding ), vocab_config=vocab_config, )
[docs]@register(f"token:{TokenMaker.LINGUISTIC_TYPE}") class LinguisticTokenMaker(TokenMaker): """ Exact Match Token (Sparse Feature) Three simple binary features, indicating whether p_i can be exactly matched to one question word in q, either in its original, lowercase or lemma form. example. c: i do, q: i -> ['i', 'do'] -> [1, 0] -> tensor consisting of - tokenizer: WordTokenizer - indexer: WordIndexer - embedding: SparseFeature - vocab: Vocab """ def __init__(self, tokenizers, indexer_config, embedding_config, vocab_config): super(LinguisticTokenMaker, self).__init__( TokenMaker.LINGUISTIC_TYPE, tokenizer=tokenizers["word"], indexer=indexer.LinguisticIndexer(tokenizers["word"], **indexer_config), embedding_fn=self._embedding_fn(embedding_config, indexer_config), vocab_config=vocab_config, ) def _embedding_fn(self, embedding_config, indexer_config): def wrapper(vocab): embed_type = embedding_config.get("type", "sparse") if "type" in embedding_config: del embedding_config["type"] feature_count = 0 embedding_config["classes"] = [] if indexer_config.get("pos_tag", False): feature_count += 1 embedding_config["classes"].append(POSTag.classes) if indexer_config.get("ner", False): feature_count += 1 embedding_config["classes"].append(NER.classes) return embedding.SparseFeature( vocab, embed_type, feature_count, params=embedding_config ) return wrapper