Source code for claf.tokens.embedding.word_embedding


import logging
from overrides import overrides
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F

from claf.data.data_handler import CachePath, DataHandler

from .base import TokenEmbedding

logger = logging.getLogger(__name__)


[docs]class WordEmbedding(TokenEmbedding): """ Word Embedding Default Token Embedding * Args: vocab: Vocab (claf.tokens.vocab) * Kwargs: dropout: The number of dropout probability embed_dim: The number of embedding dimension padding_idx: If given, pads the output with the embedding vector at padding_idx (initialized to zeros) whenever it encounters the index. max_norm: If given, will renormalize the embedding vectors to have a norm lesser than this before extracting. Note: this will modify weight in-place. norm_type: The p of the p-norm to compute for the max_norm option. Default 2. scale_grad_by_freq: if given, this will scale gradients by the inverse of frequency of the words in the mini-batch. Default False. sparse: if True, gradient w.r.t. weight will be a sparse tensor. See Notes under torch.nn.Embedding for more details regarding sparse gradients. pretrained_path: pretrained vector path (eg. GloVe) trainable: finetune or fixed """ def __init__( self, vocab, dropout=0.2, embed_dim=100, padding_idx=None, max_norm=None, norm_type=2, scale_grad_by_freq=False, sparse=False, pretrained_path=None, trainable=True, ): super(WordEmbedding, self).__init__(vocab) self.data_handler = DataHandler(cache_path=CachePath.PRETRAINED_VECTOR) self.embed_dim = embed_dim if dropout and dropout > 0: self.dropout = nn.Dropout(p=dropout) else: self.dropout = lambda x: x if pretrained_path: weight = self._read_pretrained_file(pretrained_path) self.weight = torch.nn.Parameter(weight, requires_grad=trainable) else: self.weight = self._init_weight(trainable=trainable) # nn.functional.embedding = optional paramters # (padding_idx, max_norm, norm_type, scale_grad_by_freq, sparse) # check - https://pytorch.org/docs/master/nn.html#torch.nn.functional.embeddin\ # ://pytorch.org/docs/master/nn.html#torch.nn.functional.embedding self.padding_idx = padding_idx self.max_norm = max_norm self.norm_type = norm_type self.scale_grad_by_freq = scale_grad_by_freq self.sparse = sparse def _init_weight(self, trainable=True): weight = torch.FloatTensor(self.get_vocab_size(), self.embed_dim) weight = torch.nn.Parameter(weight, requires_grad=trainable) torch.nn.init.xavier_uniform_(weight) return weight
[docs] @overrides def forward(self, words): input_size = words.size() if len(input_size) > 2: words = words.view(-1, input_size[-1]) embedded_words = F.embedding( words, self.weight, padding_idx=self.padding_idx, max_norm=self.max_norm, norm_type=self.norm_type, scale_grad_by_freq=self.scale_grad_by_freq, sparse=self.sparse, ) if len(input_size) > 2: embedded_size = list(input_size) + [embedded_words.size(-1)] embedded_words = embedded_words.view(*embedded_size) return self.dropout(embedded_words)
def _read_pretrained_file(self, file_path): words_to_keep = set(self.vocab.get_all_tokens()) vocab_size = self.get_vocab_size() embeddings = {} # First we read the embeddings from the file, only keeping vectors for the words we need. logger.info("Reading embeddings from file") file_path = self.data_handler.read(file_path, return_path=True) with open(file_path, "rb") as embeddings_file: for line in embeddings_file: fields = line.decode("utf-8").rstrip().split(" ") if len(fields) - 1 != self.embed_dim: logger.info( f"Found line with wrong number of dimensions (expected {self.embed_dim}, was {len(fields)}): {line}" ) continue word = fields[0] if word in words_to_keep: vector = np.asarray(fields[1:], dtype="float32") embeddings[word] = vector if not embeddings: raise ValueError( "No embeddings of correct dimension found. check input dimension value" ) all_embeddings = np.asarray(list(embeddings.values())) embeddings_mean = float(np.mean(all_embeddings)) embeddings_std = float(np.std(all_embeddings)) # Now we initialize the weight matrix for an embedding layer, starting with random vectors, # then filling in the word vectors we just read. logger.info("Initializing pre-trained embedding layer") embedding_matrix = torch.FloatTensor(vocab_size, self.embed_dim).normal_( embeddings_mean, embeddings_std ) match_count = 0 for i in range(0, vocab_size): word = self.vocab.get_token(i) if word in embeddings: embedding_matrix[i] = torch.FloatTensor(embeddings[word]) match_count += 1 else: # f"Word {word} was not found in the embedding file. Initialising randomly." pass logger.info(f"Match embedding vocab size: {match_count}. [{match_count}/{vocab_size}]") return embedding_matrix
[docs] @overrides def get_output_dim(self): return self.embed_dim