diff --git a/fastNLP/embeddings/__init__.py b/fastNLP/embeddings/__init__.py index 37881f17..8a970e25 100644 --- a/fastNLP/embeddings/__init__.py +++ b/fastNLP/embeddings/__init__.py @@ -18,7 +18,6 @@ __all__ = [ "get_embeddings", ] - from .embedding import Embedding, TokenEmbedding from .static_embedding import StaticEmbedding from .elmo_embedding import ElmoEmbedding diff --git a/fastNLP/embeddings/bert_embedding.py b/fastNLP/embeddings/bert_embedding.py index 6a10c489..e8844aa1 100644 --- a/fastNLP/embeddings/bert_embedding.py +++ b/fastNLP/embeddings/bert_embedding.py @@ -1,3 +1,12 @@ +""" +.. todo:: + doc +""" + +__all__ = [ + "BertEmbedding", + "BertWordPieceEncoder" +] import os import collections @@ -13,6 +22,7 @@ from ..modules.encoder.bert import _WordPieceBertModel, BertModel, BertTokenizer from .contextual_embedding import ContextualEmbedding import warnings + class BertEmbedding(ContextualEmbedding): """ 别名::class:`fastNLP.embeddings.BertEmbedding` :class:`fastNLP.embeddings.bert_embedding.BertEmbedding` @@ -54,11 +64,12 @@ class BertEmbedding(ContextualEmbedding): word pieces后的内容,并将第512个word piece置为[SEP]。超过长度的部分的encode结果直接全部置零。一般仅有只使用[CLS] 来进行分类的任务将auto_truncate置为True。 """ - def __init__(self, vocab: Vocabulary, model_dir_or_name: str='en-base-uncased', layers: str='-1', - pool_method: str='first', word_dropout=0, dropout=0, include_cls_sep: bool=False, - pooled_cls=True, requires_grad: bool=False, auto_truncate:bool=False): + + def __init__(self, vocab: Vocabulary, model_dir_or_name: str = 'en-base-uncased', layers: str = '-1', + pool_method: str = 'first', word_dropout=0, dropout=0, include_cls_sep: bool = False, + pooled_cls=True, requires_grad: bool = False, auto_truncate: bool = False): super(BertEmbedding, self).__init__(vocab, word_dropout=word_dropout, dropout=dropout) - + # 根据model_dir_or_name检查是否存在并下载 if model_dir_or_name.lower() in PRETRAINED_BERT_MODEL_DIR: if 'cn' in model_dir_or_name.lower() and pool_method not in ('first', 'last'): @@ -71,21 +82,21 @@ class BertEmbedding(ContextualEmbedding): model_dir = os.path.abspath(os.path.expanduser(model_dir_or_name)) else: raise ValueError(f"Cannot recognize {model_dir_or_name}.") - + self._word_sep_index = None if '[SEP]' in vocab: self._word_sep_index = vocab['[SEP]'] - + self.model = _WordBertModel(model_dir=model_dir, vocab=vocab, layers=layers, pool_method=pool_method, include_cls_sep=include_cls_sep, pooled_cls=pooled_cls, auto_truncate=auto_truncate, min_freq=2) - + self.requires_grad = requires_grad - self._embed_size = len(self.model.layers)*self.model.encoder.hidden_size - + self._embed_size = len(self.model.layers) * self.model.encoder.hidden_size + def _delete_model_weights(self): del self.model - + def forward(self, words): """ 计算words的bert embedding表示。计算之前会在每句话的开始增加[CLS]在结束增加[SEP], 并根据include_cls_sep判断要不要 @@ -100,9 +111,9 @@ class BertEmbedding(ContextualEmbedding): return self.dropout(outputs) outputs = self.model(words) outputs = torch.cat([*outputs], dim=-1) - + return self.dropout(outputs) - + def drop_word(self, words): """ 按照设定随机将words设置为unknown_index。 @@ -120,7 +131,7 @@ class BertEmbedding(ContextualEmbedding): if self._word_sep_index: words.masked_fill_(sep_mask, self._word_sep_index) return words - + @property def requires_grad(self): """ @@ -129,12 +140,12 @@ class BertEmbedding(ContextualEmbedding): :return: """ requires_grads = set([param.requires_grad for name, param in self.named_parameters() - if 'word_pieces_lengths' not in name]) + if 'word_pieces_lengths' not in name]) if len(requires_grads) == 1: return requires_grads.pop() else: return None - + @requires_grad.setter def requires_grad(self, value): for name, param in self.named_parameters(): @@ -155,10 +166,11 @@ class BertWordPieceEncoder(nn.Module): :param float dropout: 以多大的概率对embedding的表示进行Dropout。0.1即随机将10%的值置为0。 :param bool requires_grad: 是否需要gradient。 """ - def __init__(self, model_dir_or_name: str='en-base-uncased', layers: str='-1', pooled_cls: bool = False, - word_dropout=0, dropout=0, requires_grad: bool=False): + + def __init__(self, model_dir_or_name: str = 'en-base-uncased', layers: str = '-1', pooled_cls: bool = False, + word_dropout=0, dropout=0, requires_grad: bool = False): super().__init__() - + if model_dir_or_name.lower() in PRETRAINED_BERT_MODEL_DIR: model_url = _get_embedding_url('bert', model_dir_or_name.lower()) model_dir = cached_path(model_url, name='embedding') @@ -167,7 +179,7 @@ class BertWordPieceEncoder(nn.Module): model_dir = model_dir_or_name else: raise ValueError(f"Cannot recognize {model_dir_or_name}.") - + self.model = _WordPieceBertModel(model_dir=model_dir, layers=layers, pooled_cls=pooled_cls) self._sep_index = self.model._sep_index self._wordpiece_unk_index = self.model._wordpiece_unknown_index @@ -175,7 +187,7 @@ class BertWordPieceEncoder(nn.Module): self.requires_grad = requires_grad self.word_dropout = word_dropout self.dropout_layer = nn.Dropout(dropout) - + @property def requires_grad(self): """ @@ -187,24 +199,24 @@ class BertWordPieceEncoder(nn.Module): return requires_grads.pop() else: return None - + @requires_grad.setter def requires_grad(self, value): for name, param in self.named_parameters(): param.requires_grad = value - + @property def embed_size(self): return self._embed_size - + @property def embedding_dim(self): return self._embed_size - + @property def num_embedding(self): return self.model.encoder.config.vocab_size - + def index_datasets(self, *datasets, field_name, add_cls_sep=True): """ 使用bert的tokenizer新生成word_pieces列加入到datasets中,并将他们设置为input,且将word_pieces这一列的pad value设置为了 @@ -216,7 +228,7 @@ class BertWordPieceEncoder(nn.Module): :return: """ self.model.index_dataset(*datasets, field_name=field_name, add_cls_sep=add_cls_sep) - + def forward(self, word_pieces, token_type_ids=None): """ 计算words的bert embedding表示。传入的words中应该自行包含[CLS]与[SEP]的tag。 @@ -233,13 +245,13 @@ class BertWordPieceEncoder(nn.Module): token_type_ids = sep_mask_cumsum.fmod(2) if token_type_ids[0, 0].item(): # 如果开头是奇数,则需要flip一下结果,因为需要保证开头为0 token_type_ids = token_type_ids.eq(0).long() - + word_pieces = self.drop_word(word_pieces) outputs = self.model(word_pieces, token_type_ids) outputs = torch.cat([*outputs], dim=-1) - + return self.dropout_layer(outputs) - + def drop_word(self, words): """ 按照设定随机将words设置为unknown_index。 @@ -260,10 +272,10 @@ class BertWordPieceEncoder(nn.Module): class _WordBertModel(nn.Module): - def __init__(self, model_dir:str, vocab:Vocabulary, layers:str='-1', pool_method:str='first', - include_cls_sep:bool=False, pooled_cls:bool=False, auto_truncate:bool=False, min_freq=2): + def __init__(self, model_dir: str, vocab: Vocabulary, layers: str = '-1', pool_method: str = 'first', + include_cls_sep: bool = False, pooled_cls: bool = False, auto_truncate: bool = False, min_freq=2): super().__init__() - + self.tokenzier = BertTokenizer.from_pretrained(model_dir) self.encoder = BertModel.from_pretrained(model_dir) self._max_position_embeddings = self.encoder.config.max_position_embeddings @@ -271,23 +283,23 @@ class _WordBertModel(nn.Module): encoder_layer_number = len(self.encoder.encoder.layer) self.layers = list(map(int, layers.split(','))) for layer in self.layers: - if layer<0: - assert -layer<=encoder_layer_number, f"The layer index:{layer} is out of scope for " \ - f"a bert model with {encoder_layer_number} layers." + if layer < 0: + assert -layer <= encoder_layer_number, f"The layer index:{layer} is out of scope for " \ + f"a bert model with {encoder_layer_number} layers." else: - assert layer=min_freq and not vocab._is_word_no_create_entry(word): #出现次数大于这个次数才新增 + if index != vocab.unknown_idx and word_pieces[0] == '[UNK]': # 说明这个词不在原始的word里面 + if vocab.word_count[word] >= min_freq and not vocab._is_word_no_create_entry( + word): # 出现次数大于这个次数才新增 word_piece_dict[word] = 1 # 新增一个值 continue for word_piece in word_pieces: @@ -327,7 +340,7 @@ class _WordBertModel(nn.Module): new_word_piece_vocab[token] = len(new_word_piece_vocab) self.tokenzier._reinit_on_new_vocab(new_word_piece_vocab) self.encoder.embeddings.word_embeddings = embed - + word_to_wordpieces = [] word_pieces_lengths = [] for word, index in vocab: @@ -347,7 +360,7 @@ class _WordBertModel(nn.Module): self.word_to_wordpieces = np.array(word_to_wordpieces) self.word_pieces_lengths = nn.Parameter(torch.LongTensor(word_pieces_lengths), requires_grad=False) print("Successfully generate word pieces.") - + def forward(self, words): """ @@ -358,34 +371,37 @@ class _WordBertModel(nn.Module): batch_size, max_word_len = words.size() word_mask = words.ne(self._word_pad_index) # 为1的地方有word seq_len = word_mask.sum(dim=-1) - batch_word_pieces_length = self.word_pieces_lengths[words].masked_fill(word_mask.eq(0), 0) # batch_size x max_len + batch_word_pieces_length = self.word_pieces_lengths[words].masked_fill(word_mask.eq(0), + 0) # batch_size x max_len word_pieces_lengths = batch_word_pieces_length.sum(dim=-1) # batch_size word_piece_length = batch_word_pieces_length.sum(dim=-1).max().item() # 表示word piece的长度(包括padding) - if word_piece_length+2>self._max_position_embeddings: + if word_piece_length + 2 > self._max_position_embeddings: if self.auto_truncate: - word_pieces_lengths = word_pieces_lengths.masked_fill(word_pieces_lengths+2>self._max_position_embeddings, - self._max_position_embeddings-2) + word_pieces_lengths = word_pieces_lengths.masked_fill( + word_pieces_lengths + 2 > self._max_position_embeddings, + self._max_position_embeddings - 2) else: - raise RuntimeError("After split words into word pieces, the lengths of word pieces are longer than the " - f"maximum allowed sequence length:{self._max_position_embeddings} of bert.") - + raise RuntimeError( + "After split words into word pieces, the lengths of word pieces are longer than the " + f"maximum allowed sequence length:{self._max_position_embeddings} of bert.") + # +2是由于需要加入[CLS]与[SEP] - word_pieces = words.new_full((batch_size, min(word_piece_length+2, self._max_position_embeddings)), + word_pieces = words.new_full((batch_size, min(word_piece_length + 2, self._max_position_embeddings)), fill_value=self._wordpiece_pad_index) attn_masks = torch.zeros_like(word_pieces) # 1. 获取words的word_pieces的id,以及对应的span范围 word_indexes = words.cpu().numpy() for i in range(batch_size): word_pieces_i = list(chain(*self.word_to_wordpieces[word_indexes[i, :seq_len[i]]])) - if self.auto_truncate and len(word_pieces_i)>self._max_position_embeddings-2: - word_pieces_i = word_pieces_i[:self._max_position_embeddings-2] - word_pieces[i, 1:word_pieces_lengths[i]+1] = torch.LongTensor(word_pieces_i) - attn_masks[i, :word_pieces_lengths[i]+2].fill_(1) + if self.auto_truncate and len(word_pieces_i) > self._max_position_embeddings - 2: + word_pieces_i = word_pieces_i[:self._max_position_embeddings - 2] + word_pieces[i, 1:word_pieces_lengths[i] + 1] = torch.LongTensor(word_pieces_i) + attn_masks[i, :word_pieces_lengths[i] + 2].fill_(1) # 添加[cls]和[sep] word_pieces[:, 0].fill_(self._cls_index) batch_indexes = torch.arange(batch_size).to(words) - word_pieces[batch_indexes, word_pieces_lengths+1] = self._sep_index - if self._has_sep_in_vocab: #但[SEP]在vocab中出现应该才会需要token_ids + word_pieces[batch_indexes, word_pieces_lengths + 1] = self._sep_index + if self._has_sep_in_vocab: # 但[SEP]在vocab中出现应该才会需要token_ids sep_mask = word_pieces.eq(self._sep_index) # batch_size x max_len sep_mask_cumsum = sep_mask.flip(dims=[-1]).cumsum(dim=-1).flip(dims=[-1]) token_type_ids = sep_mask_cumsum.fmod(2) @@ -396,9 +412,9 @@ class _WordBertModel(nn.Module): # 2. 获取hidden的结果,根据word_pieces进行对应的pool计算 # all_outputs: [batch_size x max_len x hidden_size, batch_size x max_len x hidden_size, ...] bert_outputs, pooled_cls = self.encoder(word_pieces, token_type_ids=token_type_ids, attention_mask=attn_masks, - output_all_encoded_layers=True) + output_all_encoded_layers=True) # output_layers = [self.layers] # len(self.layers) x batch_size x real_word_piece_length x hidden_size - + if self.include_cls_sep: outputs = bert_outputs[-1].new_zeros(len(self.layers), batch_size, max_word_len + 2, bert_outputs[-1].size(-1)) @@ -414,7 +430,7 @@ class _WordBertModel(nn.Module): real_word_piece_length = output_layer.size(1) - 2 if word_piece_length > real_word_piece_length: # 如果实际上是截取出来的 paddings = output_layer.new_zeros(batch_size, - word_piece_length-real_word_piece_length, + word_piece_length - real_word_piece_length, output_layer.size(2)) output_layer = torch.cat((output_layer, paddings), dim=1).contiguous() # 从word_piece collapse到word的表示 @@ -423,27 +439,27 @@ class _WordBertModel(nn.Module): if self.pool_method == 'first': for i in range(batch_size): i_word_pieces_cum_length = batch_word_pieces_cum_length[i, :seq_len[i]] # 每个word的start位置 - outputs[l_index, i, s_shift:outputs_seq_len[i]] = truncate_output_layer[i, i_word_pieces_cum_length] # num_layer x batch_size x len x hidden_size + outputs[l_index, i, s_shift:outputs_seq_len[i]] = truncate_output_layer[ + i, i_word_pieces_cum_length] # num_layer x batch_size x len x hidden_size elif self.pool_method == 'last': for i in range(batch_size): - i_word_pieces_cum_length = batch_word_pieces_cum_length[i, 1:seq_len[i]+1] - 1 # 每个word的end + i_word_pieces_cum_length = batch_word_pieces_cum_length[i, 1:seq_len[i] + 1] - 1 # 每个word的end outputs[l_index, i, s_shift:outputs_seq_len[i]] = truncate_output_layer[i, i_word_pieces_cum_length] elif self.pool_method == 'max': for i in range(batch_size): for j in range(seq_len[i]): - start, end = batch_word_pieces_cum_length[i, j], batch_word_pieces_cum_length[i, j+1] - outputs[l_index, i, j+s_shift], _ = torch.max(truncate_output_layer[i, start:end], dim=-2) + start, end = batch_word_pieces_cum_length[i, j], batch_word_pieces_cum_length[i, j + 1] + outputs[l_index, i, j + s_shift], _ = torch.max(truncate_output_layer[i, start:end], dim=-2) else: for i in range(batch_size): for j in range(seq_len[i]): - start, end = batch_word_pieces_cum_length[i, j], batch_word_pieces_cum_length[i, j+1] - outputs[l_index, i, j+s_shift] = torch.mean(truncate_output_layer[i, start:end], dim=-2) + start, end = batch_word_pieces_cum_length[i, j], batch_word_pieces_cum_length[i, j + 1] + outputs[l_index, i, j + s_shift] = torch.mean(truncate_output_layer[i, start:end], dim=-2) if self.include_cls_sep: - if l in (len(bert_outputs)-1, -1) and self.pooled_cls: + if l in (len(bert_outputs) - 1, -1) and self.pooled_cls: outputs[l_index, :, 0] = pooled_cls else: outputs[l_index, :, 0] = output_layer[:, 0] - outputs[l_index, batch_indexes, seq_len+s_shift] = output_layer[batch_indexes, seq_len+s_shift] + outputs[l_index, batch_indexes, seq_len + s_shift] = output_layer[batch_indexes, seq_len + s_shift] # 3. 最终的embedding结果 return outputs - diff --git a/fastNLP/embeddings/char_embedding.py b/fastNLP/embeddings/char_embedding.py index 520e85e6..24c84314 100644 --- a/fastNLP/embeddings/char_embedding.py +++ b/fastNLP/embeddings/char_embedding.py @@ -3,6 +3,10 @@ 词的index而不需要使用词语中的char的index来获取表达。 """ +__all__ = [ + "CNNCharEmbedding", + "LSTMCharEmbedding" +] import torch import torch.nn as nn @@ -16,6 +20,7 @@ from .embedding import TokenEmbedding from .utils import _construct_char_vocab_from_vocab from .utils import get_embeddings + class CNNCharEmbedding(TokenEmbedding): """ 别名::class:`fastNLP.embeddings.CNNCharEmbedding` :class:`fastNLP.embeddings.char_embedding.CNNCharEmbedding` @@ -49,14 +54,15 @@ class CNNCharEmbedding(TokenEmbedding): (文件夹下应该只有一个以.txt作为后缀的文件)或文件路径;第二种是传入embedding的名称,第二种情况将自动查看缓存中是否存在该模型, 没有的话将自动下载。如果输入为None则使用embedding_dim的维度随机初始化一个embedding. """ - def __init__(self, vocab: Vocabulary, embed_size: int=50, char_emb_size: int=50, word_dropout:float=0, - dropout:float=0, filter_nums: List[int]=(40, 30, 20), kernel_sizes: List[int]=(5, 3, 1), - pool_method: str='max', activation='relu', min_char_freq: int=2, pre_train_char_embed: str=None): + + def __init__(self, vocab: Vocabulary, embed_size: int = 50, char_emb_size: int = 50, word_dropout: float = 0, + dropout: float = 0, filter_nums: List[int] = (40, 30, 20), kernel_sizes: List[int] = (5, 3, 1), + pool_method: str = 'max', activation='relu', min_char_freq: int = 2, pre_train_char_embed: str = None): super(CNNCharEmbedding, self).__init__(vocab, word_dropout=word_dropout, dropout=dropout) - + for kernel in kernel_sizes: assert kernel % 2 == 1, "Only odd kernel is allowed." - + assert pool_method in ('max', 'avg') self.pool_method = pool_method # activation function @@ -74,7 +80,7 @@ class CNNCharEmbedding(TokenEmbedding): else: raise Exception( "Undefined activation function: choose from: [relu, tanh, sigmoid, or a callable function]") - + print("Start constructing character vocabulary.") # 建立char的词表 self.char_vocab = _construct_char_vocab_from_vocab(vocab, min_freq=min_char_freq) @@ -96,14 +102,14 @@ class CNNCharEmbedding(TokenEmbedding): self.char_embedding = StaticEmbedding(self.char_vocab, model_dir_or_name=pre_train_char_embed) else: self.char_embedding = get_embeddings((len(self.char_vocab), char_emb_size)) - + self.convs = nn.ModuleList([nn.Conv1d( char_emb_size, filter_nums[i], kernel_size=kernel_sizes[i], bias=True, padding=kernel_sizes[i] // 2) for i in range(len(kernel_sizes))]) self._embed_size = embed_size self.fc = nn.Linear(sum(filter_nums), embed_size) self.reset_parameters() - + def forward(self, words): """ 输入words的index后,生成对应的words的表示。 @@ -114,14 +120,14 @@ class CNNCharEmbedding(TokenEmbedding): words = self.drop_word(words) batch_size, max_len = words.size() chars = self.words_to_chars_embedding[words] # batch_size x max_len x max_word_len - word_lengths = self.word_lengths[words] # batch_size x max_len + word_lengths = self.word_lengths[words] # batch_size x max_len max_word_len = word_lengths.max() chars = chars[:, :, :max_word_len] # 为1的地方为mask chars_masks = chars.eq(self.char_pad_index) # batch_size x max_len x max_word_len 如果为0, 说明是padding的位置了 chars = self.char_embedding(chars) # batch_size x max_len x max_word_len x embed_size chars = self.dropout(chars) - reshaped_chars = chars.reshape(batch_size*max_len, max_word_len, -1) + reshaped_chars = chars.reshape(batch_size * max_len, max_word_len, -1) reshaped_chars = reshaped_chars.transpose(1, 2) # B' x E x M conv_chars = [conv(reshaped_chars).transpose(1, 2).reshape(batch_size, max_len, max_word_len, -1) for conv in self.convs] @@ -129,13 +135,13 @@ class CNNCharEmbedding(TokenEmbedding): conv_chars = self.activation(conv_chars) if self.pool_method == 'max': conv_chars = conv_chars.masked_fill(chars_masks.unsqueeze(-1), float('-inf')) - chars, _ = torch.max(conv_chars, dim=-2) # batch_size x max_len x sum(filters) + chars, _ = torch.max(conv_chars, dim=-2) # batch_size x max_len x sum(filters) else: conv_chars = conv_chars.masked_fill(chars_masks.unsqueeze(-1), 0) - chars = torch.sum(conv_chars, dim=-2)/chars_masks.eq(0).sum(dim=-1, keepdim=True).float() + chars = torch.sum(conv_chars, dim=-2) / chars_masks.eq(0).sum(dim=-1, keepdim=True).float() chars = self.fc(chars) return self.dropout(chars) - + @property def requires_grad(self): """ @@ -151,21 +157,21 @@ class CNNCharEmbedding(TokenEmbedding): return requires_grads.pop() else: return None - + @requires_grad.setter def requires_grad(self, value): for name, param in self.named_parameters(): if 'words_to_chars_embedding' in name or 'word_lengths' in name: # 这个不能加入到requires_grad中 continue param.requires_grad = value - + def reset_parameters(self): for name, param in self.named_parameters(): if 'words_to_chars_embedding' in name or 'word_lengths' in name: # 这个不能reset continue if 'char_embedding' in name: continue - if param.data.dim()>1: + if param.data.dim() > 1: nn.init.xavier_uniform_(param, 1) else: nn.init.uniform_(param, -1, 1) @@ -203,13 +209,15 @@ class LSTMCharEmbedding(TokenEmbedding): (文件夹下应该只有一个以.txt作为后缀的文件)或文件路径;第二种是传入embedding的名称,第二种情况将自动查看缓存中是否存在该模型, 没有的话将自动下载。如果输入为None则使用embedding_dim的维度随机初始化一个embedding. """ - def __init__(self, vocab: Vocabulary, embed_size: int=50, char_emb_size: int=50, word_dropout:float=0, - dropout:float=0, hidden_size=50,pool_method: str='max', activation='relu', min_char_freq: int=2, - bidirectional=True, pre_train_char_embed: str=None): + + def __init__(self, vocab: Vocabulary, embed_size: int = 50, char_emb_size: int = 50, word_dropout: float = 0, + dropout: float = 0, hidden_size=50, pool_method: str = 'max', activation='relu', + min_char_freq: int = 2, + bidirectional=True, pre_train_char_embed: str = None): super(LSTMCharEmbedding, self).__init__(vocab, word_dropout=word_dropout, dropout=dropout) - + assert hidden_size % 2 == 0, "Only even kernel is allowed." - + assert pool_method in ('max', 'avg') self.pool_method = pool_method # activation function @@ -227,7 +235,7 @@ class LSTMCharEmbedding(TokenEmbedding): else: raise Exception( "Undefined activation function: choose from: [relu, tanh, sigmoid, or a callable function]") - + print("Start constructing character vocabulary.") # 建立char的词表 self.char_vocab = _construct_char_vocab_from_vocab(vocab, min_freq=min_char_freq) @@ -249,14 +257,14 @@ class LSTMCharEmbedding(TokenEmbedding): self.char_embedding = StaticEmbedding(self.char_vocab, pre_train_char_embed) else: self.char_embedding = nn.Embedding(len(self.char_vocab), char_emb_size) - + self.fc = nn.Linear(hidden_size, embed_size) hidden_size = hidden_size // 2 if bidirectional else hidden_size - + self.lstm = LSTM(char_emb_size, hidden_size, bidirectional=bidirectional, batch_first=True) self._embed_size = embed_size self.bidirectional = bidirectional - + def forward(self, words): """ 输入words的index后,生成对应的words的表示。 @@ -278,7 +286,7 @@ class LSTMCharEmbedding(TokenEmbedding): char_seq_len = chars_masks.eq(0).sum(dim=-1).reshape(batch_size * max_len) lstm_chars = self.lstm(reshaped_chars, char_seq_len)[0].reshape(batch_size, max_len, max_word_len, -1) # B x M x M x H - + lstm_chars = self.activation(lstm_chars) if self.pool_method == 'max': lstm_chars = lstm_chars.masked_fill(chars_masks.unsqueeze(-1), float('-inf')) @@ -286,11 +294,11 @@ class LSTMCharEmbedding(TokenEmbedding): else: lstm_chars = lstm_chars.masked_fill(chars_masks.unsqueeze(-1), 0) chars = torch.sum(lstm_chars, dim=-2) / chars_masks.eq(0).sum(dim=-1, keepdim=True).float() - + chars = self.fc(chars) - + return self.dropout(chars) - + @property def requires_grad(self): """ @@ -307,7 +315,7 @@ class LSTMCharEmbedding(TokenEmbedding): return requires_grads.pop() else: return None - + @requires_grad.setter def requires_grad(self, value): for name, param in self.named_parameters(): diff --git a/fastNLP/embeddings/contextual_embedding.py b/fastNLP/embeddings/contextual_embedding.py index 152b0ab9..2a1e2f82 100644 --- a/fastNLP/embeddings/contextual_embedding.py +++ b/fastNLP/embeddings/contextual_embedding.py @@ -1,3 +1,12 @@ +""" +.. todo:: + doc +""" + +__all__ = [ + "ContextualEmbedding" +] + from abc import abstractmethod import torch @@ -8,16 +17,12 @@ from ..core.sampler import SequentialSampler from ..core.utils import _move_model_to_device, _get_model_device from .embedding import TokenEmbedding -__all__ = [ - "ContextualEmbedding" -] - class ContextualEmbedding(TokenEmbedding): - def __init__(self, vocab: Vocabulary, word_dropout:float=0.0, dropout:float=0.0): + def __init__(self, vocab: Vocabulary, word_dropout: float = 0.0, dropout: float = 0.0): super(ContextualEmbedding, self).__init__(vocab, word_dropout=word_dropout, dropout=dropout) - - def add_sentence_cache(self, *datasets, batch_size=32, device='cpu', delete_weights: bool=True): + + def add_sentence_cache(self, *datasets, batch_size=32, device='cpu', delete_weights: bool = True): """ 由于动态embedding生成比较耗时,所以可以把每句话embedding缓存下来,这样就不需要每次都运行生成过程。 @@ -34,7 +39,7 @@ class ContextualEmbedding(TokenEmbedding): except Exception as e: print(f"Exception happens at {index} dataset.") raise e - + sent_embeds = {} _move_model_to_device(self, device=device) device = _get_model_device(self) @@ -54,7 +59,7 @@ class ContextualEmbedding(TokenEmbedding): word_embeds = self(words).detach().cpu().numpy() for b in range(words.size(0)): length = seq_len_from_behind[b] - if length==0: + if length == 0: sent_embeds[tuple(words_list[b][:seq_len[b]])] = word_embeds[b] else: sent_embeds[tuple(words_list[b][:seq_len[b]])] = word_embeds[b, :-length] @@ -65,7 +70,7 @@ class ContextualEmbedding(TokenEmbedding): self.sent_embeds = sent_embeds if delete_weights: self._delete_model_weights() - + def _get_sent_reprs(self, words): """ 获取sentence的表示,如果有缓存,则返回缓存的值; 没有缓存则返回None @@ -88,12 +93,12 @@ class ContextualEmbedding(TokenEmbedding): embeds[i, :len(embed)] = torch.FloatTensor(embed).to(words.device) return embeds return None - + @abstractmethod def _delete_model_weights(self): """删除计算表示的模型以节省资源""" raise NotImplementedError - + def remove_sentence_cache(self): """ 删除缓存的句子表示. 删除之后如果模型权重没有被删除,将开始使用动态计算权重。 diff --git a/fastNLP/embeddings/elmo_embedding.py b/fastNLP/embeddings/elmo_embedding.py index 24cd052e..fb5388fd 100644 --- a/fastNLP/embeddings/elmo_embedding.py +++ b/fastNLP/embeddings/elmo_embedding.py @@ -1,6 +1,13 @@ +""" +.. todo:: + doc +""" -import os +__all__ = [ + "ElmoEmbedding" +] +import os import torch import torch.nn as nn import torch.nn.functional as F @@ -49,11 +56,11 @@ class ElmoEmbedding(ContextualEmbedding): :param cache_word_reprs: 可以选择对word的表示进行cache; 设置为True的话,将在初始化的时候为每个word生成对应的embedding, 并删除character encoder,之后将直接使用cache的embedding。默认为False。 """ - + def __init__(self, vocab: Vocabulary, model_dir_or_name: str = 'en', layers: str = '2', requires_grad: bool = False, word_dropout=0.0, dropout=0.0, cache_word_reprs: bool = False): super(ElmoEmbedding, self).__init__(vocab, word_dropout=word_dropout, dropout=dropout) - + # 根据model_dir_or_name检查是否存在并下载 if model_dir_or_name.lower() in PRETRAINED_ELMO_MODEL_DIR: model_url = _get_embedding_url('elmo', model_dir_or_name.lower()) @@ -64,7 +71,7 @@ class ElmoEmbedding(ContextualEmbedding): else: raise ValueError(f"Cannot recognize {model_dir_or_name}.") self.model = _ElmoModel(model_dir, vocab, cache_word_reprs=cache_word_reprs) - + if layers == 'mix': self.layer_weights = nn.Parameter(torch.zeros(self.model.config['lstm']['n_layers'] + 1), requires_grad=requires_grad) @@ -79,16 +86,16 @@ class ElmoEmbedding(ContextualEmbedding): self.layers = layers self._get_outputs = self._get_layer_outputs self._embed_size = len(self.layers) * self.model.config['lstm']['projection_dim'] * 2 - + self.requires_grad = requires_grad - + def _get_mixed_outputs(self, outputs): # outputs: num_layers x batch_size x max_len x hidden_size # return: batch_size x max_len x hidden_size weights = F.softmax(self.layer_weights + 1 / len(outputs), dim=0).to(outputs) outputs = torch.einsum('l,lbij->bij', weights, outputs) return self.gamma.to(outputs) * outputs - + def set_mix_weights_requires_grad(self, flag=True): """ 当初始化ElmoEmbedding时layers被设置为mix时,可以通过调用该方法设置mix weights是否可训练。如果layers不是mix,调用 @@ -100,15 +107,15 @@ class ElmoEmbedding(ContextualEmbedding): if hasattr(self, 'layer_weights'): self.layer_weights.requires_grad = flag self.gamma.requires_grad = flag - + def _get_layer_outputs(self, outputs): if len(self.layers) == 1: outputs = outputs[self.layers[0]] else: outputs = torch.cat(tuple([*outputs[self.layers]]), dim=-1) - + return outputs - + def forward(self, words: torch.LongTensor): """ 计算words的elmo embedding表示。根据elmo文章中介绍的ELMO实际上是有2L+1层结果,但是为了让结果比较容易拆分,token的 @@ -125,12 +132,12 @@ class ElmoEmbedding(ContextualEmbedding): outputs = self.model(words) outputs = self._get_outputs(outputs) return self.dropout(outputs) - + def _delete_model_weights(self): for name in ['layers', 'model', 'layer_weights', 'gamma']: if hasattr(self, name): delattr(self, name) - + @property def requires_grad(self): """ @@ -144,7 +151,7 @@ class ElmoEmbedding(ContextualEmbedding): return requires_grads.pop() else: return None - + @requires_grad.setter def requires_grad(self, value): for name, param in self.named_parameters(): @@ -162,7 +169,7 @@ class _ElmoModel(nn.Module): (4) 设计一个保存token的embedding,允许缓存word的表示。 """ - + def __init__(self, model_dir: str, vocab: Vocabulary = None, cache_word_reprs: bool = False): super(_ElmoModel, self).__init__() self.model_dir = model_dir @@ -187,14 +194,14 @@ class _ElmoModel(nn.Module): config = json.load(config_f) self.weight_file = os.path.join(model_dir, weight_file) self.config = config - + OOV_TAG = '' PAD_TAG = '' BOS_TAG = '' EOS_TAG = '' BOW_TAG = '' EOW_TAG = '' - + # For the model trained with character-based word encoder. char_lexicon = {} with codecs.open(os.path.join(model_dir, 'char.dic'), 'r', encoding='utf-8') as fpi: @@ -204,29 +211,29 @@ class _ElmoModel(nn.Module): tokens.insert(0, '\u3000') token, i = tokens char_lexicon[token] = int(i) - + # 做一些sanity check for special_word in [PAD_TAG, OOV_TAG, BOW_TAG, EOW_TAG]: assert special_word in char_lexicon, f"{special_word} not found in char.dic." - + # 从vocab中构建char_vocab char_vocab = Vocabulary(unknown=OOV_TAG, padding=PAD_TAG) # 需要保证在里面 char_vocab.add_word_lst([BOW_TAG, EOW_TAG, BOS_TAG, EOS_TAG]) - + for word, index in vocab: char_vocab.add_word_lst(list(word)) - + self.bos_index, self.eos_index, self._pad_index = len(vocab), len(vocab) + 1, vocab.padding_idx # 根据char_lexicon调整, 多设置一位,是预留给word padding的(该位置的char表示为全0表示) char_emb_layer = nn.Embedding(len(char_vocab) + 1, int(config['char_cnn']['embedding']['dim']), padding_idx=len(char_vocab)) - + # 读入预训练权重 这里的elmo_model 包含char_cnn和 lstm 的 state_dict elmo_model = torch.load(os.path.join(self.model_dir, weight_file), map_location='cpu') - + char_embed_weights = elmo_model["char_cnn"]['char_emb_layer.weight'] - + found_char_count = 0 for char, index in char_vocab: # 调整character embedding if char in char_lexicon: @@ -235,11 +242,11 @@ class _ElmoModel(nn.Module): else: index_in_pre = char_lexicon[OOV_TAG] char_emb_layer.weight.data[index] = char_embed_weights[index_in_pre] - + print(f"{found_char_count} out of {len(char_vocab)} characters were found in pretrained elmo embedding.") # 生成words到chars的映射 max_chars = config['char_cnn']['max_characters_per_token'] - + self.words_to_chars_embedding = nn.Parameter(torch.full((len(vocab) + 2, max_chars), fill_value=len(char_vocab), dtype=torch.long), @@ -258,20 +265,20 @@ class _ElmoModel(nn.Module): char_vocab.to_index(EOW_TAG)] char_ids += [char_vocab.to_index(PAD_TAG)] * (max_chars - len(char_ids)) self.words_to_chars_embedding[index] = torch.LongTensor(char_ids) - + self.char_vocab = char_vocab - + self.token_embedder = ConvTokenEmbedder( config, self.weight_file, None, char_emb_layer) elmo_model["char_cnn"]['char_emb_layer.weight'] = char_emb_layer.weight self.token_embedder.load_state_dict(elmo_model["char_cnn"]) - + self.output_dim = config['lstm']['projection_dim'] - + # lstm encoder self.encoder = ElmobiLm(config) self.encoder.load_state_dict(elmo_model["lstm"]) - + if cache_word_reprs: if config['char_cnn']['embedding']['dim'] > 0: # 只有在使用了chars的情况下有用 print("Start to generate cache word representations.") @@ -280,7 +287,7 @@ class _ElmoModel(nn.Module): word_size = self.words_to_chars_embedding.size(0) num_batches = word_size // batch_size + \ int(word_size % batch_size != 0) - + self.cached_word_embedding = nn.Embedding(word_size, config['lstm']['projection_dim']) with torch.no_grad(): @@ -291,12 +298,12 @@ class _ElmoModel(nn.Module): word_reprs = self.token_embedder(words.unsqueeze(1), chars).detach() # batch_size x 1 x config['encoder']['projection_dim'] self.cached_word_embedding.weight.data[words] = word_reprs.squeeze(1) - + print("Finish generating cached word representations. Going to delete the character encoder.") del self.token_embedder, self.words_to_chars_embedding else: print("There is no need to cache word representations, since no character information is used.") - + def forward(self, words): """ @@ -321,7 +328,7 @@ class _ElmoModel(nn.Module): else: chars = None token_embedding = self.token_embedder(expanded_words, chars) # batch_size x max_len x embed_dim - + encoder_output = self.encoder(token_embedding, seq_len) if encoder_output.size(2) < max_len + 2: num_layers, _, output_len, hidden_size = encoder_output.size() @@ -332,7 +339,7 @@ class _ElmoModel(nn.Module): token_embedding = token_embedding.masked_fill(mask, 0) token_embedding = torch.cat((token_embedding, token_embedding), dim=2).view(1, sz[1], sz[2], sz[3]) encoder_output = torch.cat((token_embedding, encoder_output), dim=0) - + # 删除, . 这里没有精确地删除,但应该也不会影响最后的结果了。 encoder_output = encoder_output[:, :, 1:-1] return encoder_output diff --git a/fastNLP/embeddings/embedding.py b/fastNLP/embeddings/embedding.py index 8b746c0d..7ac841ce 100644 --- a/fastNLP/embeddings/embedding.py +++ b/fastNLP/embeddings/embedding.py @@ -3,6 +3,10 @@ """ +__all__ = [ + "Embedding", + "TokenEmbedding" +] import torch.nn as nn from abc import abstractmethod @@ -33,11 +37,11 @@ class Embedding(nn.Module): :param float dropout: 对Embedding的输出的dropout。 :param int unk_index: drop word时替换为的index。fastNLP的Vocabulary的unk_index默认为1。 """ - + def __init__(self, init_embed, word_dropout=0, dropout=0.0, unk_index=None): - + super(Embedding, self).__init__() - + self.embed = get_embeddings(init_embed) self.dropout = nn.Dropout(dropout) @@ -48,44 +52,44 @@ class Embedding(nn.Module): self._embed_size = self.embed.embedding_dim else: self._embed_size = self.embed.weight.size(1) - if word_dropout>0 and not isinstance(unk_index, int): + if word_dropout > 0 and not isinstance(unk_index, int): raise ValueError("When drop word is set, you need to pass in the unk_index.") else: self._embed_size = self.embed.embed_size unk_index = self.embed.get_word_vocab().unknown_idx self.unk_index = unk_index self.word_dropout = word_dropout - + def forward(self, words): """ :param torch.LongTensor words: [batch, seq_len] :return: torch.Tensor : [batch, seq_len, embed_dim] """ - if self.word_dropout>0 and self.training: + if self.word_dropout > 0 and self.training: mask = torch.ones_like(words).float() * self.word_dropout mask = torch.bernoulli(mask).eq(1) # dropout_word越大,越多位置为1 words = words.masked_fill(mask, self.unk_index) words = self.embed(words) return self.dropout(words) - + @property - def num_embedding(self)->int: + def num_embedding(self) -> int: if isinstance(self.embed, nn.Embedding): return self.embed.weight.size(0) else: return self.embed.num_embedding - + def __len__(self): return len(self.embed) - + @property def embed_size(self) -> int: return self._embed_size - + @property def embedding_dim(self) -> int: return self._embed_size - + @property def requires_grad(self): """ @@ -96,14 +100,14 @@ class Embedding(nn.Module): return self.embed.weight.requires_grad else: return self.embed.requires_grad - + @requires_grad.setter def requires_grad(self, value): if not isinstance(self.embed, TokenEmbedding): self.embed.weight.requires_grad = value else: self.embed.requires_grad = value - + @property def size(self): if isinstance(self.embed, TokenEmbedding): @@ -120,12 +124,12 @@ class TokenEmbedding(nn.Module): assert vocab.padding is not None, "Vocabulary must have a padding entry." self._word_vocab = vocab self._word_pad_index = vocab.padding_idx - if word_dropout>0: + if word_dropout > 0: assert vocab.unknown is not None, "Vocabulary must have unknown entry when you want to drop a word." self.word_dropout = word_dropout self._word_unk_index = vocab.unknown_idx self.dropout_layer = nn.Dropout(dropout) - + def drop_word(self, words): """ 按照设定随机将words设置为unknown_index。 @@ -138,7 +142,7 @@ class TokenEmbedding(nn.Module): mask = torch.bernoulli(mask).eq(1) # dropout_word越大,越多位置为1 words = words.masked_fill(mask, self._word_unk_index) return words - + def dropout(self, words): """ 对embedding后的word表示进行drop。 @@ -147,7 +151,7 @@ class TokenEmbedding(nn.Module): :return: """ return self.dropout_layer(words) - + @property def requires_grad(self): """ @@ -159,23 +163,23 @@ class TokenEmbedding(nn.Module): return requires_grads.pop() else: return None - + @requires_grad.setter def requires_grad(self, value): for param in self.parameters(): param.requires_grad = value - + def __len__(self): return len(self._word_vocab) - + @property def embed_size(self) -> int: return self._embed_size - + @property def embedding_dim(self) -> int: return self._embed_size - + @property def num_embedding(self) -> int: """ @@ -183,7 +187,7 @@ class TokenEmbedding(nn.Module): :return: """ return len(self._word_vocab) - + def get_word_vocab(self): """ 返回embedding的词典。 @@ -191,11 +195,11 @@ class TokenEmbedding(nn.Module): :return: Vocabulary """ return self._word_vocab - + @property def size(self): return torch.Size(self.num_embedding, self._embed_size) - + @abstractmethod def forward(self, words): raise NotImplementedError diff --git a/fastNLP/embeddings/stack_embedding.py b/fastNLP/embeddings/stack_embedding.py index d3ce462b..14781945 100644 --- a/fastNLP/embeddings/stack_embedding.py +++ b/fastNLP/embeddings/stack_embedding.py @@ -1,3 +1,12 @@ +""" +.. todo:: + doc +""" + +__all__ = [ + "StackEmbedding", +] + from typing import List import torch @@ -26,6 +35,7 @@ class StackEmbedding(TokenEmbedding): :param float dropout: 以多大的概率对embedding的表示进行Dropout。0.1即随机将10%的值置为0。 """ + def __init__(self, embeds: List[TokenEmbedding], word_dropout=0, dropout=0): vocabs = [] for embed in embeds: @@ -34,14 +44,14 @@ class StackEmbedding(TokenEmbedding): _vocab = vocabs[0] for vocab in vocabs[1:]: assert vocab == _vocab, "All embeddings in StackEmbedding should use the same word vocabulary." - + super(StackEmbedding, self).__init__(_vocab, word_dropout=word_dropout, dropout=dropout) assert isinstance(embeds, list) for embed in embeds: assert isinstance(embed, TokenEmbedding), "Only TokenEmbedding type is supported." self.embeds = nn.ModuleList(embeds) self._embed_size = sum([embed.embed_size for embed in self.embeds]) - + def append(self, embed: TokenEmbedding): """ 添加一个embedding到结尾。 @@ -50,18 +60,18 @@ class StackEmbedding(TokenEmbedding): """ assert isinstance(embed, TokenEmbedding) self.embeds.append(embed) - + def pop(self): """ 弹出最后一个embed :return: """ return self.embeds.pop() - + @property def embed_size(self): return self._embed_size - + @property def requires_grad(self): """ @@ -73,12 +83,12 @@ class StackEmbedding(TokenEmbedding): return requires_grads.pop() else: return None - + @requires_grad.setter def requires_grad(self, value): for embed in self.embeds(): embed.requires_grad = value - + def forward(self, words): """ 得到多个embedding的结果,并把结果按照顺序concat起来。 diff --git a/fastNLP/embeddings/static_embedding.py b/fastNLP/embeddings/static_embedding.py index a75ad18f..1c66e52b 100644 --- a/fastNLP/embeddings/static_embedding.py +++ b/fastNLP/embeddings/static_embedding.py @@ -1,4 +1,11 @@ +""" +.. todo:: + doc +""" +__all__ = [ + "StaticEmbedding" +] import os import torch @@ -13,6 +20,7 @@ from ..modules.utils import _get_file_name_base_on_postfix from copy import deepcopy from collections import defaultdict + class StaticEmbedding(TokenEmbedding): """ 别名::class:`fastNLP.embeddings.StaticEmbedding` :class:`fastNLP.embeddings.static_embedding.StaticEmbedding` @@ -55,15 +63,16 @@ class StaticEmbedding(TokenEmbedding): :param bool normalize: 是否对vector进行normalize,使得每个vector的norm为1。 :param int min_freq: Vocabulary词频数小于这个数量的word将被指向unk。 """ - def __init__(self, vocab: Vocabulary, model_dir_or_name: str='en', embedding_dim=-1, requires_grad: bool=True, + + def __init__(self, vocab: Vocabulary, model_dir_or_name: str = 'en', embedding_dim=-1, requires_grad: bool = True, init_method=None, lower=False, dropout=0, word_dropout=0, normalize=False, min_freq=1, **kwargs): super(StaticEmbedding, self).__init__(vocab, word_dropout=word_dropout, dropout=dropout) - if embedding_dim>0: + if embedding_dim > 0: model_dir_or_name = None - + # 得到cache_path if model_dir_or_name is None: - assert embedding_dim>=1, "The dimension of embedding should be larger than 1." + assert embedding_dim >= 1, "The dimension of embedding should be larger than 1." embedding_dim = int(embedding_dim) model_path = None elif model_dir_or_name.lower() in PRETRAIN_STATIC_FILES: @@ -76,9 +85,9 @@ class StaticEmbedding(TokenEmbedding): model_path = _get_file_name_base_on_postfix(os.path.abspath(os.path.expanduser(model_dir_or_name)), '.txt') else: raise ValueError(f"Cannot recognize {model_dir_or_name}.") - + # 根据min_freq缩小vocab - truncate_vocab = (vocab.min_freq is None and min_freq>1) or (vocab.min_freq and vocab.min_freq 1) or (vocab.min_freq and vocab.min_freq < min_freq) if truncate_vocab: truncated_vocab = deepcopy(vocab) truncated_vocab.min_freq = min_freq @@ -89,14 +98,14 @@ class StaticEmbedding(TokenEmbedding): lowered_word_count[word.lower()] += count for word in truncated_vocab.word_count.keys(): word_count = truncated_vocab.word_count[word] - if lowered_word_count[word.lower()]>=min_freq and word_count= min_freq and word_count < min_freq: + truncated_vocab.add_word_lst([word] * (min_freq - word_count), no_create_entry=truncated_vocab._is_word_no_create_entry(word)) - + # 只限制在train里面的词语使用min_freq筛选 if kwargs.get('only_train_min_freq', False) and model_dir_or_name is not None: for word in truncated_vocab.word_count.keys(): - if truncated_vocab._is_word_no_create_entry(word) and truncated_vocab.word_count[word]