@@ -18,7 +18,6 @@ __all__ = [ | |||
"get_embeddings", | |||
] | |||
from .embedding import Embedding, TokenEmbedding | |||
from .static_embedding import StaticEmbedding | |||
from .elmo_embedding import ElmoEmbedding | |||
@@ -1,3 +1,12 @@ | |||
""" | |||
.. todo:: | |||
doc | |||
""" | |||
__all__ = [ | |||
"BertEmbedding", | |||
"BertWordPieceEncoder" | |||
] | |||
import os | |||
import collections | |||
@@ -13,6 +22,7 @@ from ..modules.encoder.bert import _WordPieceBertModel, BertModel, BertTokenizer | |||
from .contextual_embedding import ContextualEmbedding | |||
import warnings | |||
class BertEmbedding(ContextualEmbedding): | |||
""" | |||
别名::class:`fastNLP.embeddings.BertEmbedding` :class:`fastNLP.embeddings.bert_embedding.BertEmbedding` | |||
@@ -54,11 +64,12 @@ class BertEmbedding(ContextualEmbedding): | |||
word pieces后的内容,并将第512个word piece置为[SEP]。超过长度的部分的encode结果直接全部置零。一般仅有只使用[CLS] | |||
来进行分类的任务将auto_truncate置为True。 | |||
""" | |||
def __init__(self, vocab: Vocabulary, model_dir_or_name: str='en-base-uncased', layers: str='-1', | |||
pool_method: str='first', word_dropout=0, dropout=0, include_cls_sep: bool=False, | |||
pooled_cls=True, requires_grad: bool=False, auto_truncate:bool=False): | |||
def __init__(self, vocab: Vocabulary, model_dir_or_name: str = 'en-base-uncased', layers: str = '-1', | |||
pool_method: str = 'first', word_dropout=0, dropout=0, include_cls_sep: bool = False, | |||
pooled_cls=True, requires_grad: bool = False, auto_truncate: bool = False): | |||
super(BertEmbedding, self).__init__(vocab, word_dropout=word_dropout, dropout=dropout) | |||
# 根据model_dir_or_name检查是否存在并下载 | |||
if model_dir_or_name.lower() in PRETRAINED_BERT_MODEL_DIR: | |||
if 'cn' in model_dir_or_name.lower() and pool_method not in ('first', 'last'): | |||
@@ -71,21 +82,21 @@ class BertEmbedding(ContextualEmbedding): | |||
model_dir = os.path.abspath(os.path.expanduser(model_dir_or_name)) | |||
else: | |||
raise ValueError(f"Cannot recognize {model_dir_or_name}.") | |||
self._word_sep_index = None | |||
if '[SEP]' in vocab: | |||
self._word_sep_index = vocab['[SEP]'] | |||
self.model = _WordBertModel(model_dir=model_dir, vocab=vocab, layers=layers, | |||
pool_method=pool_method, include_cls_sep=include_cls_sep, | |||
pooled_cls=pooled_cls, auto_truncate=auto_truncate, min_freq=2) | |||
self.requires_grad = requires_grad | |||
self._embed_size = len(self.model.layers)*self.model.encoder.hidden_size | |||
self._embed_size = len(self.model.layers) * self.model.encoder.hidden_size | |||
def _delete_model_weights(self): | |||
del self.model | |||
def forward(self, words): | |||
""" | |||
计算words的bert embedding表示。计算之前会在每句话的开始增加[CLS]在结束增加[SEP], 并根据include_cls_sep判断要不要 | |||
@@ -100,9 +111,9 @@ class BertEmbedding(ContextualEmbedding): | |||
return self.dropout(outputs) | |||
outputs = self.model(words) | |||
outputs = torch.cat([*outputs], dim=-1) | |||
return self.dropout(outputs) | |||
def drop_word(self, words): | |||
""" | |||
按照设定随机将words设置为unknown_index。 | |||
@@ -120,7 +131,7 @@ class BertEmbedding(ContextualEmbedding): | |||
if self._word_sep_index: | |||
words.masked_fill_(sep_mask, self._word_sep_index) | |||
return words | |||
@property | |||
def requires_grad(self): | |||
""" | |||
@@ -129,12 +140,12 @@ class BertEmbedding(ContextualEmbedding): | |||
:return: | |||
""" | |||
requires_grads = set([param.requires_grad for name, param in self.named_parameters() | |||
if 'word_pieces_lengths' not in name]) | |||
if 'word_pieces_lengths' not in name]) | |||
if len(requires_grads) == 1: | |||
return requires_grads.pop() | |||
else: | |||
return None | |||
@requires_grad.setter | |||
def requires_grad(self, value): | |||
for name, param in self.named_parameters(): | |||
@@ -155,10 +166,11 @@ class BertWordPieceEncoder(nn.Module): | |||
:param float dropout: 以多大的概率对embedding的表示进行Dropout。0.1即随机将10%的值置为0。 | |||
:param bool requires_grad: 是否需要gradient。 | |||
""" | |||
def __init__(self, model_dir_or_name: str='en-base-uncased', layers: str='-1', pooled_cls: bool = False, | |||
word_dropout=0, dropout=0, requires_grad: bool=False): | |||
def __init__(self, model_dir_or_name: str = 'en-base-uncased', layers: str = '-1', pooled_cls: bool = False, | |||
word_dropout=0, dropout=0, requires_grad: bool = False): | |||
super().__init__() | |||
if model_dir_or_name.lower() in PRETRAINED_BERT_MODEL_DIR: | |||
model_url = _get_embedding_url('bert', model_dir_or_name.lower()) | |||
model_dir = cached_path(model_url, name='embedding') | |||
@@ -167,7 +179,7 @@ class BertWordPieceEncoder(nn.Module): | |||
model_dir = model_dir_or_name | |||
else: | |||
raise ValueError(f"Cannot recognize {model_dir_or_name}.") | |||
self.model = _WordPieceBertModel(model_dir=model_dir, layers=layers, pooled_cls=pooled_cls) | |||
self._sep_index = self.model._sep_index | |||
self._wordpiece_unk_index = self.model._wordpiece_unknown_index | |||
@@ -175,7 +187,7 @@ class BertWordPieceEncoder(nn.Module): | |||
self.requires_grad = requires_grad | |||
self.word_dropout = word_dropout | |||
self.dropout_layer = nn.Dropout(dropout) | |||
@property | |||
def requires_grad(self): | |||
""" | |||
@@ -187,24 +199,24 @@ class BertWordPieceEncoder(nn.Module): | |||
return requires_grads.pop() | |||
else: | |||
return None | |||
@requires_grad.setter | |||
def requires_grad(self, value): | |||
for name, param in self.named_parameters(): | |||
param.requires_grad = value | |||
@property | |||
def embed_size(self): | |||
return self._embed_size | |||
@property | |||
def embedding_dim(self): | |||
return self._embed_size | |||
@property | |||
def num_embedding(self): | |||
return self.model.encoder.config.vocab_size | |||
def index_datasets(self, *datasets, field_name, add_cls_sep=True): | |||
""" | |||
使用bert的tokenizer新生成word_pieces列加入到datasets中,并将他们设置为input,且将word_pieces这一列的pad value设置为了 | |||
@@ -216,7 +228,7 @@ class BertWordPieceEncoder(nn.Module): | |||
:return: | |||
""" | |||
self.model.index_dataset(*datasets, field_name=field_name, add_cls_sep=add_cls_sep) | |||
def forward(self, word_pieces, token_type_ids=None): | |||
""" | |||
计算words的bert embedding表示。传入的words中应该自行包含[CLS]与[SEP]的tag。 | |||
@@ -233,13 +245,13 @@ class BertWordPieceEncoder(nn.Module): | |||
token_type_ids = sep_mask_cumsum.fmod(2) | |||
if token_type_ids[0, 0].item(): # 如果开头是奇数,则需要flip一下结果,因为需要保证开头为0 | |||
token_type_ids = token_type_ids.eq(0).long() | |||
word_pieces = self.drop_word(word_pieces) | |||
outputs = self.model(word_pieces, token_type_ids) | |||
outputs = torch.cat([*outputs], dim=-1) | |||
return self.dropout_layer(outputs) | |||
def drop_word(self, words): | |||
""" | |||
按照设定随机将words设置为unknown_index。 | |||
@@ -260,10 +272,10 @@ class BertWordPieceEncoder(nn.Module): | |||
class _WordBertModel(nn.Module): | |||
def __init__(self, model_dir:str, vocab:Vocabulary, layers:str='-1', pool_method:str='first', | |||
include_cls_sep:bool=False, pooled_cls:bool=False, auto_truncate:bool=False, min_freq=2): | |||
def __init__(self, model_dir: str, vocab: Vocabulary, layers: str = '-1', pool_method: str = 'first', | |||
include_cls_sep: bool = False, pooled_cls: bool = False, auto_truncate: bool = False, min_freq=2): | |||
super().__init__() | |||
self.tokenzier = BertTokenizer.from_pretrained(model_dir) | |||
self.encoder = BertModel.from_pretrained(model_dir) | |||
self._max_position_embeddings = self.encoder.config.max_position_embeddings | |||
@@ -271,23 +283,23 @@ class _WordBertModel(nn.Module): | |||
encoder_layer_number = len(self.encoder.encoder.layer) | |||
self.layers = list(map(int, layers.split(','))) | |||
for layer in self.layers: | |||
if layer<0: | |||
assert -layer<=encoder_layer_number, f"The layer index:{layer} is out of scope for " \ | |||
f"a bert model with {encoder_layer_number} layers." | |||
if layer < 0: | |||
assert -layer <= encoder_layer_number, f"The layer index:{layer} is out of scope for " \ | |||
f"a bert model with {encoder_layer_number} layers." | |||
else: | |||
assert layer<encoder_layer_number, f"The layer index:{layer} is out of scope for " \ | |||
f"a bert model with {encoder_layer_number} layers." | |||
assert layer < encoder_layer_number, f"The layer index:{layer} is out of scope for " \ | |||
f"a bert model with {encoder_layer_number} layers." | |||
assert pool_method in ('avg', 'max', 'first', 'last') | |||
self.pool_method = pool_method | |||
self.include_cls_sep = include_cls_sep | |||
self.pooled_cls = pooled_cls | |||
self.auto_truncate = auto_truncate | |||
# 将所有vocab中word的wordpiece计算出来, 需要额外考虑[CLS]和[SEP] | |||
print("Start to generating word pieces for word.") | |||
# 第一步统计出需要的word_piece, 然后创建新的embed和word_piece_vocab, 然后填入值 | |||
word_piece_dict = {'[CLS]':1, '[SEP]':1} # 用到的word_piece以及新增的 | |||
word_piece_dict = {'[CLS]': 1, '[SEP]': 1} # 用到的word_piece以及新增的 | |||
found_count = 0 | |||
self._has_sep_in_vocab = '[SEP]' in vocab # 用来判断传入的数据是否需要生成token_ids | |||
if '[sep]' in vocab: | |||
@@ -302,10 +314,11 @@ class _WordBertModel(nn.Module): | |||
elif index == vocab.unknown_idx: | |||
word = '[UNK]' | |||
word_pieces = self.tokenzier.wordpiece_tokenizer.tokenize(word) | |||
if len(word_pieces)==1: | |||
if len(word_pieces) == 1: | |||
if not vocab._is_word_no_create_entry(word): # 如果是train中的值, 但是却没有找到 | |||
if index!=vocab.unknown_idx and word_pieces[0]=='[UNK]': # 说明这个词不在原始的word里面 | |||
if vocab.word_count[word]>=min_freq and not vocab._is_word_no_create_entry(word): #出现次数大于这个次数才新增 | |||
if index != vocab.unknown_idx and word_pieces[0] == '[UNK]': # 说明这个词不在原始的word里面 | |||
if vocab.word_count[word] >= min_freq and not vocab._is_word_no_create_entry( | |||
word): # 出现次数大于这个次数才新增 | |||
word_piece_dict[word] = 1 # 新增一个值 | |||
continue | |||
for word_piece in word_pieces: | |||
@@ -327,7 +340,7 @@ class _WordBertModel(nn.Module): | |||
new_word_piece_vocab[token] = len(new_word_piece_vocab) | |||
self.tokenzier._reinit_on_new_vocab(new_word_piece_vocab) | |||
self.encoder.embeddings.word_embeddings = embed | |||
word_to_wordpieces = [] | |||
word_pieces_lengths = [] | |||
for word, index in vocab: | |||
@@ -347,7 +360,7 @@ class _WordBertModel(nn.Module): | |||
self.word_to_wordpieces = np.array(word_to_wordpieces) | |||
self.word_pieces_lengths = nn.Parameter(torch.LongTensor(word_pieces_lengths), requires_grad=False) | |||
print("Successfully generate word pieces.") | |||
def forward(self, words): | |||
""" | |||
@@ -358,34 +371,37 @@ class _WordBertModel(nn.Module): | |||
batch_size, max_word_len = words.size() | |||
word_mask = words.ne(self._word_pad_index) # 为1的地方有word | |||
seq_len = word_mask.sum(dim=-1) | |||
batch_word_pieces_length = self.word_pieces_lengths[words].masked_fill(word_mask.eq(0), 0) # batch_size x max_len | |||
batch_word_pieces_length = self.word_pieces_lengths[words].masked_fill(word_mask.eq(0), | |||
0) # batch_size x max_len | |||
word_pieces_lengths = batch_word_pieces_length.sum(dim=-1) # batch_size | |||
word_piece_length = batch_word_pieces_length.sum(dim=-1).max().item() # 表示word piece的长度(包括padding) | |||
if word_piece_length+2>self._max_position_embeddings: | |||
if word_piece_length + 2 > self._max_position_embeddings: | |||
if self.auto_truncate: | |||
word_pieces_lengths = word_pieces_lengths.masked_fill(word_pieces_lengths+2>self._max_position_embeddings, | |||
self._max_position_embeddings-2) | |||
word_pieces_lengths = word_pieces_lengths.masked_fill( | |||
word_pieces_lengths + 2 > self._max_position_embeddings, | |||
self._max_position_embeddings - 2) | |||
else: | |||
raise RuntimeError("After split words into word pieces, the lengths of word pieces are longer than the " | |||
f"maximum allowed sequence length:{self._max_position_embeddings} of bert.") | |||
raise RuntimeError( | |||
"After split words into word pieces, the lengths of word pieces are longer than the " | |||
f"maximum allowed sequence length:{self._max_position_embeddings} of bert.") | |||
# +2是由于需要加入[CLS]与[SEP] | |||
word_pieces = words.new_full((batch_size, min(word_piece_length+2, self._max_position_embeddings)), | |||
word_pieces = words.new_full((batch_size, min(word_piece_length + 2, self._max_position_embeddings)), | |||
fill_value=self._wordpiece_pad_index) | |||
attn_masks = torch.zeros_like(word_pieces) | |||
# 1. 获取words的word_pieces的id,以及对应的span范围 | |||
word_indexes = words.cpu().numpy() | |||
for i in range(batch_size): | |||
word_pieces_i = list(chain(*self.word_to_wordpieces[word_indexes[i, :seq_len[i]]])) | |||
if self.auto_truncate and len(word_pieces_i)>self._max_position_embeddings-2: | |||
word_pieces_i = word_pieces_i[:self._max_position_embeddings-2] | |||
word_pieces[i, 1:word_pieces_lengths[i]+1] = torch.LongTensor(word_pieces_i) | |||
attn_masks[i, :word_pieces_lengths[i]+2].fill_(1) | |||
if self.auto_truncate and len(word_pieces_i) > self._max_position_embeddings - 2: | |||
word_pieces_i = word_pieces_i[:self._max_position_embeddings - 2] | |||
word_pieces[i, 1:word_pieces_lengths[i] + 1] = torch.LongTensor(word_pieces_i) | |||
attn_masks[i, :word_pieces_lengths[i] + 2].fill_(1) | |||
# 添加[cls]和[sep] | |||
word_pieces[:, 0].fill_(self._cls_index) | |||
batch_indexes = torch.arange(batch_size).to(words) | |||
word_pieces[batch_indexes, word_pieces_lengths+1] = self._sep_index | |||
if self._has_sep_in_vocab: #但[SEP]在vocab中出现应该才会需要token_ids | |||
word_pieces[batch_indexes, word_pieces_lengths + 1] = self._sep_index | |||
if self._has_sep_in_vocab: # 但[SEP]在vocab中出现应该才会需要token_ids | |||
sep_mask = word_pieces.eq(self._sep_index) # batch_size x max_len | |||
sep_mask_cumsum = sep_mask.flip(dims=[-1]).cumsum(dim=-1).flip(dims=[-1]) | |||
token_type_ids = sep_mask_cumsum.fmod(2) | |||
@@ -396,9 +412,9 @@ class _WordBertModel(nn.Module): | |||
# 2. 获取hidden的结果,根据word_pieces进行对应的pool计算 | |||
# all_outputs: [batch_size x max_len x hidden_size, batch_size x max_len x hidden_size, ...] | |||
bert_outputs, pooled_cls = self.encoder(word_pieces, token_type_ids=token_type_ids, attention_mask=attn_masks, | |||
output_all_encoded_layers=True) | |||
output_all_encoded_layers=True) | |||
# output_layers = [self.layers] # len(self.layers) x batch_size x real_word_piece_length x hidden_size | |||
if self.include_cls_sep: | |||
outputs = bert_outputs[-1].new_zeros(len(self.layers), batch_size, max_word_len + 2, | |||
bert_outputs[-1].size(-1)) | |||
@@ -414,7 +430,7 @@ class _WordBertModel(nn.Module): | |||
real_word_piece_length = output_layer.size(1) - 2 | |||
if word_piece_length > real_word_piece_length: # 如果实际上是截取出来的 | |||
paddings = output_layer.new_zeros(batch_size, | |||
word_piece_length-real_word_piece_length, | |||
word_piece_length - real_word_piece_length, | |||
output_layer.size(2)) | |||
output_layer = torch.cat((output_layer, paddings), dim=1).contiguous() | |||
# 从word_piece collapse到word的表示 | |||
@@ -423,27 +439,27 @@ class _WordBertModel(nn.Module): | |||
if self.pool_method == 'first': | |||
for i in range(batch_size): | |||
i_word_pieces_cum_length = batch_word_pieces_cum_length[i, :seq_len[i]] # 每个word的start位置 | |||
outputs[l_index, i, s_shift:outputs_seq_len[i]] = truncate_output_layer[i, i_word_pieces_cum_length] # num_layer x batch_size x len x hidden_size | |||
outputs[l_index, i, s_shift:outputs_seq_len[i]] = truncate_output_layer[ | |||
i, i_word_pieces_cum_length] # num_layer x batch_size x len x hidden_size | |||
elif self.pool_method == 'last': | |||
for i in range(batch_size): | |||
i_word_pieces_cum_length = batch_word_pieces_cum_length[i, 1:seq_len[i]+1] - 1 # 每个word的end | |||
i_word_pieces_cum_length = batch_word_pieces_cum_length[i, 1:seq_len[i] + 1] - 1 # 每个word的end | |||
outputs[l_index, i, s_shift:outputs_seq_len[i]] = truncate_output_layer[i, i_word_pieces_cum_length] | |||
elif self.pool_method == 'max': | |||
for i in range(batch_size): | |||
for j in range(seq_len[i]): | |||
start, end = batch_word_pieces_cum_length[i, j], batch_word_pieces_cum_length[i, j+1] | |||
outputs[l_index, i, j+s_shift], _ = torch.max(truncate_output_layer[i, start:end], dim=-2) | |||
start, end = batch_word_pieces_cum_length[i, j], batch_word_pieces_cum_length[i, j + 1] | |||
outputs[l_index, i, j + s_shift], _ = torch.max(truncate_output_layer[i, start:end], dim=-2) | |||
else: | |||
for i in range(batch_size): | |||
for j in range(seq_len[i]): | |||
start, end = batch_word_pieces_cum_length[i, j], batch_word_pieces_cum_length[i, j+1] | |||
outputs[l_index, i, j+s_shift] = torch.mean(truncate_output_layer[i, start:end], dim=-2) | |||
start, end = batch_word_pieces_cum_length[i, j], batch_word_pieces_cum_length[i, j + 1] | |||
outputs[l_index, i, j + s_shift] = torch.mean(truncate_output_layer[i, start:end], dim=-2) | |||
if self.include_cls_sep: | |||
if l in (len(bert_outputs)-1, -1) and self.pooled_cls: | |||
if l in (len(bert_outputs) - 1, -1) and self.pooled_cls: | |||
outputs[l_index, :, 0] = pooled_cls | |||
else: | |||
outputs[l_index, :, 0] = output_layer[:, 0] | |||
outputs[l_index, batch_indexes, seq_len+s_shift] = output_layer[batch_indexes, seq_len+s_shift] | |||
outputs[l_index, batch_indexes, seq_len + s_shift] = output_layer[batch_indexes, seq_len + s_shift] | |||
# 3. 最终的embedding结果 | |||
return outputs | |||
@@ -3,6 +3,10 @@ | |||
词的index而不需要使用词语中的char的index来获取表达。 | |||
""" | |||
__all__ = [ | |||
"CNNCharEmbedding", | |||
"LSTMCharEmbedding" | |||
] | |||
import torch | |||
import torch.nn as nn | |||
@@ -16,6 +20,7 @@ from .embedding import TokenEmbedding | |||
from .utils import _construct_char_vocab_from_vocab | |||
from .utils import get_embeddings | |||
class CNNCharEmbedding(TokenEmbedding): | |||
""" | |||
别名::class:`fastNLP.embeddings.CNNCharEmbedding` :class:`fastNLP.embeddings.char_embedding.CNNCharEmbedding` | |||
@@ -49,14 +54,15 @@ class CNNCharEmbedding(TokenEmbedding): | |||
(文件夹下应该只有一个以.txt作为后缀的文件)或文件路径;第二种是传入embedding的名称,第二种情况将自动查看缓存中是否存在该模型, | |||
没有的话将自动下载。如果输入为None则使用embedding_dim的维度随机初始化一个embedding. | |||
""" | |||
def __init__(self, vocab: Vocabulary, embed_size: int=50, char_emb_size: int=50, word_dropout:float=0, | |||
dropout:float=0, filter_nums: List[int]=(40, 30, 20), kernel_sizes: List[int]=(5, 3, 1), | |||
pool_method: str='max', activation='relu', min_char_freq: int=2, pre_train_char_embed: str=None): | |||
def __init__(self, vocab: Vocabulary, embed_size: int = 50, char_emb_size: int = 50, word_dropout: float = 0, | |||
dropout: float = 0, filter_nums: List[int] = (40, 30, 20), kernel_sizes: List[int] = (5, 3, 1), | |||
pool_method: str = 'max', activation='relu', min_char_freq: int = 2, pre_train_char_embed: str = None): | |||
super(CNNCharEmbedding, self).__init__(vocab, word_dropout=word_dropout, dropout=dropout) | |||
for kernel in kernel_sizes: | |||
assert kernel % 2 == 1, "Only odd kernel is allowed." | |||
assert pool_method in ('max', 'avg') | |||
self.pool_method = pool_method | |||
# activation function | |||
@@ -74,7 +80,7 @@ class CNNCharEmbedding(TokenEmbedding): | |||
else: | |||
raise Exception( | |||
"Undefined activation function: choose from: [relu, tanh, sigmoid, or a callable function]") | |||
print("Start constructing character vocabulary.") | |||
# 建立char的词表 | |||
self.char_vocab = _construct_char_vocab_from_vocab(vocab, min_freq=min_char_freq) | |||
@@ -96,14 +102,14 @@ class CNNCharEmbedding(TokenEmbedding): | |||
self.char_embedding = StaticEmbedding(self.char_vocab, model_dir_or_name=pre_train_char_embed) | |||
else: | |||
self.char_embedding = get_embeddings((len(self.char_vocab), char_emb_size)) | |||
self.convs = nn.ModuleList([nn.Conv1d( | |||
char_emb_size, filter_nums[i], kernel_size=kernel_sizes[i], bias=True, padding=kernel_sizes[i] // 2) | |||
for i in range(len(kernel_sizes))]) | |||
self._embed_size = embed_size | |||
self.fc = nn.Linear(sum(filter_nums), embed_size) | |||
self.reset_parameters() | |||
def forward(self, words): | |||
""" | |||
输入words的index后,生成对应的words的表示。 | |||
@@ -114,14 +120,14 @@ class CNNCharEmbedding(TokenEmbedding): | |||
words = self.drop_word(words) | |||
batch_size, max_len = words.size() | |||
chars = self.words_to_chars_embedding[words] # batch_size x max_len x max_word_len | |||
word_lengths = self.word_lengths[words] # batch_size x max_len | |||
word_lengths = self.word_lengths[words] # batch_size x max_len | |||
max_word_len = word_lengths.max() | |||
chars = chars[:, :, :max_word_len] | |||
# 为1的地方为mask | |||
chars_masks = chars.eq(self.char_pad_index) # batch_size x max_len x max_word_len 如果为0, 说明是padding的位置了 | |||
chars = self.char_embedding(chars) # batch_size x max_len x max_word_len x embed_size | |||
chars = self.dropout(chars) | |||
reshaped_chars = chars.reshape(batch_size*max_len, max_word_len, -1) | |||
reshaped_chars = chars.reshape(batch_size * max_len, max_word_len, -1) | |||
reshaped_chars = reshaped_chars.transpose(1, 2) # B' x E x M | |||
conv_chars = [conv(reshaped_chars).transpose(1, 2).reshape(batch_size, max_len, max_word_len, -1) | |||
for conv in self.convs] | |||
@@ -129,13 +135,13 @@ class CNNCharEmbedding(TokenEmbedding): | |||
conv_chars = self.activation(conv_chars) | |||
if self.pool_method == 'max': | |||
conv_chars = conv_chars.masked_fill(chars_masks.unsqueeze(-1), float('-inf')) | |||
chars, _ = torch.max(conv_chars, dim=-2) # batch_size x max_len x sum(filters) | |||
chars, _ = torch.max(conv_chars, dim=-2) # batch_size x max_len x sum(filters) | |||
else: | |||
conv_chars = conv_chars.masked_fill(chars_masks.unsqueeze(-1), 0) | |||
chars = torch.sum(conv_chars, dim=-2)/chars_masks.eq(0).sum(dim=-1, keepdim=True).float() | |||
chars = torch.sum(conv_chars, dim=-2) / chars_masks.eq(0).sum(dim=-1, keepdim=True).float() | |||
chars = self.fc(chars) | |||
return self.dropout(chars) | |||
@property | |||
def requires_grad(self): | |||
""" | |||
@@ -151,21 +157,21 @@ class CNNCharEmbedding(TokenEmbedding): | |||
return requires_grads.pop() | |||
else: | |||
return None | |||
@requires_grad.setter | |||
def requires_grad(self, value): | |||
for name, param in self.named_parameters(): | |||
if 'words_to_chars_embedding' in name or 'word_lengths' in name: # 这个不能加入到requires_grad中 | |||
continue | |||
param.requires_grad = value | |||
def reset_parameters(self): | |||
for name, param in self.named_parameters(): | |||
if 'words_to_chars_embedding' in name or 'word_lengths' in name: # 这个不能reset | |||
continue | |||
if 'char_embedding' in name: | |||
continue | |||
if param.data.dim()>1: | |||
if param.data.dim() > 1: | |||
nn.init.xavier_uniform_(param, 1) | |||
else: | |||
nn.init.uniform_(param, -1, 1) | |||
@@ -203,13 +209,15 @@ class LSTMCharEmbedding(TokenEmbedding): | |||
(文件夹下应该只有一个以.txt作为后缀的文件)或文件路径;第二种是传入embedding的名称,第二种情况将自动查看缓存中是否存在该模型, | |||
没有的话将自动下载。如果输入为None则使用embedding_dim的维度随机初始化一个embedding. | |||
""" | |||
def __init__(self, vocab: Vocabulary, embed_size: int=50, char_emb_size: int=50, word_dropout:float=0, | |||
dropout:float=0, hidden_size=50,pool_method: str='max', activation='relu', min_char_freq: int=2, | |||
bidirectional=True, pre_train_char_embed: str=None): | |||
def __init__(self, vocab: Vocabulary, embed_size: int = 50, char_emb_size: int = 50, word_dropout: float = 0, | |||
dropout: float = 0, hidden_size=50, pool_method: str = 'max', activation='relu', | |||
min_char_freq: int = 2, | |||
bidirectional=True, pre_train_char_embed: str = None): | |||
super(LSTMCharEmbedding, self).__init__(vocab, word_dropout=word_dropout, dropout=dropout) | |||
assert hidden_size % 2 == 0, "Only even kernel is allowed." | |||
assert pool_method in ('max', 'avg') | |||
self.pool_method = pool_method | |||
# activation function | |||
@@ -227,7 +235,7 @@ class LSTMCharEmbedding(TokenEmbedding): | |||
else: | |||
raise Exception( | |||
"Undefined activation function: choose from: [relu, tanh, sigmoid, or a callable function]") | |||
print("Start constructing character vocabulary.") | |||
# 建立char的词表 | |||
self.char_vocab = _construct_char_vocab_from_vocab(vocab, min_freq=min_char_freq) | |||
@@ -249,14 +257,14 @@ class LSTMCharEmbedding(TokenEmbedding): | |||
self.char_embedding = StaticEmbedding(self.char_vocab, pre_train_char_embed) | |||
else: | |||
self.char_embedding = nn.Embedding(len(self.char_vocab), char_emb_size) | |||
self.fc = nn.Linear(hidden_size, embed_size) | |||
hidden_size = hidden_size // 2 if bidirectional else hidden_size | |||
self.lstm = LSTM(char_emb_size, hidden_size, bidirectional=bidirectional, batch_first=True) | |||
self._embed_size = embed_size | |||
self.bidirectional = bidirectional | |||
def forward(self, words): | |||
""" | |||
输入words的index后,生成对应的words的表示。 | |||
@@ -278,7 +286,7 @@ class LSTMCharEmbedding(TokenEmbedding): | |||
char_seq_len = chars_masks.eq(0).sum(dim=-1).reshape(batch_size * max_len) | |||
lstm_chars = self.lstm(reshaped_chars, char_seq_len)[0].reshape(batch_size, max_len, max_word_len, -1) | |||
# B x M x M x H | |||
lstm_chars = self.activation(lstm_chars) | |||
if self.pool_method == 'max': | |||
lstm_chars = lstm_chars.masked_fill(chars_masks.unsqueeze(-1), float('-inf')) | |||
@@ -286,11 +294,11 @@ class LSTMCharEmbedding(TokenEmbedding): | |||
else: | |||
lstm_chars = lstm_chars.masked_fill(chars_masks.unsqueeze(-1), 0) | |||
chars = torch.sum(lstm_chars, dim=-2) / chars_masks.eq(0).sum(dim=-1, keepdim=True).float() | |||
chars = self.fc(chars) | |||
return self.dropout(chars) | |||
@property | |||
def requires_grad(self): | |||
""" | |||
@@ -307,7 +315,7 @@ class LSTMCharEmbedding(TokenEmbedding): | |||
return requires_grads.pop() | |||
else: | |||
return None | |||
@requires_grad.setter | |||
def requires_grad(self, value): | |||
for name, param in self.named_parameters(): | |||
@@ -1,3 +1,12 @@ | |||
""" | |||
.. todo:: | |||
doc | |||
""" | |||
__all__ = [ | |||
"ContextualEmbedding" | |||
] | |||
from abc import abstractmethod | |||
import torch | |||
@@ -8,16 +17,12 @@ from ..core.sampler import SequentialSampler | |||
from ..core.utils import _move_model_to_device, _get_model_device | |||
from .embedding import TokenEmbedding | |||
__all__ = [ | |||
"ContextualEmbedding" | |||
] | |||
class ContextualEmbedding(TokenEmbedding): | |||
def __init__(self, vocab: Vocabulary, word_dropout:float=0.0, dropout:float=0.0): | |||
def __init__(self, vocab: Vocabulary, word_dropout: float = 0.0, dropout: float = 0.0): | |||
super(ContextualEmbedding, self).__init__(vocab, word_dropout=word_dropout, dropout=dropout) | |||
def add_sentence_cache(self, *datasets, batch_size=32, device='cpu', delete_weights: bool=True): | |||
def add_sentence_cache(self, *datasets, batch_size=32, device='cpu', delete_weights: bool = True): | |||
""" | |||
由于动态embedding生成比较耗时,所以可以把每句话embedding缓存下来,这样就不需要每次都运行生成过程。 | |||
@@ -34,7 +39,7 @@ class ContextualEmbedding(TokenEmbedding): | |||
except Exception as e: | |||
print(f"Exception happens at {index} dataset.") | |||
raise e | |||
sent_embeds = {} | |||
_move_model_to_device(self, device=device) | |||
device = _get_model_device(self) | |||
@@ -54,7 +59,7 @@ class ContextualEmbedding(TokenEmbedding): | |||
word_embeds = self(words).detach().cpu().numpy() | |||
for b in range(words.size(0)): | |||
length = seq_len_from_behind[b] | |||
if length==0: | |||
if length == 0: | |||
sent_embeds[tuple(words_list[b][:seq_len[b]])] = word_embeds[b] | |||
else: | |||
sent_embeds[tuple(words_list[b][:seq_len[b]])] = word_embeds[b, :-length] | |||
@@ -65,7 +70,7 @@ class ContextualEmbedding(TokenEmbedding): | |||
self.sent_embeds = sent_embeds | |||
if delete_weights: | |||
self._delete_model_weights() | |||
def _get_sent_reprs(self, words): | |||
""" | |||
获取sentence的表示,如果有缓存,则返回缓存的值; 没有缓存则返回None | |||
@@ -88,12 +93,12 @@ class ContextualEmbedding(TokenEmbedding): | |||
embeds[i, :len(embed)] = torch.FloatTensor(embed).to(words.device) | |||
return embeds | |||
return None | |||
@abstractmethod | |||
def _delete_model_weights(self): | |||
"""删除计算表示的模型以节省资源""" | |||
raise NotImplementedError | |||
def remove_sentence_cache(self): | |||
""" | |||
删除缓存的句子表示. 删除之后如果模型权重没有被删除,将开始使用动态计算权重。 | |||
@@ -1,6 +1,13 @@ | |||
""" | |||
.. todo:: | |||
doc | |||
""" | |||
import os | |||
__all__ = [ | |||
"ElmoEmbedding" | |||
] | |||
import os | |||
import torch | |||
import torch.nn as nn | |||
import torch.nn.functional as F | |||
@@ -49,11 +56,11 @@ class ElmoEmbedding(ContextualEmbedding): | |||
:param cache_word_reprs: 可以选择对word的表示进行cache; 设置为True的话,将在初始化的时候为每个word生成对应的embedding, | |||
并删除character encoder,之后将直接使用cache的embedding。默认为False。 | |||
""" | |||
def __init__(self, vocab: Vocabulary, model_dir_or_name: str = 'en', layers: str = '2', requires_grad: bool = False, | |||
word_dropout=0.0, dropout=0.0, cache_word_reprs: bool = False): | |||
super(ElmoEmbedding, self).__init__(vocab, word_dropout=word_dropout, dropout=dropout) | |||
# 根据model_dir_or_name检查是否存在并下载 | |||
if model_dir_or_name.lower() in PRETRAINED_ELMO_MODEL_DIR: | |||
model_url = _get_embedding_url('elmo', model_dir_or_name.lower()) | |||
@@ -64,7 +71,7 @@ class ElmoEmbedding(ContextualEmbedding): | |||
else: | |||
raise ValueError(f"Cannot recognize {model_dir_or_name}.") | |||
self.model = _ElmoModel(model_dir, vocab, cache_word_reprs=cache_word_reprs) | |||
if layers == 'mix': | |||
self.layer_weights = nn.Parameter(torch.zeros(self.model.config['lstm']['n_layers'] + 1), | |||
requires_grad=requires_grad) | |||
@@ -79,16 +86,16 @@ class ElmoEmbedding(ContextualEmbedding): | |||
self.layers = layers | |||
self._get_outputs = self._get_layer_outputs | |||
self._embed_size = len(self.layers) * self.model.config['lstm']['projection_dim'] * 2 | |||
self.requires_grad = requires_grad | |||
def _get_mixed_outputs(self, outputs): | |||
# outputs: num_layers x batch_size x max_len x hidden_size | |||
# return: batch_size x max_len x hidden_size | |||
weights = F.softmax(self.layer_weights + 1 / len(outputs), dim=0).to(outputs) | |||
outputs = torch.einsum('l,lbij->bij', weights, outputs) | |||
return self.gamma.to(outputs) * outputs | |||
def set_mix_weights_requires_grad(self, flag=True): | |||
""" | |||
当初始化ElmoEmbedding时layers被设置为mix时,可以通过调用该方法设置mix weights是否可训练。如果layers不是mix,调用 | |||
@@ -100,15 +107,15 @@ class ElmoEmbedding(ContextualEmbedding): | |||
if hasattr(self, 'layer_weights'): | |||
self.layer_weights.requires_grad = flag | |||
self.gamma.requires_grad = flag | |||
def _get_layer_outputs(self, outputs): | |||
if len(self.layers) == 1: | |||
outputs = outputs[self.layers[0]] | |||
else: | |||
outputs = torch.cat(tuple([*outputs[self.layers]]), dim=-1) | |||
return outputs | |||
def forward(self, words: torch.LongTensor): | |||
""" | |||
计算words的elmo embedding表示。根据elmo文章中介绍的ELMO实际上是有2L+1层结果,但是为了让结果比较容易拆分,token的 | |||
@@ -125,12 +132,12 @@ class ElmoEmbedding(ContextualEmbedding): | |||
outputs = self.model(words) | |||
outputs = self._get_outputs(outputs) | |||
return self.dropout(outputs) | |||
def _delete_model_weights(self): | |||
for name in ['layers', 'model', 'layer_weights', 'gamma']: | |||
if hasattr(self, name): | |||
delattr(self, name) | |||
@property | |||
def requires_grad(self): | |||
""" | |||
@@ -144,7 +151,7 @@ class ElmoEmbedding(ContextualEmbedding): | |||
return requires_grads.pop() | |||
else: | |||
return None | |||
@requires_grad.setter | |||
def requires_grad(self, value): | |||
for name, param in self.named_parameters(): | |||
@@ -162,7 +169,7 @@ class _ElmoModel(nn.Module): | |||
(4) 设计一个保存token的embedding,允许缓存word的表示。 | |||
""" | |||
def __init__(self, model_dir: str, vocab: Vocabulary = None, cache_word_reprs: bool = False): | |||
super(_ElmoModel, self).__init__() | |||
self.model_dir = model_dir | |||
@@ -187,14 +194,14 @@ class _ElmoModel(nn.Module): | |||
config = json.load(config_f) | |||
self.weight_file = os.path.join(model_dir, weight_file) | |||
self.config = config | |||
OOV_TAG = '<oov>' | |||
PAD_TAG = '<pad>' | |||
BOS_TAG = '<bos>' | |||
EOS_TAG = '<eos>' | |||
BOW_TAG = '<bow>' | |||
EOW_TAG = '<eow>' | |||
# For the model trained with character-based word encoder. | |||
char_lexicon = {} | |||
with codecs.open(os.path.join(model_dir, 'char.dic'), 'r', encoding='utf-8') as fpi: | |||
@@ -204,29 +211,29 @@ class _ElmoModel(nn.Module): | |||
tokens.insert(0, '\u3000') | |||
token, i = tokens | |||
char_lexicon[token] = int(i) | |||
# 做一些sanity check | |||
for special_word in [PAD_TAG, OOV_TAG, BOW_TAG, EOW_TAG]: | |||
assert special_word in char_lexicon, f"{special_word} not found in char.dic." | |||
# 从vocab中构建char_vocab | |||
char_vocab = Vocabulary(unknown=OOV_TAG, padding=PAD_TAG) | |||
# 需要保证<bow>与<eow>在里面 | |||
char_vocab.add_word_lst([BOW_TAG, EOW_TAG, BOS_TAG, EOS_TAG]) | |||
for word, index in vocab: | |||
char_vocab.add_word_lst(list(word)) | |||
self.bos_index, self.eos_index, self._pad_index = len(vocab), len(vocab) + 1, vocab.padding_idx | |||
# 根据char_lexicon调整, 多设置一位,是预留给word padding的(该位置的char表示为全0表示) | |||
char_emb_layer = nn.Embedding(len(char_vocab) + 1, int(config['char_cnn']['embedding']['dim']), | |||
padding_idx=len(char_vocab)) | |||
# 读入预训练权重 这里的elmo_model 包含char_cnn和 lstm 的 state_dict | |||
elmo_model = torch.load(os.path.join(self.model_dir, weight_file), map_location='cpu') | |||
char_embed_weights = elmo_model["char_cnn"]['char_emb_layer.weight'] | |||
found_char_count = 0 | |||
for char, index in char_vocab: # 调整character embedding | |||
if char in char_lexicon: | |||
@@ -235,11 +242,11 @@ class _ElmoModel(nn.Module): | |||
else: | |||
index_in_pre = char_lexicon[OOV_TAG] | |||
char_emb_layer.weight.data[index] = char_embed_weights[index_in_pre] | |||
print(f"{found_char_count} out of {len(char_vocab)} characters were found in pretrained elmo embedding.") | |||
# 生成words到chars的映射 | |||
max_chars = config['char_cnn']['max_characters_per_token'] | |||
self.words_to_chars_embedding = nn.Parameter(torch.full((len(vocab) + 2, max_chars), | |||
fill_value=len(char_vocab), | |||
dtype=torch.long), | |||
@@ -258,20 +265,20 @@ class _ElmoModel(nn.Module): | |||
char_vocab.to_index(EOW_TAG)] | |||
char_ids += [char_vocab.to_index(PAD_TAG)] * (max_chars - len(char_ids)) | |||
self.words_to_chars_embedding[index] = torch.LongTensor(char_ids) | |||
self.char_vocab = char_vocab | |||
self.token_embedder = ConvTokenEmbedder( | |||
config, self.weight_file, None, char_emb_layer) | |||
elmo_model["char_cnn"]['char_emb_layer.weight'] = char_emb_layer.weight | |||
self.token_embedder.load_state_dict(elmo_model["char_cnn"]) | |||
self.output_dim = config['lstm']['projection_dim'] | |||
# lstm encoder | |||
self.encoder = ElmobiLm(config) | |||
self.encoder.load_state_dict(elmo_model["lstm"]) | |||
if cache_word_reprs: | |||
if config['char_cnn']['embedding']['dim'] > 0: # 只有在使用了chars的情况下有用 | |||
print("Start to generate cache word representations.") | |||
@@ -280,7 +287,7 @@ class _ElmoModel(nn.Module): | |||
word_size = self.words_to_chars_embedding.size(0) | |||
num_batches = word_size // batch_size + \ | |||
int(word_size % batch_size != 0) | |||
self.cached_word_embedding = nn.Embedding(word_size, | |||
config['lstm']['projection_dim']) | |||
with torch.no_grad(): | |||
@@ -291,12 +298,12 @@ class _ElmoModel(nn.Module): | |||
word_reprs = self.token_embedder(words.unsqueeze(1), | |||
chars).detach() # batch_size x 1 x config['encoder']['projection_dim'] | |||
self.cached_word_embedding.weight.data[words] = word_reprs.squeeze(1) | |||
print("Finish generating cached word representations. Going to delete the character encoder.") | |||
del self.token_embedder, self.words_to_chars_embedding | |||
else: | |||
print("There is no need to cache word representations, since no character information is used.") | |||
def forward(self, words): | |||
""" | |||
@@ -321,7 +328,7 @@ class _ElmoModel(nn.Module): | |||
else: | |||
chars = None | |||
token_embedding = self.token_embedder(expanded_words, chars) # batch_size x max_len x embed_dim | |||
encoder_output = self.encoder(token_embedding, seq_len) | |||
if encoder_output.size(2) < max_len + 2: | |||
num_layers, _, output_len, hidden_size = encoder_output.size() | |||
@@ -332,7 +339,7 @@ class _ElmoModel(nn.Module): | |||
token_embedding = token_embedding.masked_fill(mask, 0) | |||
token_embedding = torch.cat((token_embedding, token_embedding), dim=2).view(1, sz[1], sz[2], sz[3]) | |||
encoder_output = torch.cat((token_embedding, encoder_output), dim=0) | |||
# 删除<eos>, <bos>. 这里没有精确地删除,但应该也不会影响最后的结果了。 | |||
encoder_output = encoder_output[:, :, 1:-1] | |||
return encoder_output |
@@ -3,6 +3,10 @@ | |||
""" | |||
__all__ = [ | |||
"Embedding", | |||
"TokenEmbedding" | |||
] | |||
import torch.nn as nn | |||
from abc import abstractmethod | |||
@@ -33,11 +37,11 @@ class Embedding(nn.Module): | |||
:param float dropout: 对Embedding的输出的dropout。 | |||
:param int unk_index: drop word时替换为的index。fastNLP的Vocabulary的unk_index默认为1。 | |||
""" | |||
def __init__(self, init_embed, word_dropout=0, dropout=0.0, unk_index=None): | |||
super(Embedding, self).__init__() | |||
self.embed = get_embeddings(init_embed) | |||
self.dropout = nn.Dropout(dropout) | |||
@@ -48,44 +52,44 @@ class Embedding(nn.Module): | |||
self._embed_size = self.embed.embedding_dim | |||
else: | |||
self._embed_size = self.embed.weight.size(1) | |||
if word_dropout>0 and not isinstance(unk_index, int): | |||
if word_dropout > 0 and not isinstance(unk_index, int): | |||
raise ValueError("When drop word is set, you need to pass in the unk_index.") | |||
else: | |||
self._embed_size = self.embed.embed_size | |||
unk_index = self.embed.get_word_vocab().unknown_idx | |||
self.unk_index = unk_index | |||
self.word_dropout = word_dropout | |||
def forward(self, words): | |||
""" | |||
:param torch.LongTensor words: [batch, seq_len] | |||
:return: torch.Tensor : [batch, seq_len, embed_dim] | |||
""" | |||
if self.word_dropout>0 and self.training: | |||
if self.word_dropout > 0 and self.training: | |||
mask = torch.ones_like(words).float() * self.word_dropout | |||
mask = torch.bernoulli(mask).eq(1) # dropout_word越大,越多位置为1 | |||
words = words.masked_fill(mask, self.unk_index) | |||
words = self.embed(words) | |||
return self.dropout(words) | |||
@property | |||
def num_embedding(self)->int: | |||
def num_embedding(self) -> int: | |||
if isinstance(self.embed, nn.Embedding): | |||
return self.embed.weight.size(0) | |||
else: | |||
return self.embed.num_embedding | |||
def __len__(self): | |||
return len(self.embed) | |||
@property | |||
def embed_size(self) -> int: | |||
return self._embed_size | |||
@property | |||
def embedding_dim(self) -> int: | |||
return self._embed_size | |||
@property | |||
def requires_grad(self): | |||
""" | |||
@@ -96,14 +100,14 @@ class Embedding(nn.Module): | |||
return self.embed.weight.requires_grad | |||
else: | |||
return self.embed.requires_grad | |||
@requires_grad.setter | |||
def requires_grad(self, value): | |||
if not isinstance(self.embed, TokenEmbedding): | |||
self.embed.weight.requires_grad = value | |||
else: | |||
self.embed.requires_grad = value | |||
@property | |||
def size(self): | |||
if isinstance(self.embed, TokenEmbedding): | |||
@@ -120,12 +124,12 @@ class TokenEmbedding(nn.Module): | |||
assert vocab.padding is not None, "Vocabulary must have a padding entry." | |||
self._word_vocab = vocab | |||
self._word_pad_index = vocab.padding_idx | |||
if word_dropout>0: | |||
if word_dropout > 0: | |||
assert vocab.unknown is not None, "Vocabulary must have unknown entry when you want to drop a word." | |||
self.word_dropout = word_dropout | |||
self._word_unk_index = vocab.unknown_idx | |||
self.dropout_layer = nn.Dropout(dropout) | |||
def drop_word(self, words): | |||
""" | |||
按照设定随机将words设置为unknown_index。 | |||
@@ -138,7 +142,7 @@ class TokenEmbedding(nn.Module): | |||
mask = torch.bernoulli(mask).eq(1) # dropout_word越大,越多位置为1 | |||
words = words.masked_fill(mask, self._word_unk_index) | |||
return words | |||
def dropout(self, words): | |||
""" | |||
对embedding后的word表示进行drop。 | |||
@@ -147,7 +151,7 @@ class TokenEmbedding(nn.Module): | |||
:return: | |||
""" | |||
return self.dropout_layer(words) | |||
@property | |||
def requires_grad(self): | |||
""" | |||
@@ -159,23 +163,23 @@ class TokenEmbedding(nn.Module): | |||
return requires_grads.pop() | |||
else: | |||
return None | |||
@requires_grad.setter | |||
def requires_grad(self, value): | |||
for param in self.parameters(): | |||
param.requires_grad = value | |||
def __len__(self): | |||
return len(self._word_vocab) | |||
@property | |||
def embed_size(self) -> int: | |||
return self._embed_size | |||
@property | |||
def embedding_dim(self) -> int: | |||
return self._embed_size | |||
@property | |||
def num_embedding(self) -> int: | |||
""" | |||
@@ -183,7 +187,7 @@ class TokenEmbedding(nn.Module): | |||
:return: | |||
""" | |||
return len(self._word_vocab) | |||
def get_word_vocab(self): | |||
""" | |||
返回embedding的词典。 | |||
@@ -191,11 +195,11 @@ class TokenEmbedding(nn.Module): | |||
:return: Vocabulary | |||
""" | |||
return self._word_vocab | |||
@property | |||
def size(self): | |||
return torch.Size(self.num_embedding, self._embed_size) | |||
@abstractmethod | |||
def forward(self, words): | |||
raise NotImplementedError |
@@ -1,3 +1,12 @@ | |||
""" | |||
.. todo:: | |||
doc | |||
""" | |||
__all__ = [ | |||
"StackEmbedding", | |||
] | |||
from typing import List | |||
import torch | |||
@@ -26,6 +35,7 @@ class StackEmbedding(TokenEmbedding): | |||
:param float dropout: 以多大的概率对embedding的表示进行Dropout。0.1即随机将10%的值置为0。 | |||
""" | |||
def __init__(self, embeds: List[TokenEmbedding], word_dropout=0, dropout=0): | |||
vocabs = [] | |||
for embed in embeds: | |||
@@ -34,14 +44,14 @@ class StackEmbedding(TokenEmbedding): | |||
_vocab = vocabs[0] | |||
for vocab in vocabs[1:]: | |||
assert vocab == _vocab, "All embeddings in StackEmbedding should use the same word vocabulary." | |||
super(StackEmbedding, self).__init__(_vocab, word_dropout=word_dropout, dropout=dropout) | |||
assert isinstance(embeds, list) | |||
for embed in embeds: | |||
assert isinstance(embed, TokenEmbedding), "Only TokenEmbedding type is supported." | |||
self.embeds = nn.ModuleList(embeds) | |||
self._embed_size = sum([embed.embed_size for embed in self.embeds]) | |||
def append(self, embed: TokenEmbedding): | |||
""" | |||
添加一个embedding到结尾。 | |||
@@ -50,18 +60,18 @@ class StackEmbedding(TokenEmbedding): | |||
""" | |||
assert isinstance(embed, TokenEmbedding) | |||
self.embeds.append(embed) | |||
def pop(self): | |||
""" | |||
弹出最后一个embed | |||
:return: | |||
""" | |||
return self.embeds.pop() | |||
@property | |||
def embed_size(self): | |||
return self._embed_size | |||
@property | |||
def requires_grad(self): | |||
""" | |||
@@ -73,12 +83,12 @@ class StackEmbedding(TokenEmbedding): | |||
return requires_grads.pop() | |||
else: | |||
return None | |||
@requires_grad.setter | |||
def requires_grad(self, value): | |||
for embed in self.embeds(): | |||
embed.requires_grad = value | |||
def forward(self, words): | |||
""" | |||
得到多个embedding的结果,并把结果按照顺序concat起来。 | |||
@@ -1,4 +1,11 @@ | |||
""" | |||
.. todo:: | |||
doc | |||
""" | |||
__all__ = [ | |||
"StaticEmbedding" | |||
] | |||
import os | |||
import torch | |||
@@ -13,6 +20,7 @@ from ..modules.utils import _get_file_name_base_on_postfix | |||
from copy import deepcopy | |||
from collections import defaultdict | |||
class StaticEmbedding(TokenEmbedding): | |||
""" | |||
别名::class:`fastNLP.embeddings.StaticEmbedding` :class:`fastNLP.embeddings.static_embedding.StaticEmbedding` | |||
@@ -55,15 +63,16 @@ class StaticEmbedding(TokenEmbedding): | |||
:param bool normalize: 是否对vector进行normalize,使得每个vector的norm为1。 | |||
:param int min_freq: Vocabulary词频数小于这个数量的word将被指向unk。 | |||
""" | |||
def __init__(self, vocab: Vocabulary, model_dir_or_name: str='en', embedding_dim=-1, requires_grad: bool=True, | |||
def __init__(self, vocab: Vocabulary, model_dir_or_name: str = 'en', embedding_dim=-1, requires_grad: bool = True, | |||
init_method=None, lower=False, dropout=0, word_dropout=0, normalize=False, min_freq=1, **kwargs): | |||
super(StaticEmbedding, self).__init__(vocab, word_dropout=word_dropout, dropout=dropout) | |||
if embedding_dim>0: | |||
if embedding_dim > 0: | |||
model_dir_or_name = None | |||
# 得到cache_path | |||
if model_dir_or_name is None: | |||
assert embedding_dim>=1, "The dimension of embedding should be larger than 1." | |||
assert embedding_dim >= 1, "The dimension of embedding should be larger than 1." | |||
embedding_dim = int(embedding_dim) | |||
model_path = None | |||
elif model_dir_or_name.lower() in PRETRAIN_STATIC_FILES: | |||
@@ -76,9 +85,9 @@ class StaticEmbedding(TokenEmbedding): | |||
model_path = _get_file_name_base_on_postfix(os.path.abspath(os.path.expanduser(model_dir_or_name)), '.txt') | |||
else: | |||
raise ValueError(f"Cannot recognize {model_dir_or_name}.") | |||
# 根据min_freq缩小vocab | |||
truncate_vocab = (vocab.min_freq is None and min_freq>1) or (vocab.min_freq and vocab.min_freq<min_freq) | |||
truncate_vocab = (vocab.min_freq is None and min_freq > 1) or (vocab.min_freq and vocab.min_freq < min_freq) | |||
if truncate_vocab: | |||
truncated_vocab = deepcopy(vocab) | |||
truncated_vocab.min_freq = min_freq | |||
@@ -89,14 +98,14 @@ class StaticEmbedding(TokenEmbedding): | |||
lowered_word_count[word.lower()] += count | |||
for word in truncated_vocab.word_count.keys(): | |||
word_count = truncated_vocab.word_count[word] | |||
if lowered_word_count[word.lower()]>=min_freq and word_count<min_freq: | |||
truncated_vocab.add_word_lst([word]*(min_freq-word_count), | |||
if lowered_word_count[word.lower()] >= min_freq and word_count < min_freq: | |||
truncated_vocab.add_word_lst([word] * (min_freq - word_count), | |||
no_create_entry=truncated_vocab._is_word_no_create_entry(word)) | |||
# 只限制在train里面的词语使用min_freq筛选 | |||
if kwargs.get('only_train_min_freq', False) and model_dir_or_name is not None: | |||
for word in truncated_vocab.word_count.keys(): | |||
if truncated_vocab._is_word_no_create_entry(word) and truncated_vocab.word_count[word]<min_freq: | |||
if truncated_vocab._is_word_no_create_entry(word) and truncated_vocab.word_count[word] < min_freq: | |||
truncated_vocab.add_word_lst([word] * (min_freq - truncated_vocab.word_count[word]), | |||
no_create_entry=True) | |||
truncated_vocab.build_vocab() | |||
@@ -105,7 +114,7 @@ class StaticEmbedding(TokenEmbedding): | |||
truncated_words_to_words[index] = truncated_vocab.to_index(word) | |||
print(f"{len(vocab) - len(truncated_vocab)} out of {len(vocab)} words have frequency less than {min_freq}.") | |||
vocab = truncated_vocab | |||
self.only_norm_found_vector = kwargs.get('only_norm_found_vector', False) | |||
# 读取embedding | |||
if lower: | |||
@@ -145,21 +154,21 @@ class StaticEmbedding(TokenEmbedding): | |||
self.words_to_words = nn.Parameter(torch.arange(len(vocab)).long(), requires_grad=False) | |||
if not self.only_norm_found_vector and normalize: | |||
embedding /= (torch.norm(embedding, dim=1, keepdim=True) + 1e-12) | |||
if truncate_vocab: | |||
for i in range(len(truncated_words_to_words)): | |||
index_in_truncated_vocab = truncated_words_to_words[i] | |||
truncated_words_to_words[i] = self.words_to_words[index_in_truncated_vocab] | |||
del self.words_to_words | |||
self.words_to_words = nn.Parameter(truncated_words_to_words, requires_grad=False) | |||
self.embedding = nn.Embedding(num_embeddings=embedding.shape[0], embedding_dim=embedding.shape[1], | |||
padding_idx=vocab.padding_idx, | |||
max_norm=None, norm_type=2, scale_grad_by_freq=False, | |||
sparse=False, _weight=embedding) | |||
self._embed_size = self.embedding.weight.size(1) | |||
self.requires_grad = requires_grad | |||
def _randomly_init_embed(self, num_embedding, embedding_dim, init_embed=None): | |||
""" | |||
@@ -169,14 +178,14 @@ class StaticEmbedding(TokenEmbedding): | |||
:return: torch.FloatTensor | |||
""" | |||
embed = torch.zeros(num_embedding, embedding_dim) | |||
if init_embed is None: | |||
nn.init.uniform_(embed, -np.sqrt(3/embedding_dim), np.sqrt(3/embedding_dim)) | |||
nn.init.uniform_(embed, -np.sqrt(3 / embedding_dim), np.sqrt(3 / embedding_dim)) | |||
else: | |||
init_embed(embed) | |||
return embed | |||
@property | |||
def requires_grad(self): | |||
""" | |||
@@ -190,14 +199,14 @@ class StaticEmbedding(TokenEmbedding): | |||
return requires_grads.pop() | |||
else: | |||
return None | |||
@requires_grad.setter | |||
def requires_grad(self, value): | |||
for name, param in self.named_parameters(): | |||
if 'words_to_words' in name: | |||
continue | |||
param.requires_grad = value | |||
def _load_with_vocab(self, embed_filepath, vocab, dtype=np.float32, padding='<pad>', unknown='<unk>', | |||
error='ignore', init_method=None): | |||
""" | |||
@@ -250,7 +259,7 @@ class StaticEmbedding(TokenEmbedding): | |||
index = vocab.to_index(word) | |||
matrix[index] = torch.from_numpy(np.fromstring(' '.join(nums), sep=' ', dtype=dtype, count=dim)) | |||
if self.only_norm_found_vector: | |||
matrix[index] = matrix[index]/np.linalg.norm(matrix[index]) | |||
matrix[index] = matrix[index] / np.linalg.norm(matrix[index]) | |||
found_count += 1 | |||
except Exception as e: | |||
if error == 'ignore': | |||
@@ -267,22 +276,22 @@ class StaticEmbedding(TokenEmbedding): | |||
matrix[index] = None | |||
# matrix中代表是需要建立entry的词 | |||
vectors = self._randomly_init_embed(len(matrix), dim, init_method) | |||
if vocab.unknown is None: # 创建一个专门的unknown | |||
unknown_idx = len(matrix) | |||
vectors = torch.cat((vectors, torch.zeros(1, dim)), dim=0).contiguous() | |||
else: | |||
unknown_idx = vocab.unknown_idx | |||
self.words_to_words = nn.Parameter(torch.full((len(vocab), ), fill_value=unknown_idx).long(), | |||
self.words_to_words = nn.Parameter(torch.full((len(vocab),), fill_value=unknown_idx).long(), | |||
requires_grad=False) | |||
for index, (index_in_vocab, vec) in enumerate(matrix.items()): | |||
if vec is not None: | |||
vectors[index] = vec | |||
self.words_to_words[index_in_vocab] = index | |||
return vectors | |||
def forward(self, words): | |||
""" | |||
传入words的index | |||
@@ -1,13 +1,19 @@ | |||
""" | |||
.. todo:: | |||
doc | |||
""" | |||
import numpy as np | |||
import torch | |||
from torch import nn as nn | |||
from ..core.vocabulary import Vocabulary | |||
__all__ = ['get_embeddings'] | |||
__all__ = [ | |||
'get_embeddings' | |||
] | |||
def _construct_char_vocab_from_vocab(vocab:Vocabulary, min_freq:int=1): | |||
def _construct_char_vocab_from_vocab(vocab: Vocabulary, min_freq: int = 1): | |||
""" | |||
给定一个word的vocabulary生成character的vocabulary. | |||
@@ -36,8 +42,8 @@ def get_embeddings(init_embed): | |||
if isinstance(init_embed, tuple): | |||
res = nn.Embedding( | |||
num_embeddings=init_embed[0], embedding_dim=init_embed[1]) | |||
nn.init.uniform_(res.weight.data, a=-np.sqrt(3/res.weight.data.size(1)), | |||
b=np.sqrt(3/res.weight.data.size(1))) | |||
nn.init.uniform_(res.weight.data, a=-np.sqrt(3 / res.weight.data.size(1)), | |||
b=np.sqrt(3 / res.weight.data.size(1))) | |||
elif isinstance(init_embed, nn.Module): | |||
res = init_embed | |||
elif isinstance(init_embed, torch.Tensor): | |||
@@ -48,4 +54,4 @@ def get_embeddings(init_embed): | |||
else: | |||
raise TypeError( | |||
'invalid init_embed type: {}'.format((type(init_embed)))) | |||
return res | |||
return res |