@@ -18,7 +18,6 @@ __all__ = [ | |||||
"get_embeddings", | "get_embeddings", | ||||
] | ] | ||||
from .embedding import Embedding, TokenEmbedding | from .embedding import Embedding, TokenEmbedding | ||||
from .static_embedding import StaticEmbedding | from .static_embedding import StaticEmbedding | ||||
from .elmo_embedding import ElmoEmbedding | from .elmo_embedding import ElmoEmbedding | ||||
@@ -1,3 +1,12 @@ | |||||
""" | |||||
.. todo:: | |||||
doc | |||||
""" | |||||
__all__ = [ | |||||
"BertEmbedding", | |||||
"BertWordPieceEncoder" | |||||
] | |||||
import os | import os | ||||
import collections | import collections | ||||
@@ -13,6 +22,7 @@ from ..modules.encoder.bert import _WordPieceBertModel, BertModel, BertTokenizer | |||||
from .contextual_embedding import ContextualEmbedding | from .contextual_embedding import ContextualEmbedding | ||||
import warnings | import warnings | ||||
class BertEmbedding(ContextualEmbedding): | class BertEmbedding(ContextualEmbedding): | ||||
""" | """ | ||||
别名::class:`fastNLP.embeddings.BertEmbedding` :class:`fastNLP.embeddings.bert_embedding.BertEmbedding` | 别名::class:`fastNLP.embeddings.BertEmbedding` :class:`fastNLP.embeddings.bert_embedding.BertEmbedding` | ||||
@@ -54,11 +64,12 @@ class BertEmbedding(ContextualEmbedding): | |||||
word pieces后的内容,并将第512个word piece置为[SEP]。超过长度的部分的encode结果直接全部置零。一般仅有只使用[CLS] | word pieces后的内容,并将第512个word piece置为[SEP]。超过长度的部分的encode结果直接全部置零。一般仅有只使用[CLS] | ||||
来进行分类的任务将auto_truncate置为True。 | 来进行分类的任务将auto_truncate置为True。 | ||||
""" | """ | ||||
def __init__(self, vocab: Vocabulary, model_dir_or_name: str='en-base-uncased', layers: str='-1', | |||||
pool_method: str='first', word_dropout=0, dropout=0, include_cls_sep: bool=False, | |||||
pooled_cls=True, requires_grad: bool=False, auto_truncate:bool=False): | |||||
def __init__(self, vocab: Vocabulary, model_dir_or_name: str = 'en-base-uncased', layers: str = '-1', | |||||
pool_method: str = 'first', word_dropout=0, dropout=0, include_cls_sep: bool = False, | |||||
pooled_cls=True, requires_grad: bool = False, auto_truncate: bool = False): | |||||
super(BertEmbedding, self).__init__(vocab, word_dropout=word_dropout, dropout=dropout) | super(BertEmbedding, self).__init__(vocab, word_dropout=word_dropout, dropout=dropout) | ||||
# 根据model_dir_or_name检查是否存在并下载 | # 根据model_dir_or_name检查是否存在并下载 | ||||
if model_dir_or_name.lower() in PRETRAINED_BERT_MODEL_DIR: | if model_dir_or_name.lower() in PRETRAINED_BERT_MODEL_DIR: | ||||
if 'cn' in model_dir_or_name.lower() and pool_method not in ('first', 'last'): | if 'cn' in model_dir_or_name.lower() and pool_method not in ('first', 'last'): | ||||
@@ -71,21 +82,21 @@ class BertEmbedding(ContextualEmbedding): | |||||
model_dir = os.path.abspath(os.path.expanduser(model_dir_or_name)) | model_dir = os.path.abspath(os.path.expanduser(model_dir_or_name)) | ||||
else: | else: | ||||
raise ValueError(f"Cannot recognize {model_dir_or_name}.") | raise ValueError(f"Cannot recognize {model_dir_or_name}.") | ||||
self._word_sep_index = None | self._word_sep_index = None | ||||
if '[SEP]' in vocab: | if '[SEP]' in vocab: | ||||
self._word_sep_index = vocab['[SEP]'] | self._word_sep_index = vocab['[SEP]'] | ||||
self.model = _WordBertModel(model_dir=model_dir, vocab=vocab, layers=layers, | self.model = _WordBertModel(model_dir=model_dir, vocab=vocab, layers=layers, | ||||
pool_method=pool_method, include_cls_sep=include_cls_sep, | pool_method=pool_method, include_cls_sep=include_cls_sep, | ||||
pooled_cls=pooled_cls, auto_truncate=auto_truncate, min_freq=2) | pooled_cls=pooled_cls, auto_truncate=auto_truncate, min_freq=2) | ||||
self.requires_grad = requires_grad | self.requires_grad = requires_grad | ||||
self._embed_size = len(self.model.layers)*self.model.encoder.hidden_size | |||||
self._embed_size = len(self.model.layers) * self.model.encoder.hidden_size | |||||
def _delete_model_weights(self): | def _delete_model_weights(self): | ||||
del self.model | del self.model | ||||
def forward(self, words): | def forward(self, words): | ||||
""" | """ | ||||
计算words的bert embedding表示。计算之前会在每句话的开始增加[CLS]在结束增加[SEP], 并根据include_cls_sep判断要不要 | 计算words的bert embedding表示。计算之前会在每句话的开始增加[CLS]在结束增加[SEP], 并根据include_cls_sep判断要不要 | ||||
@@ -100,9 +111,9 @@ class BertEmbedding(ContextualEmbedding): | |||||
return self.dropout(outputs) | return self.dropout(outputs) | ||||
outputs = self.model(words) | outputs = self.model(words) | ||||
outputs = torch.cat([*outputs], dim=-1) | outputs = torch.cat([*outputs], dim=-1) | ||||
return self.dropout(outputs) | return self.dropout(outputs) | ||||
def drop_word(self, words): | def drop_word(self, words): | ||||
""" | """ | ||||
按照设定随机将words设置为unknown_index。 | 按照设定随机将words设置为unknown_index。 | ||||
@@ -120,7 +131,7 @@ class BertEmbedding(ContextualEmbedding): | |||||
if self._word_sep_index: | if self._word_sep_index: | ||||
words.masked_fill_(sep_mask, self._word_sep_index) | words.masked_fill_(sep_mask, self._word_sep_index) | ||||
return words | return words | ||||
@property | @property | ||||
def requires_grad(self): | def requires_grad(self): | ||||
""" | """ | ||||
@@ -129,12 +140,12 @@ class BertEmbedding(ContextualEmbedding): | |||||
:return: | :return: | ||||
""" | """ | ||||
requires_grads = set([param.requires_grad for name, param in self.named_parameters() | requires_grads = set([param.requires_grad for name, param in self.named_parameters() | ||||
if 'word_pieces_lengths' not in name]) | |||||
if 'word_pieces_lengths' not in name]) | |||||
if len(requires_grads) == 1: | if len(requires_grads) == 1: | ||||
return requires_grads.pop() | return requires_grads.pop() | ||||
else: | else: | ||||
return None | return None | ||||
@requires_grad.setter | @requires_grad.setter | ||||
def requires_grad(self, value): | def requires_grad(self, value): | ||||
for name, param in self.named_parameters(): | for name, param in self.named_parameters(): | ||||
@@ -155,10 +166,11 @@ class BertWordPieceEncoder(nn.Module): | |||||
:param float dropout: 以多大的概率对embedding的表示进行Dropout。0.1即随机将10%的值置为0。 | :param float dropout: 以多大的概率对embedding的表示进行Dropout。0.1即随机将10%的值置为0。 | ||||
:param bool requires_grad: 是否需要gradient。 | :param bool requires_grad: 是否需要gradient。 | ||||
""" | """ | ||||
def __init__(self, model_dir_or_name: str='en-base-uncased', layers: str='-1', pooled_cls: bool = False, | |||||
word_dropout=0, dropout=0, requires_grad: bool=False): | |||||
def __init__(self, model_dir_or_name: str = 'en-base-uncased', layers: str = '-1', pooled_cls: bool = False, | |||||
word_dropout=0, dropout=0, requires_grad: bool = False): | |||||
super().__init__() | super().__init__() | ||||
if model_dir_or_name.lower() in PRETRAINED_BERT_MODEL_DIR: | if model_dir_or_name.lower() in PRETRAINED_BERT_MODEL_DIR: | ||||
model_url = _get_embedding_url('bert', model_dir_or_name.lower()) | model_url = _get_embedding_url('bert', model_dir_or_name.lower()) | ||||
model_dir = cached_path(model_url, name='embedding') | model_dir = cached_path(model_url, name='embedding') | ||||
@@ -167,7 +179,7 @@ class BertWordPieceEncoder(nn.Module): | |||||
model_dir = model_dir_or_name | model_dir = model_dir_or_name | ||||
else: | else: | ||||
raise ValueError(f"Cannot recognize {model_dir_or_name}.") | raise ValueError(f"Cannot recognize {model_dir_or_name}.") | ||||
self.model = _WordPieceBertModel(model_dir=model_dir, layers=layers, pooled_cls=pooled_cls) | self.model = _WordPieceBertModel(model_dir=model_dir, layers=layers, pooled_cls=pooled_cls) | ||||
self._sep_index = self.model._sep_index | self._sep_index = self.model._sep_index | ||||
self._wordpiece_unk_index = self.model._wordpiece_unknown_index | self._wordpiece_unk_index = self.model._wordpiece_unknown_index | ||||
@@ -175,7 +187,7 @@ class BertWordPieceEncoder(nn.Module): | |||||
self.requires_grad = requires_grad | self.requires_grad = requires_grad | ||||
self.word_dropout = word_dropout | self.word_dropout = word_dropout | ||||
self.dropout_layer = nn.Dropout(dropout) | self.dropout_layer = nn.Dropout(dropout) | ||||
@property | @property | ||||
def requires_grad(self): | def requires_grad(self): | ||||
""" | """ | ||||
@@ -187,24 +199,24 @@ class BertWordPieceEncoder(nn.Module): | |||||
return requires_grads.pop() | return requires_grads.pop() | ||||
else: | else: | ||||
return None | return None | ||||
@requires_grad.setter | @requires_grad.setter | ||||
def requires_grad(self, value): | def requires_grad(self, value): | ||||
for name, param in self.named_parameters(): | for name, param in self.named_parameters(): | ||||
param.requires_grad = value | param.requires_grad = value | ||||
@property | @property | ||||
def embed_size(self): | def embed_size(self): | ||||
return self._embed_size | return self._embed_size | ||||
@property | @property | ||||
def embedding_dim(self): | def embedding_dim(self): | ||||
return self._embed_size | return self._embed_size | ||||
@property | @property | ||||
def num_embedding(self): | def num_embedding(self): | ||||
return self.model.encoder.config.vocab_size | return self.model.encoder.config.vocab_size | ||||
def index_datasets(self, *datasets, field_name, add_cls_sep=True): | def index_datasets(self, *datasets, field_name, add_cls_sep=True): | ||||
""" | """ | ||||
使用bert的tokenizer新生成word_pieces列加入到datasets中,并将他们设置为input,且将word_pieces这一列的pad value设置为了 | 使用bert的tokenizer新生成word_pieces列加入到datasets中,并将他们设置为input,且将word_pieces这一列的pad value设置为了 | ||||
@@ -216,7 +228,7 @@ class BertWordPieceEncoder(nn.Module): | |||||
:return: | :return: | ||||
""" | """ | ||||
self.model.index_dataset(*datasets, field_name=field_name, add_cls_sep=add_cls_sep) | self.model.index_dataset(*datasets, field_name=field_name, add_cls_sep=add_cls_sep) | ||||
def forward(self, word_pieces, token_type_ids=None): | def forward(self, word_pieces, token_type_ids=None): | ||||
""" | """ | ||||
计算words的bert embedding表示。传入的words中应该自行包含[CLS]与[SEP]的tag。 | 计算words的bert embedding表示。传入的words中应该自行包含[CLS]与[SEP]的tag。 | ||||
@@ -233,13 +245,13 @@ class BertWordPieceEncoder(nn.Module): | |||||
token_type_ids = sep_mask_cumsum.fmod(2) | token_type_ids = sep_mask_cumsum.fmod(2) | ||||
if token_type_ids[0, 0].item(): # 如果开头是奇数,则需要flip一下结果,因为需要保证开头为0 | if token_type_ids[0, 0].item(): # 如果开头是奇数,则需要flip一下结果,因为需要保证开头为0 | ||||
token_type_ids = token_type_ids.eq(0).long() | token_type_ids = token_type_ids.eq(0).long() | ||||
word_pieces = self.drop_word(word_pieces) | word_pieces = self.drop_word(word_pieces) | ||||
outputs = self.model(word_pieces, token_type_ids) | outputs = self.model(word_pieces, token_type_ids) | ||||
outputs = torch.cat([*outputs], dim=-1) | outputs = torch.cat([*outputs], dim=-1) | ||||
return self.dropout_layer(outputs) | return self.dropout_layer(outputs) | ||||
def drop_word(self, words): | def drop_word(self, words): | ||||
""" | """ | ||||
按照设定随机将words设置为unknown_index。 | 按照设定随机将words设置为unknown_index。 | ||||
@@ -260,10 +272,10 @@ class BertWordPieceEncoder(nn.Module): | |||||
class _WordBertModel(nn.Module): | class _WordBertModel(nn.Module): | ||||
def __init__(self, model_dir:str, vocab:Vocabulary, layers:str='-1', pool_method:str='first', | |||||
include_cls_sep:bool=False, pooled_cls:bool=False, auto_truncate:bool=False, min_freq=2): | |||||
def __init__(self, model_dir: str, vocab: Vocabulary, layers: str = '-1', pool_method: str = 'first', | |||||
include_cls_sep: bool = False, pooled_cls: bool = False, auto_truncate: bool = False, min_freq=2): | |||||
super().__init__() | super().__init__() | ||||
self.tokenzier = BertTokenizer.from_pretrained(model_dir) | self.tokenzier = BertTokenizer.from_pretrained(model_dir) | ||||
self.encoder = BertModel.from_pretrained(model_dir) | self.encoder = BertModel.from_pretrained(model_dir) | ||||
self._max_position_embeddings = self.encoder.config.max_position_embeddings | self._max_position_embeddings = self.encoder.config.max_position_embeddings | ||||
@@ -271,23 +283,23 @@ class _WordBertModel(nn.Module): | |||||
encoder_layer_number = len(self.encoder.encoder.layer) | encoder_layer_number = len(self.encoder.encoder.layer) | ||||
self.layers = list(map(int, layers.split(','))) | self.layers = list(map(int, layers.split(','))) | ||||
for layer in self.layers: | for layer in self.layers: | ||||
if layer<0: | |||||
assert -layer<=encoder_layer_number, f"The layer index:{layer} is out of scope for " \ | |||||
f"a bert model with {encoder_layer_number} layers." | |||||
if layer < 0: | |||||
assert -layer <= encoder_layer_number, f"The layer index:{layer} is out of scope for " \ | |||||
f"a bert model with {encoder_layer_number} layers." | |||||
else: | else: | ||||
assert layer<encoder_layer_number, f"The layer index:{layer} is out of scope for " \ | |||||
f"a bert model with {encoder_layer_number} layers." | |||||
assert layer < encoder_layer_number, f"The layer index:{layer} is out of scope for " \ | |||||
f"a bert model with {encoder_layer_number} layers." | |||||
assert pool_method in ('avg', 'max', 'first', 'last') | assert pool_method in ('avg', 'max', 'first', 'last') | ||||
self.pool_method = pool_method | self.pool_method = pool_method | ||||
self.include_cls_sep = include_cls_sep | self.include_cls_sep = include_cls_sep | ||||
self.pooled_cls = pooled_cls | self.pooled_cls = pooled_cls | ||||
self.auto_truncate = auto_truncate | self.auto_truncate = auto_truncate | ||||
# 将所有vocab中word的wordpiece计算出来, 需要额外考虑[CLS]和[SEP] | # 将所有vocab中word的wordpiece计算出来, 需要额外考虑[CLS]和[SEP] | ||||
print("Start to generating word pieces for word.") | print("Start to generating word pieces for word.") | ||||
# 第一步统计出需要的word_piece, 然后创建新的embed和word_piece_vocab, 然后填入值 | # 第一步统计出需要的word_piece, 然后创建新的embed和word_piece_vocab, 然后填入值 | ||||
word_piece_dict = {'[CLS]':1, '[SEP]':1} # 用到的word_piece以及新增的 | |||||
word_piece_dict = {'[CLS]': 1, '[SEP]': 1} # 用到的word_piece以及新增的 | |||||
found_count = 0 | found_count = 0 | ||||
self._has_sep_in_vocab = '[SEP]' in vocab # 用来判断传入的数据是否需要生成token_ids | self._has_sep_in_vocab = '[SEP]' in vocab # 用来判断传入的数据是否需要生成token_ids | ||||
if '[sep]' in vocab: | if '[sep]' in vocab: | ||||
@@ -302,10 +314,11 @@ class _WordBertModel(nn.Module): | |||||
elif index == vocab.unknown_idx: | elif index == vocab.unknown_idx: | ||||
word = '[UNK]' | word = '[UNK]' | ||||
word_pieces = self.tokenzier.wordpiece_tokenizer.tokenize(word) | word_pieces = self.tokenzier.wordpiece_tokenizer.tokenize(word) | ||||
if len(word_pieces)==1: | |||||
if len(word_pieces) == 1: | |||||
if not vocab._is_word_no_create_entry(word): # 如果是train中的值, 但是却没有找到 | if not vocab._is_word_no_create_entry(word): # 如果是train中的值, 但是却没有找到 | ||||
if index!=vocab.unknown_idx and word_pieces[0]=='[UNK]': # 说明这个词不在原始的word里面 | |||||
if vocab.word_count[word]>=min_freq and not vocab._is_word_no_create_entry(word): #出现次数大于这个次数才新增 | |||||
if index != vocab.unknown_idx and word_pieces[0] == '[UNK]': # 说明这个词不在原始的word里面 | |||||
if vocab.word_count[word] >= min_freq and not vocab._is_word_no_create_entry( | |||||
word): # 出现次数大于这个次数才新增 | |||||
word_piece_dict[word] = 1 # 新增一个值 | word_piece_dict[word] = 1 # 新增一个值 | ||||
continue | continue | ||||
for word_piece in word_pieces: | for word_piece in word_pieces: | ||||
@@ -327,7 +340,7 @@ class _WordBertModel(nn.Module): | |||||
new_word_piece_vocab[token] = len(new_word_piece_vocab) | new_word_piece_vocab[token] = len(new_word_piece_vocab) | ||||
self.tokenzier._reinit_on_new_vocab(new_word_piece_vocab) | self.tokenzier._reinit_on_new_vocab(new_word_piece_vocab) | ||||
self.encoder.embeddings.word_embeddings = embed | self.encoder.embeddings.word_embeddings = embed | ||||
word_to_wordpieces = [] | word_to_wordpieces = [] | ||||
word_pieces_lengths = [] | word_pieces_lengths = [] | ||||
for word, index in vocab: | for word, index in vocab: | ||||
@@ -347,7 +360,7 @@ class _WordBertModel(nn.Module): | |||||
self.word_to_wordpieces = np.array(word_to_wordpieces) | self.word_to_wordpieces = np.array(word_to_wordpieces) | ||||
self.word_pieces_lengths = nn.Parameter(torch.LongTensor(word_pieces_lengths), requires_grad=False) | self.word_pieces_lengths = nn.Parameter(torch.LongTensor(word_pieces_lengths), requires_grad=False) | ||||
print("Successfully generate word pieces.") | print("Successfully generate word pieces.") | ||||
def forward(self, words): | def forward(self, words): | ||||
""" | """ | ||||
@@ -358,34 +371,37 @@ class _WordBertModel(nn.Module): | |||||
batch_size, max_word_len = words.size() | batch_size, max_word_len = words.size() | ||||
word_mask = words.ne(self._word_pad_index) # 为1的地方有word | word_mask = words.ne(self._word_pad_index) # 为1的地方有word | ||||
seq_len = word_mask.sum(dim=-1) | seq_len = word_mask.sum(dim=-1) | ||||
batch_word_pieces_length = self.word_pieces_lengths[words].masked_fill(word_mask.eq(0), 0) # batch_size x max_len | |||||
batch_word_pieces_length = self.word_pieces_lengths[words].masked_fill(word_mask.eq(0), | |||||
0) # batch_size x max_len | |||||
word_pieces_lengths = batch_word_pieces_length.sum(dim=-1) # batch_size | word_pieces_lengths = batch_word_pieces_length.sum(dim=-1) # batch_size | ||||
word_piece_length = batch_word_pieces_length.sum(dim=-1).max().item() # 表示word piece的长度(包括padding) | word_piece_length = batch_word_pieces_length.sum(dim=-1).max().item() # 表示word piece的长度(包括padding) | ||||
if word_piece_length+2>self._max_position_embeddings: | |||||
if word_piece_length + 2 > self._max_position_embeddings: | |||||
if self.auto_truncate: | if self.auto_truncate: | ||||
word_pieces_lengths = word_pieces_lengths.masked_fill(word_pieces_lengths+2>self._max_position_embeddings, | |||||
self._max_position_embeddings-2) | |||||
word_pieces_lengths = word_pieces_lengths.masked_fill( | |||||
word_pieces_lengths + 2 > self._max_position_embeddings, | |||||
self._max_position_embeddings - 2) | |||||
else: | else: | ||||
raise RuntimeError("After split words into word pieces, the lengths of word pieces are longer than the " | |||||
f"maximum allowed sequence length:{self._max_position_embeddings} of bert.") | |||||
raise RuntimeError( | |||||
"After split words into word pieces, the lengths of word pieces are longer than the " | |||||
f"maximum allowed sequence length:{self._max_position_embeddings} of bert.") | |||||
# +2是由于需要加入[CLS]与[SEP] | # +2是由于需要加入[CLS]与[SEP] | ||||
word_pieces = words.new_full((batch_size, min(word_piece_length+2, self._max_position_embeddings)), | |||||
word_pieces = words.new_full((batch_size, min(word_piece_length + 2, self._max_position_embeddings)), | |||||
fill_value=self._wordpiece_pad_index) | fill_value=self._wordpiece_pad_index) | ||||
attn_masks = torch.zeros_like(word_pieces) | attn_masks = torch.zeros_like(word_pieces) | ||||
# 1. 获取words的word_pieces的id,以及对应的span范围 | # 1. 获取words的word_pieces的id,以及对应的span范围 | ||||
word_indexes = words.cpu().numpy() | word_indexes = words.cpu().numpy() | ||||
for i in range(batch_size): | for i in range(batch_size): | ||||
word_pieces_i = list(chain(*self.word_to_wordpieces[word_indexes[i, :seq_len[i]]])) | word_pieces_i = list(chain(*self.word_to_wordpieces[word_indexes[i, :seq_len[i]]])) | ||||
if self.auto_truncate and len(word_pieces_i)>self._max_position_embeddings-2: | |||||
word_pieces_i = word_pieces_i[:self._max_position_embeddings-2] | |||||
word_pieces[i, 1:word_pieces_lengths[i]+1] = torch.LongTensor(word_pieces_i) | |||||
attn_masks[i, :word_pieces_lengths[i]+2].fill_(1) | |||||
if self.auto_truncate and len(word_pieces_i) > self._max_position_embeddings - 2: | |||||
word_pieces_i = word_pieces_i[:self._max_position_embeddings - 2] | |||||
word_pieces[i, 1:word_pieces_lengths[i] + 1] = torch.LongTensor(word_pieces_i) | |||||
attn_masks[i, :word_pieces_lengths[i] + 2].fill_(1) | |||||
# 添加[cls]和[sep] | # 添加[cls]和[sep] | ||||
word_pieces[:, 0].fill_(self._cls_index) | word_pieces[:, 0].fill_(self._cls_index) | ||||
batch_indexes = torch.arange(batch_size).to(words) | batch_indexes = torch.arange(batch_size).to(words) | ||||
word_pieces[batch_indexes, word_pieces_lengths+1] = self._sep_index | |||||
if self._has_sep_in_vocab: #但[SEP]在vocab中出现应该才会需要token_ids | |||||
word_pieces[batch_indexes, word_pieces_lengths + 1] = self._sep_index | |||||
if self._has_sep_in_vocab: # 但[SEP]在vocab中出现应该才会需要token_ids | |||||
sep_mask = word_pieces.eq(self._sep_index) # batch_size x max_len | sep_mask = word_pieces.eq(self._sep_index) # batch_size x max_len | ||||
sep_mask_cumsum = sep_mask.flip(dims=[-1]).cumsum(dim=-1).flip(dims=[-1]) | sep_mask_cumsum = sep_mask.flip(dims=[-1]).cumsum(dim=-1).flip(dims=[-1]) | ||||
token_type_ids = sep_mask_cumsum.fmod(2) | token_type_ids = sep_mask_cumsum.fmod(2) | ||||
@@ -396,9 +412,9 @@ class _WordBertModel(nn.Module): | |||||
# 2. 获取hidden的结果,根据word_pieces进行对应的pool计算 | # 2. 获取hidden的结果,根据word_pieces进行对应的pool计算 | ||||
# all_outputs: [batch_size x max_len x hidden_size, batch_size x max_len x hidden_size, ...] | # all_outputs: [batch_size x max_len x hidden_size, batch_size x max_len x hidden_size, ...] | ||||
bert_outputs, pooled_cls = self.encoder(word_pieces, token_type_ids=token_type_ids, attention_mask=attn_masks, | bert_outputs, pooled_cls = self.encoder(word_pieces, token_type_ids=token_type_ids, attention_mask=attn_masks, | ||||
output_all_encoded_layers=True) | |||||
output_all_encoded_layers=True) | |||||
# output_layers = [self.layers] # len(self.layers) x batch_size x real_word_piece_length x hidden_size | # output_layers = [self.layers] # len(self.layers) x batch_size x real_word_piece_length x hidden_size | ||||
if self.include_cls_sep: | if self.include_cls_sep: | ||||
outputs = bert_outputs[-1].new_zeros(len(self.layers), batch_size, max_word_len + 2, | outputs = bert_outputs[-1].new_zeros(len(self.layers), batch_size, max_word_len + 2, | ||||
bert_outputs[-1].size(-1)) | bert_outputs[-1].size(-1)) | ||||
@@ -414,7 +430,7 @@ class _WordBertModel(nn.Module): | |||||
real_word_piece_length = output_layer.size(1) - 2 | real_word_piece_length = output_layer.size(1) - 2 | ||||
if word_piece_length > real_word_piece_length: # 如果实际上是截取出来的 | if word_piece_length > real_word_piece_length: # 如果实际上是截取出来的 | ||||
paddings = output_layer.new_zeros(batch_size, | paddings = output_layer.new_zeros(batch_size, | ||||
word_piece_length-real_word_piece_length, | |||||
word_piece_length - real_word_piece_length, | |||||
output_layer.size(2)) | output_layer.size(2)) | ||||
output_layer = torch.cat((output_layer, paddings), dim=1).contiguous() | output_layer = torch.cat((output_layer, paddings), dim=1).contiguous() | ||||
# 从word_piece collapse到word的表示 | # 从word_piece collapse到word的表示 | ||||
@@ -423,27 +439,27 @@ class _WordBertModel(nn.Module): | |||||
if self.pool_method == 'first': | if self.pool_method == 'first': | ||||
for i in range(batch_size): | for i in range(batch_size): | ||||
i_word_pieces_cum_length = batch_word_pieces_cum_length[i, :seq_len[i]] # 每个word的start位置 | i_word_pieces_cum_length = batch_word_pieces_cum_length[i, :seq_len[i]] # 每个word的start位置 | ||||
outputs[l_index, i, s_shift:outputs_seq_len[i]] = truncate_output_layer[i, i_word_pieces_cum_length] # num_layer x batch_size x len x hidden_size | |||||
outputs[l_index, i, s_shift:outputs_seq_len[i]] = truncate_output_layer[ | |||||
i, i_word_pieces_cum_length] # num_layer x batch_size x len x hidden_size | |||||
elif self.pool_method == 'last': | elif self.pool_method == 'last': | ||||
for i in range(batch_size): | for i in range(batch_size): | ||||
i_word_pieces_cum_length = batch_word_pieces_cum_length[i, 1:seq_len[i]+1] - 1 # 每个word的end | |||||
i_word_pieces_cum_length = batch_word_pieces_cum_length[i, 1:seq_len[i] + 1] - 1 # 每个word的end | |||||
outputs[l_index, i, s_shift:outputs_seq_len[i]] = truncate_output_layer[i, i_word_pieces_cum_length] | outputs[l_index, i, s_shift:outputs_seq_len[i]] = truncate_output_layer[i, i_word_pieces_cum_length] | ||||
elif self.pool_method == 'max': | elif self.pool_method == 'max': | ||||
for i in range(batch_size): | for i in range(batch_size): | ||||
for j in range(seq_len[i]): | for j in range(seq_len[i]): | ||||
start, end = batch_word_pieces_cum_length[i, j], batch_word_pieces_cum_length[i, j+1] | |||||
outputs[l_index, i, j+s_shift], _ = torch.max(truncate_output_layer[i, start:end], dim=-2) | |||||
start, end = batch_word_pieces_cum_length[i, j], batch_word_pieces_cum_length[i, j + 1] | |||||
outputs[l_index, i, j + s_shift], _ = torch.max(truncate_output_layer[i, start:end], dim=-2) | |||||
else: | else: | ||||
for i in range(batch_size): | for i in range(batch_size): | ||||
for j in range(seq_len[i]): | for j in range(seq_len[i]): | ||||
start, end = batch_word_pieces_cum_length[i, j], batch_word_pieces_cum_length[i, j+1] | |||||
outputs[l_index, i, j+s_shift] = torch.mean(truncate_output_layer[i, start:end], dim=-2) | |||||
start, end = batch_word_pieces_cum_length[i, j], batch_word_pieces_cum_length[i, j + 1] | |||||
outputs[l_index, i, j + s_shift] = torch.mean(truncate_output_layer[i, start:end], dim=-2) | |||||
if self.include_cls_sep: | if self.include_cls_sep: | ||||
if l in (len(bert_outputs)-1, -1) and self.pooled_cls: | |||||
if l in (len(bert_outputs) - 1, -1) and self.pooled_cls: | |||||
outputs[l_index, :, 0] = pooled_cls | outputs[l_index, :, 0] = pooled_cls | ||||
else: | else: | ||||
outputs[l_index, :, 0] = output_layer[:, 0] | outputs[l_index, :, 0] = output_layer[:, 0] | ||||
outputs[l_index, batch_indexes, seq_len+s_shift] = output_layer[batch_indexes, seq_len+s_shift] | |||||
outputs[l_index, batch_indexes, seq_len + s_shift] = output_layer[batch_indexes, seq_len + s_shift] | |||||
# 3. 最终的embedding结果 | # 3. 最终的embedding结果 | ||||
return outputs | return outputs | ||||
@@ -3,6 +3,10 @@ | |||||
词的index而不需要使用词语中的char的index来获取表达。 | 词的index而不需要使用词语中的char的index来获取表达。 | ||||
""" | """ | ||||
__all__ = [ | |||||
"CNNCharEmbedding", | |||||
"LSTMCharEmbedding" | |||||
] | |||||
import torch | import torch | ||||
import torch.nn as nn | import torch.nn as nn | ||||
@@ -16,6 +20,7 @@ from .embedding import TokenEmbedding | |||||
from .utils import _construct_char_vocab_from_vocab | from .utils import _construct_char_vocab_from_vocab | ||||
from .utils import get_embeddings | from .utils import get_embeddings | ||||
class CNNCharEmbedding(TokenEmbedding): | class CNNCharEmbedding(TokenEmbedding): | ||||
""" | """ | ||||
别名::class:`fastNLP.embeddings.CNNCharEmbedding` :class:`fastNLP.embeddings.char_embedding.CNNCharEmbedding` | 别名::class:`fastNLP.embeddings.CNNCharEmbedding` :class:`fastNLP.embeddings.char_embedding.CNNCharEmbedding` | ||||
@@ -49,14 +54,15 @@ class CNNCharEmbedding(TokenEmbedding): | |||||
(文件夹下应该只有一个以.txt作为后缀的文件)或文件路径;第二种是传入embedding的名称,第二种情况将自动查看缓存中是否存在该模型, | (文件夹下应该只有一个以.txt作为后缀的文件)或文件路径;第二种是传入embedding的名称,第二种情况将自动查看缓存中是否存在该模型, | ||||
没有的话将自动下载。如果输入为None则使用embedding_dim的维度随机初始化一个embedding. | 没有的话将自动下载。如果输入为None则使用embedding_dim的维度随机初始化一个embedding. | ||||
""" | """ | ||||
def __init__(self, vocab: Vocabulary, embed_size: int=50, char_emb_size: int=50, word_dropout:float=0, | |||||
dropout:float=0, filter_nums: List[int]=(40, 30, 20), kernel_sizes: List[int]=(5, 3, 1), | |||||
pool_method: str='max', activation='relu', min_char_freq: int=2, pre_train_char_embed: str=None): | |||||
def __init__(self, vocab: Vocabulary, embed_size: int = 50, char_emb_size: int = 50, word_dropout: float = 0, | |||||
dropout: float = 0, filter_nums: List[int] = (40, 30, 20), kernel_sizes: List[int] = (5, 3, 1), | |||||
pool_method: str = 'max', activation='relu', min_char_freq: int = 2, pre_train_char_embed: str = None): | |||||
super(CNNCharEmbedding, self).__init__(vocab, word_dropout=word_dropout, dropout=dropout) | super(CNNCharEmbedding, self).__init__(vocab, word_dropout=word_dropout, dropout=dropout) | ||||
for kernel in kernel_sizes: | for kernel in kernel_sizes: | ||||
assert kernel % 2 == 1, "Only odd kernel is allowed." | assert kernel % 2 == 1, "Only odd kernel is allowed." | ||||
assert pool_method in ('max', 'avg') | assert pool_method in ('max', 'avg') | ||||
self.pool_method = pool_method | self.pool_method = pool_method | ||||
# activation function | # activation function | ||||
@@ -74,7 +80,7 @@ class CNNCharEmbedding(TokenEmbedding): | |||||
else: | else: | ||||
raise Exception( | raise Exception( | ||||
"Undefined activation function: choose from: [relu, tanh, sigmoid, or a callable function]") | "Undefined activation function: choose from: [relu, tanh, sigmoid, or a callable function]") | ||||
print("Start constructing character vocabulary.") | print("Start constructing character vocabulary.") | ||||
# 建立char的词表 | # 建立char的词表 | ||||
self.char_vocab = _construct_char_vocab_from_vocab(vocab, min_freq=min_char_freq) | self.char_vocab = _construct_char_vocab_from_vocab(vocab, min_freq=min_char_freq) | ||||
@@ -96,14 +102,14 @@ class CNNCharEmbedding(TokenEmbedding): | |||||
self.char_embedding = StaticEmbedding(self.char_vocab, model_dir_or_name=pre_train_char_embed) | self.char_embedding = StaticEmbedding(self.char_vocab, model_dir_or_name=pre_train_char_embed) | ||||
else: | else: | ||||
self.char_embedding = get_embeddings((len(self.char_vocab), char_emb_size)) | self.char_embedding = get_embeddings((len(self.char_vocab), char_emb_size)) | ||||
self.convs = nn.ModuleList([nn.Conv1d( | self.convs = nn.ModuleList([nn.Conv1d( | ||||
char_emb_size, filter_nums[i], kernel_size=kernel_sizes[i], bias=True, padding=kernel_sizes[i] // 2) | char_emb_size, filter_nums[i], kernel_size=kernel_sizes[i], bias=True, padding=kernel_sizes[i] // 2) | ||||
for i in range(len(kernel_sizes))]) | for i in range(len(kernel_sizes))]) | ||||
self._embed_size = embed_size | self._embed_size = embed_size | ||||
self.fc = nn.Linear(sum(filter_nums), embed_size) | self.fc = nn.Linear(sum(filter_nums), embed_size) | ||||
self.reset_parameters() | self.reset_parameters() | ||||
def forward(self, words): | def forward(self, words): | ||||
""" | """ | ||||
输入words的index后,生成对应的words的表示。 | 输入words的index后,生成对应的words的表示。 | ||||
@@ -114,14 +120,14 @@ class CNNCharEmbedding(TokenEmbedding): | |||||
words = self.drop_word(words) | words = self.drop_word(words) | ||||
batch_size, max_len = words.size() | batch_size, max_len = words.size() | ||||
chars = self.words_to_chars_embedding[words] # batch_size x max_len x max_word_len | chars = self.words_to_chars_embedding[words] # batch_size x max_len x max_word_len | ||||
word_lengths = self.word_lengths[words] # batch_size x max_len | |||||
word_lengths = self.word_lengths[words] # batch_size x max_len | |||||
max_word_len = word_lengths.max() | max_word_len = word_lengths.max() | ||||
chars = chars[:, :, :max_word_len] | chars = chars[:, :, :max_word_len] | ||||
# 为1的地方为mask | # 为1的地方为mask | ||||
chars_masks = chars.eq(self.char_pad_index) # batch_size x max_len x max_word_len 如果为0, 说明是padding的位置了 | chars_masks = chars.eq(self.char_pad_index) # batch_size x max_len x max_word_len 如果为0, 说明是padding的位置了 | ||||
chars = self.char_embedding(chars) # batch_size x max_len x max_word_len x embed_size | chars = self.char_embedding(chars) # batch_size x max_len x max_word_len x embed_size | ||||
chars = self.dropout(chars) | chars = self.dropout(chars) | ||||
reshaped_chars = chars.reshape(batch_size*max_len, max_word_len, -1) | |||||
reshaped_chars = chars.reshape(batch_size * max_len, max_word_len, -1) | |||||
reshaped_chars = reshaped_chars.transpose(1, 2) # B' x E x M | reshaped_chars = reshaped_chars.transpose(1, 2) # B' x E x M | ||||
conv_chars = [conv(reshaped_chars).transpose(1, 2).reshape(batch_size, max_len, max_word_len, -1) | conv_chars = [conv(reshaped_chars).transpose(1, 2).reshape(batch_size, max_len, max_word_len, -1) | ||||
for conv in self.convs] | for conv in self.convs] | ||||
@@ -129,13 +135,13 @@ class CNNCharEmbedding(TokenEmbedding): | |||||
conv_chars = self.activation(conv_chars) | conv_chars = self.activation(conv_chars) | ||||
if self.pool_method == 'max': | if self.pool_method == 'max': | ||||
conv_chars = conv_chars.masked_fill(chars_masks.unsqueeze(-1), float('-inf')) | conv_chars = conv_chars.masked_fill(chars_masks.unsqueeze(-1), float('-inf')) | ||||
chars, _ = torch.max(conv_chars, dim=-2) # batch_size x max_len x sum(filters) | |||||
chars, _ = torch.max(conv_chars, dim=-2) # batch_size x max_len x sum(filters) | |||||
else: | else: | ||||
conv_chars = conv_chars.masked_fill(chars_masks.unsqueeze(-1), 0) | conv_chars = conv_chars.masked_fill(chars_masks.unsqueeze(-1), 0) | ||||
chars = torch.sum(conv_chars, dim=-2)/chars_masks.eq(0).sum(dim=-1, keepdim=True).float() | |||||
chars = torch.sum(conv_chars, dim=-2) / chars_masks.eq(0).sum(dim=-1, keepdim=True).float() | |||||
chars = self.fc(chars) | chars = self.fc(chars) | ||||
return self.dropout(chars) | return self.dropout(chars) | ||||
@property | @property | ||||
def requires_grad(self): | def requires_grad(self): | ||||
""" | """ | ||||
@@ -151,21 +157,21 @@ class CNNCharEmbedding(TokenEmbedding): | |||||
return requires_grads.pop() | return requires_grads.pop() | ||||
else: | else: | ||||
return None | return None | ||||
@requires_grad.setter | @requires_grad.setter | ||||
def requires_grad(self, value): | def requires_grad(self, value): | ||||
for name, param in self.named_parameters(): | for name, param in self.named_parameters(): | ||||
if 'words_to_chars_embedding' in name or 'word_lengths' in name: # 这个不能加入到requires_grad中 | if 'words_to_chars_embedding' in name or 'word_lengths' in name: # 这个不能加入到requires_grad中 | ||||
continue | continue | ||||
param.requires_grad = value | param.requires_grad = value | ||||
def reset_parameters(self): | def reset_parameters(self): | ||||
for name, param in self.named_parameters(): | for name, param in self.named_parameters(): | ||||
if 'words_to_chars_embedding' in name or 'word_lengths' in name: # 这个不能reset | if 'words_to_chars_embedding' in name or 'word_lengths' in name: # 这个不能reset | ||||
continue | continue | ||||
if 'char_embedding' in name: | if 'char_embedding' in name: | ||||
continue | continue | ||||
if param.data.dim()>1: | |||||
if param.data.dim() > 1: | |||||
nn.init.xavier_uniform_(param, 1) | nn.init.xavier_uniform_(param, 1) | ||||
else: | else: | ||||
nn.init.uniform_(param, -1, 1) | nn.init.uniform_(param, -1, 1) | ||||
@@ -203,13 +209,15 @@ class LSTMCharEmbedding(TokenEmbedding): | |||||
(文件夹下应该只有一个以.txt作为后缀的文件)或文件路径;第二种是传入embedding的名称,第二种情况将自动查看缓存中是否存在该模型, | (文件夹下应该只有一个以.txt作为后缀的文件)或文件路径;第二种是传入embedding的名称,第二种情况将自动查看缓存中是否存在该模型, | ||||
没有的话将自动下载。如果输入为None则使用embedding_dim的维度随机初始化一个embedding. | 没有的话将自动下载。如果输入为None则使用embedding_dim的维度随机初始化一个embedding. | ||||
""" | """ | ||||
def __init__(self, vocab: Vocabulary, embed_size: int=50, char_emb_size: int=50, word_dropout:float=0, | |||||
dropout:float=0, hidden_size=50,pool_method: str='max', activation='relu', min_char_freq: int=2, | |||||
bidirectional=True, pre_train_char_embed: str=None): | |||||
def __init__(self, vocab: Vocabulary, embed_size: int = 50, char_emb_size: int = 50, word_dropout: float = 0, | |||||
dropout: float = 0, hidden_size=50, pool_method: str = 'max', activation='relu', | |||||
min_char_freq: int = 2, | |||||
bidirectional=True, pre_train_char_embed: str = None): | |||||
super(LSTMCharEmbedding, self).__init__(vocab, word_dropout=word_dropout, dropout=dropout) | super(LSTMCharEmbedding, self).__init__(vocab, word_dropout=word_dropout, dropout=dropout) | ||||
assert hidden_size % 2 == 0, "Only even kernel is allowed." | assert hidden_size % 2 == 0, "Only even kernel is allowed." | ||||
assert pool_method in ('max', 'avg') | assert pool_method in ('max', 'avg') | ||||
self.pool_method = pool_method | self.pool_method = pool_method | ||||
# activation function | # activation function | ||||
@@ -227,7 +235,7 @@ class LSTMCharEmbedding(TokenEmbedding): | |||||
else: | else: | ||||
raise Exception( | raise Exception( | ||||
"Undefined activation function: choose from: [relu, tanh, sigmoid, or a callable function]") | "Undefined activation function: choose from: [relu, tanh, sigmoid, or a callable function]") | ||||
print("Start constructing character vocabulary.") | print("Start constructing character vocabulary.") | ||||
# 建立char的词表 | # 建立char的词表 | ||||
self.char_vocab = _construct_char_vocab_from_vocab(vocab, min_freq=min_char_freq) | self.char_vocab = _construct_char_vocab_from_vocab(vocab, min_freq=min_char_freq) | ||||
@@ -249,14 +257,14 @@ class LSTMCharEmbedding(TokenEmbedding): | |||||
self.char_embedding = StaticEmbedding(self.char_vocab, pre_train_char_embed) | self.char_embedding = StaticEmbedding(self.char_vocab, pre_train_char_embed) | ||||
else: | else: | ||||
self.char_embedding = nn.Embedding(len(self.char_vocab), char_emb_size) | self.char_embedding = nn.Embedding(len(self.char_vocab), char_emb_size) | ||||
self.fc = nn.Linear(hidden_size, embed_size) | self.fc = nn.Linear(hidden_size, embed_size) | ||||
hidden_size = hidden_size // 2 if bidirectional else hidden_size | hidden_size = hidden_size // 2 if bidirectional else hidden_size | ||||
self.lstm = LSTM(char_emb_size, hidden_size, bidirectional=bidirectional, batch_first=True) | self.lstm = LSTM(char_emb_size, hidden_size, bidirectional=bidirectional, batch_first=True) | ||||
self._embed_size = embed_size | self._embed_size = embed_size | ||||
self.bidirectional = bidirectional | self.bidirectional = bidirectional | ||||
def forward(self, words): | def forward(self, words): | ||||
""" | """ | ||||
输入words的index后,生成对应的words的表示。 | 输入words的index后,生成对应的words的表示。 | ||||
@@ -278,7 +286,7 @@ class LSTMCharEmbedding(TokenEmbedding): | |||||
char_seq_len = chars_masks.eq(0).sum(dim=-1).reshape(batch_size * max_len) | char_seq_len = chars_masks.eq(0).sum(dim=-1).reshape(batch_size * max_len) | ||||
lstm_chars = self.lstm(reshaped_chars, char_seq_len)[0].reshape(batch_size, max_len, max_word_len, -1) | lstm_chars = self.lstm(reshaped_chars, char_seq_len)[0].reshape(batch_size, max_len, max_word_len, -1) | ||||
# B x M x M x H | # B x M x M x H | ||||
lstm_chars = self.activation(lstm_chars) | lstm_chars = self.activation(lstm_chars) | ||||
if self.pool_method == 'max': | if self.pool_method == 'max': | ||||
lstm_chars = lstm_chars.masked_fill(chars_masks.unsqueeze(-1), float('-inf')) | lstm_chars = lstm_chars.masked_fill(chars_masks.unsqueeze(-1), float('-inf')) | ||||
@@ -286,11 +294,11 @@ class LSTMCharEmbedding(TokenEmbedding): | |||||
else: | else: | ||||
lstm_chars = lstm_chars.masked_fill(chars_masks.unsqueeze(-1), 0) | lstm_chars = lstm_chars.masked_fill(chars_masks.unsqueeze(-1), 0) | ||||
chars = torch.sum(lstm_chars, dim=-2) / chars_masks.eq(0).sum(dim=-1, keepdim=True).float() | chars = torch.sum(lstm_chars, dim=-2) / chars_masks.eq(0).sum(dim=-1, keepdim=True).float() | ||||
chars = self.fc(chars) | chars = self.fc(chars) | ||||
return self.dropout(chars) | return self.dropout(chars) | ||||
@property | @property | ||||
def requires_grad(self): | def requires_grad(self): | ||||
""" | """ | ||||
@@ -307,7 +315,7 @@ class LSTMCharEmbedding(TokenEmbedding): | |||||
return requires_grads.pop() | return requires_grads.pop() | ||||
else: | else: | ||||
return None | return None | ||||
@requires_grad.setter | @requires_grad.setter | ||||
def requires_grad(self, value): | def requires_grad(self, value): | ||||
for name, param in self.named_parameters(): | for name, param in self.named_parameters(): | ||||
@@ -1,3 +1,12 @@ | |||||
""" | |||||
.. todo:: | |||||
doc | |||||
""" | |||||
__all__ = [ | |||||
"ContextualEmbedding" | |||||
] | |||||
from abc import abstractmethod | from abc import abstractmethod | ||||
import torch | import torch | ||||
@@ -8,16 +17,12 @@ from ..core.sampler import SequentialSampler | |||||
from ..core.utils import _move_model_to_device, _get_model_device | from ..core.utils import _move_model_to_device, _get_model_device | ||||
from .embedding import TokenEmbedding | from .embedding import TokenEmbedding | ||||
__all__ = [ | |||||
"ContextualEmbedding" | |||||
] | |||||
class ContextualEmbedding(TokenEmbedding): | class ContextualEmbedding(TokenEmbedding): | ||||
def __init__(self, vocab: Vocabulary, word_dropout:float=0.0, dropout:float=0.0): | |||||
def __init__(self, vocab: Vocabulary, word_dropout: float = 0.0, dropout: float = 0.0): | |||||
super(ContextualEmbedding, self).__init__(vocab, word_dropout=word_dropout, dropout=dropout) | super(ContextualEmbedding, self).__init__(vocab, word_dropout=word_dropout, dropout=dropout) | ||||
def add_sentence_cache(self, *datasets, batch_size=32, device='cpu', delete_weights: bool=True): | |||||
def add_sentence_cache(self, *datasets, batch_size=32, device='cpu', delete_weights: bool = True): | |||||
""" | """ | ||||
由于动态embedding生成比较耗时,所以可以把每句话embedding缓存下来,这样就不需要每次都运行生成过程。 | 由于动态embedding生成比较耗时,所以可以把每句话embedding缓存下来,这样就不需要每次都运行生成过程。 | ||||
@@ -34,7 +39,7 @@ class ContextualEmbedding(TokenEmbedding): | |||||
except Exception as e: | except Exception as e: | ||||
print(f"Exception happens at {index} dataset.") | print(f"Exception happens at {index} dataset.") | ||||
raise e | raise e | ||||
sent_embeds = {} | sent_embeds = {} | ||||
_move_model_to_device(self, device=device) | _move_model_to_device(self, device=device) | ||||
device = _get_model_device(self) | device = _get_model_device(self) | ||||
@@ -54,7 +59,7 @@ class ContextualEmbedding(TokenEmbedding): | |||||
word_embeds = self(words).detach().cpu().numpy() | word_embeds = self(words).detach().cpu().numpy() | ||||
for b in range(words.size(0)): | for b in range(words.size(0)): | ||||
length = seq_len_from_behind[b] | length = seq_len_from_behind[b] | ||||
if length==0: | |||||
if length == 0: | |||||
sent_embeds[tuple(words_list[b][:seq_len[b]])] = word_embeds[b] | sent_embeds[tuple(words_list[b][:seq_len[b]])] = word_embeds[b] | ||||
else: | else: | ||||
sent_embeds[tuple(words_list[b][:seq_len[b]])] = word_embeds[b, :-length] | sent_embeds[tuple(words_list[b][:seq_len[b]])] = word_embeds[b, :-length] | ||||
@@ -65,7 +70,7 @@ class ContextualEmbedding(TokenEmbedding): | |||||
self.sent_embeds = sent_embeds | self.sent_embeds = sent_embeds | ||||
if delete_weights: | if delete_weights: | ||||
self._delete_model_weights() | self._delete_model_weights() | ||||
def _get_sent_reprs(self, words): | def _get_sent_reprs(self, words): | ||||
""" | """ | ||||
获取sentence的表示,如果有缓存,则返回缓存的值; 没有缓存则返回None | 获取sentence的表示,如果有缓存,则返回缓存的值; 没有缓存则返回None | ||||
@@ -88,12 +93,12 @@ class ContextualEmbedding(TokenEmbedding): | |||||
embeds[i, :len(embed)] = torch.FloatTensor(embed).to(words.device) | embeds[i, :len(embed)] = torch.FloatTensor(embed).to(words.device) | ||||
return embeds | return embeds | ||||
return None | return None | ||||
@abstractmethod | @abstractmethod | ||||
def _delete_model_weights(self): | def _delete_model_weights(self): | ||||
"""删除计算表示的模型以节省资源""" | """删除计算表示的模型以节省资源""" | ||||
raise NotImplementedError | raise NotImplementedError | ||||
def remove_sentence_cache(self): | def remove_sentence_cache(self): | ||||
""" | """ | ||||
删除缓存的句子表示. 删除之后如果模型权重没有被删除,将开始使用动态计算权重。 | 删除缓存的句子表示. 删除之后如果模型权重没有被删除,将开始使用动态计算权重。 | ||||
@@ -1,6 +1,13 @@ | |||||
""" | |||||
.. todo:: | |||||
doc | |||||
""" | |||||
import os | |||||
__all__ = [ | |||||
"ElmoEmbedding" | |||||
] | |||||
import os | |||||
import torch | import torch | ||||
import torch.nn as nn | import torch.nn as nn | ||||
import torch.nn.functional as F | import torch.nn.functional as F | ||||
@@ -49,11 +56,11 @@ class ElmoEmbedding(ContextualEmbedding): | |||||
:param cache_word_reprs: 可以选择对word的表示进行cache; 设置为True的话,将在初始化的时候为每个word生成对应的embedding, | :param cache_word_reprs: 可以选择对word的表示进行cache; 设置为True的话,将在初始化的时候为每个word生成对应的embedding, | ||||
并删除character encoder,之后将直接使用cache的embedding。默认为False。 | 并删除character encoder,之后将直接使用cache的embedding。默认为False。 | ||||
""" | """ | ||||
def __init__(self, vocab: Vocabulary, model_dir_or_name: str = 'en', layers: str = '2', requires_grad: bool = False, | def __init__(self, vocab: Vocabulary, model_dir_or_name: str = 'en', layers: str = '2', requires_grad: bool = False, | ||||
word_dropout=0.0, dropout=0.0, cache_word_reprs: bool = False): | word_dropout=0.0, dropout=0.0, cache_word_reprs: bool = False): | ||||
super(ElmoEmbedding, self).__init__(vocab, word_dropout=word_dropout, dropout=dropout) | super(ElmoEmbedding, self).__init__(vocab, word_dropout=word_dropout, dropout=dropout) | ||||
# 根据model_dir_or_name检查是否存在并下载 | # 根据model_dir_or_name检查是否存在并下载 | ||||
if model_dir_or_name.lower() in PRETRAINED_ELMO_MODEL_DIR: | if model_dir_or_name.lower() in PRETRAINED_ELMO_MODEL_DIR: | ||||
model_url = _get_embedding_url('elmo', model_dir_or_name.lower()) | model_url = _get_embedding_url('elmo', model_dir_or_name.lower()) | ||||
@@ -64,7 +71,7 @@ class ElmoEmbedding(ContextualEmbedding): | |||||
else: | else: | ||||
raise ValueError(f"Cannot recognize {model_dir_or_name}.") | raise ValueError(f"Cannot recognize {model_dir_or_name}.") | ||||
self.model = _ElmoModel(model_dir, vocab, cache_word_reprs=cache_word_reprs) | self.model = _ElmoModel(model_dir, vocab, cache_word_reprs=cache_word_reprs) | ||||
if layers == 'mix': | if layers == 'mix': | ||||
self.layer_weights = nn.Parameter(torch.zeros(self.model.config['lstm']['n_layers'] + 1), | self.layer_weights = nn.Parameter(torch.zeros(self.model.config['lstm']['n_layers'] + 1), | ||||
requires_grad=requires_grad) | requires_grad=requires_grad) | ||||
@@ -79,16 +86,16 @@ class ElmoEmbedding(ContextualEmbedding): | |||||
self.layers = layers | self.layers = layers | ||||
self._get_outputs = self._get_layer_outputs | self._get_outputs = self._get_layer_outputs | ||||
self._embed_size = len(self.layers) * self.model.config['lstm']['projection_dim'] * 2 | self._embed_size = len(self.layers) * self.model.config['lstm']['projection_dim'] * 2 | ||||
self.requires_grad = requires_grad | self.requires_grad = requires_grad | ||||
def _get_mixed_outputs(self, outputs): | def _get_mixed_outputs(self, outputs): | ||||
# outputs: num_layers x batch_size x max_len x hidden_size | # outputs: num_layers x batch_size x max_len x hidden_size | ||||
# return: batch_size x max_len x hidden_size | # return: batch_size x max_len x hidden_size | ||||
weights = F.softmax(self.layer_weights + 1 / len(outputs), dim=0).to(outputs) | weights = F.softmax(self.layer_weights + 1 / len(outputs), dim=0).to(outputs) | ||||
outputs = torch.einsum('l,lbij->bij', weights, outputs) | outputs = torch.einsum('l,lbij->bij', weights, outputs) | ||||
return self.gamma.to(outputs) * outputs | return self.gamma.to(outputs) * outputs | ||||
def set_mix_weights_requires_grad(self, flag=True): | def set_mix_weights_requires_grad(self, flag=True): | ||||
""" | """ | ||||
当初始化ElmoEmbedding时layers被设置为mix时,可以通过调用该方法设置mix weights是否可训练。如果layers不是mix,调用 | 当初始化ElmoEmbedding时layers被设置为mix时,可以通过调用该方法设置mix weights是否可训练。如果layers不是mix,调用 | ||||
@@ -100,15 +107,15 @@ class ElmoEmbedding(ContextualEmbedding): | |||||
if hasattr(self, 'layer_weights'): | if hasattr(self, 'layer_weights'): | ||||
self.layer_weights.requires_grad = flag | self.layer_weights.requires_grad = flag | ||||
self.gamma.requires_grad = flag | self.gamma.requires_grad = flag | ||||
def _get_layer_outputs(self, outputs): | def _get_layer_outputs(self, outputs): | ||||
if len(self.layers) == 1: | if len(self.layers) == 1: | ||||
outputs = outputs[self.layers[0]] | outputs = outputs[self.layers[0]] | ||||
else: | else: | ||||
outputs = torch.cat(tuple([*outputs[self.layers]]), dim=-1) | outputs = torch.cat(tuple([*outputs[self.layers]]), dim=-1) | ||||
return outputs | return outputs | ||||
def forward(self, words: torch.LongTensor): | def forward(self, words: torch.LongTensor): | ||||
""" | """ | ||||
计算words的elmo embedding表示。根据elmo文章中介绍的ELMO实际上是有2L+1层结果,但是为了让结果比较容易拆分,token的 | 计算words的elmo embedding表示。根据elmo文章中介绍的ELMO实际上是有2L+1层结果,但是为了让结果比较容易拆分,token的 | ||||
@@ -125,12 +132,12 @@ class ElmoEmbedding(ContextualEmbedding): | |||||
outputs = self.model(words) | outputs = self.model(words) | ||||
outputs = self._get_outputs(outputs) | outputs = self._get_outputs(outputs) | ||||
return self.dropout(outputs) | return self.dropout(outputs) | ||||
def _delete_model_weights(self): | def _delete_model_weights(self): | ||||
for name in ['layers', 'model', 'layer_weights', 'gamma']: | for name in ['layers', 'model', 'layer_weights', 'gamma']: | ||||
if hasattr(self, name): | if hasattr(self, name): | ||||
delattr(self, name) | delattr(self, name) | ||||
@property | @property | ||||
def requires_grad(self): | def requires_grad(self): | ||||
""" | """ | ||||
@@ -144,7 +151,7 @@ class ElmoEmbedding(ContextualEmbedding): | |||||
return requires_grads.pop() | return requires_grads.pop() | ||||
else: | else: | ||||
return None | return None | ||||
@requires_grad.setter | @requires_grad.setter | ||||
def requires_grad(self, value): | def requires_grad(self, value): | ||||
for name, param in self.named_parameters(): | for name, param in self.named_parameters(): | ||||
@@ -162,7 +169,7 @@ class _ElmoModel(nn.Module): | |||||
(4) 设计一个保存token的embedding,允许缓存word的表示。 | (4) 设计一个保存token的embedding,允许缓存word的表示。 | ||||
""" | """ | ||||
def __init__(self, model_dir: str, vocab: Vocabulary = None, cache_word_reprs: bool = False): | def __init__(self, model_dir: str, vocab: Vocabulary = None, cache_word_reprs: bool = False): | ||||
super(_ElmoModel, self).__init__() | super(_ElmoModel, self).__init__() | ||||
self.model_dir = model_dir | self.model_dir = model_dir | ||||
@@ -187,14 +194,14 @@ class _ElmoModel(nn.Module): | |||||
config = json.load(config_f) | config = json.load(config_f) | ||||
self.weight_file = os.path.join(model_dir, weight_file) | self.weight_file = os.path.join(model_dir, weight_file) | ||||
self.config = config | self.config = config | ||||
OOV_TAG = '<oov>' | OOV_TAG = '<oov>' | ||||
PAD_TAG = '<pad>' | PAD_TAG = '<pad>' | ||||
BOS_TAG = '<bos>' | BOS_TAG = '<bos>' | ||||
EOS_TAG = '<eos>' | EOS_TAG = '<eos>' | ||||
BOW_TAG = '<bow>' | BOW_TAG = '<bow>' | ||||
EOW_TAG = '<eow>' | EOW_TAG = '<eow>' | ||||
# For the model trained with character-based word encoder. | # For the model trained with character-based word encoder. | ||||
char_lexicon = {} | char_lexicon = {} | ||||
with codecs.open(os.path.join(model_dir, 'char.dic'), 'r', encoding='utf-8') as fpi: | with codecs.open(os.path.join(model_dir, 'char.dic'), 'r', encoding='utf-8') as fpi: | ||||
@@ -204,29 +211,29 @@ class _ElmoModel(nn.Module): | |||||
tokens.insert(0, '\u3000') | tokens.insert(0, '\u3000') | ||||
token, i = tokens | token, i = tokens | ||||
char_lexicon[token] = int(i) | char_lexicon[token] = int(i) | ||||
# 做一些sanity check | # 做一些sanity check | ||||
for special_word in [PAD_TAG, OOV_TAG, BOW_TAG, EOW_TAG]: | for special_word in [PAD_TAG, OOV_TAG, BOW_TAG, EOW_TAG]: | ||||
assert special_word in char_lexicon, f"{special_word} not found in char.dic." | assert special_word in char_lexicon, f"{special_word} not found in char.dic." | ||||
# 从vocab中构建char_vocab | # 从vocab中构建char_vocab | ||||
char_vocab = Vocabulary(unknown=OOV_TAG, padding=PAD_TAG) | char_vocab = Vocabulary(unknown=OOV_TAG, padding=PAD_TAG) | ||||
# 需要保证<bow>与<eow>在里面 | # 需要保证<bow>与<eow>在里面 | ||||
char_vocab.add_word_lst([BOW_TAG, EOW_TAG, BOS_TAG, EOS_TAG]) | char_vocab.add_word_lst([BOW_TAG, EOW_TAG, BOS_TAG, EOS_TAG]) | ||||
for word, index in vocab: | for word, index in vocab: | ||||
char_vocab.add_word_lst(list(word)) | char_vocab.add_word_lst(list(word)) | ||||
self.bos_index, self.eos_index, self._pad_index = len(vocab), len(vocab) + 1, vocab.padding_idx | self.bos_index, self.eos_index, self._pad_index = len(vocab), len(vocab) + 1, vocab.padding_idx | ||||
# 根据char_lexicon调整, 多设置一位,是预留给word padding的(该位置的char表示为全0表示) | # 根据char_lexicon调整, 多设置一位,是预留给word padding的(该位置的char表示为全0表示) | ||||
char_emb_layer = nn.Embedding(len(char_vocab) + 1, int(config['char_cnn']['embedding']['dim']), | char_emb_layer = nn.Embedding(len(char_vocab) + 1, int(config['char_cnn']['embedding']['dim']), | ||||
padding_idx=len(char_vocab)) | padding_idx=len(char_vocab)) | ||||
# 读入预训练权重 这里的elmo_model 包含char_cnn和 lstm 的 state_dict | # 读入预训练权重 这里的elmo_model 包含char_cnn和 lstm 的 state_dict | ||||
elmo_model = torch.load(os.path.join(self.model_dir, weight_file), map_location='cpu') | elmo_model = torch.load(os.path.join(self.model_dir, weight_file), map_location='cpu') | ||||
char_embed_weights = elmo_model["char_cnn"]['char_emb_layer.weight'] | char_embed_weights = elmo_model["char_cnn"]['char_emb_layer.weight'] | ||||
found_char_count = 0 | found_char_count = 0 | ||||
for char, index in char_vocab: # 调整character embedding | for char, index in char_vocab: # 调整character embedding | ||||
if char in char_lexicon: | if char in char_lexicon: | ||||
@@ -235,11 +242,11 @@ class _ElmoModel(nn.Module): | |||||
else: | else: | ||||
index_in_pre = char_lexicon[OOV_TAG] | index_in_pre = char_lexicon[OOV_TAG] | ||||
char_emb_layer.weight.data[index] = char_embed_weights[index_in_pre] | char_emb_layer.weight.data[index] = char_embed_weights[index_in_pre] | ||||
print(f"{found_char_count} out of {len(char_vocab)} characters were found in pretrained elmo embedding.") | print(f"{found_char_count} out of {len(char_vocab)} characters were found in pretrained elmo embedding.") | ||||
# 生成words到chars的映射 | # 生成words到chars的映射 | ||||
max_chars = config['char_cnn']['max_characters_per_token'] | max_chars = config['char_cnn']['max_characters_per_token'] | ||||
self.words_to_chars_embedding = nn.Parameter(torch.full((len(vocab) + 2, max_chars), | self.words_to_chars_embedding = nn.Parameter(torch.full((len(vocab) + 2, max_chars), | ||||
fill_value=len(char_vocab), | fill_value=len(char_vocab), | ||||
dtype=torch.long), | dtype=torch.long), | ||||
@@ -258,20 +265,20 @@ class _ElmoModel(nn.Module): | |||||
char_vocab.to_index(EOW_TAG)] | char_vocab.to_index(EOW_TAG)] | ||||
char_ids += [char_vocab.to_index(PAD_TAG)] * (max_chars - len(char_ids)) | char_ids += [char_vocab.to_index(PAD_TAG)] * (max_chars - len(char_ids)) | ||||
self.words_to_chars_embedding[index] = torch.LongTensor(char_ids) | self.words_to_chars_embedding[index] = torch.LongTensor(char_ids) | ||||
self.char_vocab = char_vocab | self.char_vocab = char_vocab | ||||
self.token_embedder = ConvTokenEmbedder( | self.token_embedder = ConvTokenEmbedder( | ||||
config, self.weight_file, None, char_emb_layer) | config, self.weight_file, None, char_emb_layer) | ||||
elmo_model["char_cnn"]['char_emb_layer.weight'] = char_emb_layer.weight | elmo_model["char_cnn"]['char_emb_layer.weight'] = char_emb_layer.weight | ||||
self.token_embedder.load_state_dict(elmo_model["char_cnn"]) | self.token_embedder.load_state_dict(elmo_model["char_cnn"]) | ||||
self.output_dim = config['lstm']['projection_dim'] | self.output_dim = config['lstm']['projection_dim'] | ||||
# lstm encoder | # lstm encoder | ||||
self.encoder = ElmobiLm(config) | self.encoder = ElmobiLm(config) | ||||
self.encoder.load_state_dict(elmo_model["lstm"]) | self.encoder.load_state_dict(elmo_model["lstm"]) | ||||
if cache_word_reprs: | if cache_word_reprs: | ||||
if config['char_cnn']['embedding']['dim'] > 0: # 只有在使用了chars的情况下有用 | if config['char_cnn']['embedding']['dim'] > 0: # 只有在使用了chars的情况下有用 | ||||
print("Start to generate cache word representations.") | print("Start to generate cache word representations.") | ||||
@@ -280,7 +287,7 @@ class _ElmoModel(nn.Module): | |||||
word_size = self.words_to_chars_embedding.size(0) | word_size = self.words_to_chars_embedding.size(0) | ||||
num_batches = word_size // batch_size + \ | num_batches = word_size // batch_size + \ | ||||
int(word_size % batch_size != 0) | int(word_size % batch_size != 0) | ||||
self.cached_word_embedding = nn.Embedding(word_size, | self.cached_word_embedding = nn.Embedding(word_size, | ||||
config['lstm']['projection_dim']) | config['lstm']['projection_dim']) | ||||
with torch.no_grad(): | with torch.no_grad(): | ||||
@@ -291,12 +298,12 @@ class _ElmoModel(nn.Module): | |||||
word_reprs = self.token_embedder(words.unsqueeze(1), | word_reprs = self.token_embedder(words.unsqueeze(1), | ||||
chars).detach() # batch_size x 1 x config['encoder']['projection_dim'] | chars).detach() # batch_size x 1 x config['encoder']['projection_dim'] | ||||
self.cached_word_embedding.weight.data[words] = word_reprs.squeeze(1) | self.cached_word_embedding.weight.data[words] = word_reprs.squeeze(1) | ||||
print("Finish generating cached word representations. Going to delete the character encoder.") | print("Finish generating cached word representations. Going to delete the character encoder.") | ||||
del self.token_embedder, self.words_to_chars_embedding | del self.token_embedder, self.words_to_chars_embedding | ||||
else: | else: | ||||
print("There is no need to cache word representations, since no character information is used.") | print("There is no need to cache word representations, since no character information is used.") | ||||
def forward(self, words): | def forward(self, words): | ||||
""" | """ | ||||
@@ -321,7 +328,7 @@ class _ElmoModel(nn.Module): | |||||
else: | else: | ||||
chars = None | chars = None | ||||
token_embedding = self.token_embedder(expanded_words, chars) # batch_size x max_len x embed_dim | token_embedding = self.token_embedder(expanded_words, chars) # batch_size x max_len x embed_dim | ||||
encoder_output = self.encoder(token_embedding, seq_len) | encoder_output = self.encoder(token_embedding, seq_len) | ||||
if encoder_output.size(2) < max_len + 2: | if encoder_output.size(2) < max_len + 2: | ||||
num_layers, _, output_len, hidden_size = encoder_output.size() | num_layers, _, output_len, hidden_size = encoder_output.size() | ||||
@@ -332,7 +339,7 @@ class _ElmoModel(nn.Module): | |||||
token_embedding = token_embedding.masked_fill(mask, 0) | token_embedding = token_embedding.masked_fill(mask, 0) | ||||
token_embedding = torch.cat((token_embedding, token_embedding), dim=2).view(1, sz[1], sz[2], sz[3]) | token_embedding = torch.cat((token_embedding, token_embedding), dim=2).view(1, sz[1], sz[2], sz[3]) | ||||
encoder_output = torch.cat((token_embedding, encoder_output), dim=0) | encoder_output = torch.cat((token_embedding, encoder_output), dim=0) | ||||
# 删除<eos>, <bos>. 这里没有精确地删除,但应该也不会影响最后的结果了。 | # 删除<eos>, <bos>. 这里没有精确地删除,但应该也不会影响最后的结果了。 | ||||
encoder_output = encoder_output[:, :, 1:-1] | encoder_output = encoder_output[:, :, 1:-1] | ||||
return encoder_output | return encoder_output |
@@ -3,6 +3,10 @@ | |||||
""" | """ | ||||
__all__ = [ | |||||
"Embedding", | |||||
"TokenEmbedding" | |||||
] | |||||
import torch.nn as nn | import torch.nn as nn | ||||
from abc import abstractmethod | from abc import abstractmethod | ||||
@@ -33,11 +37,11 @@ class Embedding(nn.Module): | |||||
:param float dropout: 对Embedding的输出的dropout。 | :param float dropout: 对Embedding的输出的dropout。 | ||||
:param int unk_index: drop word时替换为的index。fastNLP的Vocabulary的unk_index默认为1。 | :param int unk_index: drop word时替换为的index。fastNLP的Vocabulary的unk_index默认为1。 | ||||
""" | """ | ||||
def __init__(self, init_embed, word_dropout=0, dropout=0.0, unk_index=None): | def __init__(self, init_embed, word_dropout=0, dropout=0.0, unk_index=None): | ||||
super(Embedding, self).__init__() | super(Embedding, self).__init__() | ||||
self.embed = get_embeddings(init_embed) | self.embed = get_embeddings(init_embed) | ||||
self.dropout = nn.Dropout(dropout) | self.dropout = nn.Dropout(dropout) | ||||
@@ -48,44 +52,44 @@ class Embedding(nn.Module): | |||||
self._embed_size = self.embed.embedding_dim | self._embed_size = self.embed.embedding_dim | ||||
else: | else: | ||||
self._embed_size = self.embed.weight.size(1) | self._embed_size = self.embed.weight.size(1) | ||||
if word_dropout>0 and not isinstance(unk_index, int): | |||||
if word_dropout > 0 and not isinstance(unk_index, int): | |||||
raise ValueError("When drop word is set, you need to pass in the unk_index.") | raise ValueError("When drop word is set, you need to pass in the unk_index.") | ||||
else: | else: | ||||
self._embed_size = self.embed.embed_size | self._embed_size = self.embed.embed_size | ||||
unk_index = self.embed.get_word_vocab().unknown_idx | unk_index = self.embed.get_word_vocab().unknown_idx | ||||
self.unk_index = unk_index | self.unk_index = unk_index | ||||
self.word_dropout = word_dropout | self.word_dropout = word_dropout | ||||
def forward(self, words): | def forward(self, words): | ||||
""" | """ | ||||
:param torch.LongTensor words: [batch, seq_len] | :param torch.LongTensor words: [batch, seq_len] | ||||
:return: torch.Tensor : [batch, seq_len, embed_dim] | :return: torch.Tensor : [batch, seq_len, embed_dim] | ||||
""" | """ | ||||
if self.word_dropout>0 and self.training: | |||||
if self.word_dropout > 0 and self.training: | |||||
mask = torch.ones_like(words).float() * self.word_dropout | mask = torch.ones_like(words).float() * self.word_dropout | ||||
mask = torch.bernoulli(mask).eq(1) # dropout_word越大,越多位置为1 | mask = torch.bernoulli(mask).eq(1) # dropout_word越大,越多位置为1 | ||||
words = words.masked_fill(mask, self.unk_index) | words = words.masked_fill(mask, self.unk_index) | ||||
words = self.embed(words) | words = self.embed(words) | ||||
return self.dropout(words) | return self.dropout(words) | ||||
@property | @property | ||||
def num_embedding(self)->int: | |||||
def num_embedding(self) -> int: | |||||
if isinstance(self.embed, nn.Embedding): | if isinstance(self.embed, nn.Embedding): | ||||
return self.embed.weight.size(0) | return self.embed.weight.size(0) | ||||
else: | else: | ||||
return self.embed.num_embedding | return self.embed.num_embedding | ||||
def __len__(self): | def __len__(self): | ||||
return len(self.embed) | return len(self.embed) | ||||
@property | @property | ||||
def embed_size(self) -> int: | def embed_size(self) -> int: | ||||
return self._embed_size | return self._embed_size | ||||
@property | @property | ||||
def embedding_dim(self) -> int: | def embedding_dim(self) -> int: | ||||
return self._embed_size | return self._embed_size | ||||
@property | @property | ||||
def requires_grad(self): | def requires_grad(self): | ||||
""" | """ | ||||
@@ -96,14 +100,14 @@ class Embedding(nn.Module): | |||||
return self.embed.weight.requires_grad | return self.embed.weight.requires_grad | ||||
else: | else: | ||||
return self.embed.requires_grad | return self.embed.requires_grad | ||||
@requires_grad.setter | @requires_grad.setter | ||||
def requires_grad(self, value): | def requires_grad(self, value): | ||||
if not isinstance(self.embed, TokenEmbedding): | if not isinstance(self.embed, TokenEmbedding): | ||||
self.embed.weight.requires_grad = value | self.embed.weight.requires_grad = value | ||||
else: | else: | ||||
self.embed.requires_grad = value | self.embed.requires_grad = value | ||||
@property | @property | ||||
def size(self): | def size(self): | ||||
if isinstance(self.embed, TokenEmbedding): | if isinstance(self.embed, TokenEmbedding): | ||||
@@ -120,12 +124,12 @@ class TokenEmbedding(nn.Module): | |||||
assert vocab.padding is not None, "Vocabulary must have a padding entry." | assert vocab.padding is not None, "Vocabulary must have a padding entry." | ||||
self._word_vocab = vocab | self._word_vocab = vocab | ||||
self._word_pad_index = vocab.padding_idx | self._word_pad_index = vocab.padding_idx | ||||
if word_dropout>0: | |||||
if word_dropout > 0: | |||||
assert vocab.unknown is not None, "Vocabulary must have unknown entry when you want to drop a word." | assert vocab.unknown is not None, "Vocabulary must have unknown entry when you want to drop a word." | ||||
self.word_dropout = word_dropout | self.word_dropout = word_dropout | ||||
self._word_unk_index = vocab.unknown_idx | self._word_unk_index = vocab.unknown_idx | ||||
self.dropout_layer = nn.Dropout(dropout) | self.dropout_layer = nn.Dropout(dropout) | ||||
def drop_word(self, words): | def drop_word(self, words): | ||||
""" | """ | ||||
按照设定随机将words设置为unknown_index。 | 按照设定随机将words设置为unknown_index。 | ||||
@@ -138,7 +142,7 @@ class TokenEmbedding(nn.Module): | |||||
mask = torch.bernoulli(mask).eq(1) # dropout_word越大,越多位置为1 | mask = torch.bernoulli(mask).eq(1) # dropout_word越大,越多位置为1 | ||||
words = words.masked_fill(mask, self._word_unk_index) | words = words.masked_fill(mask, self._word_unk_index) | ||||
return words | return words | ||||
def dropout(self, words): | def dropout(self, words): | ||||
""" | """ | ||||
对embedding后的word表示进行drop。 | 对embedding后的word表示进行drop。 | ||||
@@ -147,7 +151,7 @@ class TokenEmbedding(nn.Module): | |||||
:return: | :return: | ||||
""" | """ | ||||
return self.dropout_layer(words) | return self.dropout_layer(words) | ||||
@property | @property | ||||
def requires_grad(self): | def requires_grad(self): | ||||
""" | """ | ||||
@@ -159,23 +163,23 @@ class TokenEmbedding(nn.Module): | |||||
return requires_grads.pop() | return requires_grads.pop() | ||||
else: | else: | ||||
return None | return None | ||||
@requires_grad.setter | @requires_grad.setter | ||||
def requires_grad(self, value): | def requires_grad(self, value): | ||||
for param in self.parameters(): | for param in self.parameters(): | ||||
param.requires_grad = value | param.requires_grad = value | ||||
def __len__(self): | def __len__(self): | ||||
return len(self._word_vocab) | return len(self._word_vocab) | ||||
@property | @property | ||||
def embed_size(self) -> int: | def embed_size(self) -> int: | ||||
return self._embed_size | return self._embed_size | ||||
@property | @property | ||||
def embedding_dim(self) -> int: | def embedding_dim(self) -> int: | ||||
return self._embed_size | return self._embed_size | ||||
@property | @property | ||||
def num_embedding(self) -> int: | def num_embedding(self) -> int: | ||||
""" | """ | ||||
@@ -183,7 +187,7 @@ class TokenEmbedding(nn.Module): | |||||
:return: | :return: | ||||
""" | """ | ||||
return len(self._word_vocab) | return len(self._word_vocab) | ||||
def get_word_vocab(self): | def get_word_vocab(self): | ||||
""" | """ | ||||
返回embedding的词典。 | 返回embedding的词典。 | ||||
@@ -191,11 +195,11 @@ class TokenEmbedding(nn.Module): | |||||
:return: Vocabulary | :return: Vocabulary | ||||
""" | """ | ||||
return self._word_vocab | return self._word_vocab | ||||
@property | @property | ||||
def size(self): | def size(self): | ||||
return torch.Size(self.num_embedding, self._embed_size) | return torch.Size(self.num_embedding, self._embed_size) | ||||
@abstractmethod | @abstractmethod | ||||
def forward(self, words): | def forward(self, words): | ||||
raise NotImplementedError | raise NotImplementedError |
@@ -1,3 +1,12 @@ | |||||
""" | |||||
.. todo:: | |||||
doc | |||||
""" | |||||
__all__ = [ | |||||
"StackEmbedding", | |||||
] | |||||
from typing import List | from typing import List | ||||
import torch | import torch | ||||
@@ -26,6 +35,7 @@ class StackEmbedding(TokenEmbedding): | |||||
:param float dropout: 以多大的概率对embedding的表示进行Dropout。0.1即随机将10%的值置为0。 | :param float dropout: 以多大的概率对embedding的表示进行Dropout。0.1即随机将10%的值置为0。 | ||||
""" | """ | ||||
def __init__(self, embeds: List[TokenEmbedding], word_dropout=0, dropout=0): | def __init__(self, embeds: List[TokenEmbedding], word_dropout=0, dropout=0): | ||||
vocabs = [] | vocabs = [] | ||||
for embed in embeds: | for embed in embeds: | ||||
@@ -34,14 +44,14 @@ class StackEmbedding(TokenEmbedding): | |||||
_vocab = vocabs[0] | _vocab = vocabs[0] | ||||
for vocab in vocabs[1:]: | for vocab in vocabs[1:]: | ||||
assert vocab == _vocab, "All embeddings in StackEmbedding should use the same word vocabulary." | assert vocab == _vocab, "All embeddings in StackEmbedding should use the same word vocabulary." | ||||
super(StackEmbedding, self).__init__(_vocab, word_dropout=word_dropout, dropout=dropout) | super(StackEmbedding, self).__init__(_vocab, word_dropout=word_dropout, dropout=dropout) | ||||
assert isinstance(embeds, list) | assert isinstance(embeds, list) | ||||
for embed in embeds: | for embed in embeds: | ||||
assert isinstance(embed, TokenEmbedding), "Only TokenEmbedding type is supported." | assert isinstance(embed, TokenEmbedding), "Only TokenEmbedding type is supported." | ||||
self.embeds = nn.ModuleList(embeds) | self.embeds = nn.ModuleList(embeds) | ||||
self._embed_size = sum([embed.embed_size for embed in self.embeds]) | self._embed_size = sum([embed.embed_size for embed in self.embeds]) | ||||
def append(self, embed: TokenEmbedding): | def append(self, embed: TokenEmbedding): | ||||
""" | """ | ||||
添加一个embedding到结尾。 | 添加一个embedding到结尾。 | ||||
@@ -50,18 +60,18 @@ class StackEmbedding(TokenEmbedding): | |||||
""" | """ | ||||
assert isinstance(embed, TokenEmbedding) | assert isinstance(embed, TokenEmbedding) | ||||
self.embeds.append(embed) | self.embeds.append(embed) | ||||
def pop(self): | def pop(self): | ||||
""" | """ | ||||
弹出最后一个embed | 弹出最后一个embed | ||||
:return: | :return: | ||||
""" | """ | ||||
return self.embeds.pop() | return self.embeds.pop() | ||||
@property | @property | ||||
def embed_size(self): | def embed_size(self): | ||||
return self._embed_size | return self._embed_size | ||||
@property | @property | ||||
def requires_grad(self): | def requires_grad(self): | ||||
""" | """ | ||||
@@ -73,12 +83,12 @@ class StackEmbedding(TokenEmbedding): | |||||
return requires_grads.pop() | return requires_grads.pop() | ||||
else: | else: | ||||
return None | return None | ||||
@requires_grad.setter | @requires_grad.setter | ||||
def requires_grad(self, value): | def requires_grad(self, value): | ||||
for embed in self.embeds(): | for embed in self.embeds(): | ||||
embed.requires_grad = value | embed.requires_grad = value | ||||
def forward(self, words): | def forward(self, words): | ||||
""" | """ | ||||
得到多个embedding的结果,并把结果按照顺序concat起来。 | 得到多个embedding的结果,并把结果按照顺序concat起来。 | ||||
@@ -1,4 +1,11 @@ | |||||
""" | |||||
.. todo:: | |||||
doc | |||||
""" | |||||
__all__ = [ | |||||
"StaticEmbedding" | |||||
] | |||||
import os | import os | ||||
import torch | import torch | ||||
@@ -13,6 +20,7 @@ from ..modules.utils import _get_file_name_base_on_postfix | |||||
from copy import deepcopy | from copy import deepcopy | ||||
from collections import defaultdict | from collections import defaultdict | ||||
class StaticEmbedding(TokenEmbedding): | class StaticEmbedding(TokenEmbedding): | ||||
""" | """ | ||||
别名::class:`fastNLP.embeddings.StaticEmbedding` :class:`fastNLP.embeddings.static_embedding.StaticEmbedding` | 别名::class:`fastNLP.embeddings.StaticEmbedding` :class:`fastNLP.embeddings.static_embedding.StaticEmbedding` | ||||
@@ -55,15 +63,16 @@ class StaticEmbedding(TokenEmbedding): | |||||
:param bool normalize: 是否对vector进行normalize,使得每个vector的norm为1。 | :param bool normalize: 是否对vector进行normalize,使得每个vector的norm为1。 | ||||
:param int min_freq: Vocabulary词频数小于这个数量的word将被指向unk。 | :param int min_freq: Vocabulary词频数小于这个数量的word将被指向unk。 | ||||
""" | """ | ||||
def __init__(self, vocab: Vocabulary, model_dir_or_name: str='en', embedding_dim=-1, requires_grad: bool=True, | |||||
def __init__(self, vocab: Vocabulary, model_dir_or_name: str = 'en', embedding_dim=-1, requires_grad: bool = True, | |||||
init_method=None, lower=False, dropout=0, word_dropout=0, normalize=False, min_freq=1, **kwargs): | init_method=None, lower=False, dropout=0, word_dropout=0, normalize=False, min_freq=1, **kwargs): | ||||
super(StaticEmbedding, self).__init__(vocab, word_dropout=word_dropout, dropout=dropout) | super(StaticEmbedding, self).__init__(vocab, word_dropout=word_dropout, dropout=dropout) | ||||
if embedding_dim>0: | |||||
if embedding_dim > 0: | |||||
model_dir_or_name = None | model_dir_or_name = None | ||||
# 得到cache_path | # 得到cache_path | ||||
if model_dir_or_name is None: | if model_dir_or_name is None: | ||||
assert embedding_dim>=1, "The dimension of embedding should be larger than 1." | |||||
assert embedding_dim >= 1, "The dimension of embedding should be larger than 1." | |||||
embedding_dim = int(embedding_dim) | embedding_dim = int(embedding_dim) | ||||
model_path = None | model_path = None | ||||
elif model_dir_or_name.lower() in PRETRAIN_STATIC_FILES: | elif model_dir_or_name.lower() in PRETRAIN_STATIC_FILES: | ||||
@@ -76,9 +85,9 @@ class StaticEmbedding(TokenEmbedding): | |||||
model_path = _get_file_name_base_on_postfix(os.path.abspath(os.path.expanduser(model_dir_or_name)), '.txt') | model_path = _get_file_name_base_on_postfix(os.path.abspath(os.path.expanduser(model_dir_or_name)), '.txt') | ||||
else: | else: | ||||
raise ValueError(f"Cannot recognize {model_dir_or_name}.") | raise ValueError(f"Cannot recognize {model_dir_or_name}.") | ||||
# 根据min_freq缩小vocab | # 根据min_freq缩小vocab | ||||
truncate_vocab = (vocab.min_freq is None and min_freq>1) or (vocab.min_freq and vocab.min_freq<min_freq) | |||||
truncate_vocab = (vocab.min_freq is None and min_freq > 1) or (vocab.min_freq and vocab.min_freq < min_freq) | |||||
if truncate_vocab: | if truncate_vocab: | ||||
truncated_vocab = deepcopy(vocab) | truncated_vocab = deepcopy(vocab) | ||||
truncated_vocab.min_freq = min_freq | truncated_vocab.min_freq = min_freq | ||||
@@ -89,14 +98,14 @@ class StaticEmbedding(TokenEmbedding): | |||||
lowered_word_count[word.lower()] += count | lowered_word_count[word.lower()] += count | ||||
for word in truncated_vocab.word_count.keys(): | for word in truncated_vocab.word_count.keys(): | ||||
word_count = truncated_vocab.word_count[word] | word_count = truncated_vocab.word_count[word] | ||||
if lowered_word_count[word.lower()]>=min_freq and word_count<min_freq: | |||||
truncated_vocab.add_word_lst([word]*(min_freq-word_count), | |||||
if lowered_word_count[word.lower()] >= min_freq and word_count < min_freq: | |||||
truncated_vocab.add_word_lst([word] * (min_freq - word_count), | |||||
no_create_entry=truncated_vocab._is_word_no_create_entry(word)) | no_create_entry=truncated_vocab._is_word_no_create_entry(word)) | ||||
# 只限制在train里面的词语使用min_freq筛选 | # 只限制在train里面的词语使用min_freq筛选 | ||||
if kwargs.get('only_train_min_freq', False) and model_dir_or_name is not None: | if kwargs.get('only_train_min_freq', False) and model_dir_or_name is not None: | ||||
for word in truncated_vocab.word_count.keys(): | for word in truncated_vocab.word_count.keys(): | ||||
if truncated_vocab._is_word_no_create_entry(word) and truncated_vocab.word_count[word]<min_freq: | |||||
if truncated_vocab._is_word_no_create_entry(word) and truncated_vocab.word_count[word] < min_freq: | |||||
truncated_vocab.add_word_lst([word] * (min_freq - truncated_vocab.word_count[word]), | truncated_vocab.add_word_lst([word] * (min_freq - truncated_vocab.word_count[word]), | ||||
no_create_entry=True) | no_create_entry=True) | ||||
truncated_vocab.build_vocab() | truncated_vocab.build_vocab() | ||||
@@ -105,7 +114,7 @@ class StaticEmbedding(TokenEmbedding): | |||||
truncated_words_to_words[index] = truncated_vocab.to_index(word) | truncated_words_to_words[index] = truncated_vocab.to_index(word) | ||||
print(f"{len(vocab) - len(truncated_vocab)} out of {len(vocab)} words have frequency less than {min_freq}.") | print(f"{len(vocab) - len(truncated_vocab)} out of {len(vocab)} words have frequency less than {min_freq}.") | ||||
vocab = truncated_vocab | vocab = truncated_vocab | ||||
self.only_norm_found_vector = kwargs.get('only_norm_found_vector', False) | self.only_norm_found_vector = kwargs.get('only_norm_found_vector', False) | ||||
# 读取embedding | # 读取embedding | ||||
if lower: | if lower: | ||||
@@ -145,21 +154,21 @@ class StaticEmbedding(TokenEmbedding): | |||||
self.words_to_words = nn.Parameter(torch.arange(len(vocab)).long(), requires_grad=False) | self.words_to_words = nn.Parameter(torch.arange(len(vocab)).long(), requires_grad=False) | ||||
if not self.only_norm_found_vector and normalize: | if not self.only_norm_found_vector and normalize: | ||||
embedding /= (torch.norm(embedding, dim=1, keepdim=True) + 1e-12) | embedding /= (torch.norm(embedding, dim=1, keepdim=True) + 1e-12) | ||||
if truncate_vocab: | if truncate_vocab: | ||||
for i in range(len(truncated_words_to_words)): | for i in range(len(truncated_words_to_words)): | ||||
index_in_truncated_vocab = truncated_words_to_words[i] | index_in_truncated_vocab = truncated_words_to_words[i] | ||||
truncated_words_to_words[i] = self.words_to_words[index_in_truncated_vocab] | truncated_words_to_words[i] = self.words_to_words[index_in_truncated_vocab] | ||||
del self.words_to_words | del self.words_to_words | ||||
self.words_to_words = nn.Parameter(truncated_words_to_words, requires_grad=False) | self.words_to_words = nn.Parameter(truncated_words_to_words, requires_grad=False) | ||||
self.embedding = nn.Embedding(num_embeddings=embedding.shape[0], embedding_dim=embedding.shape[1], | self.embedding = nn.Embedding(num_embeddings=embedding.shape[0], embedding_dim=embedding.shape[1], | ||||
padding_idx=vocab.padding_idx, | padding_idx=vocab.padding_idx, | ||||
max_norm=None, norm_type=2, scale_grad_by_freq=False, | max_norm=None, norm_type=2, scale_grad_by_freq=False, | ||||
sparse=False, _weight=embedding) | sparse=False, _weight=embedding) | ||||
self._embed_size = self.embedding.weight.size(1) | self._embed_size = self.embedding.weight.size(1) | ||||
self.requires_grad = requires_grad | self.requires_grad = requires_grad | ||||
def _randomly_init_embed(self, num_embedding, embedding_dim, init_embed=None): | def _randomly_init_embed(self, num_embedding, embedding_dim, init_embed=None): | ||||
""" | """ | ||||
@@ -169,14 +178,14 @@ class StaticEmbedding(TokenEmbedding): | |||||
:return: torch.FloatTensor | :return: torch.FloatTensor | ||||
""" | """ | ||||
embed = torch.zeros(num_embedding, embedding_dim) | embed = torch.zeros(num_embedding, embedding_dim) | ||||
if init_embed is None: | if init_embed is None: | ||||
nn.init.uniform_(embed, -np.sqrt(3/embedding_dim), np.sqrt(3/embedding_dim)) | |||||
nn.init.uniform_(embed, -np.sqrt(3 / embedding_dim), np.sqrt(3 / embedding_dim)) | |||||
else: | else: | ||||
init_embed(embed) | init_embed(embed) | ||||
return embed | return embed | ||||
@property | @property | ||||
def requires_grad(self): | def requires_grad(self): | ||||
""" | """ | ||||
@@ -190,14 +199,14 @@ class StaticEmbedding(TokenEmbedding): | |||||
return requires_grads.pop() | return requires_grads.pop() | ||||
else: | else: | ||||
return None | return None | ||||
@requires_grad.setter | @requires_grad.setter | ||||
def requires_grad(self, value): | def requires_grad(self, value): | ||||
for name, param in self.named_parameters(): | for name, param in self.named_parameters(): | ||||
if 'words_to_words' in name: | if 'words_to_words' in name: | ||||
continue | continue | ||||
param.requires_grad = value | param.requires_grad = value | ||||
def _load_with_vocab(self, embed_filepath, vocab, dtype=np.float32, padding='<pad>', unknown='<unk>', | def _load_with_vocab(self, embed_filepath, vocab, dtype=np.float32, padding='<pad>', unknown='<unk>', | ||||
error='ignore', init_method=None): | error='ignore', init_method=None): | ||||
""" | """ | ||||
@@ -250,7 +259,7 @@ class StaticEmbedding(TokenEmbedding): | |||||
index = vocab.to_index(word) | index = vocab.to_index(word) | ||||
matrix[index] = torch.from_numpy(np.fromstring(' '.join(nums), sep=' ', dtype=dtype, count=dim)) | matrix[index] = torch.from_numpy(np.fromstring(' '.join(nums), sep=' ', dtype=dtype, count=dim)) | ||||
if self.only_norm_found_vector: | if self.only_norm_found_vector: | ||||
matrix[index] = matrix[index]/np.linalg.norm(matrix[index]) | |||||
matrix[index] = matrix[index] / np.linalg.norm(matrix[index]) | |||||
found_count += 1 | found_count += 1 | ||||
except Exception as e: | except Exception as e: | ||||
if error == 'ignore': | if error == 'ignore': | ||||
@@ -267,22 +276,22 @@ class StaticEmbedding(TokenEmbedding): | |||||
matrix[index] = None | matrix[index] = None | ||||
# matrix中代表是需要建立entry的词 | # matrix中代表是需要建立entry的词 | ||||
vectors = self._randomly_init_embed(len(matrix), dim, init_method) | vectors = self._randomly_init_embed(len(matrix), dim, init_method) | ||||
if vocab.unknown is None: # 创建一个专门的unknown | if vocab.unknown is None: # 创建一个专门的unknown | ||||
unknown_idx = len(matrix) | unknown_idx = len(matrix) | ||||
vectors = torch.cat((vectors, torch.zeros(1, dim)), dim=0).contiguous() | vectors = torch.cat((vectors, torch.zeros(1, dim)), dim=0).contiguous() | ||||
else: | else: | ||||
unknown_idx = vocab.unknown_idx | unknown_idx = vocab.unknown_idx | ||||
self.words_to_words = nn.Parameter(torch.full((len(vocab), ), fill_value=unknown_idx).long(), | |||||
self.words_to_words = nn.Parameter(torch.full((len(vocab),), fill_value=unknown_idx).long(), | |||||
requires_grad=False) | requires_grad=False) | ||||
for index, (index_in_vocab, vec) in enumerate(matrix.items()): | for index, (index_in_vocab, vec) in enumerate(matrix.items()): | ||||
if vec is not None: | if vec is not None: | ||||
vectors[index] = vec | vectors[index] = vec | ||||
self.words_to_words[index_in_vocab] = index | self.words_to_words[index_in_vocab] = index | ||||
return vectors | return vectors | ||||
def forward(self, words): | def forward(self, words): | ||||
""" | """ | ||||
传入words的index | 传入words的index | ||||
@@ -1,13 +1,19 @@ | |||||
""" | |||||
.. todo:: | |||||
doc | |||||
""" | |||||
import numpy as np | import numpy as np | ||||
import torch | import torch | ||||
from torch import nn as nn | from torch import nn as nn | ||||
from ..core.vocabulary import Vocabulary | from ..core.vocabulary import Vocabulary | ||||
__all__ = ['get_embeddings'] | |||||
__all__ = [ | |||||
'get_embeddings' | |||||
] | |||||
def _construct_char_vocab_from_vocab(vocab:Vocabulary, min_freq:int=1): | |||||
def _construct_char_vocab_from_vocab(vocab: Vocabulary, min_freq: int = 1): | |||||
""" | """ | ||||
给定一个word的vocabulary生成character的vocabulary. | 给定一个word的vocabulary生成character的vocabulary. | ||||
@@ -36,8 +42,8 @@ def get_embeddings(init_embed): | |||||
if isinstance(init_embed, tuple): | if isinstance(init_embed, tuple): | ||||
res = nn.Embedding( | res = nn.Embedding( | ||||
num_embeddings=init_embed[0], embedding_dim=init_embed[1]) | num_embeddings=init_embed[0], embedding_dim=init_embed[1]) | ||||
nn.init.uniform_(res.weight.data, a=-np.sqrt(3/res.weight.data.size(1)), | |||||
b=np.sqrt(3/res.weight.data.size(1))) | |||||
nn.init.uniform_(res.weight.data, a=-np.sqrt(3 / res.weight.data.size(1)), | |||||
b=np.sqrt(3 / res.weight.data.size(1))) | |||||
elif isinstance(init_embed, nn.Module): | elif isinstance(init_embed, nn.Module): | ||||
res = init_embed | res = init_embed | ||||
elif isinstance(init_embed, torch.Tensor): | elif isinstance(init_embed, torch.Tensor): | ||||
@@ -48,4 +54,4 @@ def get_embeddings(init_embed): | |||||
else: | else: | ||||
raise TypeError( | raise TypeError( | ||||
'invalid init_embed type: {}'.format((type(init_embed)))) | 'invalid init_embed type: {}'.format((type(init_embed)))) | ||||
return res | |||||
return res |