Browse Source

add __doc__ & __all__ in module 'embeddings'

tags/v0.4.10
ChenXin 5 years ago
parent
commit
d6c597d32e
9 changed files with 277 additions and 213 deletions
  1. +0
    -1
      fastNLP/embeddings/__init__.py
  2. +87
    -71
      fastNLP/embeddings/bert_embedding.py
  3. +38
    -30
      fastNLP/embeddings/char_embedding.py
  4. +17
    -12
      fastNLP/embeddings/contextual_embedding.py
  5. +42
    -35
      fastNLP/embeddings/elmo_embedding.py
  6. +30
    -26
      fastNLP/embeddings/embedding.py
  7. +17
    -7
      fastNLP/embeddings/stack_embedding.py
  8. +35
    -26
      fastNLP/embeddings/static_embedding.py
  9. +11
    -5
      fastNLP/embeddings/utils.py

+ 0
- 1
fastNLP/embeddings/__init__.py View File

@@ -18,7 +18,6 @@ __all__ = [
"get_embeddings", "get_embeddings",
] ]



from .embedding import Embedding, TokenEmbedding from .embedding import Embedding, TokenEmbedding
from .static_embedding import StaticEmbedding from .static_embedding import StaticEmbedding
from .elmo_embedding import ElmoEmbedding from .elmo_embedding import ElmoEmbedding


+ 87
- 71
fastNLP/embeddings/bert_embedding.py View File

@@ -1,3 +1,12 @@
"""
.. todo::
doc
"""

__all__ = [
"BertEmbedding",
"BertWordPieceEncoder"
]


import os import os
import collections import collections
@@ -13,6 +22,7 @@ from ..modules.encoder.bert import _WordPieceBertModel, BertModel, BertTokenizer
from .contextual_embedding import ContextualEmbedding from .contextual_embedding import ContextualEmbedding
import warnings import warnings



class BertEmbedding(ContextualEmbedding): class BertEmbedding(ContextualEmbedding):
""" """
别名::class:`fastNLP.embeddings.BertEmbedding` :class:`fastNLP.embeddings.bert_embedding.BertEmbedding` 别名::class:`fastNLP.embeddings.BertEmbedding` :class:`fastNLP.embeddings.bert_embedding.BertEmbedding`
@@ -54,11 +64,12 @@ class BertEmbedding(ContextualEmbedding):
word pieces后的内容,并将第512个word piece置为[SEP]。超过长度的部分的encode结果直接全部置零。一般仅有只使用[CLS] word pieces后的内容,并将第512个word piece置为[SEP]。超过长度的部分的encode结果直接全部置零。一般仅有只使用[CLS]
来进行分类的任务将auto_truncate置为True。 来进行分类的任务将auto_truncate置为True。
""" """
def __init__(self, vocab: Vocabulary, model_dir_or_name: str='en-base-uncased', layers: str='-1',
pool_method: str='first', word_dropout=0, dropout=0, include_cls_sep: bool=False,
pooled_cls=True, requires_grad: bool=False, auto_truncate:bool=False):
def __init__(self, vocab: Vocabulary, model_dir_or_name: str = 'en-base-uncased', layers: str = '-1',
pool_method: str = 'first', word_dropout=0, dropout=0, include_cls_sep: bool = False,
pooled_cls=True, requires_grad: bool = False, auto_truncate: bool = False):
super(BertEmbedding, self).__init__(vocab, word_dropout=word_dropout, dropout=dropout) super(BertEmbedding, self).__init__(vocab, word_dropout=word_dropout, dropout=dropout)
# 根据model_dir_or_name检查是否存在并下载 # 根据model_dir_or_name检查是否存在并下载
if model_dir_or_name.lower() in PRETRAINED_BERT_MODEL_DIR: if model_dir_or_name.lower() in PRETRAINED_BERT_MODEL_DIR:
if 'cn' in model_dir_or_name.lower() and pool_method not in ('first', 'last'): if 'cn' in model_dir_or_name.lower() and pool_method not in ('first', 'last'):
@@ -71,21 +82,21 @@ class BertEmbedding(ContextualEmbedding):
model_dir = os.path.abspath(os.path.expanduser(model_dir_or_name)) model_dir = os.path.abspath(os.path.expanduser(model_dir_or_name))
else: else:
raise ValueError(f"Cannot recognize {model_dir_or_name}.") raise ValueError(f"Cannot recognize {model_dir_or_name}.")
self._word_sep_index = None self._word_sep_index = None
if '[SEP]' in vocab: if '[SEP]' in vocab:
self._word_sep_index = vocab['[SEP]'] self._word_sep_index = vocab['[SEP]']
self.model = _WordBertModel(model_dir=model_dir, vocab=vocab, layers=layers, self.model = _WordBertModel(model_dir=model_dir, vocab=vocab, layers=layers,
pool_method=pool_method, include_cls_sep=include_cls_sep, pool_method=pool_method, include_cls_sep=include_cls_sep,
pooled_cls=pooled_cls, auto_truncate=auto_truncate, min_freq=2) pooled_cls=pooled_cls, auto_truncate=auto_truncate, min_freq=2)
self.requires_grad = requires_grad self.requires_grad = requires_grad
self._embed_size = len(self.model.layers)*self.model.encoder.hidden_size
self._embed_size = len(self.model.layers) * self.model.encoder.hidden_size
def _delete_model_weights(self): def _delete_model_weights(self):
del self.model del self.model
def forward(self, words): def forward(self, words):
""" """
计算words的bert embedding表示。计算之前会在每句话的开始增加[CLS]在结束增加[SEP], 并根据include_cls_sep判断要不要 计算words的bert embedding表示。计算之前会在每句话的开始增加[CLS]在结束增加[SEP], 并根据include_cls_sep判断要不要
@@ -100,9 +111,9 @@ class BertEmbedding(ContextualEmbedding):
return self.dropout(outputs) return self.dropout(outputs)
outputs = self.model(words) outputs = self.model(words)
outputs = torch.cat([*outputs], dim=-1) outputs = torch.cat([*outputs], dim=-1)
return self.dropout(outputs) return self.dropout(outputs)
def drop_word(self, words): def drop_word(self, words):
""" """
按照设定随机将words设置为unknown_index。 按照设定随机将words设置为unknown_index。
@@ -120,7 +131,7 @@ class BertEmbedding(ContextualEmbedding):
if self._word_sep_index: if self._word_sep_index:
words.masked_fill_(sep_mask, self._word_sep_index) words.masked_fill_(sep_mask, self._word_sep_index)
return words return words
@property @property
def requires_grad(self): def requires_grad(self):
""" """
@@ -129,12 +140,12 @@ class BertEmbedding(ContextualEmbedding):
:return: :return:
""" """
requires_grads = set([param.requires_grad for name, param in self.named_parameters() requires_grads = set([param.requires_grad for name, param in self.named_parameters()
if 'word_pieces_lengths' not in name])
if 'word_pieces_lengths' not in name])
if len(requires_grads) == 1: if len(requires_grads) == 1:
return requires_grads.pop() return requires_grads.pop()
else: else:
return None return None
@requires_grad.setter @requires_grad.setter
def requires_grad(self, value): def requires_grad(self, value):
for name, param in self.named_parameters(): for name, param in self.named_parameters():
@@ -155,10 +166,11 @@ class BertWordPieceEncoder(nn.Module):
:param float dropout: 以多大的概率对embedding的表示进行Dropout。0.1即随机将10%的值置为0。 :param float dropout: 以多大的概率对embedding的表示进行Dropout。0.1即随机将10%的值置为0。
:param bool requires_grad: 是否需要gradient。 :param bool requires_grad: 是否需要gradient。
""" """
def __init__(self, model_dir_or_name: str='en-base-uncased', layers: str='-1', pooled_cls: bool = False,
word_dropout=0, dropout=0, requires_grad: bool=False):
def __init__(self, model_dir_or_name: str = 'en-base-uncased', layers: str = '-1', pooled_cls: bool = False,
word_dropout=0, dropout=0, requires_grad: bool = False):
super().__init__() super().__init__()
if model_dir_or_name.lower() in PRETRAINED_BERT_MODEL_DIR: if model_dir_or_name.lower() in PRETRAINED_BERT_MODEL_DIR:
model_url = _get_embedding_url('bert', model_dir_or_name.lower()) model_url = _get_embedding_url('bert', model_dir_or_name.lower())
model_dir = cached_path(model_url, name='embedding') model_dir = cached_path(model_url, name='embedding')
@@ -167,7 +179,7 @@ class BertWordPieceEncoder(nn.Module):
model_dir = model_dir_or_name model_dir = model_dir_or_name
else: else:
raise ValueError(f"Cannot recognize {model_dir_or_name}.") raise ValueError(f"Cannot recognize {model_dir_or_name}.")
self.model = _WordPieceBertModel(model_dir=model_dir, layers=layers, pooled_cls=pooled_cls) self.model = _WordPieceBertModel(model_dir=model_dir, layers=layers, pooled_cls=pooled_cls)
self._sep_index = self.model._sep_index self._sep_index = self.model._sep_index
self._wordpiece_unk_index = self.model._wordpiece_unknown_index self._wordpiece_unk_index = self.model._wordpiece_unknown_index
@@ -175,7 +187,7 @@ class BertWordPieceEncoder(nn.Module):
self.requires_grad = requires_grad self.requires_grad = requires_grad
self.word_dropout = word_dropout self.word_dropout = word_dropout
self.dropout_layer = nn.Dropout(dropout) self.dropout_layer = nn.Dropout(dropout)
@property @property
def requires_grad(self): def requires_grad(self):
""" """
@@ -187,24 +199,24 @@ class BertWordPieceEncoder(nn.Module):
return requires_grads.pop() return requires_grads.pop()
else: else:
return None return None
@requires_grad.setter @requires_grad.setter
def requires_grad(self, value): def requires_grad(self, value):
for name, param in self.named_parameters(): for name, param in self.named_parameters():
param.requires_grad = value param.requires_grad = value
@property @property
def embed_size(self): def embed_size(self):
return self._embed_size return self._embed_size
@property @property
def embedding_dim(self): def embedding_dim(self):
return self._embed_size return self._embed_size
@property @property
def num_embedding(self): def num_embedding(self):
return self.model.encoder.config.vocab_size return self.model.encoder.config.vocab_size
def index_datasets(self, *datasets, field_name, add_cls_sep=True): def index_datasets(self, *datasets, field_name, add_cls_sep=True):
""" """
使用bert的tokenizer新生成word_pieces列加入到datasets中,并将他们设置为input,且将word_pieces这一列的pad value设置为了 使用bert的tokenizer新生成word_pieces列加入到datasets中,并将他们设置为input,且将word_pieces这一列的pad value设置为了
@@ -216,7 +228,7 @@ class BertWordPieceEncoder(nn.Module):
:return: :return:
""" """
self.model.index_dataset(*datasets, field_name=field_name, add_cls_sep=add_cls_sep) self.model.index_dataset(*datasets, field_name=field_name, add_cls_sep=add_cls_sep)
def forward(self, word_pieces, token_type_ids=None): def forward(self, word_pieces, token_type_ids=None):
""" """
计算words的bert embedding表示。传入的words中应该自行包含[CLS]与[SEP]的tag。 计算words的bert embedding表示。传入的words中应该自行包含[CLS]与[SEP]的tag。
@@ -233,13 +245,13 @@ class BertWordPieceEncoder(nn.Module):
token_type_ids = sep_mask_cumsum.fmod(2) token_type_ids = sep_mask_cumsum.fmod(2)
if token_type_ids[0, 0].item(): # 如果开头是奇数,则需要flip一下结果,因为需要保证开头为0 if token_type_ids[0, 0].item(): # 如果开头是奇数,则需要flip一下结果,因为需要保证开头为0
token_type_ids = token_type_ids.eq(0).long() token_type_ids = token_type_ids.eq(0).long()
word_pieces = self.drop_word(word_pieces) word_pieces = self.drop_word(word_pieces)
outputs = self.model(word_pieces, token_type_ids) outputs = self.model(word_pieces, token_type_ids)
outputs = torch.cat([*outputs], dim=-1) outputs = torch.cat([*outputs], dim=-1)
return self.dropout_layer(outputs) return self.dropout_layer(outputs)
def drop_word(self, words): def drop_word(self, words):
""" """
按照设定随机将words设置为unknown_index。 按照设定随机将words设置为unknown_index。
@@ -260,10 +272,10 @@ class BertWordPieceEncoder(nn.Module):




class _WordBertModel(nn.Module): class _WordBertModel(nn.Module):
def __init__(self, model_dir:str, vocab:Vocabulary, layers:str='-1', pool_method:str='first',
include_cls_sep:bool=False, pooled_cls:bool=False, auto_truncate:bool=False, min_freq=2):
def __init__(self, model_dir: str, vocab: Vocabulary, layers: str = '-1', pool_method: str = 'first',
include_cls_sep: bool = False, pooled_cls: bool = False, auto_truncate: bool = False, min_freq=2):
super().__init__() super().__init__()
self.tokenzier = BertTokenizer.from_pretrained(model_dir) self.tokenzier = BertTokenizer.from_pretrained(model_dir)
self.encoder = BertModel.from_pretrained(model_dir) self.encoder = BertModel.from_pretrained(model_dir)
self._max_position_embeddings = self.encoder.config.max_position_embeddings self._max_position_embeddings = self.encoder.config.max_position_embeddings
@@ -271,23 +283,23 @@ class _WordBertModel(nn.Module):
encoder_layer_number = len(self.encoder.encoder.layer) encoder_layer_number = len(self.encoder.encoder.layer)
self.layers = list(map(int, layers.split(','))) self.layers = list(map(int, layers.split(',')))
for layer in self.layers: for layer in self.layers:
if layer<0:
assert -layer<=encoder_layer_number, f"The layer index:{layer} is out of scope for " \
f"a bert model with {encoder_layer_number} layers."
if layer < 0:
assert -layer <= encoder_layer_number, f"The layer index:{layer} is out of scope for " \
f"a bert model with {encoder_layer_number} layers."
else: else:
assert layer<encoder_layer_number, f"The layer index:{layer} is out of scope for " \
f"a bert model with {encoder_layer_number} layers."
assert layer < encoder_layer_number, f"The layer index:{layer} is out of scope for " \
f"a bert model with {encoder_layer_number} layers."
assert pool_method in ('avg', 'max', 'first', 'last') assert pool_method in ('avg', 'max', 'first', 'last')
self.pool_method = pool_method self.pool_method = pool_method
self.include_cls_sep = include_cls_sep self.include_cls_sep = include_cls_sep
self.pooled_cls = pooled_cls self.pooled_cls = pooled_cls
self.auto_truncate = auto_truncate self.auto_truncate = auto_truncate
# 将所有vocab中word的wordpiece计算出来, 需要额外考虑[CLS]和[SEP] # 将所有vocab中word的wordpiece计算出来, 需要额外考虑[CLS]和[SEP]
print("Start to generating word pieces for word.") print("Start to generating word pieces for word.")
# 第一步统计出需要的word_piece, 然后创建新的embed和word_piece_vocab, 然后填入值 # 第一步统计出需要的word_piece, 然后创建新的embed和word_piece_vocab, 然后填入值
word_piece_dict = {'[CLS]':1, '[SEP]':1} # 用到的word_piece以及新增的
word_piece_dict = {'[CLS]': 1, '[SEP]': 1} # 用到的word_piece以及新增的
found_count = 0 found_count = 0
self._has_sep_in_vocab = '[SEP]' in vocab # 用来判断传入的数据是否需要生成token_ids self._has_sep_in_vocab = '[SEP]' in vocab # 用来判断传入的数据是否需要生成token_ids
if '[sep]' in vocab: if '[sep]' in vocab:
@@ -302,10 +314,11 @@ class _WordBertModel(nn.Module):
elif index == vocab.unknown_idx: elif index == vocab.unknown_idx:
word = '[UNK]' word = '[UNK]'
word_pieces = self.tokenzier.wordpiece_tokenizer.tokenize(word) word_pieces = self.tokenzier.wordpiece_tokenizer.tokenize(word)
if len(word_pieces)==1:
if len(word_pieces) == 1:
if not vocab._is_word_no_create_entry(word): # 如果是train中的值, 但是却没有找到 if not vocab._is_word_no_create_entry(word): # 如果是train中的值, 但是却没有找到
if index!=vocab.unknown_idx and word_pieces[0]=='[UNK]': # 说明这个词不在原始的word里面
if vocab.word_count[word]>=min_freq and not vocab._is_word_no_create_entry(word): #出现次数大于这个次数才新增
if index != vocab.unknown_idx and word_pieces[0] == '[UNK]': # 说明这个词不在原始的word里面
if vocab.word_count[word] >= min_freq and not vocab._is_word_no_create_entry(
word): # 出现次数大于这个次数才新增
word_piece_dict[word] = 1 # 新增一个值 word_piece_dict[word] = 1 # 新增一个值
continue continue
for word_piece in word_pieces: for word_piece in word_pieces:
@@ -327,7 +340,7 @@ class _WordBertModel(nn.Module):
new_word_piece_vocab[token] = len(new_word_piece_vocab) new_word_piece_vocab[token] = len(new_word_piece_vocab)
self.tokenzier._reinit_on_new_vocab(new_word_piece_vocab) self.tokenzier._reinit_on_new_vocab(new_word_piece_vocab)
self.encoder.embeddings.word_embeddings = embed self.encoder.embeddings.word_embeddings = embed
word_to_wordpieces = [] word_to_wordpieces = []
word_pieces_lengths = [] word_pieces_lengths = []
for word, index in vocab: for word, index in vocab:
@@ -347,7 +360,7 @@ class _WordBertModel(nn.Module):
self.word_to_wordpieces = np.array(word_to_wordpieces) self.word_to_wordpieces = np.array(word_to_wordpieces)
self.word_pieces_lengths = nn.Parameter(torch.LongTensor(word_pieces_lengths), requires_grad=False) self.word_pieces_lengths = nn.Parameter(torch.LongTensor(word_pieces_lengths), requires_grad=False)
print("Successfully generate word pieces.") print("Successfully generate word pieces.")
def forward(self, words): def forward(self, words):
""" """


@@ -358,34 +371,37 @@ class _WordBertModel(nn.Module):
batch_size, max_word_len = words.size() batch_size, max_word_len = words.size()
word_mask = words.ne(self._word_pad_index) # 为1的地方有word word_mask = words.ne(self._word_pad_index) # 为1的地方有word
seq_len = word_mask.sum(dim=-1) seq_len = word_mask.sum(dim=-1)
batch_word_pieces_length = self.word_pieces_lengths[words].masked_fill(word_mask.eq(0), 0) # batch_size x max_len
batch_word_pieces_length = self.word_pieces_lengths[words].masked_fill(word_mask.eq(0),
0) # batch_size x max_len
word_pieces_lengths = batch_word_pieces_length.sum(dim=-1) # batch_size word_pieces_lengths = batch_word_pieces_length.sum(dim=-1) # batch_size
word_piece_length = batch_word_pieces_length.sum(dim=-1).max().item() # 表示word piece的长度(包括padding) word_piece_length = batch_word_pieces_length.sum(dim=-1).max().item() # 表示word piece的长度(包括padding)
if word_piece_length+2>self._max_position_embeddings:
if word_piece_length + 2 > self._max_position_embeddings:
if self.auto_truncate: if self.auto_truncate:
word_pieces_lengths = word_pieces_lengths.masked_fill(word_pieces_lengths+2>self._max_position_embeddings,
self._max_position_embeddings-2)
word_pieces_lengths = word_pieces_lengths.masked_fill(
word_pieces_lengths + 2 > self._max_position_embeddings,
self._max_position_embeddings - 2)
else: else:
raise RuntimeError("After split words into word pieces, the lengths of word pieces are longer than the "
f"maximum allowed sequence length:{self._max_position_embeddings} of bert.")

raise RuntimeError(
"After split words into word pieces, the lengths of word pieces are longer than the "
f"maximum allowed sequence length:{self._max_position_embeddings} of bert.")
# +2是由于需要加入[CLS]与[SEP] # +2是由于需要加入[CLS]与[SEP]
word_pieces = words.new_full((batch_size, min(word_piece_length+2, self._max_position_embeddings)),
word_pieces = words.new_full((batch_size, min(word_piece_length + 2, self._max_position_embeddings)),
fill_value=self._wordpiece_pad_index) fill_value=self._wordpiece_pad_index)
attn_masks = torch.zeros_like(word_pieces) attn_masks = torch.zeros_like(word_pieces)
# 1. 获取words的word_pieces的id,以及对应的span范围 # 1. 获取words的word_pieces的id,以及对应的span范围
word_indexes = words.cpu().numpy() word_indexes = words.cpu().numpy()
for i in range(batch_size): for i in range(batch_size):
word_pieces_i = list(chain(*self.word_to_wordpieces[word_indexes[i, :seq_len[i]]])) word_pieces_i = list(chain(*self.word_to_wordpieces[word_indexes[i, :seq_len[i]]]))
if self.auto_truncate and len(word_pieces_i)>self._max_position_embeddings-2:
word_pieces_i = word_pieces_i[:self._max_position_embeddings-2]
word_pieces[i, 1:word_pieces_lengths[i]+1] = torch.LongTensor(word_pieces_i)
attn_masks[i, :word_pieces_lengths[i]+2].fill_(1)
if self.auto_truncate and len(word_pieces_i) > self._max_position_embeddings - 2:
word_pieces_i = word_pieces_i[:self._max_position_embeddings - 2]
word_pieces[i, 1:word_pieces_lengths[i] + 1] = torch.LongTensor(word_pieces_i)
attn_masks[i, :word_pieces_lengths[i] + 2].fill_(1)
# 添加[cls]和[sep] # 添加[cls]和[sep]
word_pieces[:, 0].fill_(self._cls_index) word_pieces[:, 0].fill_(self._cls_index)
batch_indexes = torch.arange(batch_size).to(words) batch_indexes = torch.arange(batch_size).to(words)
word_pieces[batch_indexes, word_pieces_lengths+1] = self._sep_index
if self._has_sep_in_vocab: #但[SEP]在vocab中出现应该才会需要token_ids
word_pieces[batch_indexes, word_pieces_lengths + 1] = self._sep_index
if self._has_sep_in_vocab: # 但[SEP]在vocab中出现应该才会需要token_ids
sep_mask = word_pieces.eq(self._sep_index) # batch_size x max_len sep_mask = word_pieces.eq(self._sep_index) # batch_size x max_len
sep_mask_cumsum = sep_mask.flip(dims=[-1]).cumsum(dim=-1).flip(dims=[-1]) sep_mask_cumsum = sep_mask.flip(dims=[-1]).cumsum(dim=-1).flip(dims=[-1])
token_type_ids = sep_mask_cumsum.fmod(2) token_type_ids = sep_mask_cumsum.fmod(2)
@@ -396,9 +412,9 @@ class _WordBertModel(nn.Module):
# 2. 获取hidden的结果,根据word_pieces进行对应的pool计算 # 2. 获取hidden的结果,根据word_pieces进行对应的pool计算
# all_outputs: [batch_size x max_len x hidden_size, batch_size x max_len x hidden_size, ...] # all_outputs: [batch_size x max_len x hidden_size, batch_size x max_len x hidden_size, ...]
bert_outputs, pooled_cls = self.encoder(word_pieces, token_type_ids=token_type_ids, attention_mask=attn_masks, bert_outputs, pooled_cls = self.encoder(word_pieces, token_type_ids=token_type_ids, attention_mask=attn_masks,
output_all_encoded_layers=True)
output_all_encoded_layers=True)
# output_layers = [self.layers] # len(self.layers) x batch_size x real_word_piece_length x hidden_size # output_layers = [self.layers] # len(self.layers) x batch_size x real_word_piece_length x hidden_size
if self.include_cls_sep: if self.include_cls_sep:
outputs = bert_outputs[-1].new_zeros(len(self.layers), batch_size, max_word_len + 2, outputs = bert_outputs[-1].new_zeros(len(self.layers), batch_size, max_word_len + 2,
bert_outputs[-1].size(-1)) bert_outputs[-1].size(-1))
@@ -414,7 +430,7 @@ class _WordBertModel(nn.Module):
real_word_piece_length = output_layer.size(1) - 2 real_word_piece_length = output_layer.size(1) - 2
if word_piece_length > real_word_piece_length: # 如果实际上是截取出来的 if word_piece_length > real_word_piece_length: # 如果实际上是截取出来的
paddings = output_layer.new_zeros(batch_size, paddings = output_layer.new_zeros(batch_size,
word_piece_length-real_word_piece_length,
word_piece_length - real_word_piece_length,
output_layer.size(2)) output_layer.size(2))
output_layer = torch.cat((output_layer, paddings), dim=1).contiguous() output_layer = torch.cat((output_layer, paddings), dim=1).contiguous()
# 从word_piece collapse到word的表示 # 从word_piece collapse到word的表示
@@ -423,27 +439,27 @@ class _WordBertModel(nn.Module):
if self.pool_method == 'first': if self.pool_method == 'first':
for i in range(batch_size): for i in range(batch_size):
i_word_pieces_cum_length = batch_word_pieces_cum_length[i, :seq_len[i]] # 每个word的start位置 i_word_pieces_cum_length = batch_word_pieces_cum_length[i, :seq_len[i]] # 每个word的start位置
outputs[l_index, i, s_shift:outputs_seq_len[i]] = truncate_output_layer[i, i_word_pieces_cum_length] # num_layer x batch_size x len x hidden_size
outputs[l_index, i, s_shift:outputs_seq_len[i]] = truncate_output_layer[
i, i_word_pieces_cum_length] # num_layer x batch_size x len x hidden_size
elif self.pool_method == 'last': elif self.pool_method == 'last':
for i in range(batch_size): for i in range(batch_size):
i_word_pieces_cum_length = batch_word_pieces_cum_length[i, 1:seq_len[i]+1] - 1 # 每个word的end
i_word_pieces_cum_length = batch_word_pieces_cum_length[i, 1:seq_len[i] + 1] - 1 # 每个word的end
outputs[l_index, i, s_shift:outputs_seq_len[i]] = truncate_output_layer[i, i_word_pieces_cum_length] outputs[l_index, i, s_shift:outputs_seq_len[i]] = truncate_output_layer[i, i_word_pieces_cum_length]
elif self.pool_method == 'max': elif self.pool_method == 'max':
for i in range(batch_size): for i in range(batch_size):
for j in range(seq_len[i]): for j in range(seq_len[i]):
start, end = batch_word_pieces_cum_length[i, j], batch_word_pieces_cum_length[i, j+1]
outputs[l_index, i, j+s_shift], _ = torch.max(truncate_output_layer[i, start:end], dim=-2)
start, end = batch_word_pieces_cum_length[i, j], batch_word_pieces_cum_length[i, j + 1]
outputs[l_index, i, j + s_shift], _ = torch.max(truncate_output_layer[i, start:end], dim=-2)
else: else:
for i in range(batch_size): for i in range(batch_size):
for j in range(seq_len[i]): for j in range(seq_len[i]):
start, end = batch_word_pieces_cum_length[i, j], batch_word_pieces_cum_length[i, j+1]
outputs[l_index, i, j+s_shift] = torch.mean(truncate_output_layer[i, start:end], dim=-2)
start, end = batch_word_pieces_cum_length[i, j], batch_word_pieces_cum_length[i, j + 1]
outputs[l_index, i, j + s_shift] = torch.mean(truncate_output_layer[i, start:end], dim=-2)
if self.include_cls_sep: if self.include_cls_sep:
if l in (len(bert_outputs)-1, -1) and self.pooled_cls:
if l in (len(bert_outputs) - 1, -1) and self.pooled_cls:
outputs[l_index, :, 0] = pooled_cls outputs[l_index, :, 0] = pooled_cls
else: else:
outputs[l_index, :, 0] = output_layer[:, 0] outputs[l_index, :, 0] = output_layer[:, 0]
outputs[l_index, batch_indexes, seq_len+s_shift] = output_layer[batch_indexes, seq_len+s_shift]
outputs[l_index, batch_indexes, seq_len + s_shift] = output_layer[batch_indexes, seq_len + s_shift]
# 3. 最终的embedding结果 # 3. 最终的embedding结果
return outputs return outputs


+ 38
- 30
fastNLP/embeddings/char_embedding.py View File

@@ -3,6 +3,10 @@
词的index而不需要使用词语中的char的index来获取表达。 词的index而不需要使用词语中的char的index来获取表达。
""" """


__all__ = [
"CNNCharEmbedding",
"LSTMCharEmbedding"
]


import torch import torch
import torch.nn as nn import torch.nn as nn
@@ -16,6 +20,7 @@ from .embedding import TokenEmbedding
from .utils import _construct_char_vocab_from_vocab from .utils import _construct_char_vocab_from_vocab
from .utils import get_embeddings from .utils import get_embeddings



class CNNCharEmbedding(TokenEmbedding): class CNNCharEmbedding(TokenEmbedding):
""" """
别名::class:`fastNLP.embeddings.CNNCharEmbedding` :class:`fastNLP.embeddings.char_embedding.CNNCharEmbedding` 别名::class:`fastNLP.embeddings.CNNCharEmbedding` :class:`fastNLP.embeddings.char_embedding.CNNCharEmbedding`
@@ -49,14 +54,15 @@ class CNNCharEmbedding(TokenEmbedding):
(文件夹下应该只有一个以.txt作为后缀的文件)或文件路径;第二种是传入embedding的名称,第二种情况将自动查看缓存中是否存在该模型, (文件夹下应该只有一个以.txt作为后缀的文件)或文件路径;第二种是传入embedding的名称,第二种情况将自动查看缓存中是否存在该模型,
没有的话将自动下载。如果输入为None则使用embedding_dim的维度随机初始化一个embedding. 没有的话将自动下载。如果输入为None则使用embedding_dim的维度随机初始化一个embedding.
""" """
def __init__(self, vocab: Vocabulary, embed_size: int=50, char_emb_size: int=50, word_dropout:float=0,
dropout:float=0, filter_nums: List[int]=(40, 30, 20), kernel_sizes: List[int]=(5, 3, 1),
pool_method: str='max', activation='relu', min_char_freq: int=2, pre_train_char_embed: str=None):
def __init__(self, vocab: Vocabulary, embed_size: int = 50, char_emb_size: int = 50, word_dropout: float = 0,
dropout: float = 0, filter_nums: List[int] = (40, 30, 20), kernel_sizes: List[int] = (5, 3, 1),
pool_method: str = 'max', activation='relu', min_char_freq: int = 2, pre_train_char_embed: str = None):
super(CNNCharEmbedding, self).__init__(vocab, word_dropout=word_dropout, dropout=dropout) super(CNNCharEmbedding, self).__init__(vocab, word_dropout=word_dropout, dropout=dropout)
for kernel in kernel_sizes: for kernel in kernel_sizes:
assert kernel % 2 == 1, "Only odd kernel is allowed." assert kernel % 2 == 1, "Only odd kernel is allowed."
assert pool_method in ('max', 'avg') assert pool_method in ('max', 'avg')
self.pool_method = pool_method self.pool_method = pool_method
# activation function # activation function
@@ -74,7 +80,7 @@ class CNNCharEmbedding(TokenEmbedding):
else: else:
raise Exception( raise Exception(
"Undefined activation function: choose from: [relu, tanh, sigmoid, or a callable function]") "Undefined activation function: choose from: [relu, tanh, sigmoid, or a callable function]")
print("Start constructing character vocabulary.") print("Start constructing character vocabulary.")
# 建立char的词表 # 建立char的词表
self.char_vocab = _construct_char_vocab_from_vocab(vocab, min_freq=min_char_freq) self.char_vocab = _construct_char_vocab_from_vocab(vocab, min_freq=min_char_freq)
@@ -96,14 +102,14 @@ class CNNCharEmbedding(TokenEmbedding):
self.char_embedding = StaticEmbedding(self.char_vocab, model_dir_or_name=pre_train_char_embed) self.char_embedding = StaticEmbedding(self.char_vocab, model_dir_or_name=pre_train_char_embed)
else: else:
self.char_embedding = get_embeddings((len(self.char_vocab), char_emb_size)) self.char_embedding = get_embeddings((len(self.char_vocab), char_emb_size))
self.convs = nn.ModuleList([nn.Conv1d( self.convs = nn.ModuleList([nn.Conv1d(
char_emb_size, filter_nums[i], kernel_size=kernel_sizes[i], bias=True, padding=kernel_sizes[i] // 2) char_emb_size, filter_nums[i], kernel_size=kernel_sizes[i], bias=True, padding=kernel_sizes[i] // 2)
for i in range(len(kernel_sizes))]) for i in range(len(kernel_sizes))])
self._embed_size = embed_size self._embed_size = embed_size
self.fc = nn.Linear(sum(filter_nums), embed_size) self.fc = nn.Linear(sum(filter_nums), embed_size)
self.reset_parameters() self.reset_parameters()
def forward(self, words): def forward(self, words):
""" """
输入words的index后,生成对应的words的表示。 输入words的index后,生成对应的words的表示。
@@ -114,14 +120,14 @@ class CNNCharEmbedding(TokenEmbedding):
words = self.drop_word(words) words = self.drop_word(words)
batch_size, max_len = words.size() batch_size, max_len = words.size()
chars = self.words_to_chars_embedding[words] # batch_size x max_len x max_word_len chars = self.words_to_chars_embedding[words] # batch_size x max_len x max_word_len
word_lengths = self.word_lengths[words] # batch_size x max_len
word_lengths = self.word_lengths[words] # batch_size x max_len
max_word_len = word_lengths.max() max_word_len = word_lengths.max()
chars = chars[:, :, :max_word_len] chars = chars[:, :, :max_word_len]
# 为1的地方为mask # 为1的地方为mask
chars_masks = chars.eq(self.char_pad_index) # batch_size x max_len x max_word_len 如果为0, 说明是padding的位置了 chars_masks = chars.eq(self.char_pad_index) # batch_size x max_len x max_word_len 如果为0, 说明是padding的位置了
chars = self.char_embedding(chars) # batch_size x max_len x max_word_len x embed_size chars = self.char_embedding(chars) # batch_size x max_len x max_word_len x embed_size
chars = self.dropout(chars) chars = self.dropout(chars)
reshaped_chars = chars.reshape(batch_size*max_len, max_word_len, -1)
reshaped_chars = chars.reshape(batch_size * max_len, max_word_len, -1)
reshaped_chars = reshaped_chars.transpose(1, 2) # B' x E x M reshaped_chars = reshaped_chars.transpose(1, 2) # B' x E x M
conv_chars = [conv(reshaped_chars).transpose(1, 2).reshape(batch_size, max_len, max_word_len, -1) conv_chars = [conv(reshaped_chars).transpose(1, 2).reshape(batch_size, max_len, max_word_len, -1)
for conv in self.convs] for conv in self.convs]
@@ -129,13 +135,13 @@ class CNNCharEmbedding(TokenEmbedding):
conv_chars = self.activation(conv_chars) conv_chars = self.activation(conv_chars)
if self.pool_method == 'max': if self.pool_method == 'max':
conv_chars = conv_chars.masked_fill(chars_masks.unsqueeze(-1), float('-inf')) conv_chars = conv_chars.masked_fill(chars_masks.unsqueeze(-1), float('-inf'))
chars, _ = torch.max(conv_chars, dim=-2) # batch_size x max_len x sum(filters)
chars, _ = torch.max(conv_chars, dim=-2) # batch_size x max_len x sum(filters)
else: else:
conv_chars = conv_chars.masked_fill(chars_masks.unsqueeze(-1), 0) conv_chars = conv_chars.masked_fill(chars_masks.unsqueeze(-1), 0)
chars = torch.sum(conv_chars, dim=-2)/chars_masks.eq(0).sum(dim=-1, keepdim=True).float()
chars = torch.sum(conv_chars, dim=-2) / chars_masks.eq(0).sum(dim=-1, keepdim=True).float()
chars = self.fc(chars) chars = self.fc(chars)
return self.dropout(chars) return self.dropout(chars)
@property @property
def requires_grad(self): def requires_grad(self):
""" """
@@ -151,21 +157,21 @@ class CNNCharEmbedding(TokenEmbedding):
return requires_grads.pop() return requires_grads.pop()
else: else:
return None return None
@requires_grad.setter @requires_grad.setter
def requires_grad(self, value): def requires_grad(self, value):
for name, param in self.named_parameters(): for name, param in self.named_parameters():
if 'words_to_chars_embedding' in name or 'word_lengths' in name: # 这个不能加入到requires_grad中 if 'words_to_chars_embedding' in name or 'word_lengths' in name: # 这个不能加入到requires_grad中
continue continue
param.requires_grad = value param.requires_grad = value
def reset_parameters(self): def reset_parameters(self):
for name, param in self.named_parameters(): for name, param in self.named_parameters():
if 'words_to_chars_embedding' in name or 'word_lengths' in name: # 这个不能reset if 'words_to_chars_embedding' in name or 'word_lengths' in name: # 这个不能reset
continue continue
if 'char_embedding' in name: if 'char_embedding' in name:
continue continue
if param.data.dim()>1:
if param.data.dim() > 1:
nn.init.xavier_uniform_(param, 1) nn.init.xavier_uniform_(param, 1)
else: else:
nn.init.uniform_(param, -1, 1) nn.init.uniform_(param, -1, 1)
@@ -203,13 +209,15 @@ class LSTMCharEmbedding(TokenEmbedding):
(文件夹下应该只有一个以.txt作为后缀的文件)或文件路径;第二种是传入embedding的名称,第二种情况将自动查看缓存中是否存在该模型, (文件夹下应该只有一个以.txt作为后缀的文件)或文件路径;第二种是传入embedding的名称,第二种情况将自动查看缓存中是否存在该模型,
没有的话将自动下载。如果输入为None则使用embedding_dim的维度随机初始化一个embedding. 没有的话将自动下载。如果输入为None则使用embedding_dim的维度随机初始化一个embedding.
""" """
def __init__(self, vocab: Vocabulary, embed_size: int=50, char_emb_size: int=50, word_dropout:float=0,
dropout:float=0, hidden_size=50,pool_method: str='max', activation='relu', min_char_freq: int=2,
bidirectional=True, pre_train_char_embed: str=None):
def __init__(self, vocab: Vocabulary, embed_size: int = 50, char_emb_size: int = 50, word_dropout: float = 0,
dropout: float = 0, hidden_size=50, pool_method: str = 'max', activation='relu',
min_char_freq: int = 2,
bidirectional=True, pre_train_char_embed: str = None):
super(LSTMCharEmbedding, self).__init__(vocab, word_dropout=word_dropout, dropout=dropout) super(LSTMCharEmbedding, self).__init__(vocab, word_dropout=word_dropout, dropout=dropout)

assert hidden_size % 2 == 0, "Only even kernel is allowed." assert hidden_size % 2 == 0, "Only even kernel is allowed."
assert pool_method in ('max', 'avg') assert pool_method in ('max', 'avg')
self.pool_method = pool_method self.pool_method = pool_method
# activation function # activation function
@@ -227,7 +235,7 @@ class LSTMCharEmbedding(TokenEmbedding):
else: else:
raise Exception( raise Exception(
"Undefined activation function: choose from: [relu, tanh, sigmoid, or a callable function]") "Undefined activation function: choose from: [relu, tanh, sigmoid, or a callable function]")
print("Start constructing character vocabulary.") print("Start constructing character vocabulary.")
# 建立char的词表 # 建立char的词表
self.char_vocab = _construct_char_vocab_from_vocab(vocab, min_freq=min_char_freq) self.char_vocab = _construct_char_vocab_from_vocab(vocab, min_freq=min_char_freq)
@@ -249,14 +257,14 @@ class LSTMCharEmbedding(TokenEmbedding):
self.char_embedding = StaticEmbedding(self.char_vocab, pre_train_char_embed) self.char_embedding = StaticEmbedding(self.char_vocab, pre_train_char_embed)
else: else:
self.char_embedding = nn.Embedding(len(self.char_vocab), char_emb_size) self.char_embedding = nn.Embedding(len(self.char_vocab), char_emb_size)
self.fc = nn.Linear(hidden_size, embed_size) self.fc = nn.Linear(hidden_size, embed_size)
hidden_size = hidden_size // 2 if bidirectional else hidden_size hidden_size = hidden_size // 2 if bidirectional else hidden_size
self.lstm = LSTM(char_emb_size, hidden_size, bidirectional=bidirectional, batch_first=True) self.lstm = LSTM(char_emb_size, hidden_size, bidirectional=bidirectional, batch_first=True)
self._embed_size = embed_size self._embed_size = embed_size
self.bidirectional = bidirectional self.bidirectional = bidirectional
def forward(self, words): def forward(self, words):
""" """
输入words的index后,生成对应的words的表示。 输入words的index后,生成对应的words的表示。
@@ -278,7 +286,7 @@ class LSTMCharEmbedding(TokenEmbedding):
char_seq_len = chars_masks.eq(0).sum(dim=-1).reshape(batch_size * max_len) char_seq_len = chars_masks.eq(0).sum(dim=-1).reshape(batch_size * max_len)
lstm_chars = self.lstm(reshaped_chars, char_seq_len)[0].reshape(batch_size, max_len, max_word_len, -1) lstm_chars = self.lstm(reshaped_chars, char_seq_len)[0].reshape(batch_size, max_len, max_word_len, -1)
# B x M x M x H # B x M x M x H
lstm_chars = self.activation(lstm_chars) lstm_chars = self.activation(lstm_chars)
if self.pool_method == 'max': if self.pool_method == 'max':
lstm_chars = lstm_chars.masked_fill(chars_masks.unsqueeze(-1), float('-inf')) lstm_chars = lstm_chars.masked_fill(chars_masks.unsqueeze(-1), float('-inf'))
@@ -286,11 +294,11 @@ class LSTMCharEmbedding(TokenEmbedding):
else: else:
lstm_chars = lstm_chars.masked_fill(chars_masks.unsqueeze(-1), 0) lstm_chars = lstm_chars.masked_fill(chars_masks.unsqueeze(-1), 0)
chars = torch.sum(lstm_chars, dim=-2) / chars_masks.eq(0).sum(dim=-1, keepdim=True).float() chars = torch.sum(lstm_chars, dim=-2) / chars_masks.eq(0).sum(dim=-1, keepdim=True).float()
chars = self.fc(chars) chars = self.fc(chars)
return self.dropout(chars) return self.dropout(chars)
@property @property
def requires_grad(self): def requires_grad(self):
""" """
@@ -307,7 +315,7 @@ class LSTMCharEmbedding(TokenEmbedding):
return requires_grads.pop() return requires_grads.pop()
else: else:
return None return None
@requires_grad.setter @requires_grad.setter
def requires_grad(self, value): def requires_grad(self, value):
for name, param in self.named_parameters(): for name, param in self.named_parameters():


+ 17
- 12
fastNLP/embeddings/contextual_embedding.py View File

@@ -1,3 +1,12 @@
"""
.. todo::
doc
"""

__all__ = [
"ContextualEmbedding"
]

from abc import abstractmethod from abc import abstractmethod
import torch import torch


@@ -8,16 +17,12 @@ from ..core.sampler import SequentialSampler
from ..core.utils import _move_model_to_device, _get_model_device from ..core.utils import _move_model_to_device, _get_model_device
from .embedding import TokenEmbedding from .embedding import TokenEmbedding


__all__ = [
"ContextualEmbedding"
]



class ContextualEmbedding(TokenEmbedding): class ContextualEmbedding(TokenEmbedding):
def __init__(self, vocab: Vocabulary, word_dropout:float=0.0, dropout:float=0.0):
def __init__(self, vocab: Vocabulary, word_dropout: float = 0.0, dropout: float = 0.0):
super(ContextualEmbedding, self).__init__(vocab, word_dropout=word_dropout, dropout=dropout) super(ContextualEmbedding, self).__init__(vocab, word_dropout=word_dropout, dropout=dropout)
def add_sentence_cache(self, *datasets, batch_size=32, device='cpu', delete_weights: bool=True):
def add_sentence_cache(self, *datasets, batch_size=32, device='cpu', delete_weights: bool = True):
""" """
由于动态embedding生成比较耗时,所以可以把每句话embedding缓存下来,这样就不需要每次都运行生成过程。 由于动态embedding生成比较耗时,所以可以把每句话embedding缓存下来,这样就不需要每次都运行生成过程。


@@ -34,7 +39,7 @@ class ContextualEmbedding(TokenEmbedding):
except Exception as e: except Exception as e:
print(f"Exception happens at {index} dataset.") print(f"Exception happens at {index} dataset.")
raise e raise e
sent_embeds = {} sent_embeds = {}
_move_model_to_device(self, device=device) _move_model_to_device(self, device=device)
device = _get_model_device(self) device = _get_model_device(self)
@@ -54,7 +59,7 @@ class ContextualEmbedding(TokenEmbedding):
word_embeds = self(words).detach().cpu().numpy() word_embeds = self(words).detach().cpu().numpy()
for b in range(words.size(0)): for b in range(words.size(0)):
length = seq_len_from_behind[b] length = seq_len_from_behind[b]
if length==0:
if length == 0:
sent_embeds[tuple(words_list[b][:seq_len[b]])] = word_embeds[b] sent_embeds[tuple(words_list[b][:seq_len[b]])] = word_embeds[b]
else: else:
sent_embeds[tuple(words_list[b][:seq_len[b]])] = word_embeds[b, :-length] sent_embeds[tuple(words_list[b][:seq_len[b]])] = word_embeds[b, :-length]
@@ -65,7 +70,7 @@ class ContextualEmbedding(TokenEmbedding):
self.sent_embeds = sent_embeds self.sent_embeds = sent_embeds
if delete_weights: if delete_weights:
self._delete_model_weights() self._delete_model_weights()
def _get_sent_reprs(self, words): def _get_sent_reprs(self, words):
""" """
获取sentence的表示,如果有缓存,则返回缓存的值; 没有缓存则返回None 获取sentence的表示,如果有缓存,则返回缓存的值; 没有缓存则返回None
@@ -88,12 +93,12 @@ class ContextualEmbedding(TokenEmbedding):
embeds[i, :len(embed)] = torch.FloatTensor(embed).to(words.device) embeds[i, :len(embed)] = torch.FloatTensor(embed).to(words.device)
return embeds return embeds
return None return None
@abstractmethod @abstractmethod
def _delete_model_weights(self): def _delete_model_weights(self):
"""删除计算表示的模型以节省资源""" """删除计算表示的模型以节省资源"""
raise NotImplementedError raise NotImplementedError
def remove_sentence_cache(self): def remove_sentence_cache(self):
""" """
删除缓存的句子表示. 删除之后如果模型权重没有被删除,将开始使用动态计算权重。 删除缓存的句子表示. 删除之后如果模型权重没有被删除,将开始使用动态计算权重。


+ 42
- 35
fastNLP/embeddings/elmo_embedding.py View File

@@ -1,6 +1,13 @@
"""
.. todo::
doc
"""


import os
__all__ = [
"ElmoEmbedding"
]


import os
import torch import torch
import torch.nn as nn import torch.nn as nn
import torch.nn.functional as F import torch.nn.functional as F
@@ -49,11 +56,11 @@ class ElmoEmbedding(ContextualEmbedding):
:param cache_word_reprs: 可以选择对word的表示进行cache; 设置为True的话,将在初始化的时候为每个word生成对应的embedding, :param cache_word_reprs: 可以选择对word的表示进行cache; 设置为True的话,将在初始化的时候为每个word生成对应的embedding,
并删除character encoder,之后将直接使用cache的embedding。默认为False。 并删除character encoder,之后将直接使用cache的embedding。默认为False。
""" """
def __init__(self, vocab: Vocabulary, model_dir_or_name: str = 'en', layers: str = '2', requires_grad: bool = False, def __init__(self, vocab: Vocabulary, model_dir_or_name: str = 'en', layers: str = '2', requires_grad: bool = False,
word_dropout=0.0, dropout=0.0, cache_word_reprs: bool = False): word_dropout=0.0, dropout=0.0, cache_word_reprs: bool = False):
super(ElmoEmbedding, self).__init__(vocab, word_dropout=word_dropout, dropout=dropout) super(ElmoEmbedding, self).__init__(vocab, word_dropout=word_dropout, dropout=dropout)
# 根据model_dir_or_name检查是否存在并下载 # 根据model_dir_or_name检查是否存在并下载
if model_dir_or_name.lower() in PRETRAINED_ELMO_MODEL_DIR: if model_dir_or_name.lower() in PRETRAINED_ELMO_MODEL_DIR:
model_url = _get_embedding_url('elmo', model_dir_or_name.lower()) model_url = _get_embedding_url('elmo', model_dir_or_name.lower())
@@ -64,7 +71,7 @@ class ElmoEmbedding(ContextualEmbedding):
else: else:
raise ValueError(f"Cannot recognize {model_dir_or_name}.") raise ValueError(f"Cannot recognize {model_dir_or_name}.")
self.model = _ElmoModel(model_dir, vocab, cache_word_reprs=cache_word_reprs) self.model = _ElmoModel(model_dir, vocab, cache_word_reprs=cache_word_reprs)
if layers == 'mix': if layers == 'mix':
self.layer_weights = nn.Parameter(torch.zeros(self.model.config['lstm']['n_layers'] + 1), self.layer_weights = nn.Parameter(torch.zeros(self.model.config['lstm']['n_layers'] + 1),
requires_grad=requires_grad) requires_grad=requires_grad)
@@ -79,16 +86,16 @@ class ElmoEmbedding(ContextualEmbedding):
self.layers = layers self.layers = layers
self._get_outputs = self._get_layer_outputs self._get_outputs = self._get_layer_outputs
self._embed_size = len(self.layers) * self.model.config['lstm']['projection_dim'] * 2 self._embed_size = len(self.layers) * self.model.config['lstm']['projection_dim'] * 2
self.requires_grad = requires_grad self.requires_grad = requires_grad
def _get_mixed_outputs(self, outputs): def _get_mixed_outputs(self, outputs):
# outputs: num_layers x batch_size x max_len x hidden_size # outputs: num_layers x batch_size x max_len x hidden_size
# return: batch_size x max_len x hidden_size # return: batch_size x max_len x hidden_size
weights = F.softmax(self.layer_weights + 1 / len(outputs), dim=0).to(outputs) weights = F.softmax(self.layer_weights + 1 / len(outputs), dim=0).to(outputs)
outputs = torch.einsum('l,lbij->bij', weights, outputs) outputs = torch.einsum('l,lbij->bij', weights, outputs)
return self.gamma.to(outputs) * outputs return self.gamma.to(outputs) * outputs
def set_mix_weights_requires_grad(self, flag=True): def set_mix_weights_requires_grad(self, flag=True):
""" """
当初始化ElmoEmbedding时layers被设置为mix时,可以通过调用该方法设置mix weights是否可训练。如果layers不是mix,调用 当初始化ElmoEmbedding时layers被设置为mix时,可以通过调用该方法设置mix weights是否可训练。如果layers不是mix,调用
@@ -100,15 +107,15 @@ class ElmoEmbedding(ContextualEmbedding):
if hasattr(self, 'layer_weights'): if hasattr(self, 'layer_weights'):
self.layer_weights.requires_grad = flag self.layer_weights.requires_grad = flag
self.gamma.requires_grad = flag self.gamma.requires_grad = flag
def _get_layer_outputs(self, outputs): def _get_layer_outputs(self, outputs):
if len(self.layers) == 1: if len(self.layers) == 1:
outputs = outputs[self.layers[0]] outputs = outputs[self.layers[0]]
else: else:
outputs = torch.cat(tuple([*outputs[self.layers]]), dim=-1) outputs = torch.cat(tuple([*outputs[self.layers]]), dim=-1)
return outputs return outputs
def forward(self, words: torch.LongTensor): def forward(self, words: torch.LongTensor):
""" """
计算words的elmo embedding表示。根据elmo文章中介绍的ELMO实际上是有2L+1层结果,但是为了让结果比较容易拆分,token的 计算words的elmo embedding表示。根据elmo文章中介绍的ELMO实际上是有2L+1层结果,但是为了让结果比较容易拆分,token的
@@ -125,12 +132,12 @@ class ElmoEmbedding(ContextualEmbedding):
outputs = self.model(words) outputs = self.model(words)
outputs = self._get_outputs(outputs) outputs = self._get_outputs(outputs)
return self.dropout(outputs) return self.dropout(outputs)
def _delete_model_weights(self): def _delete_model_weights(self):
for name in ['layers', 'model', 'layer_weights', 'gamma']: for name in ['layers', 'model', 'layer_weights', 'gamma']:
if hasattr(self, name): if hasattr(self, name):
delattr(self, name) delattr(self, name)
@property @property
def requires_grad(self): def requires_grad(self):
""" """
@@ -144,7 +151,7 @@ class ElmoEmbedding(ContextualEmbedding):
return requires_grads.pop() return requires_grads.pop()
else: else:
return None return None
@requires_grad.setter @requires_grad.setter
def requires_grad(self, value): def requires_grad(self, value):
for name, param in self.named_parameters(): for name, param in self.named_parameters():
@@ -162,7 +169,7 @@ class _ElmoModel(nn.Module):
(4) 设计一个保存token的embedding,允许缓存word的表示。 (4) 设计一个保存token的embedding,允许缓存word的表示。


""" """
def __init__(self, model_dir: str, vocab: Vocabulary = None, cache_word_reprs: bool = False): def __init__(self, model_dir: str, vocab: Vocabulary = None, cache_word_reprs: bool = False):
super(_ElmoModel, self).__init__() super(_ElmoModel, self).__init__()
self.model_dir = model_dir self.model_dir = model_dir
@@ -187,14 +194,14 @@ class _ElmoModel(nn.Module):
config = json.load(config_f) config = json.load(config_f)
self.weight_file = os.path.join(model_dir, weight_file) self.weight_file = os.path.join(model_dir, weight_file)
self.config = config self.config = config
OOV_TAG = '<oov>' OOV_TAG = '<oov>'
PAD_TAG = '<pad>' PAD_TAG = '<pad>'
BOS_TAG = '<bos>' BOS_TAG = '<bos>'
EOS_TAG = '<eos>' EOS_TAG = '<eos>'
BOW_TAG = '<bow>' BOW_TAG = '<bow>'
EOW_TAG = '<eow>' EOW_TAG = '<eow>'
# For the model trained with character-based word encoder. # For the model trained with character-based word encoder.
char_lexicon = {} char_lexicon = {}
with codecs.open(os.path.join(model_dir, 'char.dic'), 'r', encoding='utf-8') as fpi: with codecs.open(os.path.join(model_dir, 'char.dic'), 'r', encoding='utf-8') as fpi:
@@ -204,29 +211,29 @@ class _ElmoModel(nn.Module):
tokens.insert(0, '\u3000') tokens.insert(0, '\u3000')
token, i = tokens token, i = tokens
char_lexicon[token] = int(i) char_lexicon[token] = int(i)
# 做一些sanity check # 做一些sanity check
for special_word in [PAD_TAG, OOV_TAG, BOW_TAG, EOW_TAG]: for special_word in [PAD_TAG, OOV_TAG, BOW_TAG, EOW_TAG]:
assert special_word in char_lexicon, f"{special_word} not found in char.dic." assert special_word in char_lexicon, f"{special_word} not found in char.dic."
# 从vocab中构建char_vocab # 从vocab中构建char_vocab
char_vocab = Vocabulary(unknown=OOV_TAG, padding=PAD_TAG) char_vocab = Vocabulary(unknown=OOV_TAG, padding=PAD_TAG)
# 需要保证<bow>与<eow>在里面 # 需要保证<bow>与<eow>在里面
char_vocab.add_word_lst([BOW_TAG, EOW_TAG, BOS_TAG, EOS_TAG]) char_vocab.add_word_lst([BOW_TAG, EOW_TAG, BOS_TAG, EOS_TAG])
for word, index in vocab: for word, index in vocab:
char_vocab.add_word_lst(list(word)) char_vocab.add_word_lst(list(word))
self.bos_index, self.eos_index, self._pad_index = len(vocab), len(vocab) + 1, vocab.padding_idx self.bos_index, self.eos_index, self._pad_index = len(vocab), len(vocab) + 1, vocab.padding_idx
# 根据char_lexicon调整, 多设置一位,是预留给word padding的(该位置的char表示为全0表示) # 根据char_lexicon调整, 多设置一位,是预留给word padding的(该位置的char表示为全0表示)
char_emb_layer = nn.Embedding(len(char_vocab) + 1, int(config['char_cnn']['embedding']['dim']), char_emb_layer = nn.Embedding(len(char_vocab) + 1, int(config['char_cnn']['embedding']['dim']),
padding_idx=len(char_vocab)) padding_idx=len(char_vocab))
# 读入预训练权重 这里的elmo_model 包含char_cnn和 lstm 的 state_dict # 读入预训练权重 这里的elmo_model 包含char_cnn和 lstm 的 state_dict
elmo_model = torch.load(os.path.join(self.model_dir, weight_file), map_location='cpu') elmo_model = torch.load(os.path.join(self.model_dir, weight_file), map_location='cpu')
char_embed_weights = elmo_model["char_cnn"]['char_emb_layer.weight'] char_embed_weights = elmo_model["char_cnn"]['char_emb_layer.weight']
found_char_count = 0 found_char_count = 0
for char, index in char_vocab: # 调整character embedding for char, index in char_vocab: # 调整character embedding
if char in char_lexicon: if char in char_lexicon:
@@ -235,11 +242,11 @@ class _ElmoModel(nn.Module):
else: else:
index_in_pre = char_lexicon[OOV_TAG] index_in_pre = char_lexicon[OOV_TAG]
char_emb_layer.weight.data[index] = char_embed_weights[index_in_pre] char_emb_layer.weight.data[index] = char_embed_weights[index_in_pre]
print(f"{found_char_count} out of {len(char_vocab)} characters were found in pretrained elmo embedding.") print(f"{found_char_count} out of {len(char_vocab)} characters were found in pretrained elmo embedding.")
# 生成words到chars的映射 # 生成words到chars的映射
max_chars = config['char_cnn']['max_characters_per_token'] max_chars = config['char_cnn']['max_characters_per_token']
self.words_to_chars_embedding = nn.Parameter(torch.full((len(vocab) + 2, max_chars), self.words_to_chars_embedding = nn.Parameter(torch.full((len(vocab) + 2, max_chars),
fill_value=len(char_vocab), fill_value=len(char_vocab),
dtype=torch.long), dtype=torch.long),
@@ -258,20 +265,20 @@ class _ElmoModel(nn.Module):
char_vocab.to_index(EOW_TAG)] char_vocab.to_index(EOW_TAG)]
char_ids += [char_vocab.to_index(PAD_TAG)] * (max_chars - len(char_ids)) char_ids += [char_vocab.to_index(PAD_TAG)] * (max_chars - len(char_ids))
self.words_to_chars_embedding[index] = torch.LongTensor(char_ids) self.words_to_chars_embedding[index] = torch.LongTensor(char_ids)
self.char_vocab = char_vocab self.char_vocab = char_vocab
self.token_embedder = ConvTokenEmbedder( self.token_embedder = ConvTokenEmbedder(
config, self.weight_file, None, char_emb_layer) config, self.weight_file, None, char_emb_layer)
elmo_model["char_cnn"]['char_emb_layer.weight'] = char_emb_layer.weight elmo_model["char_cnn"]['char_emb_layer.weight'] = char_emb_layer.weight
self.token_embedder.load_state_dict(elmo_model["char_cnn"]) self.token_embedder.load_state_dict(elmo_model["char_cnn"])
self.output_dim = config['lstm']['projection_dim'] self.output_dim = config['lstm']['projection_dim']
# lstm encoder # lstm encoder
self.encoder = ElmobiLm(config) self.encoder = ElmobiLm(config)
self.encoder.load_state_dict(elmo_model["lstm"]) self.encoder.load_state_dict(elmo_model["lstm"])
if cache_word_reprs: if cache_word_reprs:
if config['char_cnn']['embedding']['dim'] > 0: # 只有在使用了chars的情况下有用 if config['char_cnn']['embedding']['dim'] > 0: # 只有在使用了chars的情况下有用
print("Start to generate cache word representations.") print("Start to generate cache word representations.")
@@ -280,7 +287,7 @@ class _ElmoModel(nn.Module):
word_size = self.words_to_chars_embedding.size(0) word_size = self.words_to_chars_embedding.size(0)
num_batches = word_size // batch_size + \ num_batches = word_size // batch_size + \
int(word_size % batch_size != 0) int(word_size % batch_size != 0)
self.cached_word_embedding = nn.Embedding(word_size, self.cached_word_embedding = nn.Embedding(word_size,
config['lstm']['projection_dim']) config['lstm']['projection_dim'])
with torch.no_grad(): with torch.no_grad():
@@ -291,12 +298,12 @@ class _ElmoModel(nn.Module):
word_reprs = self.token_embedder(words.unsqueeze(1), word_reprs = self.token_embedder(words.unsqueeze(1),
chars).detach() # batch_size x 1 x config['encoder']['projection_dim'] chars).detach() # batch_size x 1 x config['encoder']['projection_dim']
self.cached_word_embedding.weight.data[words] = word_reprs.squeeze(1) self.cached_word_embedding.weight.data[words] = word_reprs.squeeze(1)
print("Finish generating cached word representations. Going to delete the character encoder.") print("Finish generating cached word representations. Going to delete the character encoder.")
del self.token_embedder, self.words_to_chars_embedding del self.token_embedder, self.words_to_chars_embedding
else: else:
print("There is no need to cache word representations, since no character information is used.") print("There is no need to cache word representations, since no character information is used.")
def forward(self, words): def forward(self, words):
""" """


@@ -321,7 +328,7 @@ class _ElmoModel(nn.Module):
else: else:
chars = None chars = None
token_embedding = self.token_embedder(expanded_words, chars) # batch_size x max_len x embed_dim token_embedding = self.token_embedder(expanded_words, chars) # batch_size x max_len x embed_dim
encoder_output = self.encoder(token_embedding, seq_len) encoder_output = self.encoder(token_embedding, seq_len)
if encoder_output.size(2) < max_len + 2: if encoder_output.size(2) < max_len + 2:
num_layers, _, output_len, hidden_size = encoder_output.size() num_layers, _, output_len, hidden_size = encoder_output.size()
@@ -332,7 +339,7 @@ class _ElmoModel(nn.Module):
token_embedding = token_embedding.masked_fill(mask, 0) token_embedding = token_embedding.masked_fill(mask, 0)
token_embedding = torch.cat((token_embedding, token_embedding), dim=2).view(1, sz[1], sz[2], sz[3]) token_embedding = torch.cat((token_embedding, token_embedding), dim=2).view(1, sz[1], sz[2], sz[3])
encoder_output = torch.cat((token_embedding, encoder_output), dim=0) encoder_output = torch.cat((token_embedding, encoder_output), dim=0)
# 删除<eos>, <bos>. 这里没有精确地删除,但应该也不会影响最后的结果了。 # 删除<eos>, <bos>. 这里没有精确地删除,但应该也不会影响最后的结果了。
encoder_output = encoder_output[:, :, 1:-1] encoder_output = encoder_output[:, :, 1:-1]
return encoder_output return encoder_output

+ 30
- 26
fastNLP/embeddings/embedding.py View File

@@ -3,6 +3,10 @@


""" """


__all__ = [
"Embedding",
"TokenEmbedding"
]


import torch.nn as nn import torch.nn as nn
from abc import abstractmethod from abc import abstractmethod
@@ -33,11 +37,11 @@ class Embedding(nn.Module):
:param float dropout: 对Embedding的输出的dropout。 :param float dropout: 对Embedding的输出的dropout。
:param int unk_index: drop word时替换为的index。fastNLP的Vocabulary的unk_index默认为1。 :param int unk_index: drop word时替换为的index。fastNLP的Vocabulary的unk_index默认为1。
""" """
def __init__(self, init_embed, word_dropout=0, dropout=0.0, unk_index=None): def __init__(self, init_embed, word_dropout=0, dropout=0.0, unk_index=None):
super(Embedding, self).__init__() super(Embedding, self).__init__()
self.embed = get_embeddings(init_embed) self.embed = get_embeddings(init_embed)
self.dropout = nn.Dropout(dropout) self.dropout = nn.Dropout(dropout)
@@ -48,44 +52,44 @@ class Embedding(nn.Module):
self._embed_size = self.embed.embedding_dim self._embed_size = self.embed.embedding_dim
else: else:
self._embed_size = self.embed.weight.size(1) self._embed_size = self.embed.weight.size(1)
if word_dropout>0 and not isinstance(unk_index, int):
if word_dropout > 0 and not isinstance(unk_index, int):
raise ValueError("When drop word is set, you need to pass in the unk_index.") raise ValueError("When drop word is set, you need to pass in the unk_index.")
else: else:
self._embed_size = self.embed.embed_size self._embed_size = self.embed.embed_size
unk_index = self.embed.get_word_vocab().unknown_idx unk_index = self.embed.get_word_vocab().unknown_idx
self.unk_index = unk_index self.unk_index = unk_index
self.word_dropout = word_dropout self.word_dropout = word_dropout
def forward(self, words): def forward(self, words):
""" """
:param torch.LongTensor words: [batch, seq_len] :param torch.LongTensor words: [batch, seq_len]
:return: torch.Tensor : [batch, seq_len, embed_dim] :return: torch.Tensor : [batch, seq_len, embed_dim]
""" """
if self.word_dropout>0 and self.training:
if self.word_dropout > 0 and self.training:
mask = torch.ones_like(words).float() * self.word_dropout mask = torch.ones_like(words).float() * self.word_dropout
mask = torch.bernoulli(mask).eq(1) # dropout_word越大,越多位置为1 mask = torch.bernoulli(mask).eq(1) # dropout_word越大,越多位置为1
words = words.masked_fill(mask, self.unk_index) words = words.masked_fill(mask, self.unk_index)
words = self.embed(words) words = self.embed(words)
return self.dropout(words) return self.dropout(words)
@property @property
def num_embedding(self)->int:
def num_embedding(self) -> int:
if isinstance(self.embed, nn.Embedding): if isinstance(self.embed, nn.Embedding):
return self.embed.weight.size(0) return self.embed.weight.size(0)
else: else:
return self.embed.num_embedding return self.embed.num_embedding
def __len__(self): def __len__(self):
return len(self.embed) return len(self.embed)
@property @property
def embed_size(self) -> int: def embed_size(self) -> int:
return self._embed_size return self._embed_size
@property @property
def embedding_dim(self) -> int: def embedding_dim(self) -> int:
return self._embed_size return self._embed_size
@property @property
def requires_grad(self): def requires_grad(self):
""" """
@@ -96,14 +100,14 @@ class Embedding(nn.Module):
return self.embed.weight.requires_grad return self.embed.weight.requires_grad
else: else:
return self.embed.requires_grad return self.embed.requires_grad
@requires_grad.setter @requires_grad.setter
def requires_grad(self, value): def requires_grad(self, value):
if not isinstance(self.embed, TokenEmbedding): if not isinstance(self.embed, TokenEmbedding):
self.embed.weight.requires_grad = value self.embed.weight.requires_grad = value
else: else:
self.embed.requires_grad = value self.embed.requires_grad = value
@property @property
def size(self): def size(self):
if isinstance(self.embed, TokenEmbedding): if isinstance(self.embed, TokenEmbedding):
@@ -120,12 +124,12 @@ class TokenEmbedding(nn.Module):
assert vocab.padding is not None, "Vocabulary must have a padding entry." assert vocab.padding is not None, "Vocabulary must have a padding entry."
self._word_vocab = vocab self._word_vocab = vocab
self._word_pad_index = vocab.padding_idx self._word_pad_index = vocab.padding_idx
if word_dropout>0:
if word_dropout > 0:
assert vocab.unknown is not None, "Vocabulary must have unknown entry when you want to drop a word." assert vocab.unknown is not None, "Vocabulary must have unknown entry when you want to drop a word."
self.word_dropout = word_dropout self.word_dropout = word_dropout
self._word_unk_index = vocab.unknown_idx self._word_unk_index = vocab.unknown_idx
self.dropout_layer = nn.Dropout(dropout) self.dropout_layer = nn.Dropout(dropout)
def drop_word(self, words): def drop_word(self, words):
""" """
按照设定随机将words设置为unknown_index。 按照设定随机将words设置为unknown_index。
@@ -138,7 +142,7 @@ class TokenEmbedding(nn.Module):
mask = torch.bernoulli(mask).eq(1) # dropout_word越大,越多位置为1 mask = torch.bernoulli(mask).eq(1) # dropout_word越大,越多位置为1
words = words.masked_fill(mask, self._word_unk_index) words = words.masked_fill(mask, self._word_unk_index)
return words return words
def dropout(self, words): def dropout(self, words):
""" """
对embedding后的word表示进行drop。 对embedding后的word表示进行drop。
@@ -147,7 +151,7 @@ class TokenEmbedding(nn.Module):
:return: :return:
""" """
return self.dropout_layer(words) return self.dropout_layer(words)
@property @property
def requires_grad(self): def requires_grad(self):
""" """
@@ -159,23 +163,23 @@ class TokenEmbedding(nn.Module):
return requires_grads.pop() return requires_grads.pop()
else: else:
return None return None
@requires_grad.setter @requires_grad.setter
def requires_grad(self, value): def requires_grad(self, value):
for param in self.parameters(): for param in self.parameters():
param.requires_grad = value param.requires_grad = value
def __len__(self): def __len__(self):
return len(self._word_vocab) return len(self._word_vocab)
@property @property
def embed_size(self) -> int: def embed_size(self) -> int:
return self._embed_size return self._embed_size
@property @property
def embedding_dim(self) -> int: def embedding_dim(self) -> int:
return self._embed_size return self._embed_size
@property @property
def num_embedding(self) -> int: def num_embedding(self) -> int:
""" """
@@ -183,7 +187,7 @@ class TokenEmbedding(nn.Module):
:return: :return:
""" """
return len(self._word_vocab) return len(self._word_vocab)
def get_word_vocab(self): def get_word_vocab(self):
""" """
返回embedding的词典。 返回embedding的词典。
@@ -191,11 +195,11 @@ class TokenEmbedding(nn.Module):
:return: Vocabulary :return: Vocabulary
""" """
return self._word_vocab return self._word_vocab
@property @property
def size(self): def size(self):
return torch.Size(self.num_embedding, self._embed_size) return torch.Size(self.num_embedding, self._embed_size)
@abstractmethod @abstractmethod
def forward(self, words): def forward(self, words):
raise NotImplementedError raise NotImplementedError

+ 17
- 7
fastNLP/embeddings/stack_embedding.py View File

@@ -1,3 +1,12 @@
"""
.. todo::
doc
"""

__all__ = [
"StackEmbedding",
]

from typing import List from typing import List


import torch import torch
@@ -26,6 +35,7 @@ class StackEmbedding(TokenEmbedding):
:param float dropout: 以多大的概率对embedding的表示进行Dropout。0.1即随机将10%的值置为0。 :param float dropout: 以多大的概率对embedding的表示进行Dropout。0.1即随机将10%的值置为0。


""" """
def __init__(self, embeds: List[TokenEmbedding], word_dropout=0, dropout=0): def __init__(self, embeds: List[TokenEmbedding], word_dropout=0, dropout=0):
vocabs = [] vocabs = []
for embed in embeds: for embed in embeds:
@@ -34,14 +44,14 @@ class StackEmbedding(TokenEmbedding):
_vocab = vocabs[0] _vocab = vocabs[0]
for vocab in vocabs[1:]: for vocab in vocabs[1:]:
assert vocab == _vocab, "All embeddings in StackEmbedding should use the same word vocabulary." assert vocab == _vocab, "All embeddings in StackEmbedding should use the same word vocabulary."
super(StackEmbedding, self).__init__(_vocab, word_dropout=word_dropout, dropout=dropout) super(StackEmbedding, self).__init__(_vocab, word_dropout=word_dropout, dropout=dropout)
assert isinstance(embeds, list) assert isinstance(embeds, list)
for embed in embeds: for embed in embeds:
assert isinstance(embed, TokenEmbedding), "Only TokenEmbedding type is supported." assert isinstance(embed, TokenEmbedding), "Only TokenEmbedding type is supported."
self.embeds = nn.ModuleList(embeds) self.embeds = nn.ModuleList(embeds)
self._embed_size = sum([embed.embed_size for embed in self.embeds]) self._embed_size = sum([embed.embed_size for embed in self.embeds])
def append(self, embed: TokenEmbedding): def append(self, embed: TokenEmbedding):
""" """
添加一个embedding到结尾。 添加一个embedding到结尾。
@@ -50,18 +60,18 @@ class StackEmbedding(TokenEmbedding):
""" """
assert isinstance(embed, TokenEmbedding) assert isinstance(embed, TokenEmbedding)
self.embeds.append(embed) self.embeds.append(embed)
def pop(self): def pop(self):
""" """
弹出最后一个embed 弹出最后一个embed
:return: :return:
""" """
return self.embeds.pop() return self.embeds.pop()
@property @property
def embed_size(self): def embed_size(self):
return self._embed_size return self._embed_size
@property @property
def requires_grad(self): def requires_grad(self):
""" """
@@ -73,12 +83,12 @@ class StackEmbedding(TokenEmbedding):
return requires_grads.pop() return requires_grads.pop()
else: else:
return None return None
@requires_grad.setter @requires_grad.setter
def requires_grad(self, value): def requires_grad(self, value):
for embed in self.embeds(): for embed in self.embeds():
embed.requires_grad = value embed.requires_grad = value
def forward(self, words): def forward(self, words):
""" """
得到多个embedding的结果,并把结果按照顺序concat起来。 得到多个embedding的结果,并把结果按照顺序concat起来。


+ 35
- 26
fastNLP/embeddings/static_embedding.py View File

@@ -1,4 +1,11 @@
"""
.. todo::
doc
"""


__all__ = [
"StaticEmbedding"
]
import os import os


import torch import torch
@@ -13,6 +20,7 @@ from ..modules.utils import _get_file_name_base_on_postfix
from copy import deepcopy from copy import deepcopy
from collections import defaultdict from collections import defaultdict



class StaticEmbedding(TokenEmbedding): class StaticEmbedding(TokenEmbedding):
""" """
别名::class:`fastNLP.embeddings.StaticEmbedding` :class:`fastNLP.embeddings.static_embedding.StaticEmbedding` 别名::class:`fastNLP.embeddings.StaticEmbedding` :class:`fastNLP.embeddings.static_embedding.StaticEmbedding`
@@ -55,15 +63,16 @@ class StaticEmbedding(TokenEmbedding):
:param bool normalize: 是否对vector进行normalize,使得每个vector的norm为1。 :param bool normalize: 是否对vector进行normalize,使得每个vector的norm为1。
:param int min_freq: Vocabulary词频数小于这个数量的word将被指向unk。 :param int min_freq: Vocabulary词频数小于这个数量的word将被指向unk。
""" """
def __init__(self, vocab: Vocabulary, model_dir_or_name: str='en', embedding_dim=-1, requires_grad: bool=True,
def __init__(self, vocab: Vocabulary, model_dir_or_name: str = 'en', embedding_dim=-1, requires_grad: bool = True,
init_method=None, lower=False, dropout=0, word_dropout=0, normalize=False, min_freq=1, **kwargs): init_method=None, lower=False, dropout=0, word_dropout=0, normalize=False, min_freq=1, **kwargs):
super(StaticEmbedding, self).__init__(vocab, word_dropout=word_dropout, dropout=dropout) super(StaticEmbedding, self).__init__(vocab, word_dropout=word_dropout, dropout=dropout)
if embedding_dim>0:
if embedding_dim > 0:
model_dir_or_name = None model_dir_or_name = None
# 得到cache_path # 得到cache_path
if model_dir_or_name is None: if model_dir_or_name is None:
assert embedding_dim>=1, "The dimension of embedding should be larger than 1."
assert embedding_dim >= 1, "The dimension of embedding should be larger than 1."
embedding_dim = int(embedding_dim) embedding_dim = int(embedding_dim)
model_path = None model_path = None
elif model_dir_or_name.lower() in PRETRAIN_STATIC_FILES: elif model_dir_or_name.lower() in PRETRAIN_STATIC_FILES:
@@ -76,9 +85,9 @@ class StaticEmbedding(TokenEmbedding):
model_path = _get_file_name_base_on_postfix(os.path.abspath(os.path.expanduser(model_dir_or_name)), '.txt') model_path = _get_file_name_base_on_postfix(os.path.abspath(os.path.expanduser(model_dir_or_name)), '.txt')
else: else:
raise ValueError(f"Cannot recognize {model_dir_or_name}.") raise ValueError(f"Cannot recognize {model_dir_or_name}.")
# 根据min_freq缩小vocab # 根据min_freq缩小vocab
truncate_vocab = (vocab.min_freq is None and min_freq>1) or (vocab.min_freq and vocab.min_freq<min_freq)
truncate_vocab = (vocab.min_freq is None and min_freq > 1) or (vocab.min_freq and vocab.min_freq < min_freq)
if truncate_vocab: if truncate_vocab:
truncated_vocab = deepcopy(vocab) truncated_vocab = deepcopy(vocab)
truncated_vocab.min_freq = min_freq truncated_vocab.min_freq = min_freq
@@ -89,14 +98,14 @@ class StaticEmbedding(TokenEmbedding):
lowered_word_count[word.lower()] += count lowered_word_count[word.lower()] += count
for word in truncated_vocab.word_count.keys(): for word in truncated_vocab.word_count.keys():
word_count = truncated_vocab.word_count[word] word_count = truncated_vocab.word_count[word]
if lowered_word_count[word.lower()]>=min_freq and word_count<min_freq:
truncated_vocab.add_word_lst([word]*(min_freq-word_count),
if lowered_word_count[word.lower()] >= min_freq and word_count < min_freq:
truncated_vocab.add_word_lst([word] * (min_freq - word_count),
no_create_entry=truncated_vocab._is_word_no_create_entry(word)) no_create_entry=truncated_vocab._is_word_no_create_entry(word))
# 只限制在train里面的词语使用min_freq筛选 # 只限制在train里面的词语使用min_freq筛选
if kwargs.get('only_train_min_freq', False) and model_dir_or_name is not None: if kwargs.get('only_train_min_freq', False) and model_dir_or_name is not None:
for word in truncated_vocab.word_count.keys(): for word in truncated_vocab.word_count.keys():
if truncated_vocab._is_word_no_create_entry(word) and truncated_vocab.word_count[word]<min_freq:
if truncated_vocab._is_word_no_create_entry(word) and truncated_vocab.word_count[word] < min_freq:
truncated_vocab.add_word_lst([word] * (min_freq - truncated_vocab.word_count[word]), truncated_vocab.add_word_lst([word] * (min_freq - truncated_vocab.word_count[word]),
no_create_entry=True) no_create_entry=True)
truncated_vocab.build_vocab() truncated_vocab.build_vocab()
@@ -105,7 +114,7 @@ class StaticEmbedding(TokenEmbedding):
truncated_words_to_words[index] = truncated_vocab.to_index(word) truncated_words_to_words[index] = truncated_vocab.to_index(word)
print(f"{len(vocab) - len(truncated_vocab)} out of {len(vocab)} words have frequency less than {min_freq}.") print(f"{len(vocab) - len(truncated_vocab)} out of {len(vocab)} words have frequency less than {min_freq}.")
vocab = truncated_vocab vocab = truncated_vocab
self.only_norm_found_vector = kwargs.get('only_norm_found_vector', False) self.only_norm_found_vector = kwargs.get('only_norm_found_vector', False)
# 读取embedding # 读取embedding
if lower: if lower:
@@ -145,21 +154,21 @@ class StaticEmbedding(TokenEmbedding):
self.words_to_words = nn.Parameter(torch.arange(len(vocab)).long(), requires_grad=False) self.words_to_words = nn.Parameter(torch.arange(len(vocab)).long(), requires_grad=False)
if not self.only_norm_found_vector and normalize: if not self.only_norm_found_vector and normalize:
embedding /= (torch.norm(embedding, dim=1, keepdim=True) + 1e-12) embedding /= (torch.norm(embedding, dim=1, keepdim=True) + 1e-12)
if truncate_vocab: if truncate_vocab:
for i in range(len(truncated_words_to_words)): for i in range(len(truncated_words_to_words)):
index_in_truncated_vocab = truncated_words_to_words[i] index_in_truncated_vocab = truncated_words_to_words[i]
truncated_words_to_words[i] = self.words_to_words[index_in_truncated_vocab] truncated_words_to_words[i] = self.words_to_words[index_in_truncated_vocab]
del self.words_to_words del self.words_to_words
self.words_to_words = nn.Parameter(truncated_words_to_words, requires_grad=False) self.words_to_words = nn.Parameter(truncated_words_to_words, requires_grad=False)
self.embedding = nn.Embedding(num_embeddings=embedding.shape[0], embedding_dim=embedding.shape[1], self.embedding = nn.Embedding(num_embeddings=embedding.shape[0], embedding_dim=embedding.shape[1],
padding_idx=vocab.padding_idx, padding_idx=vocab.padding_idx,
max_norm=None, norm_type=2, scale_grad_by_freq=False, max_norm=None, norm_type=2, scale_grad_by_freq=False,
sparse=False, _weight=embedding) sparse=False, _weight=embedding)
self._embed_size = self.embedding.weight.size(1) self._embed_size = self.embedding.weight.size(1)
self.requires_grad = requires_grad self.requires_grad = requires_grad
def _randomly_init_embed(self, num_embedding, embedding_dim, init_embed=None): def _randomly_init_embed(self, num_embedding, embedding_dim, init_embed=None):
""" """


@@ -169,14 +178,14 @@ class StaticEmbedding(TokenEmbedding):
:return: torch.FloatTensor :return: torch.FloatTensor
""" """
embed = torch.zeros(num_embedding, embedding_dim) embed = torch.zeros(num_embedding, embedding_dim)
if init_embed is None: if init_embed is None:
nn.init.uniform_(embed, -np.sqrt(3/embedding_dim), np.sqrt(3/embedding_dim))
nn.init.uniform_(embed, -np.sqrt(3 / embedding_dim), np.sqrt(3 / embedding_dim))
else: else:
init_embed(embed) init_embed(embed)
return embed return embed
@property @property
def requires_grad(self): def requires_grad(self):
""" """
@@ -190,14 +199,14 @@ class StaticEmbedding(TokenEmbedding):
return requires_grads.pop() return requires_grads.pop()
else: else:
return None return None
@requires_grad.setter @requires_grad.setter
def requires_grad(self, value): def requires_grad(self, value):
for name, param in self.named_parameters(): for name, param in self.named_parameters():
if 'words_to_words' in name: if 'words_to_words' in name:
continue continue
param.requires_grad = value param.requires_grad = value
def _load_with_vocab(self, embed_filepath, vocab, dtype=np.float32, padding='<pad>', unknown='<unk>', def _load_with_vocab(self, embed_filepath, vocab, dtype=np.float32, padding='<pad>', unknown='<unk>',
error='ignore', init_method=None): error='ignore', init_method=None):
""" """
@@ -250,7 +259,7 @@ class StaticEmbedding(TokenEmbedding):
index = vocab.to_index(word) index = vocab.to_index(word)
matrix[index] = torch.from_numpy(np.fromstring(' '.join(nums), sep=' ', dtype=dtype, count=dim)) matrix[index] = torch.from_numpy(np.fromstring(' '.join(nums), sep=' ', dtype=dtype, count=dim))
if self.only_norm_found_vector: if self.only_norm_found_vector:
matrix[index] = matrix[index]/np.linalg.norm(matrix[index])
matrix[index] = matrix[index] / np.linalg.norm(matrix[index])
found_count += 1 found_count += 1
except Exception as e: except Exception as e:
if error == 'ignore': if error == 'ignore':
@@ -267,22 +276,22 @@ class StaticEmbedding(TokenEmbedding):
matrix[index] = None matrix[index] = None
# matrix中代表是需要建立entry的词 # matrix中代表是需要建立entry的词
vectors = self._randomly_init_embed(len(matrix), dim, init_method) vectors = self._randomly_init_embed(len(matrix), dim, init_method)
if vocab.unknown is None: # 创建一个专门的unknown if vocab.unknown is None: # 创建一个专门的unknown
unknown_idx = len(matrix) unknown_idx = len(matrix)
vectors = torch.cat((vectors, torch.zeros(1, dim)), dim=0).contiguous() vectors = torch.cat((vectors, torch.zeros(1, dim)), dim=0).contiguous()
else: else:
unknown_idx = vocab.unknown_idx unknown_idx = vocab.unknown_idx
self.words_to_words = nn.Parameter(torch.full((len(vocab), ), fill_value=unknown_idx).long(),
self.words_to_words = nn.Parameter(torch.full((len(vocab),), fill_value=unknown_idx).long(),
requires_grad=False) requires_grad=False)
for index, (index_in_vocab, vec) in enumerate(matrix.items()): for index, (index_in_vocab, vec) in enumerate(matrix.items()):
if vec is not None: if vec is not None:
vectors[index] = vec vectors[index] = vec
self.words_to_words[index_in_vocab] = index self.words_to_words[index_in_vocab] = index
return vectors return vectors
def forward(self, words): def forward(self, words):
""" """
传入words的index 传入words的index


+ 11
- 5
fastNLP/embeddings/utils.py View File

@@ -1,13 +1,19 @@
"""
.. todo::
doc
"""
import numpy as np import numpy as np
import torch import torch
from torch import nn as nn from torch import nn as nn


from ..core.vocabulary import Vocabulary from ..core.vocabulary import Vocabulary


__all__ = ['get_embeddings']
__all__ = [
'get_embeddings'
]




def _construct_char_vocab_from_vocab(vocab:Vocabulary, min_freq:int=1):
def _construct_char_vocab_from_vocab(vocab: Vocabulary, min_freq: int = 1):
""" """
给定一个word的vocabulary生成character的vocabulary. 给定一个word的vocabulary生成character的vocabulary.


@@ -36,8 +42,8 @@ def get_embeddings(init_embed):
if isinstance(init_embed, tuple): if isinstance(init_embed, tuple):
res = nn.Embedding( res = nn.Embedding(
num_embeddings=init_embed[0], embedding_dim=init_embed[1]) num_embeddings=init_embed[0], embedding_dim=init_embed[1])
nn.init.uniform_(res.weight.data, a=-np.sqrt(3/res.weight.data.size(1)),
b=np.sqrt(3/res.weight.data.size(1)))
nn.init.uniform_(res.weight.data, a=-np.sqrt(3 / res.weight.data.size(1)),
b=np.sqrt(3 / res.weight.data.size(1)))
elif isinstance(init_embed, nn.Module): elif isinstance(init_embed, nn.Module):
res = init_embed res = init_embed
elif isinstance(init_embed, torch.Tensor): elif isinstance(init_embed, torch.Tensor):
@@ -48,4 +54,4 @@ def get_embeddings(init_embed):
else: else:
raise TypeError( raise TypeError(
'invalid init_embed type: {}'.format((type(init_embed)))) 'invalid init_embed type: {}'.format((type(init_embed))))
return res
return res

Loading…
Cancel
Save