Browse Source

add __doc__ & __all__ in module 'embeddings'

tags/v0.4.10
ChenXin 5 years ago
parent
commit
d6c597d32e
9 changed files with 277 additions and 213 deletions
  1. +0
    -1
      fastNLP/embeddings/__init__.py
  2. +87
    -71
      fastNLP/embeddings/bert_embedding.py
  3. +38
    -30
      fastNLP/embeddings/char_embedding.py
  4. +17
    -12
      fastNLP/embeddings/contextual_embedding.py
  5. +42
    -35
      fastNLP/embeddings/elmo_embedding.py
  6. +30
    -26
      fastNLP/embeddings/embedding.py
  7. +17
    -7
      fastNLP/embeddings/stack_embedding.py
  8. +35
    -26
      fastNLP/embeddings/static_embedding.py
  9. +11
    -5
      fastNLP/embeddings/utils.py

+ 0
- 1
fastNLP/embeddings/__init__.py View File

@@ -18,7 +18,6 @@ __all__ = [
"get_embeddings",
]


from .embedding import Embedding, TokenEmbedding
from .static_embedding import StaticEmbedding
from .elmo_embedding import ElmoEmbedding


+ 87
- 71
fastNLP/embeddings/bert_embedding.py View File

@@ -1,3 +1,12 @@
"""
.. todo::
doc
"""

__all__ = [
"BertEmbedding",
"BertWordPieceEncoder"
]

import os
import collections
@@ -13,6 +22,7 @@ from ..modules.encoder.bert import _WordPieceBertModel, BertModel, BertTokenizer
from .contextual_embedding import ContextualEmbedding
import warnings


class BertEmbedding(ContextualEmbedding):
"""
别名::class:`fastNLP.embeddings.BertEmbedding` :class:`fastNLP.embeddings.bert_embedding.BertEmbedding`
@@ -54,11 +64,12 @@ class BertEmbedding(ContextualEmbedding):
word pieces后的内容,并将第512个word piece置为[SEP]。超过长度的部分的encode结果直接全部置零。一般仅有只使用[CLS]
来进行分类的任务将auto_truncate置为True。
"""
def __init__(self, vocab: Vocabulary, model_dir_or_name: str='en-base-uncased', layers: str='-1',
pool_method: str='first', word_dropout=0, dropout=0, include_cls_sep: bool=False,
pooled_cls=True, requires_grad: bool=False, auto_truncate:bool=False):
def __init__(self, vocab: Vocabulary, model_dir_or_name: str = 'en-base-uncased', layers: str = '-1',
pool_method: str = 'first', word_dropout=0, dropout=0, include_cls_sep: bool = False,
pooled_cls=True, requires_grad: bool = False, auto_truncate: bool = False):
super(BertEmbedding, self).__init__(vocab, word_dropout=word_dropout, dropout=dropout)
# 根据model_dir_or_name检查是否存在并下载
if model_dir_or_name.lower() in PRETRAINED_BERT_MODEL_DIR:
if 'cn' in model_dir_or_name.lower() and pool_method not in ('first', 'last'):
@@ -71,21 +82,21 @@ class BertEmbedding(ContextualEmbedding):
model_dir = os.path.abspath(os.path.expanduser(model_dir_or_name))
else:
raise ValueError(f"Cannot recognize {model_dir_or_name}.")
self._word_sep_index = None
if '[SEP]' in vocab:
self._word_sep_index = vocab['[SEP]']
self.model = _WordBertModel(model_dir=model_dir, vocab=vocab, layers=layers,
pool_method=pool_method, include_cls_sep=include_cls_sep,
pooled_cls=pooled_cls, auto_truncate=auto_truncate, min_freq=2)
self.requires_grad = requires_grad
self._embed_size = len(self.model.layers)*self.model.encoder.hidden_size
self._embed_size = len(self.model.layers) * self.model.encoder.hidden_size
def _delete_model_weights(self):
del self.model
def forward(self, words):
"""
计算words的bert embedding表示。计算之前会在每句话的开始增加[CLS]在结束增加[SEP], 并根据include_cls_sep判断要不要
@@ -100,9 +111,9 @@ class BertEmbedding(ContextualEmbedding):
return self.dropout(outputs)
outputs = self.model(words)
outputs = torch.cat([*outputs], dim=-1)
return self.dropout(outputs)
def drop_word(self, words):
"""
按照设定随机将words设置为unknown_index。
@@ -120,7 +131,7 @@ class BertEmbedding(ContextualEmbedding):
if self._word_sep_index:
words.masked_fill_(sep_mask, self._word_sep_index)
return words
@property
def requires_grad(self):
"""
@@ -129,12 +140,12 @@ class BertEmbedding(ContextualEmbedding):
:return:
"""
requires_grads = set([param.requires_grad for name, param in self.named_parameters()
if 'word_pieces_lengths' not in name])
if 'word_pieces_lengths' not in name])
if len(requires_grads) == 1:
return requires_grads.pop()
else:
return None
@requires_grad.setter
def requires_grad(self, value):
for name, param in self.named_parameters():
@@ -155,10 +166,11 @@ class BertWordPieceEncoder(nn.Module):
:param float dropout: 以多大的概率对embedding的表示进行Dropout。0.1即随机将10%的值置为0。
:param bool requires_grad: 是否需要gradient。
"""
def __init__(self, model_dir_or_name: str='en-base-uncased', layers: str='-1', pooled_cls: bool = False,
word_dropout=0, dropout=0, requires_grad: bool=False):
def __init__(self, model_dir_or_name: str = 'en-base-uncased', layers: str = '-1', pooled_cls: bool = False,
word_dropout=0, dropout=0, requires_grad: bool = False):
super().__init__()
if model_dir_or_name.lower() in PRETRAINED_BERT_MODEL_DIR:
model_url = _get_embedding_url('bert', model_dir_or_name.lower())
model_dir = cached_path(model_url, name='embedding')
@@ -167,7 +179,7 @@ class BertWordPieceEncoder(nn.Module):
model_dir = model_dir_or_name
else:
raise ValueError(f"Cannot recognize {model_dir_or_name}.")
self.model = _WordPieceBertModel(model_dir=model_dir, layers=layers, pooled_cls=pooled_cls)
self._sep_index = self.model._sep_index
self._wordpiece_unk_index = self.model._wordpiece_unknown_index
@@ -175,7 +187,7 @@ class BertWordPieceEncoder(nn.Module):
self.requires_grad = requires_grad
self.word_dropout = word_dropout
self.dropout_layer = nn.Dropout(dropout)
@property
def requires_grad(self):
"""
@@ -187,24 +199,24 @@ class BertWordPieceEncoder(nn.Module):
return requires_grads.pop()
else:
return None
@requires_grad.setter
def requires_grad(self, value):
for name, param in self.named_parameters():
param.requires_grad = value
@property
def embed_size(self):
return self._embed_size
@property
def embedding_dim(self):
return self._embed_size
@property
def num_embedding(self):
return self.model.encoder.config.vocab_size
def index_datasets(self, *datasets, field_name, add_cls_sep=True):
"""
使用bert的tokenizer新生成word_pieces列加入到datasets中,并将他们设置为input,且将word_pieces这一列的pad value设置为了
@@ -216,7 +228,7 @@ class BertWordPieceEncoder(nn.Module):
:return:
"""
self.model.index_dataset(*datasets, field_name=field_name, add_cls_sep=add_cls_sep)
def forward(self, word_pieces, token_type_ids=None):
"""
计算words的bert embedding表示。传入的words中应该自行包含[CLS]与[SEP]的tag。
@@ -233,13 +245,13 @@ class BertWordPieceEncoder(nn.Module):
token_type_ids = sep_mask_cumsum.fmod(2)
if token_type_ids[0, 0].item(): # 如果开头是奇数,则需要flip一下结果,因为需要保证开头为0
token_type_ids = token_type_ids.eq(0).long()
word_pieces = self.drop_word(word_pieces)
outputs = self.model(word_pieces, token_type_ids)
outputs = torch.cat([*outputs], dim=-1)
return self.dropout_layer(outputs)
def drop_word(self, words):
"""
按照设定随机将words设置为unknown_index。
@@ -260,10 +272,10 @@ class BertWordPieceEncoder(nn.Module):


class _WordBertModel(nn.Module):
def __init__(self, model_dir:str, vocab:Vocabulary, layers:str='-1', pool_method:str='first',
include_cls_sep:bool=False, pooled_cls:bool=False, auto_truncate:bool=False, min_freq=2):
def __init__(self, model_dir: str, vocab: Vocabulary, layers: str = '-1', pool_method: str = 'first',
include_cls_sep: bool = False, pooled_cls: bool = False, auto_truncate: bool = False, min_freq=2):
super().__init__()
self.tokenzier = BertTokenizer.from_pretrained(model_dir)
self.encoder = BertModel.from_pretrained(model_dir)
self._max_position_embeddings = self.encoder.config.max_position_embeddings
@@ -271,23 +283,23 @@ class _WordBertModel(nn.Module):
encoder_layer_number = len(self.encoder.encoder.layer)
self.layers = list(map(int, layers.split(',')))
for layer in self.layers:
if layer<0:
assert -layer<=encoder_layer_number, f"The layer index:{layer} is out of scope for " \
f"a bert model with {encoder_layer_number} layers."
if layer < 0:
assert -layer <= encoder_layer_number, f"The layer index:{layer} is out of scope for " \
f"a bert model with {encoder_layer_number} layers."
else:
assert layer<encoder_layer_number, f"The layer index:{layer} is out of scope for " \
f"a bert model with {encoder_layer_number} layers."
assert layer < encoder_layer_number, f"The layer index:{layer} is out of scope for " \
f"a bert model with {encoder_layer_number} layers."
assert pool_method in ('avg', 'max', 'first', 'last')
self.pool_method = pool_method
self.include_cls_sep = include_cls_sep
self.pooled_cls = pooled_cls
self.auto_truncate = auto_truncate
# 将所有vocab中word的wordpiece计算出来, 需要额外考虑[CLS]和[SEP]
print("Start to generating word pieces for word.")
# 第一步统计出需要的word_piece, 然后创建新的embed和word_piece_vocab, 然后填入值
word_piece_dict = {'[CLS]':1, '[SEP]':1} # 用到的word_piece以及新增的
word_piece_dict = {'[CLS]': 1, '[SEP]': 1} # 用到的word_piece以及新增的
found_count = 0
self._has_sep_in_vocab = '[SEP]' in vocab # 用来判断传入的数据是否需要生成token_ids
if '[sep]' in vocab:
@@ -302,10 +314,11 @@ class _WordBertModel(nn.Module):
elif index == vocab.unknown_idx:
word = '[UNK]'
word_pieces = self.tokenzier.wordpiece_tokenizer.tokenize(word)
if len(word_pieces)==1:
if len(word_pieces) == 1:
if not vocab._is_word_no_create_entry(word): # 如果是train中的值, 但是却没有找到
if index!=vocab.unknown_idx and word_pieces[0]=='[UNK]': # 说明这个词不在原始的word里面
if vocab.word_count[word]>=min_freq and not vocab._is_word_no_create_entry(word): #出现次数大于这个次数才新增
if index != vocab.unknown_idx and word_pieces[0] == '[UNK]': # 说明这个词不在原始的word里面
if vocab.word_count[word] >= min_freq and not vocab._is_word_no_create_entry(
word): # 出现次数大于这个次数才新增
word_piece_dict[word] = 1 # 新增一个值
continue
for word_piece in word_pieces:
@@ -327,7 +340,7 @@ class _WordBertModel(nn.Module):
new_word_piece_vocab[token] = len(new_word_piece_vocab)
self.tokenzier._reinit_on_new_vocab(new_word_piece_vocab)
self.encoder.embeddings.word_embeddings = embed
word_to_wordpieces = []
word_pieces_lengths = []
for word, index in vocab:
@@ -347,7 +360,7 @@ class _WordBertModel(nn.Module):
self.word_to_wordpieces = np.array(word_to_wordpieces)
self.word_pieces_lengths = nn.Parameter(torch.LongTensor(word_pieces_lengths), requires_grad=False)
print("Successfully generate word pieces.")
def forward(self, words):
"""

@@ -358,34 +371,37 @@ class _WordBertModel(nn.Module):
batch_size, max_word_len = words.size()
word_mask = words.ne(self._word_pad_index) # 为1的地方有word
seq_len = word_mask.sum(dim=-1)
batch_word_pieces_length = self.word_pieces_lengths[words].masked_fill(word_mask.eq(0), 0) # batch_size x max_len
batch_word_pieces_length = self.word_pieces_lengths[words].masked_fill(word_mask.eq(0),
0) # batch_size x max_len
word_pieces_lengths = batch_word_pieces_length.sum(dim=-1) # batch_size
word_piece_length = batch_word_pieces_length.sum(dim=-1).max().item() # 表示word piece的长度(包括padding)
if word_piece_length+2>self._max_position_embeddings:
if word_piece_length + 2 > self._max_position_embeddings:
if self.auto_truncate:
word_pieces_lengths = word_pieces_lengths.masked_fill(word_pieces_lengths+2>self._max_position_embeddings,
self._max_position_embeddings-2)
word_pieces_lengths = word_pieces_lengths.masked_fill(
word_pieces_lengths + 2 > self._max_position_embeddings,
self._max_position_embeddings - 2)
else:
raise RuntimeError("After split words into word pieces, the lengths of word pieces are longer than the "
f"maximum allowed sequence length:{self._max_position_embeddings} of bert.")

raise RuntimeError(
"After split words into word pieces, the lengths of word pieces are longer than the "
f"maximum allowed sequence length:{self._max_position_embeddings} of bert.")
# +2是由于需要加入[CLS]与[SEP]
word_pieces = words.new_full((batch_size, min(word_piece_length+2, self._max_position_embeddings)),
word_pieces = words.new_full((batch_size, min(word_piece_length + 2, self._max_position_embeddings)),
fill_value=self._wordpiece_pad_index)
attn_masks = torch.zeros_like(word_pieces)
# 1. 获取words的word_pieces的id,以及对应的span范围
word_indexes = words.cpu().numpy()
for i in range(batch_size):
word_pieces_i = list(chain(*self.word_to_wordpieces[word_indexes[i, :seq_len[i]]]))
if self.auto_truncate and len(word_pieces_i)>self._max_position_embeddings-2:
word_pieces_i = word_pieces_i[:self._max_position_embeddings-2]
word_pieces[i, 1:word_pieces_lengths[i]+1] = torch.LongTensor(word_pieces_i)
attn_masks[i, :word_pieces_lengths[i]+2].fill_(1)
if self.auto_truncate and len(word_pieces_i) > self._max_position_embeddings - 2:
word_pieces_i = word_pieces_i[:self._max_position_embeddings - 2]
word_pieces[i, 1:word_pieces_lengths[i] + 1] = torch.LongTensor(word_pieces_i)
attn_masks[i, :word_pieces_lengths[i] + 2].fill_(1)
# 添加[cls]和[sep]
word_pieces[:, 0].fill_(self._cls_index)
batch_indexes = torch.arange(batch_size).to(words)
word_pieces[batch_indexes, word_pieces_lengths+1] = self._sep_index
if self._has_sep_in_vocab: #但[SEP]在vocab中出现应该才会需要token_ids
word_pieces[batch_indexes, word_pieces_lengths + 1] = self._sep_index
if self._has_sep_in_vocab: # 但[SEP]在vocab中出现应该才会需要token_ids
sep_mask = word_pieces.eq(self._sep_index) # batch_size x max_len
sep_mask_cumsum = sep_mask.flip(dims=[-1]).cumsum(dim=-1).flip(dims=[-1])
token_type_ids = sep_mask_cumsum.fmod(2)
@@ -396,9 +412,9 @@ class _WordBertModel(nn.Module):
# 2. 获取hidden的结果,根据word_pieces进行对应的pool计算
# all_outputs: [batch_size x max_len x hidden_size, batch_size x max_len x hidden_size, ...]
bert_outputs, pooled_cls = self.encoder(word_pieces, token_type_ids=token_type_ids, attention_mask=attn_masks,
output_all_encoded_layers=True)
output_all_encoded_layers=True)
# output_layers = [self.layers] # len(self.layers) x batch_size x real_word_piece_length x hidden_size
if self.include_cls_sep:
outputs = bert_outputs[-1].new_zeros(len(self.layers), batch_size, max_word_len + 2,
bert_outputs[-1].size(-1))
@@ -414,7 +430,7 @@ class _WordBertModel(nn.Module):
real_word_piece_length = output_layer.size(1) - 2
if word_piece_length > real_word_piece_length: # 如果实际上是截取出来的
paddings = output_layer.new_zeros(batch_size,
word_piece_length-real_word_piece_length,
word_piece_length - real_word_piece_length,
output_layer.size(2))
output_layer = torch.cat((output_layer, paddings), dim=1).contiguous()
# 从word_piece collapse到word的表示
@@ -423,27 +439,27 @@ class _WordBertModel(nn.Module):
if self.pool_method == 'first':
for i in range(batch_size):
i_word_pieces_cum_length = batch_word_pieces_cum_length[i, :seq_len[i]] # 每个word的start位置
outputs[l_index, i, s_shift:outputs_seq_len[i]] = truncate_output_layer[i, i_word_pieces_cum_length] # num_layer x batch_size x len x hidden_size
outputs[l_index, i, s_shift:outputs_seq_len[i]] = truncate_output_layer[
i, i_word_pieces_cum_length] # num_layer x batch_size x len x hidden_size
elif self.pool_method == 'last':
for i in range(batch_size):
i_word_pieces_cum_length = batch_word_pieces_cum_length[i, 1:seq_len[i]+1] - 1 # 每个word的end
i_word_pieces_cum_length = batch_word_pieces_cum_length[i, 1:seq_len[i] + 1] - 1 # 每个word的end
outputs[l_index, i, s_shift:outputs_seq_len[i]] = truncate_output_layer[i, i_word_pieces_cum_length]
elif self.pool_method == 'max':
for i in range(batch_size):
for j in range(seq_len[i]):
start, end = batch_word_pieces_cum_length[i, j], batch_word_pieces_cum_length[i, j+1]
outputs[l_index, i, j+s_shift], _ = torch.max(truncate_output_layer[i, start:end], dim=-2)
start, end = batch_word_pieces_cum_length[i, j], batch_word_pieces_cum_length[i, j + 1]
outputs[l_index, i, j + s_shift], _ = torch.max(truncate_output_layer[i, start:end], dim=-2)
else:
for i in range(batch_size):
for j in range(seq_len[i]):
start, end = batch_word_pieces_cum_length[i, j], batch_word_pieces_cum_length[i, j+1]
outputs[l_index, i, j+s_shift] = torch.mean(truncate_output_layer[i, start:end], dim=-2)
start, end = batch_word_pieces_cum_length[i, j], batch_word_pieces_cum_length[i, j + 1]
outputs[l_index, i, j + s_shift] = torch.mean(truncate_output_layer[i, start:end], dim=-2)
if self.include_cls_sep:
if l in (len(bert_outputs)-1, -1) and self.pooled_cls:
if l in (len(bert_outputs) - 1, -1) and self.pooled_cls:
outputs[l_index, :, 0] = pooled_cls
else:
outputs[l_index, :, 0] = output_layer[:, 0]
outputs[l_index, batch_indexes, seq_len+s_shift] = output_layer[batch_indexes, seq_len+s_shift]
outputs[l_index, batch_indexes, seq_len + s_shift] = output_layer[batch_indexes, seq_len + s_shift]
# 3. 最终的embedding结果
return outputs


+ 38
- 30
fastNLP/embeddings/char_embedding.py View File

@@ -3,6 +3,10 @@
词的index而不需要使用词语中的char的index来获取表达。
"""

__all__ = [
"CNNCharEmbedding",
"LSTMCharEmbedding"
]

import torch
import torch.nn as nn
@@ -16,6 +20,7 @@ from .embedding import TokenEmbedding
from .utils import _construct_char_vocab_from_vocab
from .utils import get_embeddings


class CNNCharEmbedding(TokenEmbedding):
"""
别名::class:`fastNLP.embeddings.CNNCharEmbedding` :class:`fastNLP.embeddings.char_embedding.CNNCharEmbedding`
@@ -49,14 +54,15 @@ class CNNCharEmbedding(TokenEmbedding):
(文件夹下应该只有一个以.txt作为后缀的文件)或文件路径;第二种是传入embedding的名称,第二种情况将自动查看缓存中是否存在该模型,
没有的话将自动下载。如果输入为None则使用embedding_dim的维度随机初始化一个embedding.
"""
def __init__(self, vocab: Vocabulary, embed_size: int=50, char_emb_size: int=50, word_dropout:float=0,
dropout:float=0, filter_nums: List[int]=(40, 30, 20), kernel_sizes: List[int]=(5, 3, 1),
pool_method: str='max', activation='relu', min_char_freq: int=2, pre_train_char_embed: str=None):
def __init__(self, vocab: Vocabulary, embed_size: int = 50, char_emb_size: int = 50, word_dropout: float = 0,
dropout: float = 0, filter_nums: List[int] = (40, 30, 20), kernel_sizes: List[int] = (5, 3, 1),
pool_method: str = 'max', activation='relu', min_char_freq: int = 2, pre_train_char_embed: str = None):
super(CNNCharEmbedding, self).__init__(vocab, word_dropout=word_dropout, dropout=dropout)
for kernel in kernel_sizes:
assert kernel % 2 == 1, "Only odd kernel is allowed."
assert pool_method in ('max', 'avg')
self.pool_method = pool_method
# activation function
@@ -74,7 +80,7 @@ class CNNCharEmbedding(TokenEmbedding):
else:
raise Exception(
"Undefined activation function: choose from: [relu, tanh, sigmoid, or a callable function]")
print("Start constructing character vocabulary.")
# 建立char的词表
self.char_vocab = _construct_char_vocab_from_vocab(vocab, min_freq=min_char_freq)
@@ -96,14 +102,14 @@ class CNNCharEmbedding(TokenEmbedding):
self.char_embedding = StaticEmbedding(self.char_vocab, model_dir_or_name=pre_train_char_embed)
else:
self.char_embedding = get_embeddings((len(self.char_vocab), char_emb_size))
self.convs = nn.ModuleList([nn.Conv1d(
char_emb_size, filter_nums[i], kernel_size=kernel_sizes[i], bias=True, padding=kernel_sizes[i] // 2)
for i in range(len(kernel_sizes))])
self._embed_size = embed_size
self.fc = nn.Linear(sum(filter_nums), embed_size)
self.reset_parameters()
def forward(self, words):
"""
输入words的index后,生成对应的words的表示。
@@ -114,14 +120,14 @@ class CNNCharEmbedding(TokenEmbedding):
words = self.drop_word(words)
batch_size, max_len = words.size()
chars = self.words_to_chars_embedding[words] # batch_size x max_len x max_word_len
word_lengths = self.word_lengths[words] # batch_size x max_len
word_lengths = self.word_lengths[words] # batch_size x max_len
max_word_len = word_lengths.max()
chars = chars[:, :, :max_word_len]
# 为1的地方为mask
chars_masks = chars.eq(self.char_pad_index) # batch_size x max_len x max_word_len 如果为0, 说明是padding的位置了
chars = self.char_embedding(chars) # batch_size x max_len x max_word_len x embed_size
chars = self.dropout(chars)
reshaped_chars = chars.reshape(batch_size*max_len, max_word_len, -1)
reshaped_chars = chars.reshape(batch_size * max_len, max_word_len, -1)
reshaped_chars = reshaped_chars.transpose(1, 2) # B' x E x M
conv_chars = [conv(reshaped_chars).transpose(1, 2).reshape(batch_size, max_len, max_word_len, -1)
for conv in self.convs]
@@ -129,13 +135,13 @@ class CNNCharEmbedding(TokenEmbedding):
conv_chars = self.activation(conv_chars)
if self.pool_method == 'max':
conv_chars = conv_chars.masked_fill(chars_masks.unsqueeze(-1), float('-inf'))
chars, _ = torch.max(conv_chars, dim=-2) # batch_size x max_len x sum(filters)
chars, _ = torch.max(conv_chars, dim=-2) # batch_size x max_len x sum(filters)
else:
conv_chars = conv_chars.masked_fill(chars_masks.unsqueeze(-1), 0)
chars = torch.sum(conv_chars, dim=-2)/chars_masks.eq(0).sum(dim=-1, keepdim=True).float()
chars = torch.sum(conv_chars, dim=-2) / chars_masks.eq(0).sum(dim=-1, keepdim=True).float()
chars = self.fc(chars)
return self.dropout(chars)
@property
def requires_grad(self):
"""
@@ -151,21 +157,21 @@ class CNNCharEmbedding(TokenEmbedding):
return requires_grads.pop()
else:
return None
@requires_grad.setter
def requires_grad(self, value):
for name, param in self.named_parameters():
if 'words_to_chars_embedding' in name or 'word_lengths' in name: # 这个不能加入到requires_grad中
continue
param.requires_grad = value
def reset_parameters(self):
for name, param in self.named_parameters():
if 'words_to_chars_embedding' in name or 'word_lengths' in name: # 这个不能reset
continue
if 'char_embedding' in name:
continue
if param.data.dim()>1:
if param.data.dim() > 1:
nn.init.xavier_uniform_(param, 1)
else:
nn.init.uniform_(param, -1, 1)
@@ -203,13 +209,15 @@ class LSTMCharEmbedding(TokenEmbedding):
(文件夹下应该只有一个以.txt作为后缀的文件)或文件路径;第二种是传入embedding的名称,第二种情况将自动查看缓存中是否存在该模型,
没有的话将自动下载。如果输入为None则使用embedding_dim的维度随机初始化一个embedding.
"""
def __init__(self, vocab: Vocabulary, embed_size: int=50, char_emb_size: int=50, word_dropout:float=0,
dropout:float=0, hidden_size=50,pool_method: str='max', activation='relu', min_char_freq: int=2,
bidirectional=True, pre_train_char_embed: str=None):
def __init__(self, vocab: Vocabulary, embed_size: int = 50, char_emb_size: int = 50, word_dropout: float = 0,
dropout: float = 0, hidden_size=50, pool_method: str = 'max', activation='relu',
min_char_freq: int = 2,
bidirectional=True, pre_train_char_embed: str = None):
super(LSTMCharEmbedding, self).__init__(vocab, word_dropout=word_dropout, dropout=dropout)

assert hidden_size % 2 == 0, "Only even kernel is allowed."
assert pool_method in ('max', 'avg')
self.pool_method = pool_method
# activation function
@@ -227,7 +235,7 @@ class LSTMCharEmbedding(TokenEmbedding):
else:
raise Exception(
"Undefined activation function: choose from: [relu, tanh, sigmoid, or a callable function]")
print("Start constructing character vocabulary.")
# 建立char的词表
self.char_vocab = _construct_char_vocab_from_vocab(vocab, min_freq=min_char_freq)
@@ -249,14 +257,14 @@ class LSTMCharEmbedding(TokenEmbedding):
self.char_embedding = StaticEmbedding(self.char_vocab, pre_train_char_embed)
else:
self.char_embedding = nn.Embedding(len(self.char_vocab), char_emb_size)
self.fc = nn.Linear(hidden_size, embed_size)
hidden_size = hidden_size // 2 if bidirectional else hidden_size
self.lstm = LSTM(char_emb_size, hidden_size, bidirectional=bidirectional, batch_first=True)
self._embed_size = embed_size
self.bidirectional = bidirectional
def forward(self, words):
"""
输入words的index后,生成对应的words的表示。
@@ -278,7 +286,7 @@ class LSTMCharEmbedding(TokenEmbedding):
char_seq_len = chars_masks.eq(0).sum(dim=-1).reshape(batch_size * max_len)
lstm_chars = self.lstm(reshaped_chars, char_seq_len)[0].reshape(batch_size, max_len, max_word_len, -1)
# B x M x M x H
lstm_chars = self.activation(lstm_chars)
if self.pool_method == 'max':
lstm_chars = lstm_chars.masked_fill(chars_masks.unsqueeze(-1), float('-inf'))
@@ -286,11 +294,11 @@ class LSTMCharEmbedding(TokenEmbedding):
else:
lstm_chars = lstm_chars.masked_fill(chars_masks.unsqueeze(-1), 0)
chars = torch.sum(lstm_chars, dim=-2) / chars_masks.eq(0).sum(dim=-1, keepdim=True).float()
chars = self.fc(chars)
return self.dropout(chars)
@property
def requires_grad(self):
"""
@@ -307,7 +315,7 @@ class LSTMCharEmbedding(TokenEmbedding):
return requires_grads.pop()
else:
return None
@requires_grad.setter
def requires_grad(self, value):
for name, param in self.named_parameters():


+ 17
- 12
fastNLP/embeddings/contextual_embedding.py View File

@@ -1,3 +1,12 @@
"""
.. todo::
doc
"""

__all__ = [
"ContextualEmbedding"
]

from abc import abstractmethod
import torch

@@ -8,16 +17,12 @@ from ..core.sampler import SequentialSampler
from ..core.utils import _move_model_to_device, _get_model_device
from .embedding import TokenEmbedding

__all__ = [
"ContextualEmbedding"
]


class ContextualEmbedding(TokenEmbedding):
def __init__(self, vocab: Vocabulary, word_dropout:float=0.0, dropout:float=0.0):
def __init__(self, vocab: Vocabulary, word_dropout: float = 0.0, dropout: float = 0.0):
super(ContextualEmbedding, self).__init__(vocab, word_dropout=word_dropout, dropout=dropout)
def add_sentence_cache(self, *datasets, batch_size=32, device='cpu', delete_weights: bool=True):
def add_sentence_cache(self, *datasets, batch_size=32, device='cpu', delete_weights: bool = True):
"""
由于动态embedding生成比较耗时,所以可以把每句话embedding缓存下来,这样就不需要每次都运行生成过程。

@@ -34,7 +39,7 @@ class ContextualEmbedding(TokenEmbedding):
except Exception as e:
print(f"Exception happens at {index} dataset.")
raise e
sent_embeds = {}
_move_model_to_device(self, device=device)
device = _get_model_device(self)
@@ -54,7 +59,7 @@ class ContextualEmbedding(TokenEmbedding):
word_embeds = self(words).detach().cpu().numpy()
for b in range(words.size(0)):
length = seq_len_from_behind[b]
if length==0:
if length == 0:
sent_embeds[tuple(words_list[b][:seq_len[b]])] = word_embeds[b]
else:
sent_embeds[tuple(words_list[b][:seq_len[b]])] = word_embeds[b, :-length]
@@ -65,7 +70,7 @@ class ContextualEmbedding(TokenEmbedding):
self.sent_embeds = sent_embeds
if delete_weights:
self._delete_model_weights()
def _get_sent_reprs(self, words):
"""
获取sentence的表示,如果有缓存,则返回缓存的值; 没有缓存则返回None
@@ -88,12 +93,12 @@ class ContextualEmbedding(TokenEmbedding):
embeds[i, :len(embed)] = torch.FloatTensor(embed).to(words.device)
return embeds
return None
@abstractmethod
def _delete_model_weights(self):
"""删除计算表示的模型以节省资源"""
raise NotImplementedError
def remove_sentence_cache(self):
"""
删除缓存的句子表示. 删除之后如果模型权重没有被删除,将开始使用动态计算权重。


+ 42
- 35
fastNLP/embeddings/elmo_embedding.py View File

@@ -1,6 +1,13 @@
"""
.. todo::
doc
"""

import os
__all__ = [
"ElmoEmbedding"
]

import os
import torch
import torch.nn as nn
import torch.nn.functional as F
@@ -49,11 +56,11 @@ class ElmoEmbedding(ContextualEmbedding):
:param cache_word_reprs: 可以选择对word的表示进行cache; 设置为True的话,将在初始化的时候为每个word生成对应的embedding,
并删除character encoder,之后将直接使用cache的embedding。默认为False。
"""
def __init__(self, vocab: Vocabulary, model_dir_or_name: str = 'en', layers: str = '2', requires_grad: bool = False,
word_dropout=0.0, dropout=0.0, cache_word_reprs: bool = False):
super(ElmoEmbedding, self).__init__(vocab, word_dropout=word_dropout, dropout=dropout)
# 根据model_dir_or_name检查是否存在并下载
if model_dir_or_name.lower() in PRETRAINED_ELMO_MODEL_DIR:
model_url = _get_embedding_url('elmo', model_dir_or_name.lower())
@@ -64,7 +71,7 @@ class ElmoEmbedding(ContextualEmbedding):
else:
raise ValueError(f"Cannot recognize {model_dir_or_name}.")
self.model = _ElmoModel(model_dir, vocab, cache_word_reprs=cache_word_reprs)
if layers == 'mix':
self.layer_weights = nn.Parameter(torch.zeros(self.model.config['lstm']['n_layers'] + 1),
requires_grad=requires_grad)
@@ -79,16 +86,16 @@ class ElmoEmbedding(ContextualEmbedding):
self.layers = layers
self._get_outputs = self._get_layer_outputs
self._embed_size = len(self.layers) * self.model.config['lstm']['projection_dim'] * 2
self.requires_grad = requires_grad
def _get_mixed_outputs(self, outputs):
# outputs: num_layers x batch_size x max_len x hidden_size
# return: batch_size x max_len x hidden_size
weights = F.softmax(self.layer_weights + 1 / len(outputs), dim=0).to(outputs)
outputs = torch.einsum('l,lbij->bij', weights, outputs)
return self.gamma.to(outputs) * outputs
def set_mix_weights_requires_grad(self, flag=True):
"""
当初始化ElmoEmbedding时layers被设置为mix时,可以通过调用该方法设置mix weights是否可训练。如果layers不是mix,调用
@@ -100,15 +107,15 @@ class ElmoEmbedding(ContextualEmbedding):
if hasattr(self, 'layer_weights'):
self.layer_weights.requires_grad = flag
self.gamma.requires_grad = flag
def _get_layer_outputs(self, outputs):
if len(self.layers) == 1:
outputs = outputs[self.layers[0]]
else:
outputs = torch.cat(tuple([*outputs[self.layers]]), dim=-1)
return outputs
def forward(self, words: torch.LongTensor):
"""
计算words的elmo embedding表示。根据elmo文章中介绍的ELMO实际上是有2L+1层结果,但是为了让结果比较容易拆分,token的
@@ -125,12 +132,12 @@ class ElmoEmbedding(ContextualEmbedding):
outputs = self.model(words)
outputs = self._get_outputs(outputs)
return self.dropout(outputs)
def _delete_model_weights(self):
for name in ['layers', 'model', 'layer_weights', 'gamma']:
if hasattr(self, name):
delattr(self, name)
@property
def requires_grad(self):
"""
@@ -144,7 +151,7 @@ class ElmoEmbedding(ContextualEmbedding):
return requires_grads.pop()
else:
return None
@requires_grad.setter
def requires_grad(self, value):
for name, param in self.named_parameters():
@@ -162,7 +169,7 @@ class _ElmoModel(nn.Module):
(4) 设计一个保存token的embedding,允许缓存word的表示。

"""
def __init__(self, model_dir: str, vocab: Vocabulary = None, cache_word_reprs: bool = False):
super(_ElmoModel, self).__init__()
self.model_dir = model_dir
@@ -187,14 +194,14 @@ class _ElmoModel(nn.Module):
config = json.load(config_f)
self.weight_file = os.path.join(model_dir, weight_file)
self.config = config
OOV_TAG = '<oov>'
PAD_TAG = '<pad>'
BOS_TAG = '<bos>'
EOS_TAG = '<eos>'
BOW_TAG = '<bow>'
EOW_TAG = '<eow>'
# For the model trained with character-based word encoder.
char_lexicon = {}
with codecs.open(os.path.join(model_dir, 'char.dic'), 'r', encoding='utf-8') as fpi:
@@ -204,29 +211,29 @@ class _ElmoModel(nn.Module):
tokens.insert(0, '\u3000')
token, i = tokens
char_lexicon[token] = int(i)
# 做一些sanity check
for special_word in [PAD_TAG, OOV_TAG, BOW_TAG, EOW_TAG]:
assert special_word in char_lexicon, f"{special_word} not found in char.dic."
# 从vocab中构建char_vocab
char_vocab = Vocabulary(unknown=OOV_TAG, padding=PAD_TAG)
# 需要保证<bow>与<eow>在里面
char_vocab.add_word_lst([BOW_TAG, EOW_TAG, BOS_TAG, EOS_TAG])
for word, index in vocab:
char_vocab.add_word_lst(list(word))
self.bos_index, self.eos_index, self._pad_index = len(vocab), len(vocab) + 1, vocab.padding_idx
# 根据char_lexicon调整, 多设置一位,是预留给word padding的(该位置的char表示为全0表示)
char_emb_layer = nn.Embedding(len(char_vocab) + 1, int(config['char_cnn']['embedding']['dim']),
padding_idx=len(char_vocab))
# 读入预训练权重 这里的elmo_model 包含char_cnn和 lstm 的 state_dict
elmo_model = torch.load(os.path.join(self.model_dir, weight_file), map_location='cpu')
char_embed_weights = elmo_model["char_cnn"]['char_emb_layer.weight']
found_char_count = 0
for char, index in char_vocab: # 调整character embedding
if char in char_lexicon:
@@ -235,11 +242,11 @@ class _ElmoModel(nn.Module):
else:
index_in_pre = char_lexicon[OOV_TAG]
char_emb_layer.weight.data[index] = char_embed_weights[index_in_pre]
print(f"{found_char_count} out of {len(char_vocab)} characters were found in pretrained elmo embedding.")
# 生成words到chars的映射
max_chars = config['char_cnn']['max_characters_per_token']
self.words_to_chars_embedding = nn.Parameter(torch.full((len(vocab) + 2, max_chars),
fill_value=len(char_vocab),
dtype=torch.long),
@@ -258,20 +265,20 @@ class _ElmoModel(nn.Module):
char_vocab.to_index(EOW_TAG)]
char_ids += [char_vocab.to_index(PAD_TAG)] * (max_chars - len(char_ids))
self.words_to_chars_embedding[index] = torch.LongTensor(char_ids)
self.char_vocab = char_vocab
self.token_embedder = ConvTokenEmbedder(
config, self.weight_file, None, char_emb_layer)
elmo_model["char_cnn"]['char_emb_layer.weight'] = char_emb_layer.weight
self.token_embedder.load_state_dict(elmo_model["char_cnn"])
self.output_dim = config['lstm']['projection_dim']
# lstm encoder
self.encoder = ElmobiLm(config)
self.encoder.load_state_dict(elmo_model["lstm"])
if cache_word_reprs:
if config['char_cnn']['embedding']['dim'] > 0: # 只有在使用了chars的情况下有用
print("Start to generate cache word representations.")
@@ -280,7 +287,7 @@ class _ElmoModel(nn.Module):
word_size = self.words_to_chars_embedding.size(0)
num_batches = word_size // batch_size + \
int(word_size % batch_size != 0)
self.cached_word_embedding = nn.Embedding(word_size,
config['lstm']['projection_dim'])
with torch.no_grad():
@@ -291,12 +298,12 @@ class _ElmoModel(nn.Module):
word_reprs = self.token_embedder(words.unsqueeze(1),
chars).detach() # batch_size x 1 x config['encoder']['projection_dim']
self.cached_word_embedding.weight.data[words] = word_reprs.squeeze(1)
print("Finish generating cached word representations. Going to delete the character encoder.")
del self.token_embedder, self.words_to_chars_embedding
else:
print("There is no need to cache word representations, since no character information is used.")
def forward(self, words):
"""

@@ -321,7 +328,7 @@ class _ElmoModel(nn.Module):
else:
chars = None
token_embedding = self.token_embedder(expanded_words, chars) # batch_size x max_len x embed_dim
encoder_output = self.encoder(token_embedding, seq_len)
if encoder_output.size(2) < max_len + 2:
num_layers, _, output_len, hidden_size = encoder_output.size()
@@ -332,7 +339,7 @@ class _ElmoModel(nn.Module):
token_embedding = token_embedding.masked_fill(mask, 0)
token_embedding = torch.cat((token_embedding, token_embedding), dim=2).view(1, sz[1], sz[2], sz[3])
encoder_output = torch.cat((token_embedding, encoder_output), dim=0)
# 删除<eos>, <bos>. 这里没有精确地删除,但应该也不会影响最后的结果了。
encoder_output = encoder_output[:, :, 1:-1]
return encoder_output

+ 30
- 26
fastNLP/embeddings/embedding.py View File

@@ -3,6 +3,10 @@

"""

__all__ = [
"Embedding",
"TokenEmbedding"
]

import torch.nn as nn
from abc import abstractmethod
@@ -33,11 +37,11 @@ class Embedding(nn.Module):
:param float dropout: 对Embedding的输出的dropout。
:param int unk_index: drop word时替换为的index。fastNLP的Vocabulary的unk_index默认为1。
"""
def __init__(self, init_embed, word_dropout=0, dropout=0.0, unk_index=None):
super(Embedding, self).__init__()
self.embed = get_embeddings(init_embed)
self.dropout = nn.Dropout(dropout)
@@ -48,44 +52,44 @@ class Embedding(nn.Module):
self._embed_size = self.embed.embedding_dim
else:
self._embed_size = self.embed.weight.size(1)
if word_dropout>0 and not isinstance(unk_index, int):
if word_dropout > 0 and not isinstance(unk_index, int):
raise ValueError("When drop word is set, you need to pass in the unk_index.")
else:
self._embed_size = self.embed.embed_size
unk_index = self.embed.get_word_vocab().unknown_idx
self.unk_index = unk_index
self.word_dropout = word_dropout
def forward(self, words):
"""
:param torch.LongTensor words: [batch, seq_len]
:return: torch.Tensor : [batch, seq_len, embed_dim]
"""
if self.word_dropout>0 and self.training:
if self.word_dropout > 0 and self.training:
mask = torch.ones_like(words).float() * self.word_dropout
mask = torch.bernoulli(mask).eq(1) # dropout_word越大,越多位置为1
words = words.masked_fill(mask, self.unk_index)
words = self.embed(words)
return self.dropout(words)
@property
def num_embedding(self)->int:
def num_embedding(self) -> int:
if isinstance(self.embed, nn.Embedding):
return self.embed.weight.size(0)
else:
return self.embed.num_embedding
def __len__(self):
return len(self.embed)
@property
def embed_size(self) -> int:
return self._embed_size
@property
def embedding_dim(self) -> int:
return self._embed_size
@property
def requires_grad(self):
"""
@@ -96,14 +100,14 @@ class Embedding(nn.Module):
return self.embed.weight.requires_grad
else:
return self.embed.requires_grad
@requires_grad.setter
def requires_grad(self, value):
if not isinstance(self.embed, TokenEmbedding):
self.embed.weight.requires_grad = value
else:
self.embed.requires_grad = value
@property
def size(self):
if isinstance(self.embed, TokenEmbedding):
@@ -120,12 +124,12 @@ class TokenEmbedding(nn.Module):
assert vocab.padding is not None, "Vocabulary must have a padding entry."
self._word_vocab = vocab
self._word_pad_index = vocab.padding_idx
if word_dropout>0:
if word_dropout > 0:
assert vocab.unknown is not None, "Vocabulary must have unknown entry when you want to drop a word."
self.word_dropout = word_dropout
self._word_unk_index = vocab.unknown_idx
self.dropout_layer = nn.Dropout(dropout)
def drop_word(self, words):
"""
按照设定随机将words设置为unknown_index。
@@ -138,7 +142,7 @@ class TokenEmbedding(nn.Module):
mask = torch.bernoulli(mask).eq(1) # dropout_word越大,越多位置为1
words = words.masked_fill(mask, self._word_unk_index)
return words
def dropout(self, words):
"""
对embedding后的word表示进行drop。
@@ -147,7 +151,7 @@ class TokenEmbedding(nn.Module):
:return:
"""
return self.dropout_layer(words)
@property
def requires_grad(self):
"""
@@ -159,23 +163,23 @@ class TokenEmbedding(nn.Module):
return requires_grads.pop()
else:
return None
@requires_grad.setter
def requires_grad(self, value):
for param in self.parameters():
param.requires_grad = value
def __len__(self):
return len(self._word_vocab)
@property
def embed_size(self) -> int:
return self._embed_size
@property
def embedding_dim(self) -> int:
return self._embed_size
@property
def num_embedding(self) -> int:
"""
@@ -183,7 +187,7 @@ class TokenEmbedding(nn.Module):
:return:
"""
return len(self._word_vocab)
def get_word_vocab(self):
"""
返回embedding的词典。
@@ -191,11 +195,11 @@ class TokenEmbedding(nn.Module):
:return: Vocabulary
"""
return self._word_vocab
@property
def size(self):
return torch.Size(self.num_embedding, self._embed_size)
@abstractmethod
def forward(self, words):
raise NotImplementedError

+ 17
- 7
fastNLP/embeddings/stack_embedding.py View File

@@ -1,3 +1,12 @@
"""
.. todo::
doc
"""

__all__ = [
"StackEmbedding",
]

from typing import List

import torch
@@ -26,6 +35,7 @@ class StackEmbedding(TokenEmbedding):
:param float dropout: 以多大的概率对embedding的表示进行Dropout。0.1即随机将10%的值置为0。

"""
def __init__(self, embeds: List[TokenEmbedding], word_dropout=0, dropout=0):
vocabs = []
for embed in embeds:
@@ -34,14 +44,14 @@ class StackEmbedding(TokenEmbedding):
_vocab = vocabs[0]
for vocab in vocabs[1:]:
assert vocab == _vocab, "All embeddings in StackEmbedding should use the same word vocabulary."
super(StackEmbedding, self).__init__(_vocab, word_dropout=word_dropout, dropout=dropout)
assert isinstance(embeds, list)
for embed in embeds:
assert isinstance(embed, TokenEmbedding), "Only TokenEmbedding type is supported."
self.embeds = nn.ModuleList(embeds)
self._embed_size = sum([embed.embed_size for embed in self.embeds])
def append(self, embed: TokenEmbedding):
"""
添加一个embedding到结尾。
@@ -50,18 +60,18 @@ class StackEmbedding(TokenEmbedding):
"""
assert isinstance(embed, TokenEmbedding)
self.embeds.append(embed)
def pop(self):
"""
弹出最后一个embed
:return:
"""
return self.embeds.pop()
@property
def embed_size(self):
return self._embed_size
@property
def requires_grad(self):
"""
@@ -73,12 +83,12 @@ class StackEmbedding(TokenEmbedding):
return requires_grads.pop()
else:
return None
@requires_grad.setter
def requires_grad(self, value):
for embed in self.embeds():
embed.requires_grad = value
def forward(self, words):
"""
得到多个embedding的结果,并把结果按照顺序concat起来。


+ 35
- 26
fastNLP/embeddings/static_embedding.py View File

@@ -1,4 +1,11 @@
"""
.. todo::
doc
"""

__all__ = [
"StaticEmbedding"
]
import os

import torch
@@ -13,6 +20,7 @@ from ..modules.utils import _get_file_name_base_on_postfix
from copy import deepcopy
from collections import defaultdict


class StaticEmbedding(TokenEmbedding):
"""
别名::class:`fastNLP.embeddings.StaticEmbedding` :class:`fastNLP.embeddings.static_embedding.StaticEmbedding`
@@ -55,15 +63,16 @@ class StaticEmbedding(TokenEmbedding):
:param bool normalize: 是否对vector进行normalize,使得每个vector的norm为1。
:param int min_freq: Vocabulary词频数小于这个数量的word将被指向unk。
"""
def __init__(self, vocab: Vocabulary, model_dir_or_name: str='en', embedding_dim=-1, requires_grad: bool=True,
def __init__(self, vocab: Vocabulary, model_dir_or_name: str = 'en', embedding_dim=-1, requires_grad: bool = True,
init_method=None, lower=False, dropout=0, word_dropout=0, normalize=False, min_freq=1, **kwargs):
super(StaticEmbedding, self).__init__(vocab, word_dropout=word_dropout, dropout=dropout)
if embedding_dim>0:
if embedding_dim > 0:
model_dir_or_name = None
# 得到cache_path
if model_dir_or_name is None:
assert embedding_dim>=1, "The dimension of embedding should be larger than 1."
assert embedding_dim >= 1, "The dimension of embedding should be larger than 1."
embedding_dim = int(embedding_dim)
model_path = None
elif model_dir_or_name.lower() in PRETRAIN_STATIC_FILES:
@@ -76,9 +85,9 @@ class StaticEmbedding(TokenEmbedding):
model_path = _get_file_name_base_on_postfix(os.path.abspath(os.path.expanduser(model_dir_or_name)), '.txt')
else:
raise ValueError(f"Cannot recognize {model_dir_or_name}.")
# 根据min_freq缩小vocab
truncate_vocab = (vocab.min_freq is None and min_freq>1) or (vocab.min_freq and vocab.min_freq<min_freq)
truncate_vocab = (vocab.min_freq is None and min_freq > 1) or (vocab.min_freq and vocab.min_freq < min_freq)
if truncate_vocab:
truncated_vocab = deepcopy(vocab)
truncated_vocab.min_freq = min_freq
@@ -89,14 +98,14 @@ class StaticEmbedding(TokenEmbedding):
lowered_word_count[word.lower()] += count
for word in truncated_vocab.word_count.keys():
word_count = truncated_vocab.word_count[word]
if lowered_word_count[word.lower()]>=min_freq and word_count<min_freq:
truncated_vocab.add_word_lst([word]*(min_freq-word_count),
if lowered_word_count[word.lower()] >= min_freq and word_count < min_freq:
truncated_vocab.add_word_lst([word] * (min_freq - word_count),
no_create_entry=truncated_vocab._is_word_no_create_entry(word))
# 只限制在train里面的词语使用min_freq筛选
if kwargs.get('only_train_min_freq', False) and model_dir_or_name is not None:
for word in truncated_vocab.word_count.keys():
if truncated_vocab._is_word_no_create_entry(word) and truncated_vocab.word_count[word]<min_freq:
if truncated_vocab._is_word_no_create_entry(word) and truncated_vocab.word_count[word] < min_freq:
truncated_vocab.add_word_lst([word] * (min_freq - truncated_vocab.word_count[word]),
no_create_entry=True)
truncated_vocab.build_vocab()
@@ -105,7 +114,7 @@ class StaticEmbedding(TokenEmbedding):
truncated_words_to_words[index] = truncated_vocab.to_index(word)
print(f"{len(vocab) - len(truncated_vocab)} out of {len(vocab)} words have frequency less than {min_freq}.")
vocab = truncated_vocab
self.only_norm_found_vector = kwargs.get('only_norm_found_vector', False)
# 读取embedding
if lower:
@@ -145,21 +154,21 @@ class StaticEmbedding(TokenEmbedding):
self.words_to_words = nn.Parameter(torch.arange(len(vocab)).long(), requires_grad=False)
if not self.only_norm_found_vector and normalize:
embedding /= (torch.norm(embedding, dim=1, keepdim=True) + 1e-12)
if truncate_vocab:
for i in range(len(truncated_words_to_words)):
index_in_truncated_vocab = truncated_words_to_words[i]
truncated_words_to_words[i] = self.words_to_words[index_in_truncated_vocab]
del self.words_to_words
self.words_to_words = nn.Parameter(truncated_words_to_words, requires_grad=False)
self.embedding = nn.Embedding(num_embeddings=embedding.shape[0], embedding_dim=embedding.shape[1],
padding_idx=vocab.padding_idx,
max_norm=None, norm_type=2, scale_grad_by_freq=False,
sparse=False, _weight=embedding)
self._embed_size = self.embedding.weight.size(1)
self.requires_grad = requires_grad
def _randomly_init_embed(self, num_embedding, embedding_dim, init_embed=None):
"""

@@ -169,14 +178,14 @@ class StaticEmbedding(TokenEmbedding):
:return: torch.FloatTensor
"""
embed = torch.zeros(num_embedding, embedding_dim)
if init_embed is None:
nn.init.uniform_(embed, -np.sqrt(3/embedding_dim), np.sqrt(3/embedding_dim))
nn.init.uniform_(embed, -np.sqrt(3 / embedding_dim), np.sqrt(3 / embedding_dim))
else:
init_embed(embed)
return embed
@property
def requires_grad(self):
"""
@@ -190,14 +199,14 @@ class StaticEmbedding(TokenEmbedding):
return requires_grads.pop()
else:
return None
@requires_grad.setter
def requires_grad(self, value):
for name, param in self.named_parameters():
if 'words_to_words' in name:
continue
param.requires_grad = value
def _load_with_vocab(self, embed_filepath, vocab, dtype=np.float32, padding='<pad>', unknown='<unk>',
error='ignore', init_method=None):
"""
@@ -250,7 +259,7 @@ class StaticEmbedding(TokenEmbedding):
index = vocab.to_index(word)
matrix[index] = torch.from_numpy(np.fromstring(' '.join(nums), sep=' ', dtype=dtype, count=dim))
if self.only_norm_found_vector:
matrix[index] = matrix[index]/np.linalg.norm(matrix[index])
matrix[index] = matrix[index] / np.linalg.norm(matrix[index])
found_count += 1
except Exception as e:
if error == 'ignore':
@@ -267,22 +276,22 @@ class StaticEmbedding(TokenEmbedding):
matrix[index] = None
# matrix中代表是需要建立entry的词
vectors = self._randomly_init_embed(len(matrix), dim, init_method)
if vocab.unknown is None: # 创建一个专门的unknown
unknown_idx = len(matrix)
vectors = torch.cat((vectors, torch.zeros(1, dim)), dim=0).contiguous()
else:
unknown_idx = vocab.unknown_idx
self.words_to_words = nn.Parameter(torch.full((len(vocab), ), fill_value=unknown_idx).long(),
self.words_to_words = nn.Parameter(torch.full((len(vocab),), fill_value=unknown_idx).long(),
requires_grad=False)
for index, (index_in_vocab, vec) in enumerate(matrix.items()):
if vec is not None:
vectors[index] = vec
self.words_to_words[index_in_vocab] = index
return vectors
def forward(self, words):
"""
传入words的index


+ 11
- 5
fastNLP/embeddings/utils.py View File

@@ -1,13 +1,19 @@
"""
.. todo::
doc
"""
import numpy as np
import torch
from torch import nn as nn

from ..core.vocabulary import Vocabulary

__all__ = ['get_embeddings']
__all__ = [
'get_embeddings'
]


def _construct_char_vocab_from_vocab(vocab:Vocabulary, min_freq:int=1):
def _construct_char_vocab_from_vocab(vocab: Vocabulary, min_freq: int = 1):
"""
给定一个word的vocabulary生成character的vocabulary.

@@ -36,8 +42,8 @@ def get_embeddings(init_embed):
if isinstance(init_embed, tuple):
res = nn.Embedding(
num_embeddings=init_embed[0], embedding_dim=init_embed[1])
nn.init.uniform_(res.weight.data, a=-np.sqrt(3/res.weight.data.size(1)),
b=np.sqrt(3/res.weight.data.size(1)))
nn.init.uniform_(res.weight.data, a=-np.sqrt(3 / res.weight.data.size(1)),
b=np.sqrt(3 / res.weight.data.size(1)))
elif isinstance(init_embed, nn.Module):
res = init_embed
elif isinstance(init_embed, torch.Tensor):
@@ -48,4 +54,4 @@ def get_embeddings(init_embed):
else:
raise TypeError(
'invalid init_embed type: {}'.format((type(init_embed))))
return res
return res

Loading…
Cancel
Save