Browse Source

修复Embedding中的bug

tags/v0.4.10
yh_cc 6 years ago
parent
commit
e876082567
3 changed files with 58 additions and 41 deletions
  1. +11
    -3
      fastNLP/modules/encoder/embedding.py
  2. +12
    -13
      reproduction/seqence_labelling/ner/model/lstm_cnn_crf.py
  3. +35
    -25
      reproduction/seqence_labelling/ner/train_cnn_lstm_crf_conll2003.py

+ 11
- 3
fastNLP/modules/encoder/embedding.py View File

@@ -135,7 +135,7 @@ class TokenEmbedding(nn.Module):
:param torch.LongTensor words: batch_size x max_len
:return:
"""
if self.dropout_word > 0 and self.training:
if self.word_dropout > 0 and self.training:
mask = torch.ones_like(words).float() * self.word_dropout
mask = torch.bernoulli(mask).byte() # dropout_word越大,越多位置为1
words = words.masked_fill(mask, self._word_unk_index)
@@ -174,8 +174,16 @@ class TokenEmbedding(nn.Module):
def embed_size(self) -> int:
return self._embed_size

@property
def embedding_dim(self) -> int:
return self._embed_size

@property
def num_embedding(self) -> int:
"""
这个值可能会大于实际的embedding矩阵的大小。
:return:
"""
return len(self._word_vocab)

def get_word_vocab(self):
@@ -810,7 +818,7 @@ class CNNCharEmbedding(TokenEmbedding):
# 为1的地方为mask
chars_masks = chars.eq(self.char_pad_index) # batch_size x max_len x max_word_len 如果为0, 说明是padding的位置了
chars = self.char_embedding(chars) # batch_size x max_len x max_word_len x embed_size
self.dropout(chars)
chars = self.dropout(chars)
reshaped_chars = chars.reshape(batch_size*max_len, max_word_len, -1)
reshaped_chars = reshaped_chars.transpose(1, 2) # B' x E x M
conv_chars = [conv(reshaped_chars).transpose(1, 2).reshape(batch_size, max_len, max_word_len, -1)
@@ -962,7 +970,7 @@ class LSTMCharEmbedding(TokenEmbedding):

chars = self.fc(chars)

return self.dropout(words)
return self.dropout(chars)

@property
def requires_grad(self):


+ 12
- 13
reproduction/seqence_labelling/ner/model/lstm_cnn_crf.py View File

@@ -11,9 +11,8 @@ from fastNLP import Const
class CNNBiLSTMCRF(nn.Module):
def __init__(self, embed, char_embed, hidden_size, num_layers, tag_vocab, dropout=0.5, encoding_type='bioes'):
super().__init__()

self.embedding = Embedding(embed, dropout=0.5, dropout_word=0)
self.char_embedding = Embedding(char_embed, dropout=0.5, dropout_word=0.01)
self.embedding = embed
self.char_embedding = char_embed
self.lstm = LSTM(input_size=self.embedding.embedding_dim+self.char_embedding.embedding_dim,
hidden_size=hidden_size//2, num_layers=num_layers,
bidirectional=True, batch_first=True)
@@ -33,24 +32,24 @@ class CNNBiLSTMCRF(nn.Module):
if 'crf' in name:
nn.init.zeros_(param)

def _forward(self, words, cap_words, seq_len, target=None):
words = self.embedding(words)
chars = self.char_embedding(cap_words)
words = torch.cat([words, chars], dim=-1)
def _forward(self, words, seq_len, target=None):
word_embeds = self.embedding(words)
char_embeds = self.char_embedding(words)
words = torch.cat((word_embeds, char_embeds), dim=-1)
outputs, _ = self.lstm(words, seq_len)
self.dropout(outputs)

logits = F.log_softmax(self.fc(outputs), dim=-1)

if target is not None:
loss = self.crf(logits, target, seq_len_to_mask(seq_len))
loss = self.crf(logits, target, seq_len_to_mask(seq_len, max_len=logits.size(1))).mean()
return {Const.LOSS: loss}
else:
pred, _ = self.crf.viterbi_decode(logits, seq_len_to_mask(seq_len))
pred, _ = self.crf.viterbi_decode(logits, seq_len_to_mask(seq_len, max_len=logits.size(1)))
return {Const.OUTPUT: pred}

def forward(self, words, cap_words, seq_len, target):
return self._forward(words, cap_words, seq_len, target)
def forward(self, words, seq_len, target):
return self._forward(words, seq_len, target)

def predict(self, words, cap_words, seq_len):
return self._forward(words, cap_words, seq_len, None)
def predict(self, words, seq_len):
return self._forward(words, seq_len, None)

+ 35
- 25
reproduction/seqence_labelling/ner/train_cnn_lstm_crf_conll2003.py View File

@@ -1,6 +1,7 @@
import sys
sys.path.append('../../..')


from fastNLP.modules.encoder.embedding import CNNCharEmbedding, StaticEmbedding, BertEmbedding, ElmoEmbedding, LSTMCharEmbedding
from fastNLP.modules.encoder.embedding import CNNCharEmbedding, StaticEmbedding, BertEmbedding, ElmoEmbedding, StackEmbedding
from fastNLP.core.vocabulary import VocabularyOption

from reproduction.seqence_labelling.ner.model.lstm_cnn_crf import CNNBiLSTMCRF
@@ -12,7 +13,10 @@ from torch.optim import SGD, Adam
from fastNLP import GradientClipCallback
from fastNLP.core.callback import FitlogCallback, LRScheduler
from torch.optim.lr_scheduler import LambdaLR
from reproduction.seqence_labelling.ner.model.swats import SWATS
from fastNLP.core.optimizer import AdamW
# from reproduction.seqence_labelling.ner.model.swats import SWATS
from reproduction.seqence_labelling.chinese_ner.callbacks import SaveModelCallback
from fastNLP import cache_results

import fitlog
fitlog.debug()
@@ -20,17 +24,20 @@ fitlog.debug()
from reproduction.seqence_labelling.ner.data.Conll2003Loader import Conll2003DataLoader

encoding_type = 'bioes'

data = Conll2003DataLoader(encoding_type=encoding_type).process('../../../../others/data/conll2003',
word_vocab_opt=VocabularyOption(min_freq=2),
lower=False)
@cache_results('caches/upper_conll2003.pkl')
def load_data():
data = Conll2003DataLoader(encoding_type=encoding_type).process('../../../../others/data/conll2003',
word_vocab_opt=VocabularyOption(min_freq=1),
lower=False)
return data
data = load_data()
print(data)
char_embed = CNNCharEmbedding(vocab=data.vocabs['cap_words'], embed_size=30, char_emb_size=30, filter_nums=[30],
kernel_sizes=[3])
char_embed = CNNCharEmbedding(vocab=data.vocabs['words'], embed_size=30, char_emb_size=30, filter_nums=[30],
kernel_sizes=[3], word_dropout=0.01, dropout=0.5)
# char_embed = LSTMCharEmbedding(vocab=data.vocabs['cap_words'], embed_size=30 ,char_emb_size=30)
word_embed = StaticEmbedding(vocab=data.vocabs[Const.INPUT],
model_dir_or_name='/hdd/fudanNLP/pretrain_vectors/wiki_en_100_50_case_2.txt',
requires_grad=True)
word_embed = StaticEmbedding(vocab=data.vocabs['words'],
model_dir_or_name='/hdd/fudanNLP/pretrain_vectors/glove.6B.100d.txt',
requires_grad=True, lower=True, word_dropout=0.01, dropout=0.5)
word_embed.embedding.weight.data = word_embed.embedding.weight.data/word_embed.embedding.weight.data.std()

# import joblib
@@ -46,25 +53,28 @@ word_embed.embedding.weight.data = word_embed.embedding.weight.data/word_embed.e
# for name, dataset in data.datasets.items():
# dataset.apply_field(convert_to_ids, field_name='raw_words', new_field_name=Const.INPUT)

# word_embed = ElmoEmbedding(vocab=data.vocabs['cap_words'],
# model_dir_or_name='/hdd/fudanNLP/fastNLP/others/pretrained_models/elmo_en',
# requires_grad=True)
# elmo_embed = ElmoEmbedding(vocab=data.vocabs['cap_words'],
# model_dir_or_name='.',
# requires_grad=True, layers='mix')
# char_embed = StackEmbedding([elmo_embed, char_embed])

model = CNNBiLSTMCRF(word_embed, char_embed, hidden_size=200, num_layers=1, tag_vocab=data.vocabs[Const.TARGET],
encoding_type=encoding_type)

callbacks = [
GradientClipCallback(clip_type='value', clip_value=5)
, FitlogCallback({'test':data.datasets['test']}, verbose=1)
GradientClipCallback(clip_type='value', clip_value=5),
FitlogCallback({'test':data.datasets['test']}, verbose=1),
# SaveModelCallback('save_models/', top=3, only_param=False, save_on_exception=True)
]
# optimizer = Adam(model.parameters(), lr=0.005)
optimizer = SWATS(model.parameters(), verbose=True)
# optimizer = SGD(model.parameters(), lr=0.008, momentum=0.9)
# scheduler = LRScheduler(LambdaLR(optimizer, lr_lambda=lambda epoch: 1 / (1 + 0.05 * epoch)))
# callbacks.append(scheduler)
# optimizer = Adam(model.parameters(), lr=0.001)
# optimizer = SWATS(model.parameters(), verbose=True)
optimizer = SGD(model.parameters(), lr=0.01, momentum=0.9)
scheduler = LRScheduler(LambdaLR(optimizer, lr_lambda=lambda epoch: 1 / (1 + 0.05 * epoch)))
callbacks.append(scheduler)


trainer = Trainer(train_data=data.datasets['train'], model=model, optimizer=optimizer, sampler=BucketSampler(),
device=1, dev_data=data.datasets['dev'], batch_size=10,
trainer = Trainer(train_data=data.datasets['train'], model=model, optimizer=optimizer, sampler=BucketSampler(batch_size=20),
device=1, dev_data=data.datasets['dev'], batch_size=20,
metrics=SpanFPreRecMetric(tag_vocab=data.vocabs[Const.TARGET], encoding_type=encoding_type),
callbacks=callbacks, num_workers=1, n_epochs=100)
callbacks=callbacks, num_workers=2, n_epochs=100)
trainer.train()

Loading…
Cancel
Save