diff --git a/fastNLP/modules/encoder/embedding.py b/fastNLP/modules/encoder/embedding.py index b91e5502..f7d840ad 100644 --- a/fastNLP/modules/encoder/embedding.py +++ b/fastNLP/modules/encoder/embedding.py @@ -135,7 +135,7 @@ class TokenEmbedding(nn.Module): :param torch.LongTensor words: batch_size x max_len :return: """ - if self.dropout_word > 0 and self.training: + if self.word_dropout > 0 and self.training: mask = torch.ones_like(words).float() * self.word_dropout mask = torch.bernoulli(mask).byte() # dropout_word越大,越多位置为1 words = words.masked_fill(mask, self._word_unk_index) @@ -174,8 +174,16 @@ class TokenEmbedding(nn.Module): def embed_size(self) -> int: return self._embed_size + @property + def embedding_dim(self) -> int: + return self._embed_size + @property def num_embedding(self) -> int: + """ + 这个值可能会大于实际的embedding矩阵的大小。 + :return: + """ return len(self._word_vocab) def get_word_vocab(self): @@ -810,7 +818,7 @@ class CNNCharEmbedding(TokenEmbedding): # 为1的地方为mask chars_masks = chars.eq(self.char_pad_index) # batch_size x max_len x max_word_len 如果为0, 说明是padding的位置了 chars = self.char_embedding(chars) # batch_size x max_len x max_word_len x embed_size - self.dropout(chars) + chars = self.dropout(chars) reshaped_chars = chars.reshape(batch_size*max_len, max_word_len, -1) reshaped_chars = reshaped_chars.transpose(1, 2) # B' x E x M conv_chars = [conv(reshaped_chars).transpose(1, 2).reshape(batch_size, max_len, max_word_len, -1) @@ -962,7 +970,7 @@ class LSTMCharEmbedding(TokenEmbedding): chars = self.fc(chars) - return self.dropout(words) + return self.dropout(chars) @property def requires_grad(self): diff --git a/reproduction/seqence_labelling/ner/model/lstm_cnn_crf.py b/reproduction/seqence_labelling/ner/model/lstm_cnn_crf.py index e8e7f6d2..79d704ba 100644 --- a/reproduction/seqence_labelling/ner/model/lstm_cnn_crf.py +++ b/reproduction/seqence_labelling/ner/model/lstm_cnn_crf.py @@ -11,9 +11,8 @@ from fastNLP import Const class CNNBiLSTMCRF(nn.Module): def __init__(self, embed, char_embed, hidden_size, num_layers, tag_vocab, dropout=0.5, encoding_type='bioes'): super().__init__() - - self.embedding = Embedding(embed, dropout=0.5, dropout_word=0) - self.char_embedding = Embedding(char_embed, dropout=0.5, dropout_word=0.01) + self.embedding = embed + self.char_embedding = char_embed self.lstm = LSTM(input_size=self.embedding.embedding_dim+self.char_embedding.embedding_dim, hidden_size=hidden_size//2, num_layers=num_layers, bidirectional=True, batch_first=True) @@ -33,24 +32,24 @@ class CNNBiLSTMCRF(nn.Module): if 'crf' in name: nn.init.zeros_(param) - def _forward(self, words, cap_words, seq_len, target=None): - words = self.embedding(words) - chars = self.char_embedding(cap_words) - words = torch.cat([words, chars], dim=-1) + def _forward(self, words, seq_len, target=None): + word_embeds = self.embedding(words) + char_embeds = self.char_embedding(words) + words = torch.cat((word_embeds, char_embeds), dim=-1) outputs, _ = self.lstm(words, seq_len) self.dropout(outputs) logits = F.log_softmax(self.fc(outputs), dim=-1) if target is not None: - loss = self.crf(logits, target, seq_len_to_mask(seq_len)) + loss = self.crf(logits, target, seq_len_to_mask(seq_len, max_len=logits.size(1))).mean() return {Const.LOSS: loss} else: - pred, _ = self.crf.viterbi_decode(logits, seq_len_to_mask(seq_len)) + pred, _ = self.crf.viterbi_decode(logits, seq_len_to_mask(seq_len, max_len=logits.size(1))) return {Const.OUTPUT: pred} - def forward(self, words, cap_words, seq_len, target): - return self._forward(words, cap_words, seq_len, target) + def forward(self, words, seq_len, target): + return self._forward(words, seq_len, target) - def predict(self, words, cap_words, seq_len): - return self._forward(words, cap_words, seq_len, None) + def predict(self, words, seq_len): + return self._forward(words, seq_len, None) diff --git a/reproduction/seqence_labelling/ner/train_cnn_lstm_crf_conll2003.py b/reproduction/seqence_labelling/ner/train_cnn_lstm_crf_conll2003.py index cf491f3b..e9d18048 100644 --- a/reproduction/seqence_labelling/ner/train_cnn_lstm_crf_conll2003.py +++ b/reproduction/seqence_labelling/ner/train_cnn_lstm_crf_conll2003.py @@ -1,6 +1,7 @@ +import sys +sys.path.append('../../..') - -from fastNLP.modules.encoder.embedding import CNNCharEmbedding, StaticEmbedding, BertEmbedding, ElmoEmbedding, LSTMCharEmbedding +from fastNLP.modules.encoder.embedding import CNNCharEmbedding, StaticEmbedding, BertEmbedding, ElmoEmbedding, StackEmbedding from fastNLP.core.vocabulary import VocabularyOption from reproduction.seqence_labelling.ner.model.lstm_cnn_crf import CNNBiLSTMCRF @@ -12,7 +13,10 @@ from torch.optim import SGD, Adam from fastNLP import GradientClipCallback from fastNLP.core.callback import FitlogCallback, LRScheduler from torch.optim.lr_scheduler import LambdaLR -from reproduction.seqence_labelling.ner.model.swats import SWATS +from fastNLP.core.optimizer import AdamW +# from reproduction.seqence_labelling.ner.model.swats import SWATS +from reproduction.seqence_labelling.chinese_ner.callbacks import SaveModelCallback +from fastNLP import cache_results import fitlog fitlog.debug() @@ -20,17 +24,20 @@ fitlog.debug() from reproduction.seqence_labelling.ner.data.Conll2003Loader import Conll2003DataLoader encoding_type = 'bioes' - -data = Conll2003DataLoader(encoding_type=encoding_type).process('../../../../others/data/conll2003', - word_vocab_opt=VocabularyOption(min_freq=2), - lower=False) +@cache_results('caches/upper_conll2003.pkl') +def load_data(): + data = Conll2003DataLoader(encoding_type=encoding_type).process('../../../../others/data/conll2003', + word_vocab_opt=VocabularyOption(min_freq=1), + lower=False) + return data +data = load_data() print(data) -char_embed = CNNCharEmbedding(vocab=data.vocabs['cap_words'], embed_size=30, char_emb_size=30, filter_nums=[30], - kernel_sizes=[3]) +char_embed = CNNCharEmbedding(vocab=data.vocabs['words'], embed_size=30, char_emb_size=30, filter_nums=[30], + kernel_sizes=[3], word_dropout=0.01, dropout=0.5) # char_embed = LSTMCharEmbedding(vocab=data.vocabs['cap_words'], embed_size=30 ,char_emb_size=30) -word_embed = StaticEmbedding(vocab=data.vocabs[Const.INPUT], - model_dir_or_name='/hdd/fudanNLP/pretrain_vectors/wiki_en_100_50_case_2.txt', - requires_grad=True) +word_embed = StaticEmbedding(vocab=data.vocabs['words'], + model_dir_or_name='/hdd/fudanNLP/pretrain_vectors/glove.6B.100d.txt', + requires_grad=True, lower=True, word_dropout=0.01, dropout=0.5) word_embed.embedding.weight.data = word_embed.embedding.weight.data/word_embed.embedding.weight.data.std() # import joblib @@ -46,25 +53,28 @@ word_embed.embedding.weight.data = word_embed.embedding.weight.data/word_embed.e # for name, dataset in data.datasets.items(): # dataset.apply_field(convert_to_ids, field_name='raw_words', new_field_name=Const.INPUT) -# word_embed = ElmoEmbedding(vocab=data.vocabs['cap_words'], -# model_dir_or_name='/hdd/fudanNLP/fastNLP/others/pretrained_models/elmo_en', -# requires_grad=True) +# elmo_embed = ElmoEmbedding(vocab=data.vocabs['cap_words'], +# model_dir_or_name='.', +# requires_grad=True, layers='mix') +# char_embed = StackEmbedding([elmo_embed, char_embed]) model = CNNBiLSTMCRF(word_embed, char_embed, hidden_size=200, num_layers=1, tag_vocab=data.vocabs[Const.TARGET], encoding_type=encoding_type) callbacks = [ - GradientClipCallback(clip_type='value', clip_value=5) - , FitlogCallback({'test':data.datasets['test']}, verbose=1) + GradientClipCallback(clip_type='value', clip_value=5), + FitlogCallback({'test':data.datasets['test']}, verbose=1), + # SaveModelCallback('save_models/', top=3, only_param=False, save_on_exception=True) ] -# optimizer = Adam(model.parameters(), lr=0.005) -optimizer = SWATS(model.parameters(), verbose=True) -# optimizer = SGD(model.parameters(), lr=0.008, momentum=0.9) -# scheduler = LRScheduler(LambdaLR(optimizer, lr_lambda=lambda epoch: 1 / (1 + 0.05 * epoch))) -# callbacks.append(scheduler) +# optimizer = Adam(model.parameters(), lr=0.001) +# optimizer = SWATS(model.parameters(), verbose=True) +optimizer = SGD(model.parameters(), lr=0.01, momentum=0.9) +scheduler = LRScheduler(LambdaLR(optimizer, lr_lambda=lambda epoch: 1 / (1 + 0.05 * epoch))) +callbacks.append(scheduler) + -trainer = Trainer(train_data=data.datasets['train'], model=model, optimizer=optimizer, sampler=BucketSampler(), - device=1, dev_data=data.datasets['dev'], batch_size=10, +trainer = Trainer(train_data=data.datasets['train'], model=model, optimizer=optimizer, sampler=BucketSampler(batch_size=20), + device=1, dev_data=data.datasets['dev'], batch_size=20, metrics=SpanFPreRecMetric(tag_vocab=data.vocabs[Const.TARGET], encoding_type=encoding_type), - callbacks=callbacks, num_workers=1, n_epochs=100) + callbacks=callbacks, num_workers=2, n_epochs=100) trainer.train() \ No newline at end of file