diff --git a/fastNLP/core/losses.py b/fastNLP/core/losses.py index 66234ce7..62e7a8c8 100644 --- a/fastNLP/core/losses.py +++ b/fastNLP/core/losses.py @@ -232,12 +232,16 @@ class CrossEntropyLoss(LossBase): """ def __init__(self, pred=None, target=None, padding_idx=-100): - # TODO 需要做一些检查,F.cross_entropy在计算时,如果pred是(16, 10 ,4), target的形状按道理应该是(16, 10), 但实际需要(16,4) super(CrossEntropyLoss, self).__init__() self._init_param_map(pred=pred, target=target) self.padding_idx = padding_idx def get_loss(self, pred, target): + if pred.dim()>2: + if pred.size()[:2]==target.size(): + # F.cross_entropy在计算时,如果pred是(16, 10 ,4), 会在第二维上去log_softmax, 所以需要交换一下位置 + pred = pred.transpose(1, 2) + return F.cross_entropy(input=pred, target=target, ignore_index=self.padding_idx) diff --git a/fastNLP/core/trainer.py b/fastNLP/core/trainer.py index 55bc4ee0..a303f742 100644 --- a/fastNLP/core/trainer.py +++ b/fastNLP/core/trainer.py @@ -451,9 +451,11 @@ class Trainer(object): self.data_iterator = train_data else: raise TypeError("train_data type {} not support".format(type(train_data))) - + + self.model = _move_model_to_device(model, device=device) + if check_code_level > -1 and isinstance(self.data_iterator, DataSetIter): - _check_code(dataset=train_data, model=model, losser=losser, metrics=metrics, dev_data=dev_data, + _check_code(dataset=train_data, model=self.model, losser=losser, metrics=metrics, dev_data=dev_data, metric_key=metric_key, check_level=check_code_level, batch_size=min(batch_size, DEFAULT_CHECK_BATCH_SIZE)) # _check_code 是 fastNLP 帮助你检查代码是否正确的方法 。如果你在错误栈中看到这行注释,请认真检查你的代码 @@ -474,9 +476,7 @@ class Trainer(object): self.best_dev_perf = None self.n_steps = (len(self.train_data) // self.batch_size + int( len(self.train_data) % self.batch_size != 0)) * self.n_epochs - - self.model = _move_model_to_device(self.model, device=device) - + if isinstance(optimizer, torch.optim.Optimizer): self.optimizer = optimizer elif isinstance(optimizer, Optimizer): diff --git a/fastNLP/modules/encoder/embedding.py b/fastNLP/modules/encoder/embedding.py index aa7b399c..c6c95bb7 100644 --- a/fastNLP/modules/encoder/embedding.py +++ b/fastNLP/modules/encoder/embedding.py @@ -204,7 +204,7 @@ class StaticEmbedding(TokenEmbedding): model_url = PRETRAIN_URL + model_name model_path = cached_path(model_url) # 检查是否存在 - elif os.path.isfile(model_dir_or_name): + elif os.path.isfile(os.path.expanduser(os.path.abspath(model_dir_or_name))): model_path = model_dir_or_name else: raise ValueError(f"Cannot recognize {model_dir_or_name}.") @@ -455,7 +455,7 @@ class ElmoEmbedding(ContextualEmbedding): model_url = PRETRAIN_URL + model_name model_dir = cached_path(model_url) # 检查是否存在 - elif os.path.isdir(model_dir_or_name): + elif os.path.isdir(os.path.expanduser(os.path.abspath(model_dir_or_name))): model_dir = model_dir_or_name else: raise ValueError(f"Cannot recognize {model_dir_or_name}.") @@ -553,7 +553,7 @@ class BertEmbedding(ContextualEmbedding): model_url = PRETRAIN_URL + model_name model_dir = cached_path(model_url) # 检查是否存在 - elif os.path.isdir(model_dir_or_name): + elif os.path.isdir(os.path.expanduser(os.path.abspath(model_dir_or_name))): model_dir = model_dir_or_name else: raise ValueError(f"Cannot recognize {model_dir_or_name}.") diff --git a/reproduction/seqence_labelling/cws/train_shift_relay.py b/reproduction/seqence_labelling/cws/train_shift_relay.py index 805521e7..55576575 100644 --- a/reproduction/seqence_labelling/cws/train_shift_relay.py +++ b/reproduction/seqence_labelling/cws/train_shift_relay.py @@ -57,12 +57,8 @@ callbacks = [clipper] # if pretrain: # fixer = FixEmbedding([model.char_embedding, model.bigram_embedding], fix_until=fix_until) # callbacks.append(fixer) -trainer = Trainer(data.datasets['train'], model, optimizer=optimizer, loss=None, - batch_size=32, sampler=sampler, update_every=5, - n_epochs=3, print_every=5, - dev_data=data.datasets['dev'], metrics=RelayMetric(), metric_key='f', - validate_every=-1, save_path=None, - prefetch=True, use_tqdm=True, device=device, - callbacks=callbacks, +trainer = Trainer(data.datasets['train'], model, optimizer=optimizer, loss=None, batch_size=32, sampler=sampler, + update_every=5, n_epochs=3, print_every=5, dev_data=data.datasets['dev'], metrics=RelayMetric(), + metric_key='f', validate_every=-1, save_path=None, use_tqdm=True, device=device, callbacks=callbacks, check_code_level=0) trainer.train() \ No newline at end of file diff --git a/reproduction/seqence_labelling/ner/model/lstm_cnn_crf.py b/reproduction/seqence_labelling/ner/model/lstm_cnn_crf.py index 36d86651..e8e7f6d2 100644 --- a/reproduction/seqence_labelling/ner/model/lstm_cnn_crf.py +++ b/reproduction/seqence_labelling/ner/model/lstm_cnn_crf.py @@ -12,11 +12,11 @@ class CNNBiLSTMCRF(nn.Module): def __init__(self, embed, char_embed, hidden_size, num_layers, tag_vocab, dropout=0.5, encoding_type='bioes'): super().__init__() - self.embedding = Embedding(embed, dropout=0.5) - self.char_embedding = Embedding(char_embed, dropout=0.5) + self.embedding = Embedding(embed, dropout=0.5, dropout_word=0) + self.char_embedding = Embedding(char_embed, dropout=0.5, dropout_word=0.01) self.lstm = LSTM(input_size=self.embedding.embedding_dim+self.char_embedding.embedding_dim, - hidden_size=hidden_size//2, num_layers=num_layers, - bidirectional=True, batch_first=True, dropout=dropout) + hidden_size=hidden_size//2, num_layers=num_layers, + bidirectional=True, batch_first=True) self.fc = nn.Linear(hidden_size, len(tag_vocab)) transitions = allowed_transitions(tag_vocab.idx2word, encoding_type=encoding_type, include_start_end=True) @@ -25,9 +25,9 @@ class CNNBiLSTMCRF(nn.Module): self.dropout = nn.Dropout(dropout, inplace=True) for name, param in self.named_parameters(): - if 'ward_fc' in name: + if 'fc' in name: if param.data.dim()>1: - nn.init.xavier_normal_(param) + nn.init.xavier_uniform_(param) else: nn.init.constant_(param, 0) if 'crf' in name: diff --git a/reproduction/seqence_labelling/ner/train_cnn_lstm_crf_conll2003.py b/reproduction/seqence_labelling/ner/train_cnn_lstm_crf_conll2003.py index 507be4f6..cf491f3b 100644 --- a/reproduction/seqence_labelling/ner/train_cnn_lstm_crf_conll2003.py +++ b/reproduction/seqence_labelling/ner/train_cnn_lstm_crf_conll2003.py @@ -1,6 +1,6 @@ -from fastNLP.modules.encoder.embedding import CNNCharEmbedding, StaticEmbedding, BertEmbedding +from fastNLP.modules.encoder.embedding import CNNCharEmbedding, StaticEmbedding, BertEmbedding, ElmoEmbedding, LSTMCharEmbedding from fastNLP.core.vocabulary import VocabularyOption from reproduction.seqence_labelling.ner.model.lstm_cnn_crf import CNNBiLSTMCRF @@ -12,6 +12,8 @@ from torch.optim import SGD, Adam from fastNLP import GradientClipCallback from fastNLP.core.callback import FitlogCallback, LRScheduler from torch.optim.lr_scheduler import LambdaLR +from reproduction.seqence_labelling.ner.model.swats import SWATS + import fitlog fitlog.debug() @@ -19,28 +21,50 @@ from reproduction.seqence_labelling.ner.data.Conll2003Loader import Conll2003Dat encoding_type = 'bioes' -data = Conll2003DataLoader(encoding_type=encoding_type).process('/hdd/fudanNLP/fastNLP/others/data/conll2003', - word_vocab_opt=VocabularyOption(min_freq=2)) +data = Conll2003DataLoader(encoding_type=encoding_type).process('../../../../others/data/conll2003', + word_vocab_opt=VocabularyOption(min_freq=2), + lower=False) print(data) char_embed = CNNCharEmbedding(vocab=data.vocabs['cap_words'], embed_size=30, char_emb_size=30, filter_nums=[30], kernel_sizes=[3]) +# char_embed = LSTMCharEmbedding(vocab=data.vocabs['cap_words'], embed_size=30 ,char_emb_size=30) word_embed = StaticEmbedding(vocab=data.vocabs[Const.INPUT], - model_dir_or_name='/hdd/fudanNLP/pretrain_vectors/glove.6B.100d.txt', + model_dir_or_name='/hdd/fudanNLP/pretrain_vectors/wiki_en_100_50_case_2.txt', requires_grad=True) word_embed.embedding.weight.data = word_embed.embedding.weight.data/word_embed.embedding.weight.data.std() +# import joblib +# raw_data = joblib.load('/hdd/fudanNLP/fastNLP/others/NER-with-LS/data/conll_with_data.joblib') +# def convert_to_ids(raw_words): +# ids = [] +# for word in raw_words: +# id = raw_data['word_to_id'][word] +# id = raw_data['id_to_emb_map'][id] +# ids.append(id) +# return ids +# word_embed = raw_data['emb_matrix'] +# for name, dataset in data.datasets.items(): +# dataset.apply_field(convert_to_ids, field_name='raw_words', new_field_name=Const.INPUT) + +# word_embed = ElmoEmbedding(vocab=data.vocabs['cap_words'], +# model_dir_or_name='/hdd/fudanNLP/fastNLP/others/pretrained_models/elmo_en', +# requires_grad=True) + model = CNNBiLSTMCRF(word_embed, char_embed, hidden_size=200, num_layers=1, tag_vocab=data.vocabs[Const.TARGET], encoding_type=encoding_type) -optimizer = SGD(model.parameters(), lr=0.01, momentum=0.9) -scheduler = LRScheduler(LambdaLR(optimizer, lr_lambda=lambda epoch: 1 / (1 + 0.05 * epoch))) - -callbacks = [GradientClipCallback(clip_type='value', clip_value=5), FitlogCallback({'test':data.datasets['test'], - 'train':data.datasets['train']}, verbose=1), - scheduler] +callbacks = [ + GradientClipCallback(clip_type='value', clip_value=5) + , FitlogCallback({'test':data.datasets['test']}, verbose=1) + ] +# optimizer = Adam(model.parameters(), lr=0.005) +optimizer = SWATS(model.parameters(), verbose=True) +# optimizer = SGD(model.parameters(), lr=0.008, momentum=0.9) +# scheduler = LRScheduler(LambdaLR(optimizer, lr_lambda=lambda epoch: 1 / (1 + 0.05 * epoch))) +# callbacks.append(scheduler) trainer = Trainer(train_data=data.datasets['train'], model=model, optimizer=optimizer, sampler=BucketSampler(), - device=0, dev_data=data.datasets['dev'], batch_size=10, + device=1, dev_data=data.datasets['dev'], batch_size=10, metrics=SpanFPreRecMetric(tag_vocab=data.vocabs[Const.TARGET], encoding_type=encoding_type), callbacks=callbacks, num_workers=1, n_epochs=100) trainer.train() \ No newline at end of file diff --git a/reproduction/seqence_labelling/ner/train_ontonote.py b/reproduction/seqence_labelling/ner/train_ontonote.py index e2a4158a..6548cb9f 100644 --- a/reproduction/seqence_labelling/ner/train_ontonote.py +++ b/reproduction/seqence_labelling/ner/train_ontonote.py @@ -1,4 +1,6 @@ +import sys +sys.path.append('../../..') from fastNLP.modules.encoder.embedding import CNNCharEmbedding, StaticEmbedding @@ -8,8 +10,11 @@ from fastNLP import SpanFPreRecMetric from fastNLP import BucketSampler from fastNLP import Const from torch.optim import SGD, Adam +from torch.optim.lr_scheduler import LambdaLR from fastNLP import GradientClipCallback -from fastNLP.core.callback import FitlogCallback +from fastNLP.core.callback import FitlogCallback, LRScheduler +from reproduction.seqence_labelling.ner.model.swats import SWATS + import fitlog fitlog.debug() @@ -17,23 +22,44 @@ from reproduction.seqence_labelling.ner.data.OntoNoteLoader import OntoNoteNERDa encoding_type = 'bioes' -data = OntoNoteNERDataLoader(encoding_type=encoding_type).process('/hdd/fudanNLP/fastNLP/others/data/v4/english') +data = OntoNoteNERDataLoader(encoding_type=encoding_type).process('/hdd/fudanNLP/fastNLP/others/data/v4/english', + lower=True) + +import joblib +raw_data = joblib.load('/hdd/fudanNLP/fastNLP/others/NER-with-LS/data/ontonotes_with_data.joblib') +def convert_to_ids(raw_words): + ids = [] + for word in raw_words: + id = raw_data['word_to_id'][word] + id = raw_data['id_to_emb_map'][id] + ids.append(id) + return ids +word_embed = raw_data['emb_matrix'] +for name, dataset in data.datasets.items(): + dataset.apply_field(convert_to_ids, field_name='raw_words', new_field_name=Const.INPUT) + print(data) char_embed = CNNCharEmbedding(vocab=data.vocabs['cap_words'], embed_size=30, char_emb_size=30, filter_nums=[30], kernel_sizes=[3]) -word_embed = StaticEmbedding(vocab=data.vocabs[Const.INPUT], - model_dir_or_name='/hdd/fudanNLP/pretrain_vectors/glove.6B.100d.txt', - requires_grad=True) +# word_embed = StaticEmbedding(vocab=data.vocabs[Const.INPUT], +# model_dir_or_name='/hdd/fudanNLP/pretrain_vectors/glove.6B.100d.txt', +# requires_grad=True) -model = CNNBiLSTMCRF(word_embed, char_embed, hidden_size=400, num_layers=2, tag_vocab=data.vocabs[Const.TARGET], +model = CNNBiLSTMCRF(word_embed, char_embed, hidden_size=1200, num_layers=1, tag_vocab=data.vocabs[Const.TARGET], encoding_type=encoding_type) -optimizer = SGD(model.parameters(), lr=0.015, momentum=0.9) +callbacks = [GradientClipCallback(clip_value=5, clip_type='value'), + FitlogCallback(data.datasets['test'], verbose=1)] + +optimizer = SGD(model.parameters(), lr=0.01, momentum=0.9) +scheduler = LRScheduler(LambdaLR(optimizer, lr_lambda=lambda epoch: 1 / (1 + 0.05 * epoch))) +callbacks.append(scheduler) +# optimizer = SWATS(model.parameters(), verbose=True) +# optimizer = Adam(model.parameters(), lr=0.005) -callbacks = [GradientClipCallback(), FitlogCallback(data.datasets['test'], verbose=1)] -trainer = Trainer(train_data=data.datasets['train'], model=model, optimizer=optimizer, sampler=BucketSampler(), - device=1, dev_data=data.datasets['dev'], batch_size=32, +trainer = Trainer(train_data=data.datasets['train'], model=model, optimizer=optimizer, sampler=BucketSampler(num_buckets=100), + device=0, dev_data=data.datasets['dev'], batch_size=10, metrics=SpanFPreRecMetric(tag_vocab=data.vocabs[Const.TARGET], encoding_type=encoding_type), callbacks=callbacks, num_workers=1, n_epochs=100) trainer.train() \ No newline at end of file