Browse Source

1.修改CrossEntropyLoss中存在的反直觉bug; 2.更新sequence labeling

tags/v0.4.10
yh_cc 5 years ago
parent
commit
39dd086262
7 changed files with 93 additions and 43 deletions
  1. +5
    -1
      fastNLP/core/losses.py
  2. +5
    -5
      fastNLP/core/trainer.py
  3. +3
    -3
      fastNLP/modules/encoder/embedding.py
  4. +3
    -7
      reproduction/seqence_labelling/cws/train_shift_relay.py
  5. +6
    -6
      reproduction/seqence_labelling/ner/model/lstm_cnn_crf.py
  6. +35
    -11
      reproduction/seqence_labelling/ner/train_cnn_lstm_crf_conll2003.py
  7. +36
    -10
      reproduction/seqence_labelling/ner/train_ontonote.py

+ 5
- 1
fastNLP/core/losses.py View File

@@ -232,12 +232,16 @@ class CrossEntropyLoss(LossBase):
""" """
def __init__(self, pred=None, target=None, padding_idx=-100): def __init__(self, pred=None, target=None, padding_idx=-100):
# TODO 需要做一些检查,F.cross_entropy在计算时,如果pred是(16, 10 ,4), target的形状按道理应该是(16, 10), 但实际需要(16,4)
super(CrossEntropyLoss, self).__init__() super(CrossEntropyLoss, self).__init__()
self._init_param_map(pred=pred, target=target) self._init_param_map(pred=pred, target=target)
self.padding_idx = padding_idx self.padding_idx = padding_idx
def get_loss(self, pred, target): def get_loss(self, pred, target):
if pred.dim()>2:
if pred.size()[:2]==target.size():
# F.cross_entropy在计算时,如果pred是(16, 10 ,4), 会在第二维上去log_softmax, 所以需要交换一下位置
pred = pred.transpose(1, 2)

return F.cross_entropy(input=pred, target=target, return F.cross_entropy(input=pred, target=target,
ignore_index=self.padding_idx) ignore_index=self.padding_idx)




+ 5
- 5
fastNLP/core/trainer.py View File

@@ -451,9 +451,11 @@ class Trainer(object):
self.data_iterator = train_data self.data_iterator = train_data
else: else:
raise TypeError("train_data type {} not support".format(type(train_data))) raise TypeError("train_data type {} not support".format(type(train_data)))

self.model = _move_model_to_device(model, device=device)

if check_code_level > -1 and isinstance(self.data_iterator, DataSetIter): if check_code_level > -1 and isinstance(self.data_iterator, DataSetIter):
_check_code(dataset=train_data, model=model, losser=losser, metrics=metrics, dev_data=dev_data,
_check_code(dataset=train_data, model=self.model, losser=losser, metrics=metrics, dev_data=dev_data,
metric_key=metric_key, check_level=check_code_level, metric_key=metric_key, check_level=check_code_level,
batch_size=min(batch_size, DEFAULT_CHECK_BATCH_SIZE)) batch_size=min(batch_size, DEFAULT_CHECK_BATCH_SIZE))
# _check_code 是 fastNLP 帮助你检查代码是否正确的方法 。如果你在错误栈中看到这行注释,请认真检查你的代码 # _check_code 是 fastNLP 帮助你检查代码是否正确的方法 。如果你在错误栈中看到这行注释,请认真检查你的代码
@@ -474,9 +476,7 @@ class Trainer(object):
self.best_dev_perf = None self.best_dev_perf = None
self.n_steps = (len(self.train_data) // self.batch_size + int( self.n_steps = (len(self.train_data) // self.batch_size + int(
len(self.train_data) % self.batch_size != 0)) * self.n_epochs len(self.train_data) % self.batch_size != 0)) * self.n_epochs
self.model = _move_model_to_device(self.model, device=device)

if isinstance(optimizer, torch.optim.Optimizer): if isinstance(optimizer, torch.optim.Optimizer):
self.optimizer = optimizer self.optimizer = optimizer
elif isinstance(optimizer, Optimizer): elif isinstance(optimizer, Optimizer):


+ 3
- 3
fastNLP/modules/encoder/embedding.py View File

@@ -204,7 +204,7 @@ class StaticEmbedding(TokenEmbedding):
model_url = PRETRAIN_URL + model_name model_url = PRETRAIN_URL + model_name
model_path = cached_path(model_url) model_path = cached_path(model_url)
# 检查是否存在 # 检查是否存在
elif os.path.isfile(model_dir_or_name):
elif os.path.isfile(os.path.expanduser(os.path.abspath(model_dir_or_name))):
model_path = model_dir_or_name model_path = model_dir_or_name
else: else:
raise ValueError(f"Cannot recognize {model_dir_or_name}.") raise ValueError(f"Cannot recognize {model_dir_or_name}.")
@@ -455,7 +455,7 @@ class ElmoEmbedding(ContextualEmbedding):
model_url = PRETRAIN_URL + model_name model_url = PRETRAIN_URL + model_name
model_dir = cached_path(model_url) model_dir = cached_path(model_url)
# 检查是否存在 # 检查是否存在
elif os.path.isdir(model_dir_or_name):
elif os.path.isdir(os.path.expanduser(os.path.abspath(model_dir_or_name))):
model_dir = model_dir_or_name model_dir = model_dir_or_name
else: else:
raise ValueError(f"Cannot recognize {model_dir_or_name}.") raise ValueError(f"Cannot recognize {model_dir_or_name}.")
@@ -553,7 +553,7 @@ class BertEmbedding(ContextualEmbedding):
model_url = PRETRAIN_URL + model_name model_url = PRETRAIN_URL + model_name
model_dir = cached_path(model_url) model_dir = cached_path(model_url)
# 检查是否存在 # 检查是否存在
elif os.path.isdir(model_dir_or_name):
elif os.path.isdir(os.path.expanduser(os.path.abspath(model_dir_or_name))):
model_dir = model_dir_or_name model_dir = model_dir_or_name
else: else:
raise ValueError(f"Cannot recognize {model_dir_or_name}.") raise ValueError(f"Cannot recognize {model_dir_or_name}.")


+ 3
- 7
reproduction/seqence_labelling/cws/train_shift_relay.py View File

@@ -57,12 +57,8 @@ callbacks = [clipper]
# if pretrain: # if pretrain:
# fixer = FixEmbedding([model.char_embedding, model.bigram_embedding], fix_until=fix_until) # fixer = FixEmbedding([model.char_embedding, model.bigram_embedding], fix_until=fix_until)
# callbacks.append(fixer) # callbacks.append(fixer)
trainer = Trainer(data.datasets['train'], model, optimizer=optimizer, loss=None,
batch_size=32, sampler=sampler, update_every=5,
n_epochs=3, print_every=5,
dev_data=data.datasets['dev'], metrics=RelayMetric(), metric_key='f',
validate_every=-1, save_path=None,
prefetch=True, use_tqdm=True, device=device,
callbacks=callbacks,
trainer = Trainer(data.datasets['train'], model, optimizer=optimizer, loss=None, batch_size=32, sampler=sampler,
update_every=5, n_epochs=3, print_every=5, dev_data=data.datasets['dev'], metrics=RelayMetric(),
metric_key='f', validate_every=-1, save_path=None, use_tqdm=True, device=device, callbacks=callbacks,
check_code_level=0) check_code_level=0)
trainer.train() trainer.train()

+ 6
- 6
reproduction/seqence_labelling/ner/model/lstm_cnn_crf.py View File

@@ -12,11 +12,11 @@ class CNNBiLSTMCRF(nn.Module):
def __init__(self, embed, char_embed, hidden_size, num_layers, tag_vocab, dropout=0.5, encoding_type='bioes'): def __init__(self, embed, char_embed, hidden_size, num_layers, tag_vocab, dropout=0.5, encoding_type='bioes'):
super().__init__() super().__init__()


self.embedding = Embedding(embed, dropout=0.5)
self.char_embedding = Embedding(char_embed, dropout=0.5)
self.embedding = Embedding(embed, dropout=0.5, dropout_word=0)
self.char_embedding = Embedding(char_embed, dropout=0.5, dropout_word=0.01)
self.lstm = LSTM(input_size=self.embedding.embedding_dim+self.char_embedding.embedding_dim, self.lstm = LSTM(input_size=self.embedding.embedding_dim+self.char_embedding.embedding_dim,
hidden_size=hidden_size//2, num_layers=num_layers,
bidirectional=True, batch_first=True, dropout=dropout)
hidden_size=hidden_size//2, num_layers=num_layers,
bidirectional=True, batch_first=True)
self.fc = nn.Linear(hidden_size, len(tag_vocab)) self.fc = nn.Linear(hidden_size, len(tag_vocab))


transitions = allowed_transitions(tag_vocab.idx2word, encoding_type=encoding_type, include_start_end=True) transitions = allowed_transitions(tag_vocab.idx2word, encoding_type=encoding_type, include_start_end=True)
@@ -25,9 +25,9 @@ class CNNBiLSTMCRF(nn.Module):
self.dropout = nn.Dropout(dropout, inplace=True) self.dropout = nn.Dropout(dropout, inplace=True)


for name, param in self.named_parameters(): for name, param in self.named_parameters():
if 'ward_fc' in name:
if 'fc' in name:
if param.data.dim()>1: if param.data.dim()>1:
nn.init.xavier_normal_(param)
nn.init.xavier_uniform_(param)
else: else:
nn.init.constant_(param, 0) nn.init.constant_(param, 0)
if 'crf' in name: if 'crf' in name:


+ 35
- 11
reproduction/seqence_labelling/ner/train_cnn_lstm_crf_conll2003.py View File

@@ -1,6 +1,6 @@




from fastNLP.modules.encoder.embedding import CNNCharEmbedding, StaticEmbedding, BertEmbedding
from fastNLP.modules.encoder.embedding import CNNCharEmbedding, StaticEmbedding, BertEmbedding, ElmoEmbedding, LSTMCharEmbedding
from fastNLP.core.vocabulary import VocabularyOption from fastNLP.core.vocabulary import VocabularyOption


from reproduction.seqence_labelling.ner.model.lstm_cnn_crf import CNNBiLSTMCRF from reproduction.seqence_labelling.ner.model.lstm_cnn_crf import CNNBiLSTMCRF
@@ -12,6 +12,8 @@ from torch.optim import SGD, Adam
from fastNLP import GradientClipCallback from fastNLP import GradientClipCallback
from fastNLP.core.callback import FitlogCallback, LRScheduler from fastNLP.core.callback import FitlogCallback, LRScheduler
from torch.optim.lr_scheduler import LambdaLR from torch.optim.lr_scheduler import LambdaLR
from reproduction.seqence_labelling.ner.model.swats import SWATS

import fitlog import fitlog
fitlog.debug() fitlog.debug()


@@ -19,28 +21,50 @@ from reproduction.seqence_labelling.ner.data.Conll2003Loader import Conll2003Dat


encoding_type = 'bioes' encoding_type = 'bioes'


data = Conll2003DataLoader(encoding_type=encoding_type).process('/hdd/fudanNLP/fastNLP/others/data/conll2003',
word_vocab_opt=VocabularyOption(min_freq=2))
data = Conll2003DataLoader(encoding_type=encoding_type).process('../../../../others/data/conll2003',
word_vocab_opt=VocabularyOption(min_freq=2),
lower=False)
print(data) print(data)
char_embed = CNNCharEmbedding(vocab=data.vocabs['cap_words'], embed_size=30, char_emb_size=30, filter_nums=[30], char_embed = CNNCharEmbedding(vocab=data.vocabs['cap_words'], embed_size=30, char_emb_size=30, filter_nums=[30],
kernel_sizes=[3]) kernel_sizes=[3])
# char_embed = LSTMCharEmbedding(vocab=data.vocabs['cap_words'], embed_size=30 ,char_emb_size=30)
word_embed = StaticEmbedding(vocab=data.vocabs[Const.INPUT], word_embed = StaticEmbedding(vocab=data.vocabs[Const.INPUT],
model_dir_or_name='/hdd/fudanNLP/pretrain_vectors/glove.6B.100d.txt',
model_dir_or_name='/hdd/fudanNLP/pretrain_vectors/wiki_en_100_50_case_2.txt',
requires_grad=True) requires_grad=True)
word_embed.embedding.weight.data = word_embed.embedding.weight.data/word_embed.embedding.weight.data.std() word_embed.embedding.weight.data = word_embed.embedding.weight.data/word_embed.embedding.weight.data.std()


# import joblib
# raw_data = joblib.load('/hdd/fudanNLP/fastNLP/others/NER-with-LS/data/conll_with_data.joblib')
# def convert_to_ids(raw_words):
# ids = []
# for word in raw_words:
# id = raw_data['word_to_id'][word]
# id = raw_data['id_to_emb_map'][id]
# ids.append(id)
# return ids
# word_embed = raw_data['emb_matrix']
# for name, dataset in data.datasets.items():
# dataset.apply_field(convert_to_ids, field_name='raw_words', new_field_name=Const.INPUT)

# word_embed = ElmoEmbedding(vocab=data.vocabs['cap_words'],
# model_dir_or_name='/hdd/fudanNLP/fastNLP/others/pretrained_models/elmo_en',
# requires_grad=True)

model = CNNBiLSTMCRF(word_embed, char_embed, hidden_size=200, num_layers=1, tag_vocab=data.vocabs[Const.TARGET], model = CNNBiLSTMCRF(word_embed, char_embed, hidden_size=200, num_layers=1, tag_vocab=data.vocabs[Const.TARGET],
encoding_type=encoding_type) encoding_type=encoding_type)


optimizer = SGD(model.parameters(), lr=0.01, momentum=0.9)
scheduler = LRScheduler(LambdaLR(optimizer, lr_lambda=lambda epoch: 1 / (1 + 0.05 * epoch)))

callbacks = [GradientClipCallback(clip_type='value', clip_value=5), FitlogCallback({'test':data.datasets['test'],
'train':data.datasets['train']}, verbose=1),
scheduler]
callbacks = [
GradientClipCallback(clip_type='value', clip_value=5)
, FitlogCallback({'test':data.datasets['test']}, verbose=1)
]
# optimizer = Adam(model.parameters(), lr=0.005)
optimizer = SWATS(model.parameters(), verbose=True)
# optimizer = SGD(model.parameters(), lr=0.008, momentum=0.9)
# scheduler = LRScheduler(LambdaLR(optimizer, lr_lambda=lambda epoch: 1 / (1 + 0.05 * epoch)))
# callbacks.append(scheduler)


trainer = Trainer(train_data=data.datasets['train'], model=model, optimizer=optimizer, sampler=BucketSampler(), trainer = Trainer(train_data=data.datasets['train'], model=model, optimizer=optimizer, sampler=BucketSampler(),
device=0, dev_data=data.datasets['dev'], batch_size=10,
device=1, dev_data=data.datasets['dev'], batch_size=10,
metrics=SpanFPreRecMetric(tag_vocab=data.vocabs[Const.TARGET], encoding_type=encoding_type), metrics=SpanFPreRecMetric(tag_vocab=data.vocabs[Const.TARGET], encoding_type=encoding_type),
callbacks=callbacks, num_workers=1, n_epochs=100) callbacks=callbacks, num_workers=1, n_epochs=100)
trainer.train() trainer.train()

+ 36
- 10
reproduction/seqence_labelling/ner/train_ontonote.py View File

@@ -1,4 +1,6 @@
import sys


sys.path.append('../../..')


from fastNLP.modules.encoder.embedding import CNNCharEmbedding, StaticEmbedding from fastNLP.modules.encoder.embedding import CNNCharEmbedding, StaticEmbedding


@@ -8,8 +10,11 @@ from fastNLP import SpanFPreRecMetric
from fastNLP import BucketSampler from fastNLP import BucketSampler
from fastNLP import Const from fastNLP import Const
from torch.optim import SGD, Adam from torch.optim import SGD, Adam
from torch.optim.lr_scheduler import LambdaLR
from fastNLP import GradientClipCallback from fastNLP import GradientClipCallback
from fastNLP.core.callback import FitlogCallback
from fastNLP.core.callback import FitlogCallback, LRScheduler
from reproduction.seqence_labelling.ner.model.swats import SWATS

import fitlog import fitlog
fitlog.debug() fitlog.debug()


@@ -17,23 +22,44 @@ from reproduction.seqence_labelling.ner.data.OntoNoteLoader import OntoNoteNERDa


encoding_type = 'bioes' encoding_type = 'bioes'


data = OntoNoteNERDataLoader(encoding_type=encoding_type).process('/hdd/fudanNLP/fastNLP/others/data/v4/english')
data = OntoNoteNERDataLoader(encoding_type=encoding_type).process('/hdd/fudanNLP/fastNLP/others/data/v4/english',
lower=True)

import joblib
raw_data = joblib.load('/hdd/fudanNLP/fastNLP/others/NER-with-LS/data/ontonotes_with_data.joblib')
def convert_to_ids(raw_words):
ids = []
for word in raw_words:
id = raw_data['word_to_id'][word]
id = raw_data['id_to_emb_map'][id]
ids.append(id)
return ids
word_embed = raw_data['emb_matrix']
for name, dataset in data.datasets.items():
dataset.apply_field(convert_to_ids, field_name='raw_words', new_field_name=Const.INPUT)

print(data) print(data)
char_embed = CNNCharEmbedding(vocab=data.vocabs['cap_words'], embed_size=30, char_emb_size=30, filter_nums=[30], char_embed = CNNCharEmbedding(vocab=data.vocabs['cap_words'], embed_size=30, char_emb_size=30, filter_nums=[30],
kernel_sizes=[3]) kernel_sizes=[3])
word_embed = StaticEmbedding(vocab=data.vocabs[Const.INPUT],
model_dir_or_name='/hdd/fudanNLP/pretrain_vectors/glove.6B.100d.txt',
requires_grad=True)
# word_embed = StaticEmbedding(vocab=data.vocabs[Const.INPUT],
# model_dir_or_name='/hdd/fudanNLP/pretrain_vectors/glove.6B.100d.txt',
# requires_grad=True)


model = CNNBiLSTMCRF(word_embed, char_embed, hidden_size=400, num_layers=2, tag_vocab=data.vocabs[Const.TARGET],
model = CNNBiLSTMCRF(word_embed, char_embed, hidden_size=1200, num_layers=1, tag_vocab=data.vocabs[Const.TARGET],
encoding_type=encoding_type) encoding_type=encoding_type)


optimizer = SGD(model.parameters(), lr=0.015, momentum=0.9)
callbacks = [GradientClipCallback(clip_value=5, clip_type='value'),
FitlogCallback(data.datasets['test'], verbose=1)]

optimizer = SGD(model.parameters(), lr=0.01, momentum=0.9)
scheduler = LRScheduler(LambdaLR(optimizer, lr_lambda=lambda epoch: 1 / (1 + 0.05 * epoch)))
callbacks.append(scheduler)
# optimizer = SWATS(model.parameters(), verbose=True)
# optimizer = Adam(model.parameters(), lr=0.005)


callbacks = [GradientClipCallback(), FitlogCallback(data.datasets['test'], verbose=1)]


trainer = Trainer(train_data=data.datasets['train'], model=model, optimizer=optimizer, sampler=BucketSampler(),
device=1, dev_data=data.datasets['dev'], batch_size=32,
trainer = Trainer(train_data=data.datasets['train'], model=model, optimizer=optimizer, sampler=BucketSampler(num_buckets=100),
device=0, dev_data=data.datasets['dev'], batch_size=10,
metrics=SpanFPreRecMetric(tag_vocab=data.vocabs[Const.TARGET], encoding_type=encoding_type), metrics=SpanFPreRecMetric(tag_vocab=data.vocabs[Const.TARGET], encoding_type=encoding_type),
callbacks=callbacks, num_workers=1, n_epochs=100) callbacks=callbacks, num_workers=1, n_epochs=100)
trainer.train() trainer.train()

Loading…
Cancel
Save