Browse Source

1.修改CrossEntropyLoss中存在的反直觉bug; 2.更新sequence labeling

tags/v0.4.10
yh_cc 5 years ago
parent
commit
39dd086262
7 changed files with 93 additions and 43 deletions
  1. +5
    -1
      fastNLP/core/losses.py
  2. +5
    -5
      fastNLP/core/trainer.py
  3. +3
    -3
      fastNLP/modules/encoder/embedding.py
  4. +3
    -7
      reproduction/seqence_labelling/cws/train_shift_relay.py
  5. +6
    -6
      reproduction/seqence_labelling/ner/model/lstm_cnn_crf.py
  6. +35
    -11
      reproduction/seqence_labelling/ner/train_cnn_lstm_crf_conll2003.py
  7. +36
    -10
      reproduction/seqence_labelling/ner/train_ontonote.py

+ 5
- 1
fastNLP/core/losses.py View File

@@ -232,12 +232,16 @@ class CrossEntropyLoss(LossBase):
"""
def __init__(self, pred=None, target=None, padding_idx=-100):
# TODO 需要做一些检查,F.cross_entropy在计算时,如果pred是(16, 10 ,4), target的形状按道理应该是(16, 10), 但实际需要(16,4)
super(CrossEntropyLoss, self).__init__()
self._init_param_map(pred=pred, target=target)
self.padding_idx = padding_idx
def get_loss(self, pred, target):
if pred.dim()>2:
if pred.size()[:2]==target.size():
# F.cross_entropy在计算时,如果pred是(16, 10 ,4), 会在第二维上去log_softmax, 所以需要交换一下位置
pred = pred.transpose(1, 2)

return F.cross_entropy(input=pred, target=target,
ignore_index=self.padding_idx)



+ 5
- 5
fastNLP/core/trainer.py View File

@@ -451,9 +451,11 @@ class Trainer(object):
self.data_iterator = train_data
else:
raise TypeError("train_data type {} not support".format(type(train_data)))

self.model = _move_model_to_device(model, device=device)

if check_code_level > -1 and isinstance(self.data_iterator, DataSetIter):
_check_code(dataset=train_data, model=model, losser=losser, metrics=metrics, dev_data=dev_data,
_check_code(dataset=train_data, model=self.model, losser=losser, metrics=metrics, dev_data=dev_data,
metric_key=metric_key, check_level=check_code_level,
batch_size=min(batch_size, DEFAULT_CHECK_BATCH_SIZE))
# _check_code 是 fastNLP 帮助你检查代码是否正确的方法 。如果你在错误栈中看到这行注释,请认真检查你的代码
@@ -474,9 +476,7 @@ class Trainer(object):
self.best_dev_perf = None
self.n_steps = (len(self.train_data) // self.batch_size + int(
len(self.train_data) % self.batch_size != 0)) * self.n_epochs
self.model = _move_model_to_device(self.model, device=device)

if isinstance(optimizer, torch.optim.Optimizer):
self.optimizer = optimizer
elif isinstance(optimizer, Optimizer):


+ 3
- 3
fastNLP/modules/encoder/embedding.py View File

@@ -204,7 +204,7 @@ class StaticEmbedding(TokenEmbedding):
model_url = PRETRAIN_URL + model_name
model_path = cached_path(model_url)
# 检查是否存在
elif os.path.isfile(model_dir_or_name):
elif os.path.isfile(os.path.expanduser(os.path.abspath(model_dir_or_name))):
model_path = model_dir_or_name
else:
raise ValueError(f"Cannot recognize {model_dir_or_name}.")
@@ -455,7 +455,7 @@ class ElmoEmbedding(ContextualEmbedding):
model_url = PRETRAIN_URL + model_name
model_dir = cached_path(model_url)
# 检查是否存在
elif os.path.isdir(model_dir_or_name):
elif os.path.isdir(os.path.expanduser(os.path.abspath(model_dir_or_name))):
model_dir = model_dir_or_name
else:
raise ValueError(f"Cannot recognize {model_dir_or_name}.")
@@ -553,7 +553,7 @@ class BertEmbedding(ContextualEmbedding):
model_url = PRETRAIN_URL + model_name
model_dir = cached_path(model_url)
# 检查是否存在
elif os.path.isdir(model_dir_or_name):
elif os.path.isdir(os.path.expanduser(os.path.abspath(model_dir_or_name))):
model_dir = model_dir_or_name
else:
raise ValueError(f"Cannot recognize {model_dir_or_name}.")


+ 3
- 7
reproduction/seqence_labelling/cws/train_shift_relay.py View File

@@ -57,12 +57,8 @@ callbacks = [clipper]
# if pretrain:
# fixer = FixEmbedding([model.char_embedding, model.bigram_embedding], fix_until=fix_until)
# callbacks.append(fixer)
trainer = Trainer(data.datasets['train'], model, optimizer=optimizer, loss=None,
batch_size=32, sampler=sampler, update_every=5,
n_epochs=3, print_every=5,
dev_data=data.datasets['dev'], metrics=RelayMetric(), metric_key='f',
validate_every=-1, save_path=None,
prefetch=True, use_tqdm=True, device=device,
callbacks=callbacks,
trainer = Trainer(data.datasets['train'], model, optimizer=optimizer, loss=None, batch_size=32, sampler=sampler,
update_every=5, n_epochs=3, print_every=5, dev_data=data.datasets['dev'], metrics=RelayMetric(),
metric_key='f', validate_every=-1, save_path=None, use_tqdm=True, device=device, callbacks=callbacks,
check_code_level=0)
trainer.train()

+ 6
- 6
reproduction/seqence_labelling/ner/model/lstm_cnn_crf.py View File

@@ -12,11 +12,11 @@ class CNNBiLSTMCRF(nn.Module):
def __init__(self, embed, char_embed, hidden_size, num_layers, tag_vocab, dropout=0.5, encoding_type='bioes'):
super().__init__()

self.embedding = Embedding(embed, dropout=0.5)
self.char_embedding = Embedding(char_embed, dropout=0.5)
self.embedding = Embedding(embed, dropout=0.5, dropout_word=0)
self.char_embedding = Embedding(char_embed, dropout=0.5, dropout_word=0.01)
self.lstm = LSTM(input_size=self.embedding.embedding_dim+self.char_embedding.embedding_dim,
hidden_size=hidden_size//2, num_layers=num_layers,
bidirectional=True, batch_first=True, dropout=dropout)
hidden_size=hidden_size//2, num_layers=num_layers,
bidirectional=True, batch_first=True)
self.fc = nn.Linear(hidden_size, len(tag_vocab))

transitions = allowed_transitions(tag_vocab.idx2word, encoding_type=encoding_type, include_start_end=True)
@@ -25,9 +25,9 @@ class CNNBiLSTMCRF(nn.Module):
self.dropout = nn.Dropout(dropout, inplace=True)

for name, param in self.named_parameters():
if 'ward_fc' in name:
if 'fc' in name:
if param.data.dim()>1:
nn.init.xavier_normal_(param)
nn.init.xavier_uniform_(param)
else:
nn.init.constant_(param, 0)
if 'crf' in name:


+ 35
- 11
reproduction/seqence_labelling/ner/train_cnn_lstm_crf_conll2003.py View File

@@ -1,6 +1,6 @@


from fastNLP.modules.encoder.embedding import CNNCharEmbedding, StaticEmbedding, BertEmbedding
from fastNLP.modules.encoder.embedding import CNNCharEmbedding, StaticEmbedding, BertEmbedding, ElmoEmbedding, LSTMCharEmbedding
from fastNLP.core.vocabulary import VocabularyOption

from reproduction.seqence_labelling.ner.model.lstm_cnn_crf import CNNBiLSTMCRF
@@ -12,6 +12,8 @@ from torch.optim import SGD, Adam
from fastNLP import GradientClipCallback
from fastNLP.core.callback import FitlogCallback, LRScheduler
from torch.optim.lr_scheduler import LambdaLR
from reproduction.seqence_labelling.ner.model.swats import SWATS

import fitlog
fitlog.debug()

@@ -19,28 +21,50 @@ from reproduction.seqence_labelling.ner.data.Conll2003Loader import Conll2003Dat

encoding_type = 'bioes'

data = Conll2003DataLoader(encoding_type=encoding_type).process('/hdd/fudanNLP/fastNLP/others/data/conll2003',
word_vocab_opt=VocabularyOption(min_freq=2))
data = Conll2003DataLoader(encoding_type=encoding_type).process('../../../../others/data/conll2003',
word_vocab_opt=VocabularyOption(min_freq=2),
lower=False)
print(data)
char_embed = CNNCharEmbedding(vocab=data.vocabs['cap_words'], embed_size=30, char_emb_size=30, filter_nums=[30],
kernel_sizes=[3])
# char_embed = LSTMCharEmbedding(vocab=data.vocabs['cap_words'], embed_size=30 ,char_emb_size=30)
word_embed = StaticEmbedding(vocab=data.vocabs[Const.INPUT],
model_dir_or_name='/hdd/fudanNLP/pretrain_vectors/glove.6B.100d.txt',
model_dir_or_name='/hdd/fudanNLP/pretrain_vectors/wiki_en_100_50_case_2.txt',
requires_grad=True)
word_embed.embedding.weight.data = word_embed.embedding.weight.data/word_embed.embedding.weight.data.std()

# import joblib
# raw_data = joblib.load('/hdd/fudanNLP/fastNLP/others/NER-with-LS/data/conll_with_data.joblib')
# def convert_to_ids(raw_words):
# ids = []
# for word in raw_words:
# id = raw_data['word_to_id'][word]
# id = raw_data['id_to_emb_map'][id]
# ids.append(id)
# return ids
# word_embed = raw_data['emb_matrix']
# for name, dataset in data.datasets.items():
# dataset.apply_field(convert_to_ids, field_name='raw_words', new_field_name=Const.INPUT)

# word_embed = ElmoEmbedding(vocab=data.vocabs['cap_words'],
# model_dir_or_name='/hdd/fudanNLP/fastNLP/others/pretrained_models/elmo_en',
# requires_grad=True)

model = CNNBiLSTMCRF(word_embed, char_embed, hidden_size=200, num_layers=1, tag_vocab=data.vocabs[Const.TARGET],
encoding_type=encoding_type)

optimizer = SGD(model.parameters(), lr=0.01, momentum=0.9)
scheduler = LRScheduler(LambdaLR(optimizer, lr_lambda=lambda epoch: 1 / (1 + 0.05 * epoch)))

callbacks = [GradientClipCallback(clip_type='value', clip_value=5), FitlogCallback({'test':data.datasets['test'],
'train':data.datasets['train']}, verbose=1),
scheduler]
callbacks = [
GradientClipCallback(clip_type='value', clip_value=5)
, FitlogCallback({'test':data.datasets['test']}, verbose=1)
]
# optimizer = Adam(model.parameters(), lr=0.005)
optimizer = SWATS(model.parameters(), verbose=True)
# optimizer = SGD(model.parameters(), lr=0.008, momentum=0.9)
# scheduler = LRScheduler(LambdaLR(optimizer, lr_lambda=lambda epoch: 1 / (1 + 0.05 * epoch)))
# callbacks.append(scheduler)

trainer = Trainer(train_data=data.datasets['train'], model=model, optimizer=optimizer, sampler=BucketSampler(),
device=0, dev_data=data.datasets['dev'], batch_size=10,
device=1, dev_data=data.datasets['dev'], batch_size=10,
metrics=SpanFPreRecMetric(tag_vocab=data.vocabs[Const.TARGET], encoding_type=encoding_type),
callbacks=callbacks, num_workers=1, n_epochs=100)
trainer.train()

+ 36
- 10
reproduction/seqence_labelling/ner/train_ontonote.py View File

@@ -1,4 +1,6 @@
import sys

sys.path.append('../../..')

from fastNLP.modules.encoder.embedding import CNNCharEmbedding, StaticEmbedding

@@ -8,8 +10,11 @@ from fastNLP import SpanFPreRecMetric
from fastNLP import BucketSampler
from fastNLP import Const
from torch.optim import SGD, Adam
from torch.optim.lr_scheduler import LambdaLR
from fastNLP import GradientClipCallback
from fastNLP.core.callback import FitlogCallback
from fastNLP.core.callback import FitlogCallback, LRScheduler
from reproduction.seqence_labelling.ner.model.swats import SWATS

import fitlog
fitlog.debug()

@@ -17,23 +22,44 @@ from reproduction.seqence_labelling.ner.data.OntoNoteLoader import OntoNoteNERDa

encoding_type = 'bioes'

data = OntoNoteNERDataLoader(encoding_type=encoding_type).process('/hdd/fudanNLP/fastNLP/others/data/v4/english')
data = OntoNoteNERDataLoader(encoding_type=encoding_type).process('/hdd/fudanNLP/fastNLP/others/data/v4/english',
lower=True)

import joblib
raw_data = joblib.load('/hdd/fudanNLP/fastNLP/others/NER-with-LS/data/ontonotes_with_data.joblib')
def convert_to_ids(raw_words):
ids = []
for word in raw_words:
id = raw_data['word_to_id'][word]
id = raw_data['id_to_emb_map'][id]
ids.append(id)
return ids
word_embed = raw_data['emb_matrix']
for name, dataset in data.datasets.items():
dataset.apply_field(convert_to_ids, field_name='raw_words', new_field_name=Const.INPUT)

print(data)
char_embed = CNNCharEmbedding(vocab=data.vocabs['cap_words'], embed_size=30, char_emb_size=30, filter_nums=[30],
kernel_sizes=[3])
word_embed = StaticEmbedding(vocab=data.vocabs[Const.INPUT],
model_dir_or_name='/hdd/fudanNLP/pretrain_vectors/glove.6B.100d.txt',
requires_grad=True)
# word_embed = StaticEmbedding(vocab=data.vocabs[Const.INPUT],
# model_dir_or_name='/hdd/fudanNLP/pretrain_vectors/glove.6B.100d.txt',
# requires_grad=True)

model = CNNBiLSTMCRF(word_embed, char_embed, hidden_size=400, num_layers=2, tag_vocab=data.vocabs[Const.TARGET],
model = CNNBiLSTMCRF(word_embed, char_embed, hidden_size=1200, num_layers=1, tag_vocab=data.vocabs[Const.TARGET],
encoding_type=encoding_type)

optimizer = SGD(model.parameters(), lr=0.015, momentum=0.9)
callbacks = [GradientClipCallback(clip_value=5, clip_type='value'),
FitlogCallback(data.datasets['test'], verbose=1)]

optimizer = SGD(model.parameters(), lr=0.01, momentum=0.9)
scheduler = LRScheduler(LambdaLR(optimizer, lr_lambda=lambda epoch: 1 / (1 + 0.05 * epoch)))
callbacks.append(scheduler)
# optimizer = SWATS(model.parameters(), verbose=True)
# optimizer = Adam(model.parameters(), lr=0.005)

callbacks = [GradientClipCallback(), FitlogCallback(data.datasets['test'], verbose=1)]

trainer = Trainer(train_data=data.datasets['train'], model=model, optimizer=optimizer, sampler=BucketSampler(),
device=1, dev_data=data.datasets['dev'], batch_size=32,
trainer = Trainer(train_data=data.datasets['train'], model=model, optimizer=optimizer, sampler=BucketSampler(num_buckets=100),
device=0, dev_data=data.datasets['dev'], batch_size=10,
metrics=SpanFPreRecMetric(tag_vocab=data.vocabs[Const.TARGET], encoding_type=encoding_type),
callbacks=callbacks, num_workers=1, n_epochs=100)
trainer.train()

Loading…
Cancel
Save