@@ -232,12 +232,16 @@ class CrossEntropyLoss(LossBase): | |||
""" | |||
def __init__(self, pred=None, target=None, padding_idx=-100): | |||
# TODO 需要做一些检查,F.cross_entropy在计算时,如果pred是(16, 10 ,4), target的形状按道理应该是(16, 10), 但实际需要(16,4) | |||
super(CrossEntropyLoss, self).__init__() | |||
self._init_param_map(pred=pred, target=target) | |||
self.padding_idx = padding_idx | |||
def get_loss(self, pred, target): | |||
if pred.dim()>2: | |||
if pred.size()[:2]==target.size(): | |||
# F.cross_entropy在计算时,如果pred是(16, 10 ,4), 会在第二维上去log_softmax, 所以需要交换一下位置 | |||
pred = pred.transpose(1, 2) | |||
return F.cross_entropy(input=pred, target=target, | |||
ignore_index=self.padding_idx) | |||
@@ -451,9 +451,11 @@ class Trainer(object): | |||
self.data_iterator = train_data | |||
else: | |||
raise TypeError("train_data type {} not support".format(type(train_data))) | |||
self.model = _move_model_to_device(model, device=device) | |||
if check_code_level > -1 and isinstance(self.data_iterator, DataSetIter): | |||
_check_code(dataset=train_data, model=model, losser=losser, metrics=metrics, dev_data=dev_data, | |||
_check_code(dataset=train_data, model=self.model, losser=losser, metrics=metrics, dev_data=dev_data, | |||
metric_key=metric_key, check_level=check_code_level, | |||
batch_size=min(batch_size, DEFAULT_CHECK_BATCH_SIZE)) | |||
# _check_code 是 fastNLP 帮助你检查代码是否正确的方法 。如果你在错误栈中看到这行注释,请认真检查你的代码 | |||
@@ -474,9 +476,7 @@ class Trainer(object): | |||
self.best_dev_perf = None | |||
self.n_steps = (len(self.train_data) // self.batch_size + int( | |||
len(self.train_data) % self.batch_size != 0)) * self.n_epochs | |||
self.model = _move_model_to_device(self.model, device=device) | |||
if isinstance(optimizer, torch.optim.Optimizer): | |||
self.optimizer = optimizer | |||
elif isinstance(optimizer, Optimizer): | |||
@@ -204,7 +204,7 @@ class StaticEmbedding(TokenEmbedding): | |||
model_url = PRETRAIN_URL + model_name | |||
model_path = cached_path(model_url) | |||
# 检查是否存在 | |||
elif os.path.isfile(model_dir_or_name): | |||
elif os.path.isfile(os.path.expanduser(os.path.abspath(model_dir_or_name))): | |||
model_path = model_dir_or_name | |||
else: | |||
raise ValueError(f"Cannot recognize {model_dir_or_name}.") | |||
@@ -455,7 +455,7 @@ class ElmoEmbedding(ContextualEmbedding): | |||
model_url = PRETRAIN_URL + model_name | |||
model_dir = cached_path(model_url) | |||
# 检查是否存在 | |||
elif os.path.isdir(model_dir_or_name): | |||
elif os.path.isdir(os.path.expanduser(os.path.abspath(model_dir_or_name))): | |||
model_dir = model_dir_or_name | |||
else: | |||
raise ValueError(f"Cannot recognize {model_dir_or_name}.") | |||
@@ -553,7 +553,7 @@ class BertEmbedding(ContextualEmbedding): | |||
model_url = PRETRAIN_URL + model_name | |||
model_dir = cached_path(model_url) | |||
# 检查是否存在 | |||
elif os.path.isdir(model_dir_or_name): | |||
elif os.path.isdir(os.path.expanduser(os.path.abspath(model_dir_or_name))): | |||
model_dir = model_dir_or_name | |||
else: | |||
raise ValueError(f"Cannot recognize {model_dir_or_name}.") | |||
@@ -57,12 +57,8 @@ callbacks = [clipper] | |||
# if pretrain: | |||
# fixer = FixEmbedding([model.char_embedding, model.bigram_embedding], fix_until=fix_until) | |||
# callbacks.append(fixer) | |||
trainer = Trainer(data.datasets['train'], model, optimizer=optimizer, loss=None, | |||
batch_size=32, sampler=sampler, update_every=5, | |||
n_epochs=3, print_every=5, | |||
dev_data=data.datasets['dev'], metrics=RelayMetric(), metric_key='f', | |||
validate_every=-1, save_path=None, | |||
prefetch=True, use_tqdm=True, device=device, | |||
callbacks=callbacks, | |||
trainer = Trainer(data.datasets['train'], model, optimizer=optimizer, loss=None, batch_size=32, sampler=sampler, | |||
update_every=5, n_epochs=3, print_every=5, dev_data=data.datasets['dev'], metrics=RelayMetric(), | |||
metric_key='f', validate_every=-1, save_path=None, use_tqdm=True, device=device, callbacks=callbacks, | |||
check_code_level=0) | |||
trainer.train() |
@@ -12,11 +12,11 @@ class CNNBiLSTMCRF(nn.Module): | |||
def __init__(self, embed, char_embed, hidden_size, num_layers, tag_vocab, dropout=0.5, encoding_type='bioes'): | |||
super().__init__() | |||
self.embedding = Embedding(embed, dropout=0.5) | |||
self.char_embedding = Embedding(char_embed, dropout=0.5) | |||
self.embedding = Embedding(embed, dropout=0.5, dropout_word=0) | |||
self.char_embedding = Embedding(char_embed, dropout=0.5, dropout_word=0.01) | |||
self.lstm = LSTM(input_size=self.embedding.embedding_dim+self.char_embedding.embedding_dim, | |||
hidden_size=hidden_size//2, num_layers=num_layers, | |||
bidirectional=True, batch_first=True, dropout=dropout) | |||
hidden_size=hidden_size//2, num_layers=num_layers, | |||
bidirectional=True, batch_first=True) | |||
self.fc = nn.Linear(hidden_size, len(tag_vocab)) | |||
transitions = allowed_transitions(tag_vocab.idx2word, encoding_type=encoding_type, include_start_end=True) | |||
@@ -25,9 +25,9 @@ class CNNBiLSTMCRF(nn.Module): | |||
self.dropout = nn.Dropout(dropout, inplace=True) | |||
for name, param in self.named_parameters(): | |||
if 'ward_fc' in name: | |||
if 'fc' in name: | |||
if param.data.dim()>1: | |||
nn.init.xavier_normal_(param) | |||
nn.init.xavier_uniform_(param) | |||
else: | |||
nn.init.constant_(param, 0) | |||
if 'crf' in name: | |||
@@ -1,6 +1,6 @@ | |||
from fastNLP.modules.encoder.embedding import CNNCharEmbedding, StaticEmbedding, BertEmbedding | |||
from fastNLP.modules.encoder.embedding import CNNCharEmbedding, StaticEmbedding, BertEmbedding, ElmoEmbedding, LSTMCharEmbedding | |||
from fastNLP.core.vocabulary import VocabularyOption | |||
from reproduction.seqence_labelling.ner.model.lstm_cnn_crf import CNNBiLSTMCRF | |||
@@ -12,6 +12,8 @@ from torch.optim import SGD, Adam | |||
from fastNLP import GradientClipCallback | |||
from fastNLP.core.callback import FitlogCallback, LRScheduler | |||
from torch.optim.lr_scheduler import LambdaLR | |||
from reproduction.seqence_labelling.ner.model.swats import SWATS | |||
import fitlog | |||
fitlog.debug() | |||
@@ -19,28 +21,50 @@ from reproduction.seqence_labelling.ner.data.Conll2003Loader import Conll2003Dat | |||
encoding_type = 'bioes' | |||
data = Conll2003DataLoader(encoding_type=encoding_type).process('/hdd/fudanNLP/fastNLP/others/data/conll2003', | |||
word_vocab_opt=VocabularyOption(min_freq=2)) | |||
data = Conll2003DataLoader(encoding_type=encoding_type).process('../../../../others/data/conll2003', | |||
word_vocab_opt=VocabularyOption(min_freq=2), | |||
lower=False) | |||
print(data) | |||
char_embed = CNNCharEmbedding(vocab=data.vocabs['cap_words'], embed_size=30, char_emb_size=30, filter_nums=[30], | |||
kernel_sizes=[3]) | |||
# char_embed = LSTMCharEmbedding(vocab=data.vocabs['cap_words'], embed_size=30 ,char_emb_size=30) | |||
word_embed = StaticEmbedding(vocab=data.vocabs[Const.INPUT], | |||
model_dir_or_name='/hdd/fudanNLP/pretrain_vectors/glove.6B.100d.txt', | |||
model_dir_or_name='/hdd/fudanNLP/pretrain_vectors/wiki_en_100_50_case_2.txt', | |||
requires_grad=True) | |||
word_embed.embedding.weight.data = word_embed.embedding.weight.data/word_embed.embedding.weight.data.std() | |||
# import joblib | |||
# raw_data = joblib.load('/hdd/fudanNLP/fastNLP/others/NER-with-LS/data/conll_with_data.joblib') | |||
# def convert_to_ids(raw_words): | |||
# ids = [] | |||
# for word in raw_words: | |||
# id = raw_data['word_to_id'][word] | |||
# id = raw_data['id_to_emb_map'][id] | |||
# ids.append(id) | |||
# return ids | |||
# word_embed = raw_data['emb_matrix'] | |||
# for name, dataset in data.datasets.items(): | |||
# dataset.apply_field(convert_to_ids, field_name='raw_words', new_field_name=Const.INPUT) | |||
# word_embed = ElmoEmbedding(vocab=data.vocabs['cap_words'], | |||
# model_dir_or_name='/hdd/fudanNLP/fastNLP/others/pretrained_models/elmo_en', | |||
# requires_grad=True) | |||
model = CNNBiLSTMCRF(word_embed, char_embed, hidden_size=200, num_layers=1, tag_vocab=data.vocabs[Const.TARGET], | |||
encoding_type=encoding_type) | |||
optimizer = SGD(model.parameters(), lr=0.01, momentum=0.9) | |||
scheduler = LRScheduler(LambdaLR(optimizer, lr_lambda=lambda epoch: 1 / (1 + 0.05 * epoch))) | |||
callbacks = [GradientClipCallback(clip_type='value', clip_value=5), FitlogCallback({'test':data.datasets['test'], | |||
'train':data.datasets['train']}, verbose=1), | |||
scheduler] | |||
callbacks = [ | |||
GradientClipCallback(clip_type='value', clip_value=5) | |||
, FitlogCallback({'test':data.datasets['test']}, verbose=1) | |||
] | |||
# optimizer = Adam(model.parameters(), lr=0.005) | |||
optimizer = SWATS(model.parameters(), verbose=True) | |||
# optimizer = SGD(model.parameters(), lr=0.008, momentum=0.9) | |||
# scheduler = LRScheduler(LambdaLR(optimizer, lr_lambda=lambda epoch: 1 / (1 + 0.05 * epoch))) | |||
# callbacks.append(scheduler) | |||
trainer = Trainer(train_data=data.datasets['train'], model=model, optimizer=optimizer, sampler=BucketSampler(), | |||
device=0, dev_data=data.datasets['dev'], batch_size=10, | |||
device=1, dev_data=data.datasets['dev'], batch_size=10, | |||
metrics=SpanFPreRecMetric(tag_vocab=data.vocabs[Const.TARGET], encoding_type=encoding_type), | |||
callbacks=callbacks, num_workers=1, n_epochs=100) | |||
trainer.train() |
@@ -1,4 +1,6 @@ | |||
import sys | |||
sys.path.append('../../..') | |||
from fastNLP.modules.encoder.embedding import CNNCharEmbedding, StaticEmbedding | |||
@@ -8,8 +10,11 @@ from fastNLP import SpanFPreRecMetric | |||
from fastNLP import BucketSampler | |||
from fastNLP import Const | |||
from torch.optim import SGD, Adam | |||
from torch.optim.lr_scheduler import LambdaLR | |||
from fastNLP import GradientClipCallback | |||
from fastNLP.core.callback import FitlogCallback | |||
from fastNLP.core.callback import FitlogCallback, LRScheduler | |||
from reproduction.seqence_labelling.ner.model.swats import SWATS | |||
import fitlog | |||
fitlog.debug() | |||
@@ -17,23 +22,44 @@ from reproduction.seqence_labelling.ner.data.OntoNoteLoader import OntoNoteNERDa | |||
encoding_type = 'bioes' | |||
data = OntoNoteNERDataLoader(encoding_type=encoding_type).process('/hdd/fudanNLP/fastNLP/others/data/v4/english') | |||
data = OntoNoteNERDataLoader(encoding_type=encoding_type).process('/hdd/fudanNLP/fastNLP/others/data/v4/english', | |||
lower=True) | |||
import joblib | |||
raw_data = joblib.load('/hdd/fudanNLP/fastNLP/others/NER-with-LS/data/ontonotes_with_data.joblib') | |||
def convert_to_ids(raw_words): | |||
ids = [] | |||
for word in raw_words: | |||
id = raw_data['word_to_id'][word] | |||
id = raw_data['id_to_emb_map'][id] | |||
ids.append(id) | |||
return ids | |||
word_embed = raw_data['emb_matrix'] | |||
for name, dataset in data.datasets.items(): | |||
dataset.apply_field(convert_to_ids, field_name='raw_words', new_field_name=Const.INPUT) | |||
print(data) | |||
char_embed = CNNCharEmbedding(vocab=data.vocabs['cap_words'], embed_size=30, char_emb_size=30, filter_nums=[30], | |||
kernel_sizes=[3]) | |||
word_embed = StaticEmbedding(vocab=data.vocabs[Const.INPUT], | |||
model_dir_or_name='/hdd/fudanNLP/pretrain_vectors/glove.6B.100d.txt', | |||
requires_grad=True) | |||
# word_embed = StaticEmbedding(vocab=data.vocabs[Const.INPUT], | |||
# model_dir_or_name='/hdd/fudanNLP/pretrain_vectors/glove.6B.100d.txt', | |||
# requires_grad=True) | |||
model = CNNBiLSTMCRF(word_embed, char_embed, hidden_size=400, num_layers=2, tag_vocab=data.vocabs[Const.TARGET], | |||
model = CNNBiLSTMCRF(word_embed, char_embed, hidden_size=1200, num_layers=1, tag_vocab=data.vocabs[Const.TARGET], | |||
encoding_type=encoding_type) | |||
optimizer = SGD(model.parameters(), lr=0.015, momentum=0.9) | |||
callbacks = [GradientClipCallback(clip_value=5, clip_type='value'), | |||
FitlogCallback(data.datasets['test'], verbose=1)] | |||
optimizer = SGD(model.parameters(), lr=0.01, momentum=0.9) | |||
scheduler = LRScheduler(LambdaLR(optimizer, lr_lambda=lambda epoch: 1 / (1 + 0.05 * epoch))) | |||
callbacks.append(scheduler) | |||
# optimizer = SWATS(model.parameters(), verbose=True) | |||
# optimizer = Adam(model.parameters(), lr=0.005) | |||
callbacks = [GradientClipCallback(), FitlogCallback(data.datasets['test'], verbose=1)] | |||
trainer = Trainer(train_data=data.datasets['train'], model=model, optimizer=optimizer, sampler=BucketSampler(), | |||
device=1, dev_data=data.datasets['dev'], batch_size=32, | |||
trainer = Trainer(train_data=data.datasets['train'], model=model, optimizer=optimizer, sampler=BucketSampler(num_buckets=100), | |||
device=0, dev_data=data.datasets['dev'], batch_size=10, | |||
metrics=SpanFPreRecMetric(tag_vocab=data.vocabs[Const.TARGET], encoding_type=encoding_type), | |||
callbacks=callbacks, num_workers=1, n_epochs=100) | |||
trainer.train() |