Browse Source

sequence labeling更新

tags/v0.4.10
yh_cc 5 years ago
parent
commit
4533427ea3
5 changed files with 24 additions and 23 deletions
  1. +4
    -2
      reproduction/seqence_labelling/ner/data/Conll2003Loader.py
  2. +2
    -1
      reproduction/seqence_labelling/ner/data/OntoNoteLoader.py
  3. +6
    -12
      reproduction/seqence_labelling/ner/model/lstm_cnn_crf.py
  4. +10
    -6
      reproduction/seqence_labelling/ner/train_cnn_lstm_crf_conll2003.py
  5. +2
    -2
      reproduction/seqence_labelling/ner/train_ontonote.py

+ 4
- 2
reproduction/seqence_labelling/ner/data/Conll2003Loader.py View File

@@ -63,8 +63,10 @@ class Conll2003DataLoader(DataSetLoader):
data.datasets[name] = dataset data.datasets[name] = dataset


# 对construct vocab # 对construct vocab
word_vocab = Vocabulary(min_freq=3) if word_vocab_opt is None else Vocabulary(**word_vocab_opt)
word_vocab.from_dataset(data.datasets['train'], field_name=Const.INPUT)
word_vocab = Vocabulary(min_freq=2) if word_vocab_opt is None else Vocabulary(**word_vocab_opt)
# word_vocab.from_dataset(data.datasets['train'], field_name=Const.INPUT)
# TODO 这样感觉不规范呐
word_vocab.from_dataset(*data.datasets.values(), field_name=Const.INPUT)
word_vocab.index_dataset(*data.datasets.values(), field_name=Const.INPUT, new_field_name=Const.INPUT) word_vocab.index_dataset(*data.datasets.values(), field_name=Const.INPUT, new_field_name=Const.INPUT)
data.vocabs[Const.INPUT] = word_vocab data.vocabs[Const.INPUT] = word_vocab




+ 2
- 1
reproduction/seqence_labelling/ner/data/OntoNoteLoader.py View File

@@ -87,7 +87,8 @@ class OntoNoteNERDataLoader(DataSetLoader):


# 对construct vocab # 对construct vocab
word_vocab = Vocabulary(min_freq=2) if word_vocab_opt is None else Vocabulary(**word_vocab_opt) word_vocab = Vocabulary(min_freq=2) if word_vocab_opt is None else Vocabulary(**word_vocab_opt)
word_vocab.from_dataset(data.datasets['train'], field_name='raw_words')
# word_vocab.from_dataset(data.datasets['train'], field_name='raw_words')
word_vocab.from_dataset(*data.datasets.values(), field_name=Const.INPUT)
word_vocab.index_dataset(*data.datasets.values(), field_name='raw_words', new_field_name=Const.INPUT) word_vocab.index_dataset(*data.datasets.values(), field_name='raw_words', new_field_name=Const.INPUT)
data.vocabs[Const.INPUT] = word_vocab data.vocabs[Const.INPUT] = word_vocab




+ 6
- 12
reproduction/seqence_labelling/ner/model/lstm_cnn_crf.py View File

@@ -4,7 +4,7 @@ from torch import nn
from fastNLP import seq_len_to_mask from fastNLP import seq_len_to_mask
from fastNLP.modules import Embedding from fastNLP.modules import Embedding
from fastNLP.modules import LSTM from fastNLP.modules import LSTM
from fastNLP.modules import ConditionalRandomField, allowed_transitions, TimestepDropout
from fastNLP.modules import ConditionalRandomField, allowed_transitions
import torch.nn.functional as F import torch.nn.functional as F
from fastNLP import Const from fastNLP import Const


@@ -17,13 +17,12 @@ class CNNBiLSTMCRF(nn.Module):
self.lstm = LSTM(input_size=self.embedding.embedding_dim+self.char_embedding.embedding_dim, self.lstm = LSTM(input_size=self.embedding.embedding_dim+self.char_embedding.embedding_dim,
hidden_size=hidden_size//2, num_layers=num_layers, hidden_size=hidden_size//2, num_layers=num_layers,
bidirectional=True, batch_first=True, dropout=dropout) bidirectional=True, batch_first=True, dropout=dropout)
self.forward_fc = nn.Linear(hidden_size//2, len(tag_vocab))
self.backward_fc = nn.Linear(hidden_size//2, len(tag_vocab))
self.fc = nn.Linear(hidden_size, len(tag_vocab))


transitions = allowed_transitions(tag_vocab.idx2word, encoding_type=encoding_type, include_start_end=False)
self.crf = ConditionalRandomField(len(tag_vocab), include_start_end_trans=False, allowed_transitions=transitions)
transitions = allowed_transitions(tag_vocab.idx2word, encoding_type=encoding_type, include_start_end=True)
self.crf = ConditionalRandomField(len(tag_vocab), include_start_end_trans=True, allowed_transitions=transitions)


self.dropout = TimestepDropout(dropout, inplace=True)
self.dropout = nn.Dropout(dropout, inplace=True)


for name, param in self.named_parameters(): for name, param in self.named_parameters():
if 'ward_fc' in name: if 'ward_fc' in name:
@@ -40,13 +39,8 @@ class CNNBiLSTMCRF(nn.Module):
words = torch.cat([words, chars], dim=-1) words = torch.cat([words, chars], dim=-1)
outputs, _ = self.lstm(words, seq_len) outputs, _ = self.lstm(words, seq_len)
self.dropout(outputs) self.dropout(outputs)
forwards, backwards = outputs.chunk(2, dim=-1)


# forward_logits = F.log_softmax(self.forward_fc(forwards), dim=-1)
# backward_logits = F.log_softmax(self.backward_fc(backwards), dim=-1)

logits = self.forward_fc(forwards) + self.backward_fc(backwards)
self.dropout(logits)
logits = F.log_softmax(self.fc(outputs), dim=-1)


if target is not None: if target is not None:
loss = self.crf(logits, target, seq_len_to_mask(seq_len)) loss = self.crf(logits, target, seq_len_to_mask(seq_len))


+ 10
- 6
reproduction/seqence_labelling/ner/train_cnn_lstm_crf_conll2003.py View File

@@ -10,7 +10,8 @@ from fastNLP import BucketSampler
from fastNLP import Const from fastNLP import Const
from torch.optim import SGD, Adam from torch.optim import SGD, Adam
from fastNLP import GradientClipCallback from fastNLP import GradientClipCallback
from fastNLP.core.callback import FitlogCallback
from fastNLP.core.callback import FitlogCallback, LRScheduler
from torch.optim.lr_scheduler import LambdaLR
import fitlog import fitlog
fitlog.debug() fitlog.debug()


@@ -19,7 +20,7 @@ from reproduction.seqence_labelling.ner.data.Conll2003Loader import Conll2003Dat
encoding_type = 'bioes' encoding_type = 'bioes'


data = Conll2003DataLoader(encoding_type=encoding_type).process('/hdd/fudanNLP/fastNLP/others/data/conll2003', data = Conll2003DataLoader(encoding_type=encoding_type).process('/hdd/fudanNLP/fastNLP/others/data/conll2003',
word_vocab_opt=VocabularyOption(min_freq=3))
word_vocab_opt=VocabularyOption(min_freq=2))
print(data) print(data)
char_embed = CNNCharEmbedding(vocab=data.vocabs['cap_words'], embed_size=30, char_emb_size=30, filter_nums=[30], char_embed = CNNCharEmbedding(vocab=data.vocabs['cap_words'], embed_size=30, char_emb_size=30, filter_nums=[30],
kernel_sizes=[3]) kernel_sizes=[3])
@@ -28,15 +29,18 @@ word_embed = StaticEmbedding(vocab=data.vocabs[Const.INPUT],
requires_grad=True) requires_grad=True)
word_embed.embedding.weight.data = word_embed.embedding.weight.data/word_embed.embedding.weight.data.std() word_embed.embedding.weight.data = word_embed.embedding.weight.data/word_embed.embedding.weight.data.std()


model = CNNBiLSTMCRF(word_embed, char_embed, hidden_size=400, num_layers=1, tag_vocab=data.vocabs[Const.TARGET],
model = CNNBiLSTMCRF(word_embed, char_embed, hidden_size=200, num_layers=1, tag_vocab=data.vocabs[Const.TARGET],
encoding_type=encoding_type) encoding_type=encoding_type)


optimizer = Adam(model.parameters(), lr=0.001)
optimizer = SGD(model.parameters(), lr=0.01, momentum=0.9)
scheduler = LRScheduler(LambdaLR(optimizer, lr_lambda=lambda epoch: 1 / (1 + 0.05 * epoch)))


callbacks = [GradientClipCallback(clip_type='value'), FitlogCallback({'test':data.datasets['test']}, verbose=1)]
callbacks = [GradientClipCallback(clip_type='value', clip_value=5), FitlogCallback({'test':data.datasets['test'],
'train':data.datasets['train']}, verbose=1),
scheduler]


trainer = Trainer(train_data=data.datasets['train'], model=model, optimizer=optimizer, sampler=BucketSampler(), trainer = Trainer(train_data=data.datasets['train'], model=model, optimizer=optimizer, sampler=BucketSampler(),
device=0, dev_data=data.datasets['dev'], batch_size=32,
device=0, dev_data=data.datasets['dev'], batch_size=10,
metrics=SpanFPreRecMetric(tag_vocab=data.vocabs[Const.TARGET], encoding_type=encoding_type), metrics=SpanFPreRecMetric(tag_vocab=data.vocabs[Const.TARGET], encoding_type=encoding_type),
callbacks=callbacks, num_workers=1, n_epochs=100) callbacks=callbacks, num_workers=1, n_epochs=100)
trainer.train() trainer.train()

+ 2
- 2
reproduction/seqence_labelling/ner/train_ontonote.py View File

@@ -25,10 +25,10 @@ word_embed = StaticEmbedding(vocab=data.vocabs[Const.INPUT],
model_dir_or_name='/hdd/fudanNLP/pretrain_vectors/glove.6B.100d.txt', model_dir_or_name='/hdd/fudanNLP/pretrain_vectors/glove.6B.100d.txt',
requires_grad=True) requires_grad=True)


model = CNNBiLSTMCRF(word_embed, char_embed, hidden_size=200, num_layers=1, tag_vocab=data.vocabs[Const.TARGET],
model = CNNBiLSTMCRF(word_embed, char_embed, hidden_size=400, num_layers=2, tag_vocab=data.vocabs[Const.TARGET],
encoding_type=encoding_type) encoding_type=encoding_type)


optimizer = Adam(model.parameters(), lr=0.001)
optimizer = SGD(model.parameters(), lr=0.015, momentum=0.9)


callbacks = [GradientClipCallback(), FitlogCallback(data.datasets['test'], verbose=1)] callbacks = [GradientClipCallback(), FitlogCallback(data.datasets['test'], verbose=1)]




Loading…
Cancel
Save