From 887c6fec70be4691a8f83f78433de70c87e89311 Mon Sep 17 00:00:00 2001 From: FengZiYjun Date: Mon, 7 Jan 2019 14:22:53 +0800 Subject: [PATCH] add pos-tag training script --- fastNLP/api/api.py | 2 +- fastNLP/core/losses.py | 2 +- fastNLP/core/metrics.py | 9 ++- fastNLP/models/sequence_modeling.py | 19 +++-- .../{process => }/pos_processor.py | 2 + .../pos_tag_model/{pos_io => }/pos_reader.py | 0 reproduction/pos_tag_model/train_pos_tag.py | 71 +++++++++++++++++++ 7 files changed, 92 insertions(+), 13 deletions(-) rename reproduction/pos_tag_model/{process => }/pos_processor.py (99%) rename reproduction/pos_tag_model/{pos_io => }/pos_reader.py (100%) create mode 100644 reproduction/pos_tag_model/train_pos_tag.py diff --git a/fastNLP/api/api.py b/fastNLP/api/api.py index 0ac5c503..bd14197a 100644 --- a/fastNLP/api/api.py +++ b/fastNLP/api/api.py @@ -10,7 +10,7 @@ from fastNLP.core.dataset import DataSet from fastNLP.api.model_zoo import load_url from fastNLP.api.processor import ModelProcessor from reproduction.chinese_word_segment.cws_io.cws_reader import ConllCWSReader -from reproduction.pos_tag_model.pos_io.pos_reader import ConllPOSReader +from reproduction.pos_tag_model.pos_reader import ConllPOSReader from reproduction.Biaffine_parser.util import ConllxDataLoader, add_seg_tag from fastNLP.core.instance import Instance from fastNLP.core.sampler import SequentialSampler diff --git a/fastNLP/core/losses.py b/fastNLP/core/losses.py index 04f8b73e..c11f538c 100644 --- a/fastNLP/core/losses.py +++ b/fastNLP/core/losses.py @@ -250,7 +250,7 @@ class LossInForward(LossBase): if not (isinstance(loss, torch.Tensor) and len(loss.size()) == 0): if not isinstance(loss, torch.Tensor): - raise TypeError(f"loss excepts to be a torch.Tensor, got {type(loss)}") + raise TypeError(f"Loss excepted to be a torch.Tensor, got {type(loss)}") raise RuntimeError(f"The size of loss excepts to be torch.Size([]), got {loss.size()}") return loss diff --git a/fastNLP/core/metrics.py b/fastNLP/core/metrics.py index 99637463..36f9ab90 100644 --- a/fastNLP/core/metrics.py +++ b/fastNLP/core/metrics.py @@ -436,15 +436,14 @@ class SpanFPreRecMetric(MetricBase): raise TypeError(f"`seq_lens` in {get_func_signature(self.evaluate)} must be torch.Tensor," f"got {type(seq_lens)}.") - num_classes = pred.size(-1) - if (target >= num_classes).any(): - raise ValueError("A gold label passed to SpanBasedF1Metric contains an " - "id >= {}, the number of classes.".format(num_classes)) - if pred.size() == target.size() and len(target.size()) == 2: pass elif len(pred.size()) == len(target.size()) + 1 and len(target.size()) == 2: pred = pred.argmax(dim=-1) + num_classes = pred.size(-1) + if (target >= num_classes).any(): + raise ValueError("A gold label passed to SpanBasedF1Metric contains an " + "id >= {}, the number of classes.".format(num_classes)) else: raise RuntimeError(f"In {get_func_signature(self.evaluate)}, when pred have " f"size:{pred.size()}, target should have size: {pred.size()} or " diff --git a/fastNLP/models/sequence_modeling.py b/fastNLP/models/sequence_modeling.py index e911598c..cb9e9478 100644 --- a/fastNLP/models/sequence_modeling.py +++ b/fastNLP/models/sequence_modeling.py @@ -1,8 +1,8 @@ import torch -import numpy as np from fastNLP.models.base_model import BaseModel from fastNLP.modules import decoder, encoder +from fastNLP.modules.decoder.CRF import allowed_transitions from fastNLP.modules.utils import seq_mask @@ -93,7 +93,7 @@ class AdvSeqLabel(SeqLabeling): Advanced Sequence Labeling Model """ - def __init__(self, args, emb=None): + def __init__(self, args, emb=None, id2words=None): super(AdvSeqLabel, self).__init__(args) vocab_size = args["vocab_size"] @@ -105,7 +105,8 @@ class AdvSeqLabel(SeqLabeling): self.Embedding = encoder.embedding.Embedding(vocab_size, word_emb_dim, init_emb=emb) self.norm1 = torch.nn.LayerNorm(word_emb_dim) # self.Rnn = encoder.lstm.LSTM(word_emb_dim, hidden_dim, num_layers=2, dropout=dropout, bidirectional=True) - self.Rnn = torch.nn.LSTM(input_size=word_emb_dim, hidden_size=hidden_dim, num_layers=2, dropout=dropout, bidirectional=True, batch_first=True) + self.Rnn = torch.nn.LSTM(input_size=word_emb_dim, hidden_size=hidden_dim, num_layers=2, dropout=dropout, + bidirectional=True, batch_first=True) self.Linear1 = encoder.Linear(hidden_dim * 2, hidden_dim * 2 // 3) self.norm2 = torch.nn.LayerNorm(hidden_dim * 2 // 3) # self.batch_norm = torch.nn.BatchNorm1d(hidden_dim * 2 // 3) @@ -113,7 +114,12 @@ class AdvSeqLabel(SeqLabeling): self.drop = torch.nn.Dropout(dropout) self.Linear2 = encoder.Linear(hidden_dim * 2 // 3, num_classes) - self.Crf = decoder.CRF.ConditionalRandomField(num_classes, include_start_end_trans=False) + if id2words is None: + self.Crf = decoder.CRF.ConditionalRandomField(num_classes, include_start_end_trans=False) + else: + self.Crf = decoder.CRF.ConditionalRandomField(num_classes, include_start_end_trans=False, + allowed_transitions=allowed_transitions(id2words, + encoding_type="bmes")) def forward(self, word_seq, word_seq_origin_len, truth=None): """ @@ -178,6 +184,7 @@ class AdvSeqLabel(SeqLabeling): assert 'loss' in kwargs return kwargs['loss'] + if __name__ == '__main__': args = { 'vocab_size': 20, @@ -208,11 +215,11 @@ if __name__ == '__main__': res = model(word_seq, word_seq_len, truth) loss = res['loss'] pred = res['predict'] - print('loss: {} acc {}'.format(loss.item(), ((pred.data == truth).long().sum().float() / word_seq_len.sum().float()))) + print('loss: {} acc {}'.format(loss.item(), + ((pred.data == truth).long().sum().float() / word_seq_len.sum().float()))) optimizer.zero_grad() loss.backward() optimizer.step() curidx = endidx if curidx == len(data): curidx = 0 - diff --git a/reproduction/pos_tag_model/process/pos_processor.py b/reproduction/pos_tag_model/pos_processor.py similarity index 99% rename from reproduction/pos_tag_model/process/pos_processor.py rename to reproduction/pos_tag_model/pos_processor.py index 5c03f9cd..7a1b8e01 100644 --- a/reproduction/pos_tag_model/process/pos_processor.py +++ b/reproduction/pos_tag_model/pos_processor.py @@ -4,6 +4,7 @@ from collections import Counter from fastNLP.api.processor import Processor from fastNLP.core.dataset import DataSet + class CombineWordAndPosProcessor(Processor): def __init__(self, word_field_name, pos_field_name): super(CombineWordAndPosProcessor, self).__init__(None, None) @@ -60,6 +61,7 @@ class CombineWordAndPosProcessor(Processor): return dataset + class PosOutputStrProcessor(Processor): def __init__(self, word_field_name, pos_field_name): super(PosOutputStrProcessor, self).__init__(None, None) diff --git a/reproduction/pos_tag_model/pos_io/pos_reader.py b/reproduction/pos_tag_model/pos_reader.py similarity index 100% rename from reproduction/pos_tag_model/pos_io/pos_reader.py rename to reproduction/pos_tag_model/pos_reader.py diff --git a/reproduction/pos_tag_model/train_pos_tag.py b/reproduction/pos_tag_model/train_pos_tag.py new file mode 100644 index 00000000..e440b542 --- /dev/null +++ b/reproduction/pos_tag_model/train_pos_tag.py @@ -0,0 +1,71 @@ +import torch + +from fastNLP.api.pipeline import Pipeline +from fastNLP.api.processor import SeqLenProcessor +from fastNLP.core.metrics import SpanFPreRecMetric +from fastNLP.core.trainer import Trainer +from fastNLP.io.config_io import ConfigLoader, ConfigSection +from fastNLP.models.sequence_modeling import AdvSeqLabel +from reproduction.chinese_word_segment.process.cws_processor import VocabIndexerProcessor +from reproduction.pos_tag_model.pos_reader import ZhConllPOSReader + +cfgfile = './pos_tag.cfg' +pickle_path = "save" + + +def train(): + # load config + train_param = ConfigSection() + model_param = ConfigSection() + ConfigLoader().load_config(cfgfile, {"train": train_param, "model": model_param}) + print("config loaded") + + # Data Loader + dataset = ZhConllPOSReader().load("/home/hyan/train.conllx") + print(dataset) + print("dataset transformed") + + vocab_proc = VocabIndexerProcessor("words") + tag_proc = VocabIndexerProcessor("tag") + seq_len_proc = SeqLenProcessor(field_name="words", new_added_field_name="word_seq_origin_len") + + vocab_proc(dataset) + tag_proc(dataset) + seq_len_proc(dataset) + + dataset.rename_field("words", "word_seq") + dataset.rename_field("tag", "truth") + dataset.set_input("word_seq", "word_seq_origin_len", "truth") + dataset.set_target("truth", "word_seq_origin_len") + + print("processors defined") + + # dataset.set_is_target(tag_ids=True) + model_param["vocab_size"] = vocab_proc.get_vocab_size() + model_param["num_classes"] = tag_proc.get_vocab_size() + print("vocab_size={} num_classes={}".format(model_param["vocab_size"], model_param["num_classes"])) + + # define a model + model = AdvSeqLabel(model_param, id2words=tag_proc.vocab.idx2word) + + # call trainer to train + trainer = Trainer(dataset, model, loss=None, metrics=SpanFPreRecMetric(tag_proc.vocab, pred="predict", + target="truth", + seq_lens="word_seq_origin_len"), + dev_data=dataset, metric_key="f", + use_tqdm=False, use_cuda=True, print_every=20) + trainer.train() + + # save model & pipeline + pp = Pipeline([vocab_proc, seq_len_proc]) + save_dict = {"pipeline": pp, "model": model, "tag_vocab": tag_proc.vocab} + torch.save(save_dict, "model_pp.pkl") + print("pipeline saved") + + +def infer(): + pass + + +if __name__ == "__main__": + train()