diff --git a/fastNLP/api/api.py b/fastNLP/api/api.py index 4198fd2b..d927ae56 100644 --- a/fastNLP/api/api.py +++ b/fastNLP/api/api.py @@ -1,11 +1,7 @@ - import torch from fastNLP.core.dataset import DataSet from fastNLP.core.instance import Instance -from fastNLP.core.predictor import Predictor - -from fastNLP.api.model_zoo import load_url model_urls = { 'cws': "", @@ -48,23 +44,13 @@ class POS_tagger(API): for example in query: data.append(Instance(words=example)) - data = self.pipeline(data) - - predictor = Predictor() - outputs = predictor.predict(self.model, data) + out = self.pipeline(data) - answers = [] - for out in outputs: - out = out.numpy() - for sent in out: - answers.append([self.tag_vocab.to_word(tag) for tag in sent]) - return answers + return [x["outputs"] for x in out] def load(self, name): _dict = torch.load(name) self.pipeline = _dict['pipeline'] - self.model = _dict['model'] - self.tag_vocab = _dict["tag_vocab"] diff --git a/fastNLP/core/metrics.py b/fastNLP/core/metrics.py index 6fe47d72..73203b1c 100644 --- a/fastNLP/core/metrics.py +++ b/fastNLP/core/metrics.py @@ -38,18 +38,18 @@ class SeqLabelEvaluator(Evaluator): def __call__(self, predict, truth): """ - :param predict: list of List, the network outputs from all batches. + :param predict: list of dict, the network outputs from all batches. :param truth: list of dict, the ground truths from all batch_y. :return accuracy: """ truth = [item["truth"] for item in truth] + predict = [item["predict"] for item in predict] total_correct, total_count = 0., 0. for x, y in zip(predict, truth): - x = torch.tensor(x) + # x = torch.tensor(x) y = y.to(x) # make sure they are in the same device mask = x.ge(1).long() - correct = torch.sum(x * mask == y * mask) - correct -= torch.sum(x.le(0)) + correct = torch.sum(x * mask == y * mask) - torch.sum(x.le(0)) total_correct += float(correct) total_count += float(torch.sum(mask)) accuracy = total_correct / total_count diff --git a/fastNLP/core/trainer.py b/fastNLP/core/trainer.py index e124ad11..aa2cd385 100644 --- a/fastNLP/core/trainer.py +++ b/fastNLP/core/trainer.py @@ -9,7 +9,7 @@ from fastNLP.core.batch import Batch from fastNLP.core.loss import Loss from fastNLP.core.metrics import Evaluator from fastNLP.core.optimizer import Optimizer -from fastNLP.core.sampler import RandomSampler +from fastNLP.core.sampler import BucketSampler from fastNLP.core.tester import SeqLabelTester, ClassificationTester, SNLITester from fastNLP.core.tester import Tester from fastNLP.saver.logger import create_logger @@ -144,7 +144,8 @@ class Trainer(object): logger.info("training epoch {}".format(epoch)) # prepare mini-batch iterator - data_iterator = Batch(train_data, batch_size=self.batch_size, sampler=RandomSampler(), + data_iterator = Batch(train_data, batch_size=self.batch_size, + sampler=BucketSampler(10, self.batch_size, "word_seq_origin_len"), use_cuda=self.use_cuda) logger.info("prepared data iterator") @@ -170,15 +171,19 @@ class Trainer(object): for batch_x, batch_y in data_iterator: prediction = self.data_forward(network, batch_x) - loss = self.get_loss(prediction, batch_y) + # TODO: refactor self.get_loss + loss = prediction["loss"] if "loss" in prediction else self.get_loss(prediction, batch_y) + # acc = self._evaluator([{"predict": prediction["predict"]}], [{"truth": batch_x["truth"]}]) + self.grad_backward(loss) self.update() self._summary_writer.add_scalar("loss", loss.item(), global_step=step) for name, param in self._model.named_parameters(): if param.requires_grad: - self._summary_writer.add_scalar(name + "_mean", param.mean(), global_step=step) - self._summary_writer.add_scalar(name + "_std", param.std(), global_step=step) - self._summary_writer.add_scalar(name + "_grad_sum", param.sum(), global_step=step) + # self._summary_writer.add_scalar(name + "_mean", param.mean(), global_step=step) + # self._summary_writer.add_scalar(name + "_std", param.std(), global_step=step) + # self._summary_writer.add_scalar(name + "_grad_sum", param.sum(), global_step=step) + pass if kwargs["n_print"] > 0 and step % kwargs["n_print"] == 0: end = time.time() diff --git a/fastNLP/loader/dataset_loader.py b/fastNLP/loader/dataset_loader.py index e9a6dd75..bae3e143 100644 --- a/fastNLP/loader/dataset_loader.py +++ b/fastNLP/loader/dataset_loader.py @@ -361,10 +361,11 @@ class PeopleDailyCorpusLoader(DataSetLoader): pos_tag_examples = [] ner_examples = [] for sent in sents: + if len(sent) <= 2: + continue inside_ne = False sent_pos_tag = [] sent_words = [] - sent_word = [] sent_ner = [] words = sent.strip().split()[1:] for word in words: @@ -389,23 +390,10 @@ class PeopleDailyCorpusLoader(DataSetLoader): ner_tag = "O" tmp = word.split("/") token, pos = tmp[0], tmp[1] - - pos_tag = [] - for single_token in token: - if len(token) == 1: - single_pos = "S-" + pos - else: - single_pos = "M-" + pos - pos_tag.append(single_pos) - sent_word.append(single_token) - if len(token) > 1: - pos_tag[0] = "B-" + pos - pos_tag[-1] = "E-" + pos - sent_pos_tag += pos_tag - sent_ner.append(ner_tag) + sent_pos_tag.append(pos) sent_words.append(token) - pos_tag_examples.append([sent_word, sent_pos_tag]) + pos_tag_examples.append([sent_words, sent_pos_tag]) ner_examples.append([sent_words, sent_ner]) # List[List[List[str], List[str]]] return pos_tag_examples, ner_examples diff --git a/fastNLP/models/base_model.py b/fastNLP/models/base_model.py index 59605f4f..829f7c9c 100644 --- a/fastNLP/models/base_model.py +++ b/fastNLP/models/base_model.py @@ -14,5 +14,5 @@ class BaseModel(torch.nn.Module): trainer = Trainer(**train_args) trainer.train(self, train_data, dev_data) - def predict(self): - pass + def predict(self, *args, **kwargs): + raise NotImplementedError diff --git a/fastNLP/models/sequence_modeling.py b/fastNLP/models/sequence_modeling.py index 8b2375ae..2ba5b97f 100644 --- a/fastNLP/models/sequence_modeling.py +++ b/fastNLP/models/sequence_modeling.py @@ -1,3 +1,4 @@ +import numpy as np import torch from fastNLP.models.base_model import BaseModel @@ -55,10 +56,8 @@ class SeqLabeling(BaseModel): # [batch_size, max_len, hidden_size * direction] x = self.Linear(x) # [batch_size, max_len, num_classes] - if truth is not None: - return self._internal_loss(x, truth) - else: - return self.decode(x) + return {"loss": self._internal_loss(x, truth) if truth is not None else None, + "predict": self.decode(x)} def loss(self, x, y): """ Since the loss has been computed in forward(), this function simply returns x.""" @@ -116,7 +115,7 @@ class AdvSeqLabel(SeqLabeling): num_classes = args["num_classes"] self.Embedding = encoder.embedding.Embedding(vocab_size, word_emb_dim, init_emb=emb) - self.Rnn = encoder.lstm.LSTM(word_emb_dim, hidden_dim, num_layers=3, dropout=0.5, bidirectional=True) + self.Rnn = encoder.lstm.LSTM(word_emb_dim, hidden_dim, num_layers=1, dropout=0.5, bidirectional=True) self.Linear1 = encoder.Linear(hidden_dim * 2, hidden_dim * 2 // 3) self.batch_norm = torch.nn.BatchNorm1d(hidden_dim * 2 // 3) self.relu = torch.nn.ReLU() @@ -128,32 +127,56 @@ class AdvSeqLabel(SeqLabeling): def forward(self, word_seq, word_seq_origin_len, truth=None): """ :param word_seq: LongTensor, [batch_size, mex_len] - :param word_seq_origin_len: list of int. + :param word_seq_origin_len: LongTensor, [batch_size, ] :param truth: LongTensor, [batch_size, max_len] :return y: If truth is None, return list of [decode path(list)]. Used in testing and predicting. If truth is not None, return loss, a scalar. Used in training. """ + word_seq = word_seq.long() - word_seq_origin_len = word_seq_origin_len.long() - truth = truth.long() if truth is not None else None self.mask = self.make_mask(word_seq, word_seq_origin_len) + word_seq_origin_len = word_seq_origin_len.cpu().numpy() + sent_len, idx_sort = np.sort(word_seq_origin_len)[::-1], np.argsort(-word_seq_origin_len) + idx_unsort = np.argsort(idx_sort) + idx_sort = torch.from_numpy(idx_sort) + idx_unsort = torch.from_numpy(idx_unsort) + + # word_seq_origin_len = word_seq_origin_len.long() + truth = truth.long() if truth is not None else None batch_size = word_seq.size(0) max_len = word_seq.size(1) + if next(self.parameters()).is_cuda: + word_seq = word_seq.cuda() + idx_sort = idx_sort.cuda() + idx_unsort = idx_unsort.cuda() + self.mask = self.mask.cuda() + truth = truth.cuda() if truth is not None else None + x = self.Embedding(word_seq) # [batch_size, max_len, word_emb_dim] - x = self.Rnn(x) + + sent_variable = x.index_select(0, idx_sort) + sent_packed = torch.nn.utils.rnn.pack_padded_sequence(sent_variable, sent_len, batch_first=True) + + x = self.Rnn(sent_packed) # [batch_size, max_len, hidden_size * direction] + + sent_output = torch.nn.utils.rnn.pad_packed_sequence(x, batch_first=True)[0] + x = sent_output.index_select(0, idx_unsort) + x = x.contiguous() x = x.view(batch_size * max_len, -1) x = self.Linear1(x) - x = self.batch_norm(x) + # x = self.batch_norm(x) x = self.relu(x) x = self.drop(x) x = self.Linear2(x) x = x.view(batch_size, max_len, -1) # [batch_size, max_len, num_classes] - if truth is not None: - return self._internal_loss(x, truth) - else: - return self.decode(x) + return {"loss": self._internal_loss(x, truth) if truth is not None else None, + "predict": self.decode(x)} + + def predict(self, **x): + out = self.forward(**x) + return {"predict": out["predict"]} diff --git a/reproduction/pos_tag_model/pos_tag.cfg b/reproduction/pos_tag_model/pos_tag.cfg index 40639d7b..366b8bb8 100644 --- a/reproduction/pos_tag_model/pos_tag.cfg +++ b/reproduction/pos_tag_model/pos_tag.cfg @@ -1,6 +1,6 @@ [train] -epochs = 5 -batch_size = 64 +epochs = 300 +batch_size = 32 pickle_path = "./save/" validate = false save_best_dev = true diff --git a/reproduction/pos_tag_model/train_pos_tag.py b/reproduction/pos_tag_model/train_pos_tag.py index 6b8b1d7f..497c5dc8 100644 --- a/reproduction/pos_tag_model/train_pos_tag.py +++ b/reproduction/pos_tag_model/train_pos_tag.py @@ -1,11 +1,14 @@ import copy import os +import sys +sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '../..'))) +print(sys.path) import torch -from fastNLP.api.pipeline import Pipeline -from fastNLP.api.processor import VocabProcessor, IndexerProcessor, SeqLenProcessor from fastNLP.core.dataset import DataSet +from fastNLP.api.pipeline import Pipeline +from fastNLP.api.processor import VocabProcessor, IndexerProcessor, SeqLenProcessor, ModelProcessor, Index2WordProcessor from fastNLP.core.instance import Instance from fastNLP.core.metrics import SeqLabelEvaluator from fastNLP.core.optimizer import Optimizer @@ -14,11 +17,12 @@ from fastNLP.loader.config_loader import ConfigLoader, ConfigSection from fastNLP.loader.dataset_loader import PeopleDailyCorpusLoader from fastNLP.models.sequence_modeling import AdvSeqLabel + cfgfile = './pos_tag.cfg' -# datadir = "/home/zyfeng/data/" -# data_name = "POS_PD_1998.txt" -datadir = "/home/zyfeng/fastnlp_0.2.0/test/data_for_tests/" -data_name = "people_daily_raw.txt" +datadir = "/home/zyfeng/data/" +data_name = "CWS_POS_TAG_NER_people_daily.txt" +# datadir = "/home/zyfeng/env/fastnlp_v_2/test/data_for_tests" +# data_name = "people_daily_raw.txt" pos_tag_data_path = os.path.join(datadir, data_name) @@ -58,6 +62,7 @@ def train(): tag_indexer(dataset) seq_len_proc = SeqLenProcessor("word_seq", "word_seq_origin_len") seq_len_proc(dataset) + #torch.save(dataset, "data_set.pkl") dev_set = copy.deepcopy(dataset) dev_set.set_is_target(truth=True) @@ -75,14 +80,21 @@ def train(): trainer = Trainer(epochs=train_param["epochs"], batch_size=train_param["batch_size"], validate=True, - optimizer=Optimizer("SGD", lr=0.01, momentum=0.9), - evaluator=SeqLabelEvaluator() + optimizer=Optimizer("Adam", lr=0.01, weight_decay=0.9), + evaluator=SeqLabelEvaluator(), + use_cuda=True ) trainer.train(model, dataset, dev_set) + model_proc = ModelProcessor(model, "word_seq_origin_len") + dataset.set_is_target(truth=True) + res = model_proc.process(dataset) + + decoder = Index2WordProcessor(tag_vocab_proc.get_vocab(), "predict", "outputs") + # save model & pipeline - pp = Pipeline([word_indexer, seq_len_proc]) - save_dict = {"pipeline": pp, "model": model, "tag_vocab": tag_vocab_proc.get_vocab()} + pp = Pipeline([word_indexer, seq_len_proc, model_proc, decoder]) + save_dict = {"pipeline": pp} torch.save(save_dict, "model_pp.pkl") diff --git a/test/model/test_seq_label.py b/test/model/test_seq_label.py index 09d43008..83ae6e62 100644 --- a/test/model/test_seq_label.py +++ b/test/model/test_seq_label.py @@ -1,22 +1,22 @@ import os -from fastNLP.core.vocabulary import Vocabulary -from fastNLP.loader.dataset_loader import TokenizeDataSetLoader from fastNLP.core.metrics import SeqLabelEvaluator from fastNLP.core.optimizer import Optimizer from fastNLP.core.preprocess import save_pickle from fastNLP.core.tester import SeqLabelTester from fastNLP.core.trainer import SeqLabelTrainer +from fastNLP.core.vocabulary import Vocabulary from fastNLP.loader.config_loader import ConfigLoader, ConfigSection +from fastNLP.loader.dataset_loader import TokenizeDataSetLoader from fastNLP.loader.model_loader import ModelLoader from fastNLP.models.sequence_modeling import SeqLabeling from fastNLP.saver.model_saver import ModelSaver pickle_path = "./seq_label/" model_name = "seq_label_model.pkl" -config_dir = "test/data_for_tests/config" -data_path = "test/data_for_tests/people.txt" -data_infer_path = "test/data_for_tests/people_infer.txt" +config_dir = "../data_for_tests/config" +data_path = "../data_for_tests/people.txt" +data_infer_path = "../data_for_tests/people_infer.txt" def test_training(): @@ -84,3 +84,7 @@ def test_training(): # Start testing with validation data data_dev.set_target(truth=True) tester.test(model, data_dev) + + +if __name__ == "__main__": + test_training()