From f2fc98b5e6ea9e5209526c4b3f49bfb51d86a323 Mon Sep 17 00:00:00 2001 From: FengZiYjun Date: Sun, 16 Sep 2018 14:08:52 +0800 Subject: [PATCH] add Field support in Predictor: - apply DataSet in Predictor; remove sub-predictors; add "task" argument to specify which task to predict, as how Trainer/Tester did. - remove Action class - add helper function for DataSet, to create DataSet easily - more code comments - clean up unnecessary codes - add unit tests for Batch, Predictor, Preprocessor, Trainer, Tester --- fastNLP/core/README.md | 1 - fastNLP/core/action.py | 83 +------------ fastNLP/core/dataset.py | 69 +++++++++++ fastNLP/core/field.py | 7 +- fastNLP/core/instance.py | 2 +- fastNLP/core/predictor.py | 177 ++++++++-------------------- fastNLP/core/tester.py | 13 +- fastNLP/core/trainer.py | 18 ++- fastNLP/models/sequence_modeling.py | 2 + test/core/test_action.py | 17 --- test/core/test_batch.py | 62 ++++++++++ test/core/test_predictor.py | 51 ++++++++ test/core/test_preprocess.py | 69 +++++++---- test/core/test_tester.py | 78 +++++++----- test/core/test_trainer.py | 51 +++++--- test/model/test_charlm.py | 8 -- test/model/test_seq_label.py | 96 +++++++++++++++ 17 files changed, 489 insertions(+), 315 deletions(-) delete mode 100644 fastNLP/core/README.md delete mode 100644 test/core/test_action.py create mode 100644 test/core/test_batch.py create mode 100644 test/core/test_predictor.py delete mode 100644 test/model/test_charlm.py create mode 100644 test/model/test_seq_label.py diff --git a/fastNLP/core/README.md b/fastNLP/core/README.md deleted file mode 100644 index 8b137891..00000000 --- a/fastNLP/core/README.md +++ /dev/null @@ -1 +0,0 @@ - diff --git a/fastNLP/core/action.py b/fastNLP/core/action.py index d36d350d..c6cf2d63 100644 --- a/fastNLP/core/action.py +++ b/fastNLP/core/action.py @@ -4,88 +4,6 @@ import numpy as np import torch -class Action(object): - """Operations shared by Trainer, Tester, or Inference. - - This is designed for reducing replicate codes. - - make_batch: produce a min-batch of data. @staticmethod - - pad: padding method used in sequence modeling. @staticmethod - - mode: change network mode for either train or test. (for PyTorch) @staticmethod - """ - - def __init__(self): - super(Action, self).__init__() - - @staticmethod - def make_batch(iterator, use_cuda, output_length=True, max_len=None): - """Batch and Pad data. - - :param iterator: an iterator, (object that implements __next__ method) which returns the next sample. - :param use_cuda: bool, whether to use GPU - :param output_length: bool, whether to output the original length of the sequence before padding. (default: True) - :param max_len: int, maximum sequence length. Longer sequences will be clipped. (default: None) - :return : - - if output_length is True, - (batch_x, seq_len): tuple of two elements - batch_x: list. Each entry is a list of features of a sample. [batch_size, max_len] - seq_len: list. The length of the pre-padded sequence, if output_length is True. - batch_y: list. Each entry is a list of labels of a sample. [batch_size, num_labels] - - if output_length is False, - batch_x: list. Each entry is a list of features of a sample. [batch_size, max_len] - batch_y: list. Each entry is a list of labels of a sample. [batch_size, num_labels] - """ - for batch in iterator: - batch_x = [sample[0] for sample in batch] - batch_y = [sample[1] for sample in batch] - - batch_x = Action.pad(batch_x) - # pad batch_y only if it is a 2-level list - if len(batch_y) > 0 and isinstance(batch_y[0], list): - batch_y = Action.pad(batch_y) - - # convert list to tensor - batch_x = convert_to_torch_tensor(batch_x, use_cuda) - batch_y = convert_to_torch_tensor(batch_y, use_cuda) - - # trim data to max_len - if max_len is not None and batch_x.size(1) > max_len: - batch_x = batch_x[:, :max_len] - - if output_length: - seq_len = [len(x) for x in batch_x] - yield (batch_x, seq_len), batch_y - else: - yield batch_x, batch_y - - @staticmethod - def pad(batch, fill=0): - """ Pad a mini-batch of sequence samples to maximum length of this batch. - - :param batch: list of list - :param fill: word index to pad, default 0. - :return batch: a padded mini-batch - """ - max_length = max([len(x) for x in batch]) - for idx, sample in enumerate(batch): - if len(sample) < max_length: - batch[idx] = sample + ([fill] * (max_length - len(sample))) - return batch - - @staticmethod - def mode(model, is_test=False): - """Train mode or Test mode. This is for PyTorch currently. - - :param model: a PyTorch model - :param is_test: bool, whether in test mode or not. - """ - if is_test: - model.eval() - else: - model.train() - - def convert_to_torch_tensor(data_list, use_cuda): """Convert lists into (cuda) Tensors. @@ -224,6 +142,7 @@ class BucketBatchifier(Batchifier): """Partition all samples into multiple buckets, each of which contains sentences of approximately the same length. In sampling, first random choose a bucket. Then sample data from it. The number of buckets is decided dynamically by the variance of sentence lengths. + TODO: merge it into Batch """ def __init__(self, data_set, batch_size, num_buckets, drop_last=True, sampler=None): diff --git a/fastNLP/core/dataset.py b/fastNLP/core/dataset.py index 5f749795..bb1a1890 100644 --- a/fastNLP/core/dataset.py +++ b/fastNLP/core/dataset.py @@ -1,8 +1,77 @@ from collections import defaultdict +from fastNLP.core.field import TextField +from fastNLP.core.instance import Instance + + +def create_dataset_from_lists(str_lists: list, word_vocab: dict, has_target: bool = False, label_vocab: dict = None): + if has_target is True: + if label_vocab is None: + raise RuntimeError("Must provide label vocabulary to transform labels.") + return create_labeled_dataset_from_lists(str_lists, word_vocab, label_vocab) + else: + return create_unlabeled_dataset_from_lists(str_lists, word_vocab) + + +def create_labeled_dataset_from_lists(str_lists, word_vocab, label_vocab): + """Create an DataSet instance that contains labels. + + :param str_lists: list of list of strings, [num_examples, 2, *]. + :: + [ + [[word_11, word_12, ...], [label_11, label_12, ...]], + ... + ] + + :param word_vocab: dict of (str: int), which means (word: index). + :param label_vocab: dict of (str: int), which means (word: index). + :return data_set: a DataSet instance. + + """ + data_set = DataSet() + for example in str_lists: + word_seq, label_seq = example[0], example[1] + x = TextField(word_seq, is_target=False) + y = TextField(label_seq, is_target=True) + data_set.append(Instance(word_seq=x, label_seq=y)) + data_set.index_field("word_seq", word_vocab) + data_set.index_field("label_seq", label_vocab) + return data_set + + +def create_unlabeled_dataset_from_lists(str_lists, word_vocab): + """Create an DataSet instance that contains no labels. + + :param str_lists: list of list of strings, [num_examples, *]. + :: + [ + [word_11, word_12, ...], + ... + ] + + :param word_vocab: dict of (str: int), which means (word: index). + :return data_set: a DataSet instance. + + """ + data_set = DataSet() + for word_seq in str_lists: + x = TextField(word_seq, is_target=False) + data_set.append(Instance(word_seq=x)) + data_set.index_field("word_seq", word_vocab) + return data_set + class DataSet(list): + """A DataSet object is a list of Instance objects. + + """ def __init__(self, name="", instances=None): + """ + + :param name: str, the name of the dataset. (default: "") + :param instances: list of Instance objects. (default: None) + + """ list.__init__([]) self.name = name if instances is not None: diff --git a/fastNLP/core/field.py b/fastNLP/core/field.py index 1efa759a..f5347bd6 100644 --- a/fastNLP/core/field.py +++ b/fastNLP/core/field.py @@ -20,9 +20,10 @@ class Field(object): class TextField(Field): - def __init__(self, text: list, is_target): + def __init__(self, text, is_target): """ - :param list text: + :param text: list of strings + :param is_target: bool """ super(TextField, self).__init__(is_target) self.text = text @@ -32,7 +33,7 @@ class TextField(Field): if self._index is None: self._index = [vocab[c] for c in self.text] else: - print('error') + raise RuntimeError("Replicate indexing of this field.") return self._index def get_length(self): diff --git a/fastNLP/core/instance.py b/fastNLP/core/instance.py index 3322e576..32f95197 100644 --- a/fastNLP/core/instance.py +++ b/fastNLP/core/instance.py @@ -41,7 +41,7 @@ class Instance(object): :param padding_length: dict of (str: int), which means (field name: padding_length of this field) :return tensor_x: dict of (str: torch.LongTensor), which means (field name: tensor of shape [padding_length, ]) tensor_y: dict of (str: torch.LongTensor), which means (field name: tensor of shape [padding_length, ]) - + If is_target is False for all fields, tensor_y would be an empty dict. """ tensor_x = {} tensor_y = {} diff --git a/fastNLP/core/predictor.py b/fastNLP/core/predictor.py index d04a6ef0..802661ef 100644 --- a/fastNLP/core/predictor.py +++ b/fastNLP/core/predictor.py @@ -1,53 +1,10 @@ import numpy as np import torch -from fastNLP.core.action import Batchifier, SequentialSampler -from fastNLP.core.action import convert_to_torch_tensor -from fastNLP.core.preprocess import load_pickle, DEFAULT_UNKNOWN_LABEL -from fastNLP.modules import utils - - -def make_batch(iterator, use_cuda, output_length=False, max_len=None, min_len=None): - """Batch and Pad data, only for Inference. - - :param iterator: An iterable object that returns a list of indices representing a mini-batch of samples. - :param use_cuda: bool, whether to use GPU - :param output_length: bool, whether to output the original length of the sequence before padding. (default: False) - :param max_len: int, maximum sequence length. Longer sequences will be clipped. (default: None) - :param min_len: int, minimum sequence length. Shorter sequences will be padded. (default: None) - :return: - """ - for batch_x in iterator: - batch_x = pad(batch_x) - # convert list to tensor - batch_x = convert_to_torch_tensor(batch_x, use_cuda) - - # trim data to max_len - if max_len is not None and batch_x.size(1) > max_len: - batch_x = batch_x[:, :max_len] - if min_len is not None and batch_x.size(1) < min_len: - pad_tensor = torch.zeros(batch_x.size(0), min_len - batch_x.size(1)).to(batch_x) - batch_x = torch.cat((batch_x, pad_tensor), 1) - - if output_length: - seq_len = [len(x) for x in batch_x] - yield tuple([batch_x, seq_len]) - else: - yield batch_x - - -def pad(batch, fill=0): - """ Pad a mini-batch of sequence samples to maximum length of this batch. - - :param batch: list of list - :param fill: word index to pad, default 0. - :return batch: a padded mini-batch - """ - max_length = max([len(x) for x in batch]) - for idx, sample in enumerate(batch): - if len(sample) < max_length: - batch[idx] = sample + ([fill] * (max_length - len(sample))) - return batch +from fastNLP.core.action import SequentialSampler +from fastNLP.core.batch import Batch +from fastNLP.core.dataset import create_dataset_from_lists +from fastNLP.core.preprocess import load_pickle class Predictor(object): @@ -59,11 +16,17 @@ class Predictor(object): Currently, Predictor does not support GPU. """ - def __init__(self, pickle_path): + def __init__(self, pickle_path, task): + """ + + :param pickle_path: str, the path to the pickle files. + :param task: str, specify which task the predictor will perform. One of ("seq_label", "text_classify"). + + """ self.batch_size = 1 self.batch_output = [] - self.iterator = None self.pickle_path = pickle_path + self._task = task # one of ("seq_label", "text_classify") self.index2label = load_pickle(self.pickle_path, "id2class.pkl") self.word2index = load_pickle(self.pickle_path, "word2id.pkl") @@ -71,19 +34,19 @@ class Predictor(object): """Perform inference using the trained model. :param network: a PyTorch model (cpu) - :param data: list of list of strings + :param data: list of list of strings, [num_examples, seq_len] :return: list of list of strings, [num_examples, tag_seq_length] """ - # transform strings into indices + # transform strings into DataSet object data = self.prepare_input(data) # turn on the testing mode; clean up the history self.mode(network, test=True) self.batch_output.clear() - data_iterator = iter(Batchifier(SequentialSampler(data), self.batch_size, drop_last=False)) + data_iterator = Batch(data, batch_size=self.batch_size, sampler=SequentialSampler(), use_cuda=False) - for batch_x in self.make_batch(data_iterator, use_cuda=False): + for batch_x, _ in data_iterator: with torch.no_grad(): prediction = self.data_forward(network, batch_x) @@ -99,103 +62,61 @@ class Predictor(object): def data_forward(self, network, x): """Forward through network.""" - raise NotImplementedError - - def make_batch(self, iterator, use_cuda): - raise NotImplementedError + y = network(**x) + if self._task == "seq_label": + y = network.prediction(y) + return y def prepare_input(self, data): - """Transform two-level list of strings into that of index. + """Transform two-level list of strings into an DataSet object. + In the training pipeline, this is done by Preprocessor. But in inference time, we do not call Preprocessor. - :param data: + :param data: list of list of strings. + :: [ [word_11, word_12, ...], [word_21, word_22, ...], ... ] - :return data_index: list of list of int. + + :return data_set: a DataSet instance. """ assert isinstance(data, list) - data_index = [] - default_unknown_index = self.word2index[DEFAULT_UNKNOWN_LABEL] - for example in data: - data_index.append([self.word2index.get(w, default_unknown_index) for w in example]) - return data_index + return create_dataset_from_lists(data, self.word2index, has_target=False) def prepare_output(self, data): """Transform list of batch outputs into strings.""" - raise NotImplementedError - - -class SeqLabelInfer(Predictor): - """ - Inference on sequence labeling models. - """ - - def __init__(self, pickle_path): - super(SeqLabelInfer, self).__init__(pickle_path) + if self._task == "seq_label": + return self._seq_label_prepare_output(data) + elif self._task == "text_classify": + return self._text_classify_prepare_output(data) + else: + raise NotImplementedError("Unknown task type {}".format(self._task)) - def data_forward(self, network, inputs): - """ - This is only for sequence labeling with CRF decoder. - :param network: a PyTorch model - :param inputs: tuple of (x, seq_len) - x: Tensor of shape [batch_size, max_len], where max_len is the maximum length of the mini-batch - after padding. - seq_len: list of int, the lengths of sequences before padding. - :return prediction: Tensor of shape [batch_size, max_len] - """ - if not isinstance(inputs[1], list) and isinstance(inputs[0], list): - raise RuntimeError("output_length must be true for sequence modeling.") - # unpack the returned value from make_batch - x, seq_len = inputs[0], inputs[1] - batch_size, max_len = x.size(0), x.size(1) - mask = utils.seq_mask(seq_len, max_len) - mask = mask.byte().view(batch_size, max_len) - y = network(x) - prediction = network.prediction(y, mask) - return torch.Tensor(prediction) - - def make_batch(self, iterator, use_cuda): - return make_batch(iterator, use_cuda, output_length=True) - - def prepare_output(self, batch_outputs): - """Transform list of batch outputs into strings. - - :param batch_outputs: list of 2-D Tensor, shape [num_batch, batch-size, tag_seq_length]. - :return results: 2-D list of strings, shape [num_examples, tag_seq_length] - """ + def _seq_label_prepare_output(self, batch_outputs): results = [] for batch in batch_outputs: for example in np.array(batch): results.append([self.index2label[int(x)] for x in example]) return results - -class ClassificationInfer(Predictor): - """ - Inference on Classification models. - """ - - def __init__(self, pickle_path): - super(ClassificationInfer, self).__init__(pickle_path) - - def data_forward(self, network, x): - """Forward through network.""" - logits = network(x) - return logits - - def make_batch(self, iterator, use_cuda): - return make_batch(iterator, use_cuda, output_length=False, min_len=5) - - def prepare_output(self, batch_outputs): - """ - Transform list of batch outputs into strings. - :param batch_outputs: list of 2-D Tensor, of shape [num_batch, batch-size, num_classes]. - :return results: list of strings - """ + def _text_classify_prepare_output(self, batch_outputs): results = [] for batch_out in batch_outputs: idx = np.argmax(batch_out.detach().numpy(), axis=-1) results.extend([self.index2label[i] for i in idx]) return results + + +class SeqLabelInfer(Predictor): + def __init__(self, pickle_path): + print( + "[FastNLP Warning] SeqLabelInfer will be deprecated. Please use Predictor with argument 'task'='seq_label'.") + super(SeqLabelInfer, self).__init__(pickle_path, "seq_label") + + +class ClassificationInfer(Predictor): + def __init__(self, pickle_path): + print( + "[FastNLP Warning] ClassificationInfer will be deprecated. Please use Predictor with argument 'task'='text_classify'.") + super(ClassificationInfer, self).__init__(pickle_path, "text_classify") diff --git a/fastNLP/core/tester.py b/fastNLP/core/tester.py index f23ab704..aaa96283 100644 --- a/fastNLP/core/tester.py +++ b/fastNLP/core/tester.py @@ -1,7 +1,6 @@ import numpy as np import torch -from fastNLP.core.action import Action from fastNLP.core.action import RandomSampler from fastNLP.core.batch import Batch from fastNLP.saver.logger import create_logger @@ -79,7 +78,7 @@ class BaseTester(object): self._model = network # turn on the testing mode; clean up the history - self.mode(network, test=True) + self.mode(network, is_test=True) self.eval_history.clear() self.batch_output.clear() @@ -102,13 +101,17 @@ class BaseTester(object): print(self.make_eval_output(prediction, eval_results)) step += 1 - def mode(self, model, test): + def mode(self, model, is_test=False): """Train mode or Test mode. This is for PyTorch currently. :param model: a PyTorch model - :param test: bool, whether in test mode. + :param is_test: bool, whether in test mode or not. + """ - Action.mode(model, test) + if is_test: + model.eval() + else: + model.train() def data_forward(self, network, x): """A forward pass of the model. """ diff --git a/fastNLP/core/trainer.py b/fastNLP/core/trainer.py index 1405f156..e638fdde 100644 --- a/fastNLP/core/trainer.py +++ b/fastNLP/core/trainer.py @@ -6,7 +6,6 @@ from datetime import timedelta import torch from tensorboardX import SummaryWriter -from fastNLP.core.action import Action from fastNLP.core.action import RandomSampler from fastNLP.core.batch import Batch from fastNLP.core.loss import Loss @@ -126,7 +125,7 @@ class BaseTrainer(object): logger.info("training epoch {}".format(epoch)) # turn on network training mode - self.mode(network, test=False) + self.mode(network, is_test=False) # prepare mini-batch iterator data_iterator = Batch(train_data, batch_size=self.batch_size, sampler=RandomSampler(), use_cuda=self.use_cuda) @@ -201,8 +200,17 @@ class BaseTrainer(object): network_copy = copy.deepcopy(network) self.train(network_copy, train_data_cv[i], dev_data_cv[i]) - def mode(self, network, test): - Action.mode(network, test) + def mode(self, model, is_test=False): + """Train mode or Test mode. This is for PyTorch currently. + + :param model: a PyTorch model + :param is_test: bool, whether in test mode or not. + + """ + if is_test: + model.eval() + else: + model.train() def define_optimizer(self): """Define framework-specific optimizer specified by the models. @@ -284,7 +292,7 @@ class BaseTrainer(object): :param validator: a Tester instance :return: bool, True means current results on dev set is the best. """ - loss, accuracy = validator.metrics() + loss, accuracy = validator.metrics if accuracy > self._best_accuracy: self._best_accuracy = accuracy return True diff --git a/fastNLP/models/sequence_modeling.py b/fastNLP/models/sequence_modeling.py index 8d194947..c2bcc693 100644 --- a/fastNLP/models/sequence_modeling.py +++ b/fastNLP/models/sequence_modeling.py @@ -62,6 +62,8 @@ class SeqLabeling(BaseModel): """ x = x.float() y = y.long() + assert x.shape[:2] == y.shape + assert y.shape == self.mask.shape total_loss = self.Crf(x, y, self.mask) return torch.mean(total_loss) diff --git a/test/core/test_action.py b/test/core/test_action.py deleted file mode 100644 index 8d0f628b..00000000 --- a/test/core/test_action.py +++ /dev/null @@ -1,17 +0,0 @@ -import unittest - -from fastNLP.core.action import Action, Batchifier, SequentialSampler - - -class TestAction(unittest.TestCase): - def test_case_1(self): - x = [1, 2, 3, 4, 5, 6, 7, 8] - y = [1, 1, 1, 1, 2, 2, 2, 2] - data = [] - for i in range(len(x)): - data.append([[x[i]], [y[i]]]) - data = Batchifier(SequentialSampler(data), batch_size=2, drop_last=False) - action = Action() - for batch_x in action.make_batch(data, use_cuda=False, output_length=True, max_len=None): - print(batch_x) - diff --git a/test/core/test_batch.py b/test/core/test_batch.py new file mode 100644 index 00000000..395aeb2b --- /dev/null +++ b/test/core/test_batch.py @@ -0,0 +1,62 @@ +import unittest + +import torch + +from fastNLP.core.batch import Batch +from fastNLP.core.dataset import DataSet, create_dataset_from_lists +from fastNLP.core.field import TextField, LabelField +from fastNLP.core.instance import Instance + +raw_texts = ["i am a cat", + "this is a test of new batch", + "ha ha", + "I am a good boy .", + "This is the most beautiful girl ." + ] +texts = [text.strip().split() for text in raw_texts] +labels = [0, 1, 0, 0, 1] + +# prepare vocabulary +vocab = {} +for text in texts: + for tokens in text: + if tokens not in vocab: + vocab[tokens] = len(vocab) + + +class TestCase1(unittest.TestCase): + def test(self): + data = DataSet() + for text, label in zip(texts, labels): + x = TextField(text, is_target=False) + y = LabelField(label, is_target=True) + ins = Instance(text=x, label=y) + data.append(ins) + + # use vocabulary to index data + data.index_field("text", vocab) + + # define naive sampler for batch class + class SeqSampler: + def __call__(self, dataset): + return list(range(len(dataset))) + + # use batch to iterate dataset + data_iterator = Batch(data, 2, SeqSampler(), False) + for batch_x, batch_y in data_iterator: + self.assertEqual(len(batch_x), 2) + self.assertTrue(isinstance(batch_x, dict)) + self.assertTrue(isinstance(batch_x["text"], torch.LongTensor)) + self.assertTrue(isinstance(batch_y, dict)) + self.assertTrue(isinstance(batch_y["label"], torch.LongTensor)) + + +class TestCase2(unittest.TestCase): + def test(self): + data = DataSet() + for text in texts: + x = TextField(text, is_target=False) + ins = Instance(text=x) + data.append(ins) + data_set = create_dataset_from_lists(texts, vocab, has_target=False) + self.assertTrue(type(data) == type(data_set)) diff --git a/test/core/test_predictor.py b/test/core/test_predictor.py new file mode 100644 index 00000000..c7ad65d7 --- /dev/null +++ b/test/core/test_predictor.py @@ -0,0 +1,51 @@ +import os +import unittest + +from fastNLP.core.predictor import Predictor +from fastNLP.core.preprocess import save_pickle +from fastNLP.models.sequence_modeling import SeqLabeling + + +class TestPredictor(unittest.TestCase): + def test_seq_label(self): + model_args = { + "vocab_size": 10, + "word_emb_dim": 100, + "rnn_hidden_units": 100, + "num_classes": 5 + } + + infer_data = [ + ['a', 'b', 'c', 'd', 'e'], + ['a', '@', 'c', 'd', 'e'], + ['a', 'b', '#', 'd', 'e'], + ['a', 'b', 'c', '?', 'e'], + ['a', 'b', 'c', 'd', '$'], + ['!', 'b', 'c', 'd', 'e'] + ] + vocab = {'a': 0, 'b': 1, 'c': 2, 'd': 3, 'e': 4, '!': 5, '@': 6, '#': 7, '$': 8, '?': 9} + + os.system("mkdir save") + save_pickle({0: "0", 1: "1", 2: "2", 3: "3", 4: "4"}, "./save/", "id2class.pkl") + save_pickle(vocab, "./save/", "word2id.pkl") + + model = SeqLabeling(model_args) + predictor = Predictor("./save/", task="seq_label") + + results = predictor.predict(network=model, data=infer_data) + + self.assertTrue(isinstance(results, list)) + self.assertGreater(len(results), 0) + for res in results: + self.assertTrue(isinstance(res, list)) + self.assertEqual(len(res), 5) + self.assertTrue(isinstance(res[0], str)) + + os.system("rm -rf save") + print("pickle path deleted") + + +class TestPredictor2(unittest.TestCase): + def test_text_classify(self): + # TODO + pass diff --git a/test/core/test_preprocess.py b/test/core/test_preprocess.py index bff33ed3..05c04ce9 100644 --- a/test/core/test_preprocess.py +++ b/test/core/test_preprocess.py @@ -1,24 +1,25 @@ import os import unittest +from fastNLP.core.dataset import DataSet from fastNLP.core.preprocess import SeqLabelPreprocess +data = [ + [['Tom', 'and', 'Jerry', '.'], ['n', '&', 'n', '.']], + [['Hello', 'world', '!'], ['a', 'n', '.']], + [['Tom', 'and', 'Jerry', '.'], ['n', '&', 'n', '.']], + [['Hello', 'world', '!'], ['a', 'n', '.']], + [['Tom', 'and', 'Jerry', '.'], ['n', '&', 'n', '.']], + [['Hello', 'world', '!'], ['a', 'n', '.']], + [['Tom', 'and', 'Jerry', '.'], ['n', '&', 'n', '.']], + [['Hello', 'world', '!'], ['a', 'n', '.']], + [['Tom', 'and', 'Jerry', '.'], ['n', '&', 'n', '.']], + [['Hello', 'world', '!'], ['a', 'n', '.']], +] -class TestSeqLabelPreprocess(unittest.TestCase): - def test_case_1(self): - data = [ - [['Tom', 'and', 'Jerry', '.'], ['n', '&', 'n', '.']], - [['Hello', 'world', '!'], ['a', 'n', '.']], - [['Tom', 'and', 'Jerry', '.'], ['n', '&', 'n', '.']], - [['Hello', 'world', '!'], ['a', 'n', '.']], - [['Tom', 'and', 'Jerry', '.'], ['n', '&', 'n', '.']], - [['Hello', 'world', '!'], ['a', 'n', '.']], - [['Tom', 'and', 'Jerry', '.'], ['n', '&', 'n', '.']], - [['Hello', 'world', '!'], ['a', 'n', '.']], - [['Tom', 'and', 'Jerry', '.'], ['n', '&', 'n', '.']], - [['Hello', 'world', '!'], ['a', 'n', '.']], - ] +class TestCase1(unittest.TestCase): + def test(self): if os.path.exists("./save"): for root, dirs, files in os.walk("./save", topdown=False): for name in files: @@ -27,17 +28,45 @@ class TestSeqLabelPreprocess(unittest.TestCase): os.rmdir(os.path.join(root, name)) result = SeqLabelPreprocess().run(train_dev_data=data, train_dev_split=0.4, pickle_path="./save") - result = SeqLabelPreprocess().run(train_dev_data=data, train_dev_split=0.4, - pickle_path="./save") + self.assertEqual(len(result), 2) + self.assertEqual(type(result[0]), DataSet) + self.assertEqual(type(result[1]), DataSet) + + os.system("rm -rf save") + print("pickle path deleted") + + +class TestCase2(unittest.TestCase): + def test(self): if os.path.exists("./save"): for root, dirs, files in os.walk("./save", topdown=False): for name in files: os.remove(os.path.join(root, name)) for name in dirs: os.rmdir(os.path.join(root, name)) - result = SeqLabelPreprocess().run(test_data=data, train_dev_data=data, - pickle_path="./save", train_dev_split=0.4, - cross_val=True) result = SeqLabelPreprocess().run(test_data=data, train_dev_data=data, pickle_path="./save", train_dev_split=0.4, - cross_val=True) + cross_val=False) + self.assertEqual(len(result), 3) + self.assertEqual(type(result[0]), DataSet) + self.assertEqual(type(result[1]), DataSet) + self.assertEqual(type(result[2]), DataSet) + + os.system("rm -rf save") + print("pickle path deleted") + + +class TestCase3(unittest.TestCase): + def test(self): + num_folds = 2 + result = SeqLabelPreprocess().run(test_data=None, train_dev_data=data, + pickle_path="./save", train_dev_split=0.4, + cross_val=True, n_fold=num_folds) + self.assertEqual(len(result), 2) + self.assertEqual(len(result[0]), num_folds) + self.assertEqual(len(result[1]), num_folds) + for data_set in result[0] + result[1]: + self.assertEqual(type(data_set), DataSet) + + os.system("rm -rf save") + print("pickle path deleted") diff --git a/test/core/test_tester.py b/test/core/test_tester.py index e4ccf536..aa277b9a 100644 --- a/test/core/test_tester.py +++ b/test/core/test_tester.py @@ -1,37 +1,55 @@ -from fastNLP.core.preprocess import SeqLabelPreprocess +import os +import unittest + +from fastNLP.core.dataset import DataSet +from fastNLP.core.field import TextField +from fastNLP.core.instance import Instance from fastNLP.core.tester import SeqLabelTester -from fastNLP.loader.config_loader import ConfigSection, ConfigLoader -from fastNLP.loader.dataset_loader import TokenizeDatasetLoader from fastNLP.models.sequence_modeling import SeqLabeling data_name = "pku_training.utf8" pickle_path = "data_for_tests" -def foo(): - loader = TokenizeDatasetLoader("./data_for_tests/cws_pku_utf_8") - train_data = loader.load_pku() - - train_args = ConfigSection() - ConfigLoader("config.cfg").load_config("./data_for_tests/config", {"POS": train_args}) - - # Preprocessor - p = SeqLabelPreprocess() - train_data = p.run(train_data) - train_args["vocab_size"] = p.vocab_size - train_args["num_classes"] = p.num_classes - - model = SeqLabeling(train_args) - - valid_args = {"save_output": True, "validate_in_training": True, "save_dev_input": True, - "save_loss": True, "batch_size": 8, "pickle_path": "./data_for_tests/", - "use_cuda": True} - validator = SeqLabelTester(**valid_args) - - print("start validation.") - validator.test(model, train_data) - print(validator.show_metrics()) - - -if __name__ == "__main__": - foo() +class TestTester(unittest.TestCase): + def test_case_1(self): + model_args = { + "vocab_size": 10, + "word_emb_dim": 100, + "rnn_hidden_units": 100, + "num_classes": 5 + } + valid_args = {"save_output": True, "validate_in_training": True, "save_dev_input": True, + "save_loss": True, "batch_size": 2, "pickle_path": "./save/", + "use_cuda": False, "print_every_step": 1} + + train_data = [ + [['a', 'b', 'c', 'd', 'e'], ['a', '@', 'c', 'd', 'e']], + [['a', '@', 'c', 'd', 'e'], ['a', '@', 'c', 'd', 'e']], + [['a', 'b', '#', 'd', 'e'], ['a', '@', 'c', 'd', 'e']], + [['a', 'b', 'c', '?', 'e'], ['a', '@', 'c', 'd', 'e']], + [['a', 'b', 'c', 'd', '$'], ['a', '@', 'c', 'd', 'e']], + [['!', 'b', 'c', 'd', 'e'], ['a', '@', 'c', 'd', 'e']], + ] + vocab = {'a': 0, 'b': 1, 'c': 2, 'd': 3, 'e': 4, '!': 5, '@': 6, '#': 7, '$': 8, '?': 9} + label_vocab = {'a': 0, '@': 1, 'c': 2, 'd': 3, 'e': 4} + + data_set = DataSet() + for example in train_data: + text, label = example[0], example[1] + x = TextField(text, False) + y = TextField(label, is_target=True) + ins = Instance(word_seq=x, label_seq=y) + data_set.append(ins) + + data_set.index_field("word_seq", vocab) + data_set.index_field("label_seq", label_vocab) + + model = SeqLabeling(model_args) + + tester = SeqLabelTester(**valid_args) + tester.test(network=model, dev_data=data_set) + # If this can run, everything is OK. + + os.system("rm -rf save") + print("pickle path deleted") diff --git a/test/core/test_trainer.py b/test/core/test_trainer.py index 7db861af..c71cd695 100644 --- a/test/core/test_trainer.py +++ b/test/core/test_trainer.py @@ -1,33 +1,54 @@ import os - -import torch.nn as nn import unittest -from fastNLP.core.trainer import SeqLabelTrainer +from fastNLP.core.dataset import DataSet +from fastNLP.core.field import TextField +from fastNLP.core.instance import Instance from fastNLP.core.loss import Loss from fastNLP.core.optimizer import Optimizer +from fastNLP.core.trainer import SeqLabelTrainer from fastNLP.models.sequence_modeling import SeqLabeling + class TestTrainer(unittest.TestCase): def test_case_1(self): - args = {"epochs": 3, "batch_size": 8, "validate": True, "use_cuda": True, "pickle_path": "./save/", + args = {"epochs": 3, "batch_size": 2, "validate": True, "use_cuda": False, "pickle_path": "./save/", "save_best_dev": True, "model_name": "default_model_name.pkl", "loss": Loss(None), "optimizer": Optimizer("Adam", lr=0.001, weight_decay=0), - "vocab_size": 20, + "vocab_size": 10, "word_emb_dim": 100, "rnn_hidden_units": 100, - "num_classes": 3 + "num_classes": 5 } - trainer = SeqLabelTrainer() + trainer = SeqLabelTrainer(**args) + train_data = [ - [[1, 2, 3, 4, 5, 6], [1, 0, 1, 0, 1, 2]], - [[2, 3, 4, 5, 1, 6], [0, 1, 0, 1, 0, 2]], - [[1, 4, 1, 4, 1, 6], [1, 0, 1, 0, 1, 2]], - [[1, 2, 3, 4, 5, 6], [1, 0, 1, 0, 1, 2]], - [[2, 3, 4, 5, 1, 6], [0, 1, 0, 1, 0, 2]], - [[1, 4, 1, 4, 1, 6], [1, 0, 1, 0, 1, 2]], + [['a', 'b', 'c', 'd', 'e'], ['a', '@', 'c', 'd', 'e']], + [['a', '@', 'c', 'd', 'e'], ['a', '@', 'c', 'd', 'e']], + [['a', 'b', '#', 'd', 'e'], ['a', '@', 'c', 'd', 'e']], + [['a', 'b', 'c', '?', 'e'], ['a', '@', 'c', 'd', 'e']], + [['a', 'b', 'c', 'd', '$'], ['a', '@', 'c', 'd', 'e']], + [['!', 'b', 'c', 'd', 'e'], ['a', '@', 'c', 'd', 'e']], ] - dev_data = train_data + vocab = {'a': 0, 'b': 1, 'c': 2, 'd': 3, 'e': 4, '!': 5, '@': 6, '#': 7, '$': 8, '?': 9} + label_vocab = {'a': 0, '@': 1, 'c': 2, 'd': 3, 'e': 4} + + data_set = DataSet() + for example in train_data: + text, label = example[0], example[1] + x = TextField(text, False) + y = TextField(label, is_target=True) + ins = Instance(word_seq=x, label_seq=y) + data_set.append(ins) + + data_set.index_field("word_seq", vocab) + data_set.index_field("label_seq", label_vocab) + model = SeqLabeling(args) - trainer.train(network=model, train_data=train_data, dev_data=dev_data) \ No newline at end of file + + trainer.train(network=model, train_data=data_set, dev_data=data_set) + # If this can run, everything is OK. + + os.system("rm -rf save") + print("pickle path deleted") diff --git a/test/model/test_charlm.py b/test/model/test_charlm.py deleted file mode 100644 index e76f6404..00000000 --- a/test/model/test_charlm.py +++ /dev/null @@ -1,8 +0,0 @@ - - -def test_charlm(): - pass - - -if __name__ == "__main__": - test_charlm() diff --git a/test/model/test_seq_label.py b/test/model/test_seq_label.py new file mode 100644 index 00000000..9a0c46cd --- /dev/null +++ b/test/model/test_seq_label.py @@ -0,0 +1,96 @@ +import argparse +import os + +from fastNLP.core.optimizer import Optimizer +from fastNLP.core.preprocess import SeqLabelPreprocess +from fastNLP.core.tester import SeqLabelTester +from fastNLP.core.trainer import SeqLabelTrainer +from fastNLP.loader.config_loader import ConfigLoader, ConfigSection +from fastNLP.loader.dataset_loader import POSDatasetLoader +from fastNLP.loader.model_loader import ModelLoader +from fastNLP.models.sequence_modeling import SeqLabeling +from fastNLP.saver.model_saver import ModelSaver + +parser = argparse.ArgumentParser() +parser.add_argument("-s", "--save", type=str, default="./seq_label/", help="path to save pickle files") +parser.add_argument("-t", "--train", type=str, default="../data_for_tests/people.txt", + help="path to the training data") +parser.add_argument("-c", "--config", type=str, default="../data_for_tests/config", help="path to the config file") +parser.add_argument("-m", "--model_name", type=str, default="seq_label_model.pkl", help="the name of the model") +parser.add_argument("-i", "--infer", type=str, default="../data_for_tests/people_infer.txt", + help="data used for inference") + +args = parser.parse_args() +pickle_path = args.save +model_name = args.model_name +config_dir = args.config +data_path = args.train +data_infer_path = args.infer + + +def test_training(): + # Config Loader + trainer_args = ConfigSection() + model_args = ConfigSection() + ConfigLoader("config.cfg").load_config(config_dir, { + "test_seq_label_trainer": trainer_args, "test_seq_label_model": model_args}) + + # Data Loader + pos_loader = POSDatasetLoader(data_path) + train_data = pos_loader.load_lines() + + # Preprocessor + p = SeqLabelPreprocess() + data_train, data_dev = p.run(train_data, pickle_path=pickle_path, train_dev_split=0.5) + model_args["vocab_size"] = p.vocab_size + model_args["num_classes"] = p.num_classes + + trainer = SeqLabelTrainer( + epochs=trainer_args["epochs"], + batch_size=trainer_args["batch_size"], + validate=False, + use_cuda=False, + pickle_path=pickle_path, + save_best_dev=trainer_args["save_best_dev"], + model_name=model_name, + optimizer=Optimizer("SGD", lr=0.01, momentum=0.9), + ) + + # Model + model = SeqLabeling(model_args) + + # Start training + trainer.train(model, data_train, data_dev) + + # Saver + saver = ModelSaver(os.path.join(pickle_path, model_name)) + saver.save_pytorch(model) + + del model, trainer, pos_loader + + # Define the same model + model = SeqLabeling(model_args) + + # Dump trained parameters into the model + ModelLoader.load_pytorch(model, os.path.join(pickle_path, model_name)) + + # Load test configuration + tester_args = ConfigSection() + ConfigLoader("config.cfg").load_config(config_dir, {"test_seq_label_tester": tester_args}) + + # Tester + tester = SeqLabelTester(save_output=False, + save_loss=True, + save_best_dev=False, + batch_size=4, + use_cuda=False, + pickle_path=pickle_path, + model_name="seq_label_in_test.pkl", + print_every_step=1 + ) + + # Start testing with validation data + tester.test(model, data_dev) + + loss, accuracy = tester.metrics + assert 0 < accuracy < 1