- apply DataSet in Predictor; remove sub-predictors; add "task" argument to specify which task to predict, as how Trainer/Tester did. - remove Action class - add helper function for DataSet, to create DataSet easily - more code comments - clean up unnecessary codes - add unit tests for Batch, Predictor, Preprocessor, Trainer, Testertags/v0.1.0
@@ -1 +0,0 @@ | |||||
@@ -4,88 +4,6 @@ import numpy as np | |||||
import torch | import torch | ||||
class Action(object): | |||||
"""Operations shared by Trainer, Tester, or Inference. | |||||
This is designed for reducing replicate codes. | |||||
- make_batch: produce a min-batch of data. @staticmethod | |||||
- pad: padding method used in sequence modeling. @staticmethod | |||||
- mode: change network mode for either train or test. (for PyTorch) @staticmethod | |||||
""" | |||||
def __init__(self): | |||||
super(Action, self).__init__() | |||||
@staticmethod | |||||
def make_batch(iterator, use_cuda, output_length=True, max_len=None): | |||||
"""Batch and Pad data. | |||||
:param iterator: an iterator, (object that implements __next__ method) which returns the next sample. | |||||
:param use_cuda: bool, whether to use GPU | |||||
:param output_length: bool, whether to output the original length of the sequence before padding. (default: True) | |||||
:param max_len: int, maximum sequence length. Longer sequences will be clipped. (default: None) | |||||
:return : | |||||
if output_length is True, | |||||
(batch_x, seq_len): tuple of two elements | |||||
batch_x: list. Each entry is a list of features of a sample. [batch_size, max_len] | |||||
seq_len: list. The length of the pre-padded sequence, if output_length is True. | |||||
batch_y: list. Each entry is a list of labels of a sample. [batch_size, num_labels] | |||||
if output_length is False, | |||||
batch_x: list. Each entry is a list of features of a sample. [batch_size, max_len] | |||||
batch_y: list. Each entry is a list of labels of a sample. [batch_size, num_labels] | |||||
""" | |||||
for batch in iterator: | |||||
batch_x = [sample[0] for sample in batch] | |||||
batch_y = [sample[1] for sample in batch] | |||||
batch_x = Action.pad(batch_x) | |||||
# pad batch_y only if it is a 2-level list | |||||
if len(batch_y) > 0 and isinstance(batch_y[0], list): | |||||
batch_y = Action.pad(batch_y) | |||||
# convert list to tensor | |||||
batch_x = convert_to_torch_tensor(batch_x, use_cuda) | |||||
batch_y = convert_to_torch_tensor(batch_y, use_cuda) | |||||
# trim data to max_len | |||||
if max_len is not None and batch_x.size(1) > max_len: | |||||
batch_x = batch_x[:, :max_len] | |||||
if output_length: | |||||
seq_len = [len(x) for x in batch_x] | |||||
yield (batch_x, seq_len), batch_y | |||||
else: | |||||
yield batch_x, batch_y | |||||
@staticmethod | |||||
def pad(batch, fill=0): | |||||
""" Pad a mini-batch of sequence samples to maximum length of this batch. | |||||
:param batch: list of list | |||||
:param fill: word index to pad, default 0. | |||||
:return batch: a padded mini-batch | |||||
""" | |||||
max_length = max([len(x) for x in batch]) | |||||
for idx, sample in enumerate(batch): | |||||
if len(sample) < max_length: | |||||
batch[idx] = sample + ([fill] * (max_length - len(sample))) | |||||
return batch | |||||
@staticmethod | |||||
def mode(model, is_test=False): | |||||
"""Train mode or Test mode. This is for PyTorch currently. | |||||
:param model: a PyTorch model | |||||
:param is_test: bool, whether in test mode or not. | |||||
""" | |||||
if is_test: | |||||
model.eval() | |||||
else: | |||||
model.train() | |||||
def convert_to_torch_tensor(data_list, use_cuda): | def convert_to_torch_tensor(data_list, use_cuda): | ||||
"""Convert lists into (cuda) Tensors. | """Convert lists into (cuda) Tensors. | ||||
@@ -224,6 +142,7 @@ class BucketBatchifier(Batchifier): | |||||
"""Partition all samples into multiple buckets, each of which contains sentences of approximately the same length. | """Partition all samples into multiple buckets, each of which contains sentences of approximately the same length. | ||||
In sampling, first random choose a bucket. Then sample data from it. | In sampling, first random choose a bucket. Then sample data from it. | ||||
The number of buckets is decided dynamically by the variance of sentence lengths. | The number of buckets is decided dynamically by the variance of sentence lengths. | ||||
TODO: merge it into Batch | |||||
""" | """ | ||||
def __init__(self, data_set, batch_size, num_buckets, drop_last=True, sampler=None): | def __init__(self, data_set, batch_size, num_buckets, drop_last=True, sampler=None): | ||||
@@ -1,8 +1,77 @@ | |||||
from collections import defaultdict | from collections import defaultdict | ||||
from fastNLP.core.field import TextField | |||||
from fastNLP.core.instance import Instance | |||||
def create_dataset_from_lists(str_lists: list, word_vocab: dict, has_target: bool = False, label_vocab: dict = None): | |||||
if has_target is True: | |||||
if label_vocab is None: | |||||
raise RuntimeError("Must provide label vocabulary to transform labels.") | |||||
return create_labeled_dataset_from_lists(str_lists, word_vocab, label_vocab) | |||||
else: | |||||
return create_unlabeled_dataset_from_lists(str_lists, word_vocab) | |||||
def create_labeled_dataset_from_lists(str_lists, word_vocab, label_vocab): | |||||
"""Create an DataSet instance that contains labels. | |||||
:param str_lists: list of list of strings, [num_examples, 2, *]. | |||||
:: | |||||
[ | |||||
[[word_11, word_12, ...], [label_11, label_12, ...]], | |||||
... | |||||
] | |||||
:param word_vocab: dict of (str: int), which means (word: index). | |||||
:param label_vocab: dict of (str: int), which means (word: index). | |||||
:return data_set: a DataSet instance. | |||||
""" | |||||
data_set = DataSet() | |||||
for example in str_lists: | |||||
word_seq, label_seq = example[0], example[1] | |||||
x = TextField(word_seq, is_target=False) | |||||
y = TextField(label_seq, is_target=True) | |||||
data_set.append(Instance(word_seq=x, label_seq=y)) | |||||
data_set.index_field("word_seq", word_vocab) | |||||
data_set.index_field("label_seq", label_vocab) | |||||
return data_set | |||||
def create_unlabeled_dataset_from_lists(str_lists, word_vocab): | |||||
"""Create an DataSet instance that contains no labels. | |||||
:param str_lists: list of list of strings, [num_examples, *]. | |||||
:: | |||||
[ | |||||
[word_11, word_12, ...], | |||||
... | |||||
] | |||||
:param word_vocab: dict of (str: int), which means (word: index). | |||||
:return data_set: a DataSet instance. | |||||
""" | |||||
data_set = DataSet() | |||||
for word_seq in str_lists: | |||||
x = TextField(word_seq, is_target=False) | |||||
data_set.append(Instance(word_seq=x)) | |||||
data_set.index_field("word_seq", word_vocab) | |||||
return data_set | |||||
class DataSet(list): | class DataSet(list): | ||||
"""A DataSet object is a list of Instance objects. | |||||
""" | |||||
def __init__(self, name="", instances=None): | def __init__(self, name="", instances=None): | ||||
""" | |||||
:param name: str, the name of the dataset. (default: "") | |||||
:param instances: list of Instance objects. (default: None) | |||||
""" | |||||
list.__init__([]) | list.__init__([]) | ||||
self.name = name | self.name = name | ||||
if instances is not None: | if instances is not None: | ||||
@@ -20,9 +20,10 @@ class Field(object): | |||||
class TextField(Field): | class TextField(Field): | ||||
def __init__(self, text: list, is_target): | |||||
def __init__(self, text, is_target): | |||||
""" | """ | ||||
:param list text: | |||||
:param text: list of strings | |||||
:param is_target: bool | |||||
""" | """ | ||||
super(TextField, self).__init__(is_target) | super(TextField, self).__init__(is_target) | ||||
self.text = text | self.text = text | ||||
@@ -32,7 +33,7 @@ class TextField(Field): | |||||
if self._index is None: | if self._index is None: | ||||
self._index = [vocab[c] for c in self.text] | self._index = [vocab[c] for c in self.text] | ||||
else: | else: | ||||
print('error') | |||||
raise RuntimeError("Replicate indexing of this field.") | |||||
return self._index | return self._index | ||||
def get_length(self): | def get_length(self): | ||||
@@ -41,7 +41,7 @@ class Instance(object): | |||||
:param padding_length: dict of (str: int), which means (field name: padding_length of this field) | :param padding_length: dict of (str: int), which means (field name: padding_length of this field) | ||||
:return tensor_x: dict of (str: torch.LongTensor), which means (field name: tensor of shape [padding_length, ]) | :return tensor_x: dict of (str: torch.LongTensor), which means (field name: tensor of shape [padding_length, ]) | ||||
tensor_y: dict of (str: torch.LongTensor), which means (field name: tensor of shape [padding_length, ]) | tensor_y: dict of (str: torch.LongTensor), which means (field name: tensor of shape [padding_length, ]) | ||||
If is_target is False for all fields, tensor_y would be an empty dict. | |||||
""" | """ | ||||
tensor_x = {} | tensor_x = {} | ||||
tensor_y = {} | tensor_y = {} | ||||
@@ -1,53 +1,10 @@ | |||||
import numpy as np | import numpy as np | ||||
import torch | import torch | ||||
from fastNLP.core.action import Batchifier, SequentialSampler | |||||
from fastNLP.core.action import convert_to_torch_tensor | |||||
from fastNLP.core.preprocess import load_pickle, DEFAULT_UNKNOWN_LABEL | |||||
from fastNLP.modules import utils | |||||
def make_batch(iterator, use_cuda, output_length=False, max_len=None, min_len=None): | |||||
"""Batch and Pad data, only for Inference. | |||||
:param iterator: An iterable object that returns a list of indices representing a mini-batch of samples. | |||||
:param use_cuda: bool, whether to use GPU | |||||
:param output_length: bool, whether to output the original length of the sequence before padding. (default: False) | |||||
:param max_len: int, maximum sequence length. Longer sequences will be clipped. (default: None) | |||||
:param min_len: int, minimum sequence length. Shorter sequences will be padded. (default: None) | |||||
:return: | |||||
""" | |||||
for batch_x in iterator: | |||||
batch_x = pad(batch_x) | |||||
# convert list to tensor | |||||
batch_x = convert_to_torch_tensor(batch_x, use_cuda) | |||||
# trim data to max_len | |||||
if max_len is not None and batch_x.size(1) > max_len: | |||||
batch_x = batch_x[:, :max_len] | |||||
if min_len is not None and batch_x.size(1) < min_len: | |||||
pad_tensor = torch.zeros(batch_x.size(0), min_len - batch_x.size(1)).to(batch_x) | |||||
batch_x = torch.cat((batch_x, pad_tensor), 1) | |||||
if output_length: | |||||
seq_len = [len(x) for x in batch_x] | |||||
yield tuple([batch_x, seq_len]) | |||||
else: | |||||
yield batch_x | |||||
def pad(batch, fill=0): | |||||
""" Pad a mini-batch of sequence samples to maximum length of this batch. | |||||
:param batch: list of list | |||||
:param fill: word index to pad, default 0. | |||||
:return batch: a padded mini-batch | |||||
""" | |||||
max_length = max([len(x) for x in batch]) | |||||
for idx, sample in enumerate(batch): | |||||
if len(sample) < max_length: | |||||
batch[idx] = sample + ([fill] * (max_length - len(sample))) | |||||
return batch | |||||
from fastNLP.core.action import SequentialSampler | |||||
from fastNLP.core.batch import Batch | |||||
from fastNLP.core.dataset import create_dataset_from_lists | |||||
from fastNLP.core.preprocess import load_pickle | |||||
class Predictor(object): | class Predictor(object): | ||||
@@ -59,11 +16,17 @@ class Predictor(object): | |||||
Currently, Predictor does not support GPU. | Currently, Predictor does not support GPU. | ||||
""" | """ | ||||
def __init__(self, pickle_path): | |||||
def __init__(self, pickle_path, task): | |||||
""" | |||||
:param pickle_path: str, the path to the pickle files. | |||||
:param task: str, specify which task the predictor will perform. One of ("seq_label", "text_classify"). | |||||
""" | |||||
self.batch_size = 1 | self.batch_size = 1 | ||||
self.batch_output = [] | self.batch_output = [] | ||||
self.iterator = None | |||||
self.pickle_path = pickle_path | self.pickle_path = pickle_path | ||||
self._task = task # one of ("seq_label", "text_classify") | |||||
self.index2label = load_pickle(self.pickle_path, "id2class.pkl") | self.index2label = load_pickle(self.pickle_path, "id2class.pkl") | ||||
self.word2index = load_pickle(self.pickle_path, "word2id.pkl") | self.word2index = load_pickle(self.pickle_path, "word2id.pkl") | ||||
@@ -71,19 +34,19 @@ class Predictor(object): | |||||
"""Perform inference using the trained model. | """Perform inference using the trained model. | ||||
:param network: a PyTorch model (cpu) | :param network: a PyTorch model (cpu) | ||||
:param data: list of list of strings | |||||
:param data: list of list of strings, [num_examples, seq_len] | |||||
:return: list of list of strings, [num_examples, tag_seq_length] | :return: list of list of strings, [num_examples, tag_seq_length] | ||||
""" | """ | ||||
# transform strings into indices | |||||
# transform strings into DataSet object | |||||
data = self.prepare_input(data) | data = self.prepare_input(data) | ||||
# turn on the testing mode; clean up the history | # turn on the testing mode; clean up the history | ||||
self.mode(network, test=True) | self.mode(network, test=True) | ||||
self.batch_output.clear() | self.batch_output.clear() | ||||
data_iterator = iter(Batchifier(SequentialSampler(data), self.batch_size, drop_last=False)) | |||||
data_iterator = Batch(data, batch_size=self.batch_size, sampler=SequentialSampler(), use_cuda=False) | |||||
for batch_x in self.make_batch(data_iterator, use_cuda=False): | |||||
for batch_x, _ in data_iterator: | |||||
with torch.no_grad(): | with torch.no_grad(): | ||||
prediction = self.data_forward(network, batch_x) | prediction = self.data_forward(network, batch_x) | ||||
@@ -99,103 +62,61 @@ class Predictor(object): | |||||
def data_forward(self, network, x): | def data_forward(self, network, x): | ||||
"""Forward through network.""" | """Forward through network.""" | ||||
raise NotImplementedError | |||||
def make_batch(self, iterator, use_cuda): | |||||
raise NotImplementedError | |||||
y = network(**x) | |||||
if self._task == "seq_label": | |||||
y = network.prediction(y) | |||||
return y | |||||
def prepare_input(self, data): | def prepare_input(self, data): | ||||
"""Transform two-level list of strings into that of index. | |||||
"""Transform two-level list of strings into an DataSet object. | |||||
In the training pipeline, this is done by Preprocessor. But in inference time, we do not call Preprocessor. | |||||
:param data: | |||||
:param data: list of list of strings. | |||||
:: | |||||
[ | [ | ||||
[word_11, word_12, ...], | [word_11, word_12, ...], | ||||
[word_21, word_22, ...], | [word_21, word_22, ...], | ||||
... | ... | ||||
] | ] | ||||
:return data_index: list of list of int. | |||||
:return data_set: a DataSet instance. | |||||
""" | """ | ||||
assert isinstance(data, list) | assert isinstance(data, list) | ||||
data_index = [] | |||||
default_unknown_index = self.word2index[DEFAULT_UNKNOWN_LABEL] | |||||
for example in data: | |||||
data_index.append([self.word2index.get(w, default_unknown_index) for w in example]) | |||||
return data_index | |||||
return create_dataset_from_lists(data, self.word2index, has_target=False) | |||||
def prepare_output(self, data): | def prepare_output(self, data): | ||||
"""Transform list of batch outputs into strings.""" | """Transform list of batch outputs into strings.""" | ||||
raise NotImplementedError | |||||
class SeqLabelInfer(Predictor): | |||||
""" | |||||
Inference on sequence labeling models. | |||||
""" | |||||
def __init__(self, pickle_path): | |||||
super(SeqLabelInfer, self).__init__(pickle_path) | |||||
if self._task == "seq_label": | |||||
return self._seq_label_prepare_output(data) | |||||
elif self._task == "text_classify": | |||||
return self._text_classify_prepare_output(data) | |||||
else: | |||||
raise NotImplementedError("Unknown task type {}".format(self._task)) | |||||
def data_forward(self, network, inputs): | |||||
""" | |||||
This is only for sequence labeling with CRF decoder. | |||||
:param network: a PyTorch model | |||||
:param inputs: tuple of (x, seq_len) | |||||
x: Tensor of shape [batch_size, max_len], where max_len is the maximum length of the mini-batch | |||||
after padding. | |||||
seq_len: list of int, the lengths of sequences before padding. | |||||
:return prediction: Tensor of shape [batch_size, max_len] | |||||
""" | |||||
if not isinstance(inputs[1], list) and isinstance(inputs[0], list): | |||||
raise RuntimeError("output_length must be true for sequence modeling.") | |||||
# unpack the returned value from make_batch | |||||
x, seq_len = inputs[0], inputs[1] | |||||
batch_size, max_len = x.size(0), x.size(1) | |||||
mask = utils.seq_mask(seq_len, max_len) | |||||
mask = mask.byte().view(batch_size, max_len) | |||||
y = network(x) | |||||
prediction = network.prediction(y, mask) | |||||
return torch.Tensor(prediction) | |||||
def make_batch(self, iterator, use_cuda): | |||||
return make_batch(iterator, use_cuda, output_length=True) | |||||
def prepare_output(self, batch_outputs): | |||||
"""Transform list of batch outputs into strings. | |||||
:param batch_outputs: list of 2-D Tensor, shape [num_batch, batch-size, tag_seq_length]. | |||||
:return results: 2-D list of strings, shape [num_examples, tag_seq_length] | |||||
""" | |||||
def _seq_label_prepare_output(self, batch_outputs): | |||||
results = [] | results = [] | ||||
for batch in batch_outputs: | for batch in batch_outputs: | ||||
for example in np.array(batch): | for example in np.array(batch): | ||||
results.append([self.index2label[int(x)] for x in example]) | results.append([self.index2label[int(x)] for x in example]) | ||||
return results | return results | ||||
class ClassificationInfer(Predictor): | |||||
""" | |||||
Inference on Classification models. | |||||
""" | |||||
def __init__(self, pickle_path): | |||||
super(ClassificationInfer, self).__init__(pickle_path) | |||||
def data_forward(self, network, x): | |||||
"""Forward through network.""" | |||||
logits = network(x) | |||||
return logits | |||||
def make_batch(self, iterator, use_cuda): | |||||
return make_batch(iterator, use_cuda, output_length=False, min_len=5) | |||||
def prepare_output(self, batch_outputs): | |||||
""" | |||||
Transform list of batch outputs into strings. | |||||
:param batch_outputs: list of 2-D Tensor, of shape [num_batch, batch-size, num_classes]. | |||||
:return results: list of strings | |||||
""" | |||||
def _text_classify_prepare_output(self, batch_outputs): | |||||
results = [] | results = [] | ||||
for batch_out in batch_outputs: | for batch_out in batch_outputs: | ||||
idx = np.argmax(batch_out.detach().numpy(), axis=-1) | idx = np.argmax(batch_out.detach().numpy(), axis=-1) | ||||
results.extend([self.index2label[i] for i in idx]) | results.extend([self.index2label[i] for i in idx]) | ||||
return results | return results | ||||
class SeqLabelInfer(Predictor): | |||||
def __init__(self, pickle_path): | |||||
print( | |||||
"[FastNLP Warning] SeqLabelInfer will be deprecated. Please use Predictor with argument 'task'='seq_label'.") | |||||
super(SeqLabelInfer, self).__init__(pickle_path, "seq_label") | |||||
class ClassificationInfer(Predictor): | |||||
def __init__(self, pickle_path): | |||||
print( | |||||
"[FastNLP Warning] ClassificationInfer will be deprecated. Please use Predictor with argument 'task'='text_classify'.") | |||||
super(ClassificationInfer, self).__init__(pickle_path, "text_classify") |
@@ -1,7 +1,6 @@ | |||||
import numpy as np | import numpy as np | ||||
import torch | import torch | ||||
from fastNLP.core.action import Action | |||||
from fastNLP.core.action import RandomSampler | from fastNLP.core.action import RandomSampler | ||||
from fastNLP.core.batch import Batch | from fastNLP.core.batch import Batch | ||||
from fastNLP.saver.logger import create_logger | from fastNLP.saver.logger import create_logger | ||||
@@ -79,7 +78,7 @@ class BaseTester(object): | |||||
self._model = network | self._model = network | ||||
# turn on the testing mode; clean up the history | # turn on the testing mode; clean up the history | ||||
self.mode(network, test=True) | |||||
self.mode(network, is_test=True) | |||||
self.eval_history.clear() | self.eval_history.clear() | ||||
self.batch_output.clear() | self.batch_output.clear() | ||||
@@ -102,13 +101,17 @@ class BaseTester(object): | |||||
print(self.make_eval_output(prediction, eval_results)) | print(self.make_eval_output(prediction, eval_results)) | ||||
step += 1 | step += 1 | ||||
def mode(self, model, test): | |||||
def mode(self, model, is_test=False): | |||||
"""Train mode or Test mode. This is for PyTorch currently. | """Train mode or Test mode. This is for PyTorch currently. | ||||
:param model: a PyTorch model | :param model: a PyTorch model | ||||
:param test: bool, whether in test mode. | |||||
:param is_test: bool, whether in test mode or not. | |||||
""" | """ | ||||
Action.mode(model, test) | |||||
if is_test: | |||||
model.eval() | |||||
else: | |||||
model.train() | |||||
def data_forward(self, network, x): | def data_forward(self, network, x): | ||||
"""A forward pass of the model. """ | """A forward pass of the model. """ | ||||
@@ -6,7 +6,6 @@ from datetime import timedelta | |||||
import torch | import torch | ||||
from tensorboardX import SummaryWriter | from tensorboardX import SummaryWriter | ||||
from fastNLP.core.action import Action | |||||
from fastNLP.core.action import RandomSampler | from fastNLP.core.action import RandomSampler | ||||
from fastNLP.core.batch import Batch | from fastNLP.core.batch import Batch | ||||
from fastNLP.core.loss import Loss | from fastNLP.core.loss import Loss | ||||
@@ -126,7 +125,7 @@ class BaseTrainer(object): | |||||
logger.info("training epoch {}".format(epoch)) | logger.info("training epoch {}".format(epoch)) | ||||
# turn on network training mode | # turn on network training mode | ||||
self.mode(network, test=False) | |||||
self.mode(network, is_test=False) | |||||
# prepare mini-batch iterator | # prepare mini-batch iterator | ||||
data_iterator = Batch(train_data, batch_size=self.batch_size, sampler=RandomSampler(), | data_iterator = Batch(train_data, batch_size=self.batch_size, sampler=RandomSampler(), | ||||
use_cuda=self.use_cuda) | use_cuda=self.use_cuda) | ||||
@@ -201,8 +200,17 @@ class BaseTrainer(object): | |||||
network_copy = copy.deepcopy(network) | network_copy = copy.deepcopy(network) | ||||
self.train(network_copy, train_data_cv[i], dev_data_cv[i]) | self.train(network_copy, train_data_cv[i], dev_data_cv[i]) | ||||
def mode(self, network, test): | |||||
Action.mode(network, test) | |||||
def mode(self, model, is_test=False): | |||||
"""Train mode or Test mode. This is for PyTorch currently. | |||||
:param model: a PyTorch model | |||||
:param is_test: bool, whether in test mode or not. | |||||
""" | |||||
if is_test: | |||||
model.eval() | |||||
else: | |||||
model.train() | |||||
def define_optimizer(self): | def define_optimizer(self): | ||||
"""Define framework-specific optimizer specified by the models. | """Define framework-specific optimizer specified by the models. | ||||
@@ -284,7 +292,7 @@ class BaseTrainer(object): | |||||
:param validator: a Tester instance | :param validator: a Tester instance | ||||
:return: bool, True means current results on dev set is the best. | :return: bool, True means current results on dev set is the best. | ||||
""" | """ | ||||
loss, accuracy = validator.metrics() | |||||
loss, accuracy = validator.metrics | |||||
if accuracy > self._best_accuracy: | if accuracy > self._best_accuracy: | ||||
self._best_accuracy = accuracy | self._best_accuracy = accuracy | ||||
return True | return True | ||||
@@ -62,6 +62,8 @@ class SeqLabeling(BaseModel): | |||||
""" | """ | ||||
x = x.float() | x = x.float() | ||||
y = y.long() | y = y.long() | ||||
assert x.shape[:2] == y.shape | |||||
assert y.shape == self.mask.shape | |||||
total_loss = self.Crf(x, y, self.mask) | total_loss = self.Crf(x, y, self.mask) | ||||
return torch.mean(total_loss) | return torch.mean(total_loss) | ||||
@@ -1,17 +0,0 @@ | |||||
import unittest | |||||
from fastNLP.core.action import Action, Batchifier, SequentialSampler | |||||
class TestAction(unittest.TestCase): | |||||
def test_case_1(self): | |||||
x = [1, 2, 3, 4, 5, 6, 7, 8] | |||||
y = [1, 1, 1, 1, 2, 2, 2, 2] | |||||
data = [] | |||||
for i in range(len(x)): | |||||
data.append([[x[i]], [y[i]]]) | |||||
data = Batchifier(SequentialSampler(data), batch_size=2, drop_last=False) | |||||
action = Action() | |||||
for batch_x in action.make_batch(data, use_cuda=False, output_length=True, max_len=None): | |||||
print(batch_x) | |||||
@@ -0,0 +1,62 @@ | |||||
import unittest | |||||
import torch | |||||
from fastNLP.core.batch import Batch | |||||
from fastNLP.core.dataset import DataSet, create_dataset_from_lists | |||||
from fastNLP.core.field import TextField, LabelField | |||||
from fastNLP.core.instance import Instance | |||||
raw_texts = ["i am a cat", | |||||
"this is a test of new batch", | |||||
"ha ha", | |||||
"I am a good boy .", | |||||
"This is the most beautiful girl ." | |||||
] | |||||
texts = [text.strip().split() for text in raw_texts] | |||||
labels = [0, 1, 0, 0, 1] | |||||
# prepare vocabulary | |||||
vocab = {} | |||||
for text in texts: | |||||
for tokens in text: | |||||
if tokens not in vocab: | |||||
vocab[tokens] = len(vocab) | |||||
class TestCase1(unittest.TestCase): | |||||
def test(self): | |||||
data = DataSet() | |||||
for text, label in zip(texts, labels): | |||||
x = TextField(text, is_target=False) | |||||
y = LabelField(label, is_target=True) | |||||
ins = Instance(text=x, label=y) | |||||
data.append(ins) | |||||
# use vocabulary to index data | |||||
data.index_field("text", vocab) | |||||
# define naive sampler for batch class | |||||
class SeqSampler: | |||||
def __call__(self, dataset): | |||||
return list(range(len(dataset))) | |||||
# use batch to iterate dataset | |||||
data_iterator = Batch(data, 2, SeqSampler(), False) | |||||
for batch_x, batch_y in data_iterator: | |||||
self.assertEqual(len(batch_x), 2) | |||||
self.assertTrue(isinstance(batch_x, dict)) | |||||
self.assertTrue(isinstance(batch_x["text"], torch.LongTensor)) | |||||
self.assertTrue(isinstance(batch_y, dict)) | |||||
self.assertTrue(isinstance(batch_y["label"], torch.LongTensor)) | |||||
class TestCase2(unittest.TestCase): | |||||
def test(self): | |||||
data = DataSet() | |||||
for text in texts: | |||||
x = TextField(text, is_target=False) | |||||
ins = Instance(text=x) | |||||
data.append(ins) | |||||
data_set = create_dataset_from_lists(texts, vocab, has_target=False) | |||||
self.assertTrue(type(data) == type(data_set)) |
@@ -0,0 +1,51 @@ | |||||
import os | |||||
import unittest | |||||
from fastNLP.core.predictor import Predictor | |||||
from fastNLP.core.preprocess import save_pickle | |||||
from fastNLP.models.sequence_modeling import SeqLabeling | |||||
class TestPredictor(unittest.TestCase): | |||||
def test_seq_label(self): | |||||
model_args = { | |||||
"vocab_size": 10, | |||||
"word_emb_dim": 100, | |||||
"rnn_hidden_units": 100, | |||||
"num_classes": 5 | |||||
} | |||||
infer_data = [ | |||||
['a', 'b', 'c', 'd', 'e'], | |||||
['a', '@', 'c', 'd', 'e'], | |||||
['a', 'b', '#', 'd', 'e'], | |||||
['a', 'b', 'c', '?', 'e'], | |||||
['a', 'b', 'c', 'd', '$'], | |||||
['!', 'b', 'c', 'd', 'e'] | |||||
] | |||||
vocab = {'a': 0, 'b': 1, 'c': 2, 'd': 3, 'e': 4, '!': 5, '@': 6, '#': 7, '$': 8, '?': 9} | |||||
os.system("mkdir save") | |||||
save_pickle({0: "0", 1: "1", 2: "2", 3: "3", 4: "4"}, "./save/", "id2class.pkl") | |||||
save_pickle(vocab, "./save/", "word2id.pkl") | |||||
model = SeqLabeling(model_args) | |||||
predictor = Predictor("./save/", task="seq_label") | |||||
results = predictor.predict(network=model, data=infer_data) | |||||
self.assertTrue(isinstance(results, list)) | |||||
self.assertGreater(len(results), 0) | |||||
for res in results: | |||||
self.assertTrue(isinstance(res, list)) | |||||
self.assertEqual(len(res), 5) | |||||
self.assertTrue(isinstance(res[0], str)) | |||||
os.system("rm -rf save") | |||||
print("pickle path deleted") | |||||
class TestPredictor2(unittest.TestCase): | |||||
def test_text_classify(self): | |||||
# TODO | |||||
pass |
@@ -1,24 +1,25 @@ | |||||
import os | import os | ||||
import unittest | import unittest | ||||
from fastNLP.core.dataset import DataSet | |||||
from fastNLP.core.preprocess import SeqLabelPreprocess | from fastNLP.core.preprocess import SeqLabelPreprocess | ||||
data = [ | |||||
[['Tom', 'and', 'Jerry', '.'], ['n', '&', 'n', '.']], | |||||
[['Hello', 'world', '!'], ['a', 'n', '.']], | |||||
[['Tom', 'and', 'Jerry', '.'], ['n', '&', 'n', '.']], | |||||
[['Hello', 'world', '!'], ['a', 'n', '.']], | |||||
[['Tom', 'and', 'Jerry', '.'], ['n', '&', 'n', '.']], | |||||
[['Hello', 'world', '!'], ['a', 'n', '.']], | |||||
[['Tom', 'and', 'Jerry', '.'], ['n', '&', 'n', '.']], | |||||
[['Hello', 'world', '!'], ['a', 'n', '.']], | |||||
[['Tom', 'and', 'Jerry', '.'], ['n', '&', 'n', '.']], | |||||
[['Hello', 'world', '!'], ['a', 'n', '.']], | |||||
] | |||||
class TestSeqLabelPreprocess(unittest.TestCase): | |||||
def test_case_1(self): | |||||
data = [ | |||||
[['Tom', 'and', 'Jerry', '.'], ['n', '&', 'n', '.']], | |||||
[['Hello', 'world', '!'], ['a', 'n', '.']], | |||||
[['Tom', 'and', 'Jerry', '.'], ['n', '&', 'n', '.']], | |||||
[['Hello', 'world', '!'], ['a', 'n', '.']], | |||||
[['Tom', 'and', 'Jerry', '.'], ['n', '&', 'n', '.']], | |||||
[['Hello', 'world', '!'], ['a', 'n', '.']], | |||||
[['Tom', 'and', 'Jerry', '.'], ['n', '&', 'n', '.']], | |||||
[['Hello', 'world', '!'], ['a', 'n', '.']], | |||||
[['Tom', 'and', 'Jerry', '.'], ['n', '&', 'n', '.']], | |||||
[['Hello', 'world', '!'], ['a', 'n', '.']], | |||||
] | |||||
class TestCase1(unittest.TestCase): | |||||
def test(self): | |||||
if os.path.exists("./save"): | if os.path.exists("./save"): | ||||
for root, dirs, files in os.walk("./save", topdown=False): | for root, dirs, files in os.walk("./save", topdown=False): | ||||
for name in files: | for name in files: | ||||
@@ -27,17 +28,45 @@ class TestSeqLabelPreprocess(unittest.TestCase): | |||||
os.rmdir(os.path.join(root, name)) | os.rmdir(os.path.join(root, name)) | ||||
result = SeqLabelPreprocess().run(train_dev_data=data, train_dev_split=0.4, | result = SeqLabelPreprocess().run(train_dev_data=data, train_dev_split=0.4, | ||||
pickle_path="./save") | pickle_path="./save") | ||||
result = SeqLabelPreprocess().run(train_dev_data=data, train_dev_split=0.4, | |||||
pickle_path="./save") | |||||
self.assertEqual(len(result), 2) | |||||
self.assertEqual(type(result[0]), DataSet) | |||||
self.assertEqual(type(result[1]), DataSet) | |||||
os.system("rm -rf save") | |||||
print("pickle path deleted") | |||||
class TestCase2(unittest.TestCase): | |||||
def test(self): | |||||
if os.path.exists("./save"): | if os.path.exists("./save"): | ||||
for root, dirs, files in os.walk("./save", topdown=False): | for root, dirs, files in os.walk("./save", topdown=False): | ||||
for name in files: | for name in files: | ||||
os.remove(os.path.join(root, name)) | os.remove(os.path.join(root, name)) | ||||
for name in dirs: | for name in dirs: | ||||
os.rmdir(os.path.join(root, name)) | os.rmdir(os.path.join(root, name)) | ||||
result = SeqLabelPreprocess().run(test_data=data, train_dev_data=data, | |||||
pickle_path="./save", train_dev_split=0.4, | |||||
cross_val=True) | |||||
result = SeqLabelPreprocess().run(test_data=data, train_dev_data=data, | result = SeqLabelPreprocess().run(test_data=data, train_dev_data=data, | ||||
pickle_path="./save", train_dev_split=0.4, | pickle_path="./save", train_dev_split=0.4, | ||||
cross_val=True) | |||||
cross_val=False) | |||||
self.assertEqual(len(result), 3) | |||||
self.assertEqual(type(result[0]), DataSet) | |||||
self.assertEqual(type(result[1]), DataSet) | |||||
self.assertEqual(type(result[2]), DataSet) | |||||
os.system("rm -rf save") | |||||
print("pickle path deleted") | |||||
class TestCase3(unittest.TestCase): | |||||
def test(self): | |||||
num_folds = 2 | |||||
result = SeqLabelPreprocess().run(test_data=None, train_dev_data=data, | |||||
pickle_path="./save", train_dev_split=0.4, | |||||
cross_val=True, n_fold=num_folds) | |||||
self.assertEqual(len(result), 2) | |||||
self.assertEqual(len(result[0]), num_folds) | |||||
self.assertEqual(len(result[1]), num_folds) | |||||
for data_set in result[0] + result[1]: | |||||
self.assertEqual(type(data_set), DataSet) | |||||
os.system("rm -rf save") | |||||
print("pickle path deleted") |
@@ -1,37 +1,55 @@ | |||||
from fastNLP.core.preprocess import SeqLabelPreprocess | |||||
import os | |||||
import unittest | |||||
from fastNLP.core.dataset import DataSet | |||||
from fastNLP.core.field import TextField | |||||
from fastNLP.core.instance import Instance | |||||
from fastNLP.core.tester import SeqLabelTester | from fastNLP.core.tester import SeqLabelTester | ||||
from fastNLP.loader.config_loader import ConfigSection, ConfigLoader | |||||
from fastNLP.loader.dataset_loader import TokenizeDatasetLoader | |||||
from fastNLP.models.sequence_modeling import SeqLabeling | from fastNLP.models.sequence_modeling import SeqLabeling | ||||
data_name = "pku_training.utf8" | data_name = "pku_training.utf8" | ||||
pickle_path = "data_for_tests" | pickle_path = "data_for_tests" | ||||
def foo(): | |||||
loader = TokenizeDatasetLoader("./data_for_tests/cws_pku_utf_8") | |||||
train_data = loader.load_pku() | |||||
train_args = ConfigSection() | |||||
ConfigLoader("config.cfg").load_config("./data_for_tests/config", {"POS": train_args}) | |||||
# Preprocessor | |||||
p = SeqLabelPreprocess() | |||||
train_data = p.run(train_data) | |||||
train_args["vocab_size"] = p.vocab_size | |||||
train_args["num_classes"] = p.num_classes | |||||
model = SeqLabeling(train_args) | |||||
valid_args = {"save_output": True, "validate_in_training": True, "save_dev_input": True, | |||||
"save_loss": True, "batch_size": 8, "pickle_path": "./data_for_tests/", | |||||
"use_cuda": True} | |||||
validator = SeqLabelTester(**valid_args) | |||||
print("start validation.") | |||||
validator.test(model, train_data) | |||||
print(validator.show_metrics()) | |||||
if __name__ == "__main__": | |||||
foo() | |||||
class TestTester(unittest.TestCase): | |||||
def test_case_1(self): | |||||
model_args = { | |||||
"vocab_size": 10, | |||||
"word_emb_dim": 100, | |||||
"rnn_hidden_units": 100, | |||||
"num_classes": 5 | |||||
} | |||||
valid_args = {"save_output": True, "validate_in_training": True, "save_dev_input": True, | |||||
"save_loss": True, "batch_size": 2, "pickle_path": "./save/", | |||||
"use_cuda": False, "print_every_step": 1} | |||||
train_data = [ | |||||
[['a', 'b', 'c', 'd', 'e'], ['a', '@', 'c', 'd', 'e']], | |||||
[['a', '@', 'c', 'd', 'e'], ['a', '@', 'c', 'd', 'e']], | |||||
[['a', 'b', '#', 'd', 'e'], ['a', '@', 'c', 'd', 'e']], | |||||
[['a', 'b', 'c', '?', 'e'], ['a', '@', 'c', 'd', 'e']], | |||||
[['a', 'b', 'c', 'd', '$'], ['a', '@', 'c', 'd', 'e']], | |||||
[['!', 'b', 'c', 'd', 'e'], ['a', '@', 'c', 'd', 'e']], | |||||
] | |||||
vocab = {'a': 0, 'b': 1, 'c': 2, 'd': 3, 'e': 4, '!': 5, '@': 6, '#': 7, '$': 8, '?': 9} | |||||
label_vocab = {'a': 0, '@': 1, 'c': 2, 'd': 3, 'e': 4} | |||||
data_set = DataSet() | |||||
for example in train_data: | |||||
text, label = example[0], example[1] | |||||
x = TextField(text, False) | |||||
y = TextField(label, is_target=True) | |||||
ins = Instance(word_seq=x, label_seq=y) | |||||
data_set.append(ins) | |||||
data_set.index_field("word_seq", vocab) | |||||
data_set.index_field("label_seq", label_vocab) | |||||
model = SeqLabeling(model_args) | |||||
tester = SeqLabelTester(**valid_args) | |||||
tester.test(network=model, dev_data=data_set) | |||||
# If this can run, everything is OK. | |||||
os.system("rm -rf save") | |||||
print("pickle path deleted") |
@@ -1,33 +1,54 @@ | |||||
import os | import os | ||||
import torch.nn as nn | |||||
import unittest | import unittest | ||||
from fastNLP.core.trainer import SeqLabelTrainer | |||||
from fastNLP.core.dataset import DataSet | |||||
from fastNLP.core.field import TextField | |||||
from fastNLP.core.instance import Instance | |||||
from fastNLP.core.loss import Loss | from fastNLP.core.loss import Loss | ||||
from fastNLP.core.optimizer import Optimizer | from fastNLP.core.optimizer import Optimizer | ||||
from fastNLP.core.trainer import SeqLabelTrainer | |||||
from fastNLP.models.sequence_modeling import SeqLabeling | from fastNLP.models.sequence_modeling import SeqLabeling | ||||
class TestTrainer(unittest.TestCase): | class TestTrainer(unittest.TestCase): | ||||
def test_case_1(self): | def test_case_1(self): | ||||
args = {"epochs": 3, "batch_size": 8, "validate": True, "use_cuda": True, "pickle_path": "./save/", | |||||
args = {"epochs": 3, "batch_size": 2, "validate": True, "use_cuda": False, "pickle_path": "./save/", | |||||
"save_best_dev": True, "model_name": "default_model_name.pkl", | "save_best_dev": True, "model_name": "default_model_name.pkl", | ||||
"loss": Loss(None), | "loss": Loss(None), | ||||
"optimizer": Optimizer("Adam", lr=0.001, weight_decay=0), | "optimizer": Optimizer("Adam", lr=0.001, weight_decay=0), | ||||
"vocab_size": 20, | |||||
"vocab_size": 10, | |||||
"word_emb_dim": 100, | "word_emb_dim": 100, | ||||
"rnn_hidden_units": 100, | "rnn_hidden_units": 100, | ||||
"num_classes": 3 | |||||
"num_classes": 5 | |||||
} | } | ||||
trainer = SeqLabelTrainer() | |||||
trainer = SeqLabelTrainer(**args) | |||||
train_data = [ | train_data = [ | ||||
[[1, 2, 3, 4, 5, 6], [1, 0, 1, 0, 1, 2]], | |||||
[[2, 3, 4, 5, 1, 6], [0, 1, 0, 1, 0, 2]], | |||||
[[1, 4, 1, 4, 1, 6], [1, 0, 1, 0, 1, 2]], | |||||
[[1, 2, 3, 4, 5, 6], [1, 0, 1, 0, 1, 2]], | |||||
[[2, 3, 4, 5, 1, 6], [0, 1, 0, 1, 0, 2]], | |||||
[[1, 4, 1, 4, 1, 6], [1, 0, 1, 0, 1, 2]], | |||||
[['a', 'b', 'c', 'd', 'e'], ['a', '@', 'c', 'd', 'e']], | |||||
[['a', '@', 'c', 'd', 'e'], ['a', '@', 'c', 'd', 'e']], | |||||
[['a', 'b', '#', 'd', 'e'], ['a', '@', 'c', 'd', 'e']], | |||||
[['a', 'b', 'c', '?', 'e'], ['a', '@', 'c', 'd', 'e']], | |||||
[['a', 'b', 'c', 'd', '$'], ['a', '@', 'c', 'd', 'e']], | |||||
[['!', 'b', 'c', 'd', 'e'], ['a', '@', 'c', 'd', 'e']], | |||||
] | ] | ||||
dev_data = train_data | |||||
vocab = {'a': 0, 'b': 1, 'c': 2, 'd': 3, 'e': 4, '!': 5, '@': 6, '#': 7, '$': 8, '?': 9} | |||||
label_vocab = {'a': 0, '@': 1, 'c': 2, 'd': 3, 'e': 4} | |||||
data_set = DataSet() | |||||
for example in train_data: | |||||
text, label = example[0], example[1] | |||||
x = TextField(text, False) | |||||
y = TextField(label, is_target=True) | |||||
ins = Instance(word_seq=x, label_seq=y) | |||||
data_set.append(ins) | |||||
data_set.index_field("word_seq", vocab) | |||||
data_set.index_field("label_seq", label_vocab) | |||||
model = SeqLabeling(args) | model = SeqLabeling(args) | ||||
trainer.train(network=model, train_data=train_data, dev_data=dev_data) | |||||
trainer.train(network=model, train_data=data_set, dev_data=data_set) | |||||
# If this can run, everything is OK. | |||||
os.system("rm -rf save") | |||||
print("pickle path deleted") |
@@ -1,8 +0,0 @@ | |||||
def test_charlm(): | |||||
pass | |||||
if __name__ == "__main__": | |||||
test_charlm() |
@@ -0,0 +1,96 @@ | |||||
import argparse | |||||
import os | |||||
from fastNLP.core.optimizer import Optimizer | |||||
from fastNLP.core.preprocess import SeqLabelPreprocess | |||||
from fastNLP.core.tester import SeqLabelTester | |||||
from fastNLP.core.trainer import SeqLabelTrainer | |||||
from fastNLP.loader.config_loader import ConfigLoader, ConfigSection | |||||
from fastNLP.loader.dataset_loader import POSDatasetLoader | |||||
from fastNLP.loader.model_loader import ModelLoader | |||||
from fastNLP.models.sequence_modeling import SeqLabeling | |||||
from fastNLP.saver.model_saver import ModelSaver | |||||
parser = argparse.ArgumentParser() | |||||
parser.add_argument("-s", "--save", type=str, default="./seq_label/", help="path to save pickle files") | |||||
parser.add_argument("-t", "--train", type=str, default="../data_for_tests/people.txt", | |||||
help="path to the training data") | |||||
parser.add_argument("-c", "--config", type=str, default="../data_for_tests/config", help="path to the config file") | |||||
parser.add_argument("-m", "--model_name", type=str, default="seq_label_model.pkl", help="the name of the model") | |||||
parser.add_argument("-i", "--infer", type=str, default="../data_for_tests/people_infer.txt", | |||||
help="data used for inference") | |||||
args = parser.parse_args() | |||||
pickle_path = args.save | |||||
model_name = args.model_name | |||||
config_dir = args.config | |||||
data_path = args.train | |||||
data_infer_path = args.infer | |||||
def test_training(): | |||||
# Config Loader | |||||
trainer_args = ConfigSection() | |||||
model_args = ConfigSection() | |||||
ConfigLoader("config.cfg").load_config(config_dir, { | |||||
"test_seq_label_trainer": trainer_args, "test_seq_label_model": model_args}) | |||||
# Data Loader | |||||
pos_loader = POSDatasetLoader(data_path) | |||||
train_data = pos_loader.load_lines() | |||||
# Preprocessor | |||||
p = SeqLabelPreprocess() | |||||
data_train, data_dev = p.run(train_data, pickle_path=pickle_path, train_dev_split=0.5) | |||||
model_args["vocab_size"] = p.vocab_size | |||||
model_args["num_classes"] = p.num_classes | |||||
trainer = SeqLabelTrainer( | |||||
epochs=trainer_args["epochs"], | |||||
batch_size=trainer_args["batch_size"], | |||||
validate=False, | |||||
use_cuda=False, | |||||
pickle_path=pickle_path, | |||||
save_best_dev=trainer_args["save_best_dev"], | |||||
model_name=model_name, | |||||
optimizer=Optimizer("SGD", lr=0.01, momentum=0.9), | |||||
) | |||||
# Model | |||||
model = SeqLabeling(model_args) | |||||
# Start training | |||||
trainer.train(model, data_train, data_dev) | |||||
# Saver | |||||
saver = ModelSaver(os.path.join(pickle_path, model_name)) | |||||
saver.save_pytorch(model) | |||||
del model, trainer, pos_loader | |||||
# Define the same model | |||||
model = SeqLabeling(model_args) | |||||
# Dump trained parameters into the model | |||||
ModelLoader.load_pytorch(model, os.path.join(pickle_path, model_name)) | |||||
# Load test configuration | |||||
tester_args = ConfigSection() | |||||
ConfigLoader("config.cfg").load_config(config_dir, {"test_seq_label_tester": tester_args}) | |||||
# Tester | |||||
tester = SeqLabelTester(save_output=False, | |||||
save_loss=True, | |||||
save_best_dev=False, | |||||
batch_size=4, | |||||
use_cuda=False, | |||||
pickle_path=pickle_path, | |||||
model_name="seq_label_in_test.pkl", | |||||
print_every_step=1 | |||||
) | |||||
# Start testing with validation data | |||||
tester.test(model, data_dev) | |||||
loss, accuracy = tester.metrics | |||||
assert 0 < accuracy < 1 |