Text Classification Interfacetags/v0.1.0
| @@ -1,7 +1,3 @@ | |||||
| """ | |||||
| This file defines Action(s) and sample methods. | |||||
| """ | |||||
| from collections import Counter | from collections import Counter | ||||
| import numpy as np | import numpy as np | ||||
| @@ -9,13 +5,12 @@ import torch | |||||
| class Action(object): | class Action(object): | ||||
| """ | |||||
| Operations shared by Trainer, Tester, or Inference. | |||||
| """Operations shared by Trainer, Tester, or Inference. | |||||
| This is designed for reducing replicate codes. | This is designed for reducing replicate codes. | ||||
| - make_batch: produce a min-batch of data. @staticmethod | - make_batch: produce a min-batch of data. @staticmethod | ||||
| - pad: padding method used in sequence modeling. @staticmethod | - pad: padding method used in sequence modeling. @staticmethod | ||||
| - mode: change network mode for either train or test. (for PyTorch) @staticmethod | - mode: change network mode for either train or test. (for PyTorch) @staticmethod | ||||
| The base Action shall define operations shared by as much task-specific Actions as possible. | |||||
| """ | """ | ||||
| def __init__(self): | def __init__(self): | ||||
| @@ -24,18 +19,20 @@ class Action(object): | |||||
| @staticmethod | @staticmethod | ||||
| def make_batch(iterator, use_cuda, output_length=True, max_len=None): | def make_batch(iterator, use_cuda, output_length=True, max_len=None): | ||||
| """Batch and Pad data. | """Batch and Pad data. | ||||
| :param iterator: an iterator, (object that implements __next__ method) which returns the next sample. | :param iterator: an iterator, (object that implements __next__ method) which returns the next sample. | ||||
| :param use_cuda: bool, whether to use GPU | :param use_cuda: bool, whether to use GPU | ||||
| :param output_length: bool, whether to output the original length of the sequence before padding. (default: True) | :param output_length: bool, whether to output the original length of the sequence before padding. (default: True) | ||||
| :param max_len: int, maximum sequence length. Longer sequences will be clipped. (default: None) | :param max_len: int, maximum sequence length. Longer sequences will be clipped. (default: None) | ||||
| :return | |||||
| if output_length is True: | |||||
| :return : | |||||
| if output_length is True, | |||||
| (batch_x, seq_len): tuple of two elements | (batch_x, seq_len): tuple of two elements | ||||
| batch_x: list. Each entry is a list of features of a sample. [batch_size, max_len] | batch_x: list. Each entry is a list of features of a sample. [batch_size, max_len] | ||||
| seq_len: list. The length of the pre-padded sequence, if output_length is True. | seq_len: list. The length of the pre-padded sequence, if output_length is True. | ||||
| batch_y: list. Each entry is a list of labels of a sample. [batch_size, num_labels] | batch_y: list. Each entry is a list of labels of a sample. [batch_size, num_labels] | ||||
| if output_length is False: | |||||
| if output_length is False, | |||||
| batch_x: list. Each entry is a list of features of a sample. [batch_size, max_len] | batch_x: list. Each entry is a list of features of a sample. [batch_size, max_len] | ||||
| batch_y: list. Each entry is a list of labels of a sample. [batch_size, num_labels] | batch_y: list. Each entry is a list of labels of a sample. [batch_size, num_labels] | ||||
| """ | """ | ||||
| @@ -77,21 +74,21 @@ class Action(object): | |||||
| return batch | return batch | ||||
| @staticmethod | @staticmethod | ||||
| def mode(model, test=False): | |||||
| """ | |||||
| Train mode or Test mode. This is for PyTorch currently. | |||||
| :param model: | |||||
| :param test: | |||||
| def mode(model, is_test=False): | |||||
| """Train mode or Test mode. This is for PyTorch currently. | |||||
| :param model: a PyTorch model | |||||
| :param is_test: bool, whether in test mode or not. | |||||
| """ | """ | ||||
| if test: | |||||
| if is_test: | |||||
| model.eval() | model.eval() | ||||
| else: | else: | ||||
| model.train() | model.train() | ||||
| def convert_to_torch_tensor(data_list, use_cuda): | def convert_to_torch_tensor(data_list, use_cuda): | ||||
| """ | |||||
| convert lists into (cuda) Tensors. | |||||
| """Convert lists into (cuda) Tensors. | |||||
| :param data_list: 2-level lists | :param data_list: 2-level lists | ||||
| :param use_cuda: bool, whether to use GPU or not | :param use_cuda: bool, whether to use GPU or not | ||||
| :return data_list: PyTorch Tensor of shape [batch_size, max_seq_len] | :return data_list: PyTorch Tensor of shape [batch_size, max_seq_len] | ||||
| @@ -103,8 +100,8 @@ def convert_to_torch_tensor(data_list, use_cuda): | |||||
| def k_means_1d(x, k, max_iter=100): | def k_means_1d(x, k, max_iter=100): | ||||
| """ | |||||
| Perform k-means on 1-D data. | |||||
| """Perform k-means on 1-D data. | |||||
| :param x: list of int, representing points in 1-D. | :param x: list of int, representing points in 1-D. | ||||
| :param k: the number of clusters required. | :param k: the number of clusters required. | ||||
| :param max_iter: maximum iteration | :param max_iter: maximum iteration | ||||
| @@ -132,21 +129,28 @@ def k_means_1d(x, k, max_iter=100): | |||||
| def k_means_bucketing(all_inst, buckets): | def k_means_bucketing(all_inst, buckets): | ||||
| """ | |||||
| """Assign all instances into possible buckets using k-means, such that instances in the same bucket have similar lengths. | |||||
| :param all_inst: 3-level list | :param all_inst: 3-level list | ||||
| E.g. :: | |||||
| [ | [ | ||||
| [[word_11, word_12, word_13], [label_11. label_12]], # sample 1 | [[word_11, word_12, word_13], [label_11. label_12]], # sample 1 | ||||
| [[word_21, word_22, word_23], [label_21. label_22]], # sample 2 | [[word_21, word_22, word_23], [label_21. label_22]], # sample 2 | ||||
| ... | ... | ||||
| ] | ] | ||||
| :param buckets: list of int. The length of the list is the number of buckets. Each integer is the maximum length | :param buckets: list of int. The length of the list is the number of buckets. Each integer is the maximum length | ||||
| threshold for each bucket (This is usually None.). | threshold for each bucket (This is usually None.). | ||||
| :return data: 2-level list | :return data: 2-level list | ||||
| :: | |||||
| [ | [ | ||||
| [index_11, index_12, ...], # bucket 1 | [index_11, index_12, ...], # bucket 1 | ||||
| [index_21, index_22, ...], # bucket 2 | [index_21, index_22, ...], # bucket 2 | ||||
| ... | ... | ||||
| ] | ] | ||||
| """ | """ | ||||
| bucket_data = [[] for _ in buckets] | bucket_data = [[] for _ in buckets] | ||||
| num_buckets = len(buckets) | num_buckets = len(buckets) | ||||
| @@ -160,11 +164,16 @@ def k_means_bucketing(all_inst, buckets): | |||||
| class BaseSampler(object): | class BaseSampler(object): | ||||
| """ | |||||
| Base class for all samplers. | |||||
| """The base class of all samplers. | |||||
| """ | """ | ||||
| def __init__(self, data_set): | def __init__(self, data_set): | ||||
| """ | |||||
| :param data_set: multi-level list, of shape [num_example, *] | |||||
| """ | |||||
| self.data_set_length = len(data_set) | self.data_set_length = len(data_set) | ||||
| self.data = data_set | self.data = data_set | ||||
| @@ -176,11 +185,16 @@ class BaseSampler(object): | |||||
| class SequentialSampler(BaseSampler): | class SequentialSampler(BaseSampler): | ||||
| """ | |||||
| Sample data in the original order. | |||||
| """Sample data in the original order. | |||||
| """ | """ | ||||
| def __init__(self, data_set): | def __init__(self, data_set): | ||||
| """ | |||||
| :param data_set: multi-level list | |||||
| """ | |||||
| super(SequentialSampler, self).__init__(data_set) | super(SequentialSampler, self).__init__(data_set) | ||||
| def __iter__(self): | def __iter__(self): | ||||
| @@ -188,11 +202,16 @@ class SequentialSampler(BaseSampler): | |||||
| class RandomSampler(BaseSampler): | class RandomSampler(BaseSampler): | ||||
| """ | |||||
| Sample data in random permutation order. | |||||
| """Sample data in random permutation order. | |||||
| """ | """ | ||||
| def __init__(self, data_set): | def __init__(self, data_set): | ||||
| """ | |||||
| :param data_set: multi-level list | |||||
| """ | |||||
| super(RandomSampler, self).__init__(data_set) | super(RandomSampler, self).__init__(data_set) | ||||
| self.order = np.random.permutation(self.data_set_length) | self.order = np.random.permutation(self.data_set_length) | ||||
| @@ -201,11 +220,18 @@ class RandomSampler(BaseSampler): | |||||
| class Batchifier(object): | class Batchifier(object): | ||||
| """ | |||||
| Wrap random or sequential sampler to generate a mini-batch. | |||||
| """Wrap random or sequential sampler to generate a mini-batch. | |||||
| """ | """ | ||||
| def __init__(self, sampler, batch_size, drop_last=True): | def __init__(self, sampler, batch_size, drop_last=True): | ||||
| """ | |||||
| :param sampler: a Sampler object | |||||
| :param batch_size: int, the size of the mini-batch | |||||
| :param drop_last: bool, whether to drop the last examples that are not enough to make a mini-batch. | |||||
| """ | |||||
| super(Batchifier, self).__init__() | super(Batchifier, self).__init__() | ||||
| self.sampler = sampler | self.sampler = sampler | ||||
| self.batch_size = batch_size | self.batch_size = batch_size | ||||
| @@ -223,8 +249,7 @@ class Batchifier(object): | |||||
| class BucketBatchifier(Batchifier): | class BucketBatchifier(Batchifier): | ||||
| """ | |||||
| Partition all samples into multiple buckets, each of which contains sentences of approximately the same length. | |||||
| """Partition all samples into multiple buckets, each of which contains sentences of approximately the same length. | |||||
| In sampling, first random choose a bucket. Then sample data from it. | In sampling, first random choose a bucket. Then sample data from it. | ||||
| The number of buckets is decided dynamically by the variance of sentence lengths. | The number of buckets is decided dynamically by the variance of sentence lengths. | ||||
| """ | """ | ||||
| @@ -237,6 +262,7 @@ class BucketBatchifier(Batchifier): | |||||
| :param num_buckets: int, number of buckets for grouping these sequences. | :param num_buckets: int, number of buckets for grouping these sequences. | ||||
| :param drop_last: bool, useless currently. | :param drop_last: bool, useless currently. | ||||
| :param sampler: Sampler, useless currently. | :param sampler: Sampler, useless currently. | ||||
| """ | """ | ||||
| super(BucketBatchifier, self).__init__(sampler, batch_size, drop_last) | super(BucketBatchifier, self).__init__(sampler, batch_size, drop_last) | ||||
| buckets = ([None] * num_buckets) | buckets = ([None] * num_buckets) | ||||
| @@ -8,6 +8,11 @@ class Loss(object): | |||||
| """ | """ | ||||
| def __init__(self, args): | def __init__(self, args): | ||||
| """ | |||||
| :param args: None or str, the name of a loss function. | |||||
| """ | |||||
| if args is None: | if args is None: | ||||
| # this is useful when Trainer.__init__ performs type check | # this is useful when Trainer.__init__ performs type check | ||||
| self._loss = None | self._loss = None | ||||
| @@ -17,10 +22,19 @@ class Loss(object): | |||||
| raise NotImplementedError | raise NotImplementedError | ||||
| def get(self): | def get(self): | ||||
| """ | |||||
| :return self._loss: the loss function | |||||
| """ | |||||
| return self._loss | return self._loss | ||||
| @staticmethod | @staticmethod | ||||
| def _borrow_from_pytorch(loss_name): | def _borrow_from_pytorch(loss_name): | ||||
| """Given a name of a loss function, return it from PyTorch. | |||||
| :param loss_name: str, the name of a loss function | |||||
| :return loss: a PyTorch loss | |||||
| """ | |||||
| if loss_name == "cross_entropy": | if loss_name == "cross_entropy": | ||||
| return torch.nn.CrossEntropyLoss() | return torch.nn.CrossEntropyLoss() | ||||
| else: | else: | ||||
| @@ -1,11 +1,12 @@ | |||||
| import warnings | import warnings | ||||
| import numpy as np | import numpy as np | ||||
| import torch | import torch | ||||
| def _conver_numpy(x): | def _conver_numpy(x): | ||||
| """ | |||||
| convert input data to numpy array | |||||
| """convert input data to numpy array | |||||
| """ | """ | ||||
| if isinstance(x, np.ndarray): | if isinstance(x, np.ndarray): | ||||
| return x | return x | ||||
| @@ -17,21 +18,20 @@ def _conver_numpy(x): | |||||
| def _check_same_len(*arrays, axis=0): | def _check_same_len(*arrays, axis=0): | ||||
| """ | |||||
| check if input array list has same length for one dimension | |||||
| """check if input array list has same length for one dimension | |||||
| """ | """ | ||||
| lens = set([x.shape[axis] for x in arrays if x is not None]) | lens = set([x.shape[axis] for x in arrays if x is not None]) | ||||
| return len(lens) == 1 | return len(lens) == 1 | ||||
| def _label_types(y): | def _label_types(y): | ||||
| """ | |||||
| determine the type | |||||
| "binary" | |||||
| "multiclass" | |||||
| "multiclass-multioutput" | |||||
| "multilabel" | |||||
| "unknown" | |||||
| """Determine the type | |||||
| - "binary" | |||||
| - "multiclass" | |||||
| - "multiclass-multioutput" | |||||
| - "multilabel" | |||||
| - "unknown" | |||||
| """ | """ | ||||
| # never squeeze the first dimension | # never squeeze the first dimension | ||||
| y = y.squeeze() if y.shape[0] > 1 else y.resize(1, -1) | y = y.squeeze() if y.shape[0] > 1 else y.resize(1, -1) | ||||
| @@ -46,8 +46,8 @@ def _label_types(y): | |||||
| def _check_data(y_true, y_pred): | def _check_data(y_true, y_pred): | ||||
| """ | |||||
| check if y_true and y_pred is same type of data e.g both binary or multiclass | |||||
| """Check if y_true and y_pred is same type of data e.g both binary or multiclass | |||||
| """ | """ | ||||
| y_true, y_pred = _conver_numpy(y_true), _conver_numpy(y_pred) | y_true, y_pred = _conver_numpy(y_true), _conver_numpy(y_pred) | ||||
| if not _check_same_len(y_true, y_pred): | if not _check_same_len(y_true, y_pred): | ||||
| @@ -174,16 +174,13 @@ def classification_report(y_true, y_pred, labels=None, target_names=None, digits | |||||
| def accuracy_topk(y_true, y_prob, k=1): | def accuracy_topk(y_true, y_prob, k=1): | ||||
| """ | |||||
| Compute accuracy of y_true matching top-k probable | |||||
| """Compute accuracy of y_true matching top-k probable | |||||
| labels in y_prob. | labels in y_prob. | ||||
| Paras: | |||||
| y_ture - ndarray, true label, [n_samples] | |||||
| y_prob - ndarray, label probabilities, [n_samples, n_classes] | |||||
| k - int, k in top-k | |||||
| Returns: | |||||
| accuracy of top-k | |||||
| :param y_true: ndarray, true label, [n_samples] | |||||
| :param y_prob: ndarray, label probabilities, [n_samples, n_classes] | |||||
| :param k: int, k in top-k | |||||
| :return :accuracy of top-k | |||||
| """ | """ | ||||
| y_pred_topk = np.argsort(y_prob, axis=-1)[:, -1:-k - 1:-1] | y_pred_topk = np.argsort(y_prob, axis=-1)[:, -1:-k - 1:-1] | ||||
| @@ -195,16 +192,14 @@ def accuracy_topk(y_true, y_prob, k=1): | |||||
| def pred_topk(y_prob, k=1): | def pred_topk(y_prob, k=1): | ||||
| """ | |||||
| Return top-k predicted labels and corresponding probabilities. | |||||
| Args: | |||||
| y_prob - ndarray, size [n_samples, n_classes], probabilities on labels | |||||
| k - int, k of top-k | |||||
| Returns: | |||||
| y_pred_topk - ndarray, size [n_samples, k], predicted top-k labels | |||||
| y_prob_topk - ndarray, size [n_samples, k], probabilities for | |||||
| top-k labels | |||||
| """Return top-k predicted labels and corresponding probabilities. | |||||
| :param y_prob: ndarray, size [n_samples, n_classes], probabilities on labels | |||||
| :param k: int, k of top-k | |||||
| :returns | |||||
| y_pred_topk: ndarray, size [n_samples, k], predicted top-k labels | |||||
| y_prob_topk: ndarray, size [n_samples, k], probabilities for top-k labels | |||||
| """ | """ | ||||
| y_pred_topk = np.argsort(y_prob, axis=-1)[:, -1:-k - 1:-1] | y_pred_topk = np.argsort(y_prob, axis=-1)[:, -1:-k - 1:-1] | ||||
| @@ -4,7 +4,6 @@ import torch | |||||
| class Optimizer(object): | class Optimizer(object): | ||||
| """Wrapper of optimizer from framework | """Wrapper of optimizer from framework | ||||
| names: arguments (type) | |||||
| 1. Adam: lr (float), weight_decay (float) | 1. Adam: lr (float), weight_decay (float) | ||||
| 2. AdaGrad | 2. AdaGrad | ||||
| 3. RMSProp | 3. RMSProp | ||||
| @@ -16,20 +15,29 @@ class Optimizer(object): | |||||
| """ | """ | ||||
| :param optimizer_name: str, the name of the optimizer | :param optimizer_name: str, the name of the optimizer | ||||
| :param kwargs: the arguments | :param kwargs: the arguments | ||||
| """ | """ | ||||
| self.optim_name = optimizer_name | self.optim_name = optimizer_name | ||||
| self.kwargs = kwargs | self.kwargs = kwargs | ||||
| @property | @property | ||||
| def name(self): | def name(self): | ||||
| """The name of the optimizer. | |||||
| :return: str | |||||
| """ | |||||
| return self.optim_name | return self.optim_name | ||||
| @property | @property | ||||
| def params(self): | def params(self): | ||||
| """The arguments used to create the optimizer. | |||||
| :return: dict of (str, *) | |||||
| """ | |||||
| return self.kwargs | return self.kwargs | ||||
| def construct_from_pytorch(self, model_params): | def construct_from_pytorch(self, model_params): | ||||
| """construct a optimizer from framework over given model parameters""" | |||||
| """Construct a optimizer from framework over given model parameters.""" | |||||
| if self.optim_name in ["SGD", "sgd"]: | if self.optim_name in ["SGD", "sgd"]: | ||||
| if "lr" in self.kwargs: | if "lr" in self.kwargs: | ||||
| @@ -17,12 +17,24 @@ DEFAULT_WORD_TO_INDEX = {DEFAULT_PADDING_LABEL: 0, DEFAULT_UNKNOWN_LABEL: 1, | |||||
| # the first vocab in dict with the index = 5 | # the first vocab in dict with the index = 5 | ||||
| def save_pickle(obj, pickle_path, file_name): | def save_pickle(obj, pickle_path, file_name): | ||||
| """Save an object into a pickle file. | |||||
| :param obj: an object | |||||
| :param pickle_path: str, the directory where the pickle file is to be saved | |||||
| :param file_name: str, the name of the pickle file. In general, it should be ended by "pkl". | |||||
| """ | |||||
| with open(os.path.join(pickle_path, file_name), "wb") as f: | with open(os.path.join(pickle_path, file_name), "wb") as f: | ||||
| _pickle.dump(obj, f) | _pickle.dump(obj, f) | ||||
| print("{} saved in {}".format(file_name, pickle_path)) | print("{} saved in {}".format(file_name, pickle_path)) | ||||
| def load_pickle(pickle_path, file_name): | def load_pickle(pickle_path, file_name): | ||||
| """Load an object from a given pickle file. | |||||
| :param pickle_path: str, the directory where the pickle file is. | |||||
| :param file_name: str, the name of the pickle file. | |||||
| :return obj: an object stored in the pickle | |||||
| """ | |||||
| with open(os.path.join(pickle_path, file_name), "rb") as f: | with open(os.path.join(pickle_path, file_name), "rb") as f: | ||||
| obj = _pickle.load(f) | obj = _pickle.load(f) | ||||
| print("{} loaded from {}".format(file_name, pickle_path)) | print("{} loaded from {}".format(file_name, pickle_path)) | ||||
| @@ -30,7 +42,8 @@ def load_pickle(pickle_path, file_name): | |||||
| def pickle_exist(pickle_path, pickle_name): | def pickle_exist(pickle_path, pickle_name): | ||||
| """ | |||||
| """Check if a given pickle file exists in the directory. | |||||
| :param pickle_path: the directory of target pickle file | :param pickle_path: the directory of target pickle file | ||||
| :param pickle_name: the filename of target pickle file | :param pickle_name: the filename of target pickle file | ||||
| :return: True if file exists else False | :return: True if file exists else False | ||||
| @@ -45,6 +58,19 @@ def pickle_exist(pickle_path, pickle_name): | |||||
| class BasePreprocess(object): | class BasePreprocess(object): | ||||
| """Base class of all preprocessors. | |||||
| Preprocessors are responsible for converting data of strings into data of indices. | |||||
| During the pre-processing, the following pickle files will be built: | |||||
| - "word2id.pkl", a mapping from words(tokens) to indices | |||||
| - "id2word.pkl", a reversed dictionary | |||||
| - "label2id.pkl", a dictionary on labels | |||||
| - "id2label.pkl", a reversed dictionary on labels | |||||
| These four pickle files are expected to be saved in the given pickle directory once they are constructed. | |||||
| Preprocessors will check if those files are already in the directory and will reuse them in future calls. | |||||
| """ | |||||
| def __init__(self): | def __init__(self): | ||||
| self.word2index = None | self.word2index = None | ||||
| self.label2index = None | self.label2index = None | ||||
| @@ -59,6 +85,7 @@ class BasePreprocess(object): | |||||
| def run(self, train_dev_data, test_data=None, pickle_path="./", train_dev_split=0, cross_val=False, n_fold=10): | def run(self, train_dev_data, test_data=None, pickle_path="./", train_dev_split=0, cross_val=False, n_fold=10): | ||||
| """Main preprocessing pipeline. | """Main preprocessing pipeline. | ||||
| :param train_dev_data: three-level list, with either single label or multiple labels in a sample. | :param train_dev_data: three-level list, with either single label or multiple labels in a sample. | ||||
| :param test_data: three-level list, with either single label or multiple labels in a sample. (optional) | :param test_data: three-level list, with either single label or multiple labels in a sample. (optional) | ||||
| :param pickle_path: str, the path to save the pickle files. | :param pickle_path: str, the path to save the pickle files. | ||||
| @@ -67,6 +94,7 @@ class BasePreprocess(object): | |||||
| :param n_fold: int, the number of folds of cross validation. Only useful when cross_val is True. | :param n_fold: int, the number of folds of cross validation. Only useful when cross_val is True. | ||||
| :return results: a tuple of datasets after preprocessing. | :return results: a tuple of datasets after preprocessing. | ||||
| """ | """ | ||||
| if pickle_exist(pickle_path, "word2id.pkl") and pickle_exist(pickle_path, "class2id.pkl"): | if pickle_exist(pickle_path, "word2id.pkl") and pickle_exist(pickle_path, "class2id.pkl"): | ||||
| self.word2index = load_pickle(pickle_path, "word2id.pkl") | self.word2index = load_pickle(pickle_path, "word2id.pkl") | ||||
| self.label2index = load_pickle(pickle_path, "class2id.pkl") | self.label2index = load_pickle(pickle_path, "class2id.pkl") | ||||
| @@ -182,25 +210,31 @@ class SeqLabelPreprocess(BasePreprocess): | |||||
| """Preprocess pipeline, including building mapping from words to index, from index to words, | """Preprocess pipeline, including building mapping from words to index, from index to words, | ||||
| from labels/classes to index, from index to labels/classes. | from labels/classes to index, from index to labels/classes. | ||||
| data of three-level list which have multiple labels in each sample. | data of three-level list which have multiple labels in each sample. | ||||
| :: | |||||
| [ | [ | ||||
| [ [word_11, word_12, ...], [label_1, label_1, ...] ], | [ [word_11, word_12, ...], [label_1, label_1, ...] ], | ||||
| [ [word_21, word_22, ...], [label_2, label_1, ...] ], | [ [word_21, word_22, ...], [label_2, label_1, ...] ], | ||||
| ... | ... | ||||
| ] | ] | ||||
| """ | """ | ||||
| def __init__(self): | def __init__(self): | ||||
| super(SeqLabelPreprocess, self).__init__() | super(SeqLabelPreprocess, self).__init__() | ||||
| def build_dict(self, data): | def build_dict(self, data): | ||||
| """ | |||||
| Add new words with indices into self.word_dict, new labels with indices into self.label_dict. | |||||
| """Add new words with indices into self.word_dict, new labels with indices into self.label_dict. | |||||
| :param data: three-level list | :param data: three-level list | ||||
| :: | |||||
| [ | [ | ||||
| [ [word_11, word_12, ...], [label_1, label_1, ...] ], | [ [word_11, word_12, ...], [label_1, label_1, ...] ], | ||||
| [ [word_21, word_22, ...], [label_2, label_1, ...] ], | [ [word_21, word_22, ...], [label_2, label_1, ...] ], | ||||
| ... | ... | ||||
| ] | ] | ||||
| :return word2index: dict of {str, int} | :return word2index: dict of {str, int} | ||||
| label2index: dict of {str, int} | label2index: dict of {str, int} | ||||
| """ | """ | ||||
| @@ -216,14 +250,17 @@ class SeqLabelPreprocess(BasePreprocess): | |||||
| return word2index, label2index | return word2index, label2index | ||||
| def to_index(self, data): | def to_index(self, data): | ||||
| """ | |||||
| Convert word strings and label strings into indices. | |||||
| """Convert word strings and label strings into indices. | |||||
| :param data: three-level list | :param data: three-level list | ||||
| :: | |||||
| [ | [ | ||||
| [ [word_11, word_12, ...], [label_1, label_1, ...] ], | [ [word_11, word_12, ...], [label_1, label_1, ...] ], | ||||
| [ [word_21, word_22, ...], [label_2, label_1, ...] ], | [ [word_21, word_22, ...], [label_2, label_1, ...] ], | ||||
| ... | ... | ||||
| ] | ] | ||||
| :return data_index: the same shape as data, but each string is replaced by its corresponding index | :return data_index: the same shape as data, but each string is replaced by its corresponding index | ||||
| """ | """ | ||||
| data_index = [] | data_index = [] | ||||
| @@ -242,11 +279,14 @@ class ClassPreprocess(BasePreprocess): | |||||
| Preprocess pipeline, including building mapping from words to index, from index to words, | Preprocess pipeline, including building mapping from words to index, from index to words, | ||||
| from labels/classes to index, from index to labels/classes. | from labels/classes to index, from index to labels/classes. | ||||
| design for data of three-level list which has a single label in each sample. | design for data of three-level list which has a single label in each sample. | ||||
| :: | |||||
| [ | [ | ||||
| [ [word_11, word_12, ...], label_1 ], | [ [word_11, word_12, ...], label_1 ], | ||||
| [ [word_21, word_22, ...], label_2 ], | [ [word_21, word_22, ...], label_2 ], | ||||
| ... | ... | ||||
| ] | ] | ||||
| """ | """ | ||||
| def __init__(self): | def __init__(self): | ||||
| @@ -273,14 +313,17 @@ class ClassPreprocess(BasePreprocess): | |||||
| return word2index, label2index | return word2index, label2index | ||||
| def to_index(self, data): | def to_index(self, data): | ||||
| """ | |||||
| Convert word strings and label strings into indices. | |||||
| """Convert word strings and label strings into indices. | |||||
| :param data: three-level list | :param data: three-level list | ||||
| :: | |||||
| [ | [ | ||||
| [ [word_11, word_12, ...], label_1 ], | [ [word_11, word_12, ...], label_1 ], | ||||
| [ [word_21, word_22, ...], label_2 ], | [ [word_21, word_22, ...], label_2 ], | ||||
| ... | ... | ||||
| ] | ] | ||||
| :return data_index: the same shape as data, but each string is replaced by its corresponding index | :return data_index: the same shape as data, but each string is replaced by its corresponding index | ||||
| """ | """ | ||||
| data_index = [] | data_index = [] | ||||
| @@ -295,14 +338,15 @@ class ClassPreprocess(BasePreprocess): | |||||
| def infer_preprocess(pickle_path, data): | def infer_preprocess(pickle_path, data): | ||||
| """ | |||||
| Preprocess over inference data. | |||||
| Transform three-level list of strings into that of index. | |||||
| """Preprocess over inference data. Transform three-level list of strings into that of index. | |||||
| :: | |||||
| [ | [ | ||||
| [word_11, word_12, ...], | [word_11, word_12, ...], | ||||
| [word_21, word_22, ...], | [word_21, word_22, ...], | ||||
| ... | ... | ||||
| ] | ] | ||||
| """ | """ | ||||
| word2index = load_pickle(pickle_path, "word2id.pkl") | word2index = load_pickle(pickle_path, "word2id.pkl") | ||||
| data_index = [] | data_index = [] | ||||
| @@ -155,8 +155,8 @@ class BaseTester(object): | |||||
| raise NotImplementedError | raise NotImplementedError | ||||
| class SeqLabelTester(BaseTester): | class SeqLabelTester(BaseTester): | ||||
| """ | |||||
| Tester for sequence labeling. | |||||
| """Tester for sequence labeling. | |||||
| """ | """ | ||||
| def __init__(self, **test_args): | def __init__(self, **test_args): | ||||
| @@ -215,8 +215,8 @@ class SeqLabelTester(BaseTester): | |||||
| return batch_loss, batch_accuracy | return batch_loss, batch_accuracy | ||||
| def show_metrics(self): | def show_metrics(self): | ||||
| """ | |||||
| This is called by Trainer to print evaluation on dev set. | |||||
| """This is called by Trainer to print evaluation on dev set. | |||||
| :return print_str: str | :return print_str: str | ||||
| """ | """ | ||||
| loss, accuracy = self.metrics() | loss, accuracy = self.metrics() | ||||
| @@ -1,4 +1,5 @@ | |||||
| import copy | import copy | ||||
| import os | |||||
| import time | import time | ||||
| from datetime import timedelta | from datetime import timedelta | ||||
| @@ -26,10 +27,10 @@ class BaseTrainer(object): | |||||
| :param kwargs: dict of (key, value), or dict-like object. key is str. | :param kwargs: dict of (key, value), or dict-like object. key is str. | ||||
| The base trainer requires the following keys: | The base trainer requires the following keys: | ||||
| - epochs: int, the number of epochs in training | |||||
| - validate: bool, whether or not to validate on dev set | |||||
| - batch_size: int | |||||
| - pickle_path: str, the path to pickle files for pre-processing | |||||
| - epochs: int, the number of epochs in training | |||||
| - validate: bool, whether or not to validate on dev set | |||||
| - batch_size: int | |||||
| - pickle_path: str, the path to pickle files for pre-processing | |||||
| """ | """ | ||||
| super(BaseTrainer, self).__init__() | super(BaseTrainer, self).__init__() | ||||
| @@ -88,6 +89,7 @@ class BaseTrainer(object): | |||||
| def train(self, network, train_data, dev_data=None): | def train(self, network, train_data, dev_data=None): | ||||
| """General Training Procedure | """General Training Procedure | ||||
| :param network: a model | :param network: a model | ||||
| :param train_data: three-level list, the training set. | :param train_data: three-level list, the training set. | ||||
| :param dev_data: three-level list, the validation data (optional) | :param dev_data: three-level list, the validation data (optional) | ||||
| @@ -144,6 +146,7 @@ class BaseTrainer(object): | |||||
| def _train_step(self, data_iterator, network, **kwargs): | def _train_step(self, data_iterator, network, **kwargs): | ||||
| """Training process in one epoch. | """Training process in one epoch. | ||||
| kwargs should contain: | kwargs should contain: | ||||
| - n_print: int, print training information every n steps. | - n_print: int, print training information every n steps. | ||||
| - start: time.time(), the starting time of this step. | - start: time.time(), the starting time of this step. | ||||
| @@ -199,14 +202,13 @@ class BaseTrainer(object): | |||||
| Action.mode(network, test) | Action.mode(network, test) | ||||
| def define_optimizer(self): | def define_optimizer(self): | ||||
| """ | |||||
| Define framework-specific optimizer specified by the models. | |||||
| """Define framework-specific optimizer specified by the models. | |||||
| """ | """ | ||||
| self._optimizer = self._optimizer_proto.construct_from_pytorch(self._model.parameters()) | self._optimizer = self._optimizer_proto.construct_from_pytorch(self._model.parameters()) | ||||
| def update(self): | def update(self): | ||||
| """ | |||||
| Perform weight update on a model. | |||||
| """Perform weight update on a model. | |||||
| For PyTorch, just call optimizer to update. | For PyTorch, just call optimizer to update. | ||||
| """ | """ | ||||
| @@ -216,8 +218,8 @@ class BaseTrainer(object): | |||||
| raise NotImplementedError | raise NotImplementedError | ||||
| def grad_backward(self, loss): | def grad_backward(self, loss): | ||||
| """ | |||||
| Compute gradient with link rules. | |||||
| """Compute gradient with link rules. | |||||
| :param loss: a scalar where back-prop starts | :param loss: a scalar where back-prop starts | ||||
| For PyTorch, just do "loss.backward()" | For PyTorch, just do "loss.backward()" | ||||
| @@ -226,8 +228,8 @@ class BaseTrainer(object): | |||||
| loss.backward() | loss.backward() | ||||
| def get_loss(self, predict, truth): | def get_loss(self, predict, truth): | ||||
| """ | |||||
| Compute loss given prediction and ground truth. | |||||
| """Compute loss given prediction and ground truth. | |||||
| :param predict: prediction label vector | :param predict: prediction label vector | ||||
| :param truth: ground truth label vector | :param truth: ground truth label vector | ||||
| :return: a scalar | :return: a scalar | ||||
| @@ -235,8 +237,9 @@ class BaseTrainer(object): | |||||
| return self._loss_func(predict, truth) | return self._loss_func(predict, truth) | ||||
| def define_loss(self): | def define_loss(self): | ||||
| """ | |||||
| if the model defines a loss, use model's loss. | |||||
| """Define a loss for the trainer. | |||||
| If the model defines a loss, use model's loss. | |||||
| Otherwise, Trainer must has a loss argument, use it as loss. | Otherwise, Trainer must has a loss argument, use it as loss. | ||||
| These two losses cannot be defined at the same time. | These two losses cannot be defined at the same time. | ||||
| Trainer does not handle loss definition or choose default losses. | Trainer does not handle loss definition or choose default losses. | ||||
| @@ -253,7 +256,8 @@ class BaseTrainer(object): | |||||
| logger.info("The model didn't define loss, use Trainer's loss.") | logger.info("The model didn't define loss, use Trainer's loss.") | ||||
| def best_eval_result(self, validator): | def best_eval_result(self, validator): | ||||
| """ | |||||
| """Check if the current epoch yields better validation results. | |||||
| :param validator: a Tester instance | :param validator: a Tester instance | ||||
| :return: bool, True means current results on dev set is the best. | :return: bool, True means current results on dev set is the best. | ||||
| """ | """ | ||||
| @@ -268,15 +272,14 @@ class BaseTrainer(object): | |||||
| """ | """ | ||||
| if model_name[-4:] != ".pkl": | if model_name[-4:] != ".pkl": | ||||
| model_name += ".pkl" | model_name += ".pkl" | ||||
| ModelSaver(self.pickle_path + model_name).save_pytorch(network) | |||||
| ModelSaver(os.path.join(self.pickle_path, model_name)).save_pytorch(network) | |||||
| def _create_validator(self, valid_args): | def _create_validator(self, valid_args): | ||||
| raise NotImplementedError | raise NotImplementedError | ||||
| class SeqLabelTrainer(BaseTrainer): | class SeqLabelTrainer(BaseTrainer): | ||||
| """ | |||||
| Trainer for Sequence Labeling | |||||
| """Trainer for Sequence Labeling | |||||
| """ | """ | ||||
| @@ -306,11 +309,11 @@ class SeqLabelTrainer(BaseTrainer): | |||||
| return y | return y | ||||
| def get_loss(self, predict, truth): | def get_loss(self, predict, truth): | ||||
| """ | |||||
| Compute loss given prediction and ground truth. | |||||
| """Compute loss given prediction and ground truth. | |||||
| :param predict: prediction label vector, [batch_size, max_len, tag_size] | :param predict: prediction label vector, [batch_size, max_len, tag_size] | ||||
| :param truth: ground truth label vector, [batch_size, max_len] | :param truth: ground truth label vector, [batch_size, max_len] | ||||
| :return: a scalar | |||||
| :return loss: a scalar | |||||
| """ | """ | ||||
| batch_size, max_len = predict.size(0), predict.size(1) | batch_size, max_len = predict.size(0), predict.size(1) | ||||
| assert truth.shape == (batch_size, max_len) | assert truth.shape == (batch_size, max_len) | ||||
| @@ -1,3 +1,5 @@ | |||||
| import os | |||||
| from fastNLP.core.predictor import SeqLabelInfer, ClassificationInfer | from fastNLP.core.predictor import SeqLabelInfer, ClassificationInfer | ||||
| from fastNLP.core.preprocess import load_pickle | from fastNLP.core.preprocess import load_pickle | ||||
| from fastNLP.loader.config_loader import ConfigLoader, ConfigSection | from fastNLP.loader.config_loader import ConfigLoader, ConfigSection | ||||
| @@ -39,8 +41,15 @@ FastNLP_MODEL_COLLECTION = { | |||||
| "type": "seq_label", | "type": "seq_label", | ||||
| "config_file_name": "pos_tag.config", | "config_file_name": "pos_tag.config", | ||||
| "config_section_name": "pos_tag_model" | "config_section_name": "pos_tag_model" | ||||
| }, | |||||
| "text_classify_model": { | |||||
| "url": "", | |||||
| "class": "cnn_text_classification.CNNText", | |||||
| "pickle": "text_class_model_v0.pkl", | |||||
| "type": "text_class", | |||||
| "config_file_name": "text_classify.cfg", | |||||
| "config_section_name": "model" | |||||
| } | } | ||||
| } | } | ||||
| @@ -86,7 +95,7 @@ class FastNLP(object): | |||||
| print("Restore model class {}".format(str(model_class))) | print("Restore model class {}".format(str(model_class))) | ||||
| model_args = ConfigSection() | model_args = ConfigSection() | ||||
| ConfigLoader.load_config(self.model_dir + config_file, {section_name: model_args}) | |||||
| ConfigLoader.load_config(os.path.join(self.model_dir, config_file), {section_name: model_args}) | |||||
| print("Restore model hyper-parameters {}".format(str(model_args.data))) | print("Restore model hyper-parameters {}".format(str(model_args.data))) | ||||
| # fetch dictionary size and number of labels from pickle files | # fetch dictionary size and number of labels from pickle files | ||||
| @@ -100,7 +109,7 @@ class FastNLP(object): | |||||
| print("Model constructed.") | print("Model constructed.") | ||||
| # To do: framework independent | # To do: framework independent | ||||
| ModelLoader.load_pytorch(model, self.model_dir + FastNLP_MODEL_COLLECTION[model_name]["pickle"]) | |||||
| ModelLoader.load_pytorch(model, os.path.join(self.model_dir, FastNLP_MODEL_COLLECTION[model_name]["pickle"])) | |||||
| print("Model weights loaded.") | print("Model weights loaded.") | ||||
| self.model = model | self.model = model | ||||
| @@ -13,8 +13,8 @@ with open('requirements.txt') as f: | |||||
| setup( | setup( | ||||
| name='fastNLP', | name='fastNLP', | ||||
| version='1.0', | |||||
| description=('fudan fastNLP '), | |||||
| version='0.0.1', | |||||
| description='fastNLP: Deep Learning Toolkit for NLP, developed by Fudan FastNLP Team', | |||||
| long_description=readme, | long_description=readme, | ||||
| license=license, | license=license, | ||||
| author='fudanNLP', | author='fudanNLP', | ||||
| @@ -1,9 +1,8 @@ | |||||
| import os | |||||
| import unittest | import unittest | ||||
| from fastNLP.core.action import Action, Batchifier, SequentialSampler | from fastNLP.core.action import Action, Batchifier, SequentialSampler | ||||
| class TestAction(unittest.TestCase): | class TestAction(unittest.TestCase): | ||||
| def test_case_1(self): | def test_case_1(self): | ||||
| x = [1, 2, 3, 4, 5, 6, 7, 8] | x = [1, 2, 3, 4, 5, 6, 7, 8] | ||||
| @@ -6,7 +6,7 @@ from fastNLP.fastnlp import interpret_word_seg_results, interpret_cws_pos_result | |||||
| PATH_TO_CWS_PICKLE_FILES = "/home/zyfeng/fastNLP/reproduction/chinese_word_segment/save/" | PATH_TO_CWS_PICKLE_FILES = "/home/zyfeng/fastNLP/reproduction/chinese_word_segment/save/" | ||||
| PATH_TO_POS_TAG_PICKLE_FILES = "/home/zyfeng/data/crf_seg/" | PATH_TO_POS_TAG_PICKLE_FILES = "/home/zyfeng/data/crf_seg/" | ||||
| PATH_TO_TEXT_CLASSIFICATION_PICKLE_FILES = "/home/zyfeng/data/text_classify/" | |||||
| def word_seg(): | def word_seg(): | ||||
| nlp = FastNLP(model_dir=PATH_TO_CWS_PICKLE_FILES) | nlp = FastNLP(model_dir=PATH_TO_CWS_PICKLE_FILES) | ||||
| @@ -68,7 +68,18 @@ def pos_tag(): | |||||
| print(interpret_cws_pos_results(words, labels)) | print(interpret_cws_pos_results(words, labels)) | ||||
| def text_classify(): | |||||
| nlp = FastNLP(model_dir=PATH_TO_TEXT_CLASSIFICATION_PICKLE_FILES) | |||||
| nlp.load("text_classify_model", config_file="text_classify.cfg", section_name="model") | |||||
| text = [ | |||||
| "世界物联网大会明日在京召开龙头股启动在即", | |||||
| "乌鲁木齐市新增一处城市中心旅游目的地", | |||||
| "朱元璋的大明朝真的源于明教吗?——告诉你一个真实的“明教”"] | |||||
| results = nlp.run(text) | |||||
| print(results) | |||||
| """ | |||||
| ['finance', 'travel', 'history'] | |||||
| """ | |||||
| if __name__ == "__main__": | if __name__ == "__main__": | ||||
| pos_tag() | |||||
| text_classify() | |||||