updates to core, loader, test:

- move preprocess.py from loader/ to core/ - changes to interface of preprocess: 1. add run method, to run the main processing 2. add cross validation split 3. add return value 4. merge subclasses - Trainer supports cross validation - add data as arguments in Trainer.train & Tester.test - add readme.example.py, to run the example program shown in README.md - other corresponding changes
7 years ago · 4c8c2dfdb8
--- a/fastNLP/core/predictor.py
+++ b/fastNLP/core/predictor.py
@@ -3,7 +3,7 @@ import torch

 from fastNLP.core.action import Batchifier, SequentialSampler
 from fastNLP.core.action import convert_to_torch_tensor
 from fastNLP.loader.preprocess import load_pickle, DEFAULT_UNKNOWN_LABEL
 from fastNLP.core.preprocess import load_pickle, DEFAULT_UNKNOWN_LABEL
 from fastNLP.modules import utils


--- a/fastNLP/core/preprocess.py
+++ b/fastNLP/core/preprocess.py
@@ -0,0 +1,306 @@
 import _pickle
 import os

 import numpy as np

 DEFAULT_PADDING_LABEL = '<pad>'  # dict index = 0
 DEFAULT_UNKNOWN_LABEL = '<unk>'  # dict index = 1
 DEFAULT_RESERVED_LABEL = ['<reserved-2>',
                          '<reserved-3>',
                          '<reserved-4>']  # dict index = 2~4

 DEFAULT_WORD_TO_INDEX = {DEFAULT_PADDING_LABEL: 0, DEFAULT_UNKNOWN_LABEL: 1,
                         DEFAULT_RESERVED_LABEL[0]: 2, DEFAULT_RESERVED_LABEL[1]: 3,
                         DEFAULT_RESERVED_LABEL[2]: 4}


 # the first vocab in dict with the index = 5

 def save_pickle(obj, pickle_path, file_name):
    with open(os.path.join(pickle_path, file_name), "wb") as f:
        _pickle.dump(obj, f)
    print("{} saved. ".format(file_name))


 def load_pickle(pickle_path, file_name):
    with open(os.path.join(pickle_path, file_name), "rb") as f:
        obj = _pickle.load(f)
    print("{} loaded. ".format(file_name))
    return obj


 def pickle_exist(pickle_path, pickle_name):
    """
    :param pickle_path: the directory of target pickle file
    :param pickle_name: the filename of target pickle file
    :return: True if file exists else False
    """
    if not os.path.exists(pickle_path):
        os.makedirs(pickle_path)
    file_name = os.path.join(pickle_path, pickle_name)
    if os.path.exists(file_name):
        return True
    else:
        return False


 class BasePreprocess(object):
    def __init__(self):
        self.word2index = None
        self.label2index = None

    @property
    def vocab_size(self):
        return len(self.word2index)

    @property
    def num_classes(self):
        return len(self.label2index)

    def run(self, train_dev_data, test_data=None, pickle_path="./", train_dev_split=0, cross_val=False, n_fold=10):
        """Main preprocessing pipeline.

        :param train_dev_data: three-level list, with either single label or multiple labels in a sample.
        :param test_data: three-level list, with either single label or multiple labels in a sample. (optional)
        :param pickle_path: str, the path to save the pickle files.
        :param train_dev_split: float, between [0, 1]. The ratio of training data used as validation set.
        :param cross_val: bool, whether to do cross validation.
        :param n_fold: int, the number of folds of cross validation. Only useful when cross_val is True.
        :return results: a tuple of datasets after preprocessing.
        """
        if pickle_exist(pickle_path, "word2id.pkl") and pickle_exist(pickle_path, "class2id.pkl"):
            self.word2index = load_pickle(pickle_path, "word2id.pkl")
            self.label2index = load_pickle(pickle_path, "class2id.pkl")
        else:
            self.word2index, self.label2index = self.build_dict(train_dev_data)
            save_pickle(self.word2index, pickle_path, "word2id.pkl")
            save_pickle(self.label2index, pickle_path, "class2id.pkl")

        if not pickle_exist(pickle_path, "id2word.pkl"):
            index2word = self.build_reverse_dict(self.word2index)
            save_pickle(index2word, pickle_path, "id2word.pkl")

        if not pickle_exist(pickle_path, "id2class.pkl"):
            index2label = self.build_reverse_dict(self.label2index)
            save_pickle(index2label, pickle_path, "id2class.pkl")

        data_train = []
        data_dev = []
        if not cross_val:
            if not pickle_exist(pickle_path, "data_train.pkl"):
                data_train.extend(self.to_index(train_dev_data))
                if train_dev_split > 0 and not pickle_exist(pickle_path, "data_dev.pkl"):
                    split = int(len(data_train) * train_dev_split)
                    data_dev = data_train[: split]
                    data_train = data_train[split:]
                    save_pickle(data_dev, pickle_path, "data_dev.pkl")
                    print("{} of the training data is split for validation. ".format(train_dev_split))
                save_pickle(data_train, pickle_path, "data_train.pkl")
            else:
                data_train = load_pickle(pickle_path, "data_train.pkl")
        else:
            # cross_val is True
            if not pickle_exist(pickle_path, "data_train_0.pkl"):
                # cross validation
                data_idx = self.to_index(train_dev_data)
                data_cv = self.cv_split(data_idx, n_fold)
                for i, (data_train_cv, data_dev_cv) in enumerate(data_cv):
                    save_pickle(
                        data_train_cv, pickle_path,
                        "data_train_{}.pkl".format(i))
                    save_pickle(
                        data_dev_cv, pickle_path,
                        "data_dev_{}.pkl".format(i))
                    data_train.append(data_train_cv)
                    data_dev.append(data_dev_cv)
                print("{}-fold cross validation.".format(n_fold))
            else:
                for i in range(n_fold):
                    data_train_cv = load_pickle(pickle_path, "data_train_{}.pkl".format(i))
                    data_dev_cv = load_pickle(pickle_path, "data_dev_{}.pkl".format(i))
                    data_train.append(data_train_cv)
                    data_dev.append(data_dev_cv)

        # prepare test data if provided
        data_test = []
        if test_data is not None:
            if not pickle_exist(pickle_path, "data_test.pkl"):
                data_test = self.to_index(test_data)
                save_pickle(data_test, pickle_path, "data_test.pkl")

        # return preprocessed results
        results = [data_train]
        if cross_val or train_dev_split > 0:
            results.append(data_dev)
        if test_data:
            results.append(data_test)
        return tuple(results)

    def build_dict(self, data):
        raise NotImplementedError

    def to_index(self, data):
        raise NotImplementedError

    def build_reverse_dict(self, word_dict):
        id2word = {word_dict[w]: w for w in word_dict}
        return id2word

    def data_split(self, data, train_dev_split):
        """Split data into train and dev set."""
        split = int(len(data) * train_dev_split)
        data_dev = data[: split]
        data_train = data[split:]
        return data_train, data_dev

    def cv_split(self, data, n_fold):
        """Split data for cross validation."""
        data_copy = data.copy()
        np.random.shuffle(data_copy)
        fold_size = round(len(data_copy) / n_fold)

        data_cv = []
        for i in range(n_fold - 1):
            start = i * fold_size
            end = (i + 1) * fold_size
            data_dev = data_copy[start:end]
            data_train = data_copy[:start] + data_copy[end:]
            data_cv.append((data_train, data_dev))
        start = (n_fold - 1) * fold_size
        data_dev = data_copy[start:]
        data_train = data_copy[:start]
        data_cv.append((data_train, data_dev))

        return data_cv


 class SeqLabelPreprocess(BasePreprocess):
    """Preprocess pipeline, including building mapping from words to index, from index to words,
        from labels/classes to index, from index to labels/classes.
        data of three-level list which have multiple labels in each sample.
            [
                [ [word_11, word_12, ...], [label_1, label_1, ...] ],
                [ [word_21, word_22, ...], [label_2, label_1, ...] ],
                ...
            ]
    """

    def __init__(self):
        super(SeqLabelPreprocess, self).__init__()

    def build_dict(self, data):
        """
        Add new words with indices into self.word_dict, new labels with indices into self.label_dict.
        :param data: three-level list
            [
                [ [word_11, word_12, ...], [label_1, label_1, ...] ],
                [ [word_21, word_22, ...], [label_2, label_1, ...] ],
                ...
            ]
        :return word2index: dict of {str, int}
                label2index: dict of {str, int}
        """
        # In seq labeling, both word seq and label seq need to be padded to the same length in a mini-batch.
        label2index = DEFAULT_WORD_TO_INDEX.copy()
        word2index = DEFAULT_WORD_TO_INDEX.copy()
        for example in data:
            for word, label in zip(example[0], example[1]):
                if word not in word2index:
                    word2index[word] = len(word2index)
                if label not in label2index:
                    label2index[label] = len(label2index)
        return word2index, label2index

    def to_index(self, data):
        """
        Convert word strings and label strings into indices.
        :param data: three-level list
            [
                [ [word_11, word_12, ...], [label_1, label_1, ...] ],
                [ [word_21, word_22, ...], [label_2, label_1, ...] ],
                ...
            ]
        :return data_index: the same shape as data, but each string is replaced by its corresponding index
        """
        data_index = []
        for example in data:
            word_list = []
            label_list = []
            for word, label in zip(example[0], example[1]):
                word_list.append(self.word2index.get(word, DEFAULT_WORD_TO_INDEX[DEFAULT_UNKNOWN_LABEL]))
                label_list.append(self.label2index.get(label, DEFAULT_WORD_TO_INDEX[DEFAULT_UNKNOWN_LABEL]))
            data_index.append([word_list, label_list])
        return data_index


 class ClassPreprocess(BasePreprocess):
    """ Preprocess pipeline for classification datasets.
        Preprocess pipeline, including building mapping from words to index, from index to words,
        from labels/classes to index, from index to labels/classes.
        design for data of three-level list which has a single label in each sample.
            [
                [ [word_11, word_12, ...], label_1 ],
                [ [word_21, word_22, ...], label_2 ],
                ...
            ]
    """

    def __init__(self):
        super(ClassPreprocess, self).__init__()

    def build_dict(self, data):
        """Build vocabulary."""

        # build vocabulary from scratch if nothing exists
        word2index = DEFAULT_WORD_TO_INDEX.copy()
        label2index = DEFAULT_WORD_TO_INDEX.copy()

        # collect every word and label
        for sent, label in data:
            if len(sent) <= 1:
                continue

            if label not in label2index:
                label2index[label] = len(label2index)

            for word in sent:
                if word not in word2index:
                    word2index[word[0]] = len(word2index)
        return word2index, label2index

    def to_index(self, data):
        """
        Convert word strings and label strings into indices.
        :param data: three-level list
            [
                [ [word_11, word_12, ...], label_1 ],
                [ [word_21, word_22, ...], label_2 ],
                ...
            ]
        :return data_index: the same shape as data, but each string is replaced by its corresponding index
        """
        data_index = []
        for example in data:
            word_list = []
            for word, label in zip(example[0]):
                word_list.append(self.word2index.get(word, DEFAULT_WORD_TO_INDEX[DEFAULT_UNKNOWN_LABEL]))
            label_index = self.label2index.get(example[1], DEFAULT_WORD_TO_INDEX[DEFAULT_UNKNOWN_LABEL])
            data_index.append([word_list, label_index])
        return data_index


 def infer_preprocess(pickle_path, data):
    """
        Preprocess over inference data.
        Transform three-level list of strings into that of index.
        [
            [word_11, word_12, ...],
            [word_21, word_22, ...],
            ...
        ]
    """
    word2index = load_pickle(pickle_path, "word2id.pkl")
    data_index = []
    for example in data:
        data_index.append([word2index.get(w, DEFAULT_UNKNOWN_LABEL) for w in example])
    return data_index
--- a/fastNLP/core/tester.py
+++ b/fastNLP/core/tester.py
@@ -34,7 +34,7 @@ class BaseTester(object):
        self.eval_history = []
        self.batch_output = []

    def test(self, network):
    def test(self, network, dev_data):
        if torch.cuda.is_available() and self.use_cuda:
            self.model = network.cuda()
        else:
@@ -45,8 +45,8 @@ class BaseTester(object):
        self.eval_history.clear()
        self.batch_output.clear()

        dev_data = self.prepare_input(self.pickle_path)
        logger.info("validation data loaded")
        # dev_data = self.prepare_input(self.pickle_path)
        # logger.info("validation data loaded")

        iterator = iter(Batchifier(RandomSampler(dev_data), self.batch_size, drop_last=True))
        n_batches = len(dev_data) // self.batch_size
--- a/fastNLP/core/trainer.py
+++ b/fastNLP/core/trainer.py
@@ -1,4 +1,5 @@
 import _pickle
 import copy
 import os
 import time
 from datetime import timedelta
@@ -52,9 +53,11 @@ class BaseTrainer(object):
        self.loss_func = None
        self.optimizer = None

    def train(self, network):
    def train(self, network, train_data, dev_data=None):
        """General Training Steps
        :param network: a model
        :param train_data: three-level list, the training set.
        :param dev_data: three-level list, the validation data (optional)

        The method is framework independent.
        Work by calling the following methods:
@@ -73,8 +76,8 @@ class BaseTrainer(object):
        else:
            self.model = network

        data_train = self.load_train_data(self.pickle_path)
        logger.info("training data loaded")
        # train_data = self.load_train_data(self.pickle_path)
        # logger.info("training data loaded")

        # define tester over dev data
        if self.validate:
@@ -88,8 +91,7 @@ class BaseTrainer(object):
        logger.info("optimizer defined as {}".format(str(self.optimizer)))

        # main training epochs

        n_samples = len(data_train)
        n_samples = len(train_data)
        n_batches = n_samples // self.batch_size
        n_print = 1
        start = time.time()
@@ -101,14 +103,14 @@ class BaseTrainer(object):
            # turn on network training mode
            self.mode(network, test=False)
            # prepare mini-batch iterator
            data_iterator = iter(Batchifier(RandomSampler(data_train), self.batch_size, drop_last=False))
            data_iterator = iter(Batchifier(RandomSampler(train_data), self.batch_size, drop_last=False))
            logger.info("prepared data iterator")

            self._train_step(data_iterator, network, start=start, n_print=n_print, epoch=epoch)

            if self.validate:
                logger.info("validation started")
                validator.test(network)
                validator.test(network, dev_data)

                if self.save_best_dev and self.best_eval_result(validator):
                    self.save_model(network)
@@ -139,6 +141,26 @@ class BaseTrainer(object):
                logger.info(print_output)
            step += 1

    def cross_validate(self, network, train_data_cv, dev_data_cv):
        """Training with cross validation.

        :param network: the model
        :param train_data_cv: four-level list, of shape [num_folds, num_examples, 2, ?]
        :param dev_data_cv: four-level list, of shape [num_folds, num_examples, 2, ?]

        """
        if len(train_data_cv) != len(dev_data_cv):
            logger.error("the number of folds in train and dev data unequals {}!={}".format(len(train_data_cv),
                                                                                            len(dev_data_cv)))
            raise RuntimeError("the number of folds in train and dev data unequals")
        n_fold = len(train_data_cv)
        logger.info("perform {} folds cross validation.".format(n_fold))
        for i in range(n_fold):
            print("CV:", i)
            logger.info("running the {} of {} folds cross validation".format(i + 1, n_fold))
            network_copy = copy.deepcopy(network)
            self.train(network_copy, train_data_cv[i], dev_data_cv[i])

    def load_train_data(self, pickle_path):
        """
        For task-specific processing.
--- a/fastNLP/loader/preprocess.py
+++ b/fastNLP/loader/preprocess.py
@@ -1,366 +0,0 @@
 import _pickle
 import os

 DEFAULT_PADDING_LABEL = '<pad>'  # dict index = 0
 DEFAULT_UNKNOWN_LABEL = '<unk>'  # dict index = 1
 DEFAULT_RESERVED_LABEL = ['<reserved-2>',
                          '<reserved-3>',
                          '<reserved-4>']  # dict index = 2~4

 DEFAULT_WORD_TO_INDEX = {DEFAULT_PADDING_LABEL: 0, DEFAULT_UNKNOWN_LABEL: 1,
                         DEFAULT_RESERVED_LABEL[0]: 2, DEFAULT_RESERVED_LABEL[1]: 3,
                         DEFAULT_RESERVED_LABEL[2]: 4}


 # the first vocab in dict with the index = 5

 def save_pickle(obj, pickle_path, file_name):
    with open(os.path.join(pickle_path, file_name), "wb") as f:
        _pickle.dump(obj, f)
    print("{} saved. ".format(file_name))


 def load_pickle(pickle_path, file_name):
    with open(os.path.join(pickle_path, file_name), "rb") as f:
        obj = _pickle.load(f)
    print("{} loaded. ".format(file_name))
    return obj


 def pickle_exist(pickle_path, pickle_name):
    """
    :param pickle_path: the directory of target pickle file
    :param pickle_name: the filename of target pickle file
    :return: True if file exists else False
    """
    if not os.path.exists(pickle_path):
        os.makedirs(pickle_path)
    file_name = os.path.join(pickle_path, pickle_name)
    if os.path.exists(file_name):
        return True
    else:
        return False


 class BasePreprocess(object):

    def __init__(self, data, pickle_path):
        super(BasePreprocess, self).__init__()
        # self.data = data
        self.pickle_path = pickle_path
        if not self.pickle_path.endswith('/'):
            self.pickle_path = self.pickle_path + '/'


 class POSPreprocess(BasePreprocess):
    """
        This class are used to preprocess the POS Tag datasets.

    """

    def __init__(self, data, pickle_path="./", train_dev_split=0):
        """
        Preprocess pipeline, including building mapping from words to index, from index to words,
        from labels/classes to index, from index to labels/classes.
        :param data: three-level list
            [
                [ [word_11, word_12, ...], [label_1, label_1, ...] ],
                [ [word_21, word_22, ...], [label_2, label_1, ...] ],
                ...
            ]
        :param pickle_path: str, the directory to the pickle files. Default: "./"
        :param train_dev_split: float in [0, 1]. The ratio of dev data split from training data. Default: 0.

        """
        super(POSPreprocess, self).__init__(data, pickle_path)

        self.pickle_path = pickle_path

        if pickle_exist(pickle_path, "word2id.pkl") and pickle_exist(pickle_path, "class2id.pkl"):
            self.word2index = load_pickle(self.pickle_path, "word2id.pkl")
            self.label2index = load_pickle(self.pickle_path, "class2id.pkl")
        else:
            self.word2index, self.label2index = self.build_dict(data)
            save_pickle(self.word2index, self.pickle_path, "word2id.pkl")
            save_pickle(self.label2index, self.pickle_path, "class2id.pkl")

        if not pickle_exist(pickle_path, "id2word.pkl"):
            index2word = self.build_reverse_dict(self.word2index)
            save_pickle(index2word, self.pickle_path, "id2word.pkl")

        if not pickle_exist(pickle_path, "id2class.pkl"):
            index2label = self.build_reverse_dict(self.label2index)
            save_pickle(index2label, self.pickle_path, "id2class.pkl")

        if not pickle_exist(pickle_path, "data_train.pkl"):
            data_train = self.to_index(data)
            if train_dev_split > 0 and not pickle_exist(pickle_path, "data_dev.pkl"):
                split = int(len(data_train) * train_dev_split)
                data_dev = data_train[: split]
                data_train = data_train[split:]
                save_pickle(data_dev, self.pickle_path, "data_dev.pkl")
                print("{} of the training data is split for validation. ".format(train_dev_split))
            save_pickle(data_train, self.pickle_path, "data_train.pkl")

    def build_dict(self, data):
        """
        Add new words with indices into self.word_dict, new labels with indices into self.label_dict.
        :param data: three-level list
            [
                [ [word_11, word_12, ...], [label_1, label_1, ...] ],
                [ [word_21, word_22, ...], [label_2, label_1, ...] ],
                ...
            ]
        :return word2index: dict of {str, int}
                label2index: dict of {str, int}
        """
        # In seq labeling, both word seq and label seq need to be padded to the same length in a mini-batch.
        label2index = DEFAULT_WORD_TO_INDEX.copy()
        word2index = DEFAULT_WORD_TO_INDEX.copy()
        for example in data:
            for word, label in zip(example[0], example[1]):
                if word not in word2index:
                    word2index[word] = len(word2index)
                if label not in label2index:
                    label2index[label] = len(label2index)
        return word2index, label2index

    def build_reverse_dict(self, word_dict):
        id2word = {word_dict[w]: w for w in word_dict}
        return id2word

    def to_index(self, data):
        """
        Convert word strings and label strings into indices.
        :param data: three-level list
            [
                [ [word_11, word_12, ...], [label_1, label_1, ...] ],
                [ [word_21, word_22, ...], [label_2, label_1, ...] ],
                ...
            ]
        :return data_index: the shape of data, but each string is replaced by its corresponding index
        """
        data_index = []
        for example in data:
            word_list = []
            label_list = []
            for word, label in zip(example[0], example[1]):
                word_list.append(self.word2index[word])
                label_list.append(self.label2index[label])
            data_index.append([word_list, label_list])
        return data_index

    @property
    def vocab_size(self):
        return len(self.word2index)

    @property
    def num_classes(self):
        return len(self.label2index)


 class ClassPreprocess(BasePreprocess):
    """
    Pre-process the classification datasets.

    Params:
        pickle_path - directory to save result of pre-processing
    Saves:
        word2id.pkl
        id2word.pkl
        class2id.pkl
        id2class.pkl
        embedding.pkl
        data_train.pkl
        data_dev.pkl
        data_test.pkl
    """

    def __init__(self, pickle_path):
        # super(ClassPreprocess, self).__init__(data, pickle_path)
        self.word_dict = None
        self.label_dict = None
        self.pickle_path = pickle_path  # save directory

    def process(self, data, save_name):
        """
        Process data.

        Params:
            data - nested list, data = [sample1, sample2, ...],
                sample = [sentence, label], sentence = [word1, word2, ...]
            save_name - name of processed data, such as data_train.pkl
        Returns:
            vocab_size - vocabulary size
            n_classes - number of classes
        """
        self.build_dict(data)
        self.word2id()
        vocab_size = self.id2word()
        self.class2id()
        num_classes = self.id2class()
        self.embedding()
        self.data_generate(data, save_name)

        return vocab_size, num_classes

    def build_dict(self, data):
        """Build vocabulary."""

        # just read if word2id.pkl and class2id.pkl exists
        if self.pickle_exist("word2id.pkl") and \
                self.pickle_exist("class2id.pkl"):
            file_name = os.path.join(self.pickle_path, "word2id.pkl")
            with open(file_name, 'rb') as f:
                self.word_dict = _pickle.load(f)
            file_name = os.path.join(self.pickle_path, "class2id.pkl")
            with open(file_name, 'rb') as f:
                self.label_dict = _pickle.load(f)
            return

        # build vocabulary from scratch if nothing exists
        self.word_dict = {
            DEFAULT_PADDING_LABEL: 0,
            DEFAULT_UNKNOWN_LABEL: 1,
            DEFAULT_RESERVED_LABEL[0]: 2,
            DEFAULT_RESERVED_LABEL[1]: 3,
            DEFAULT_RESERVED_LABEL[2]: 4}
        self.label_dict = {}

        # collect every word and label
        for sent, label in data:
            if len(sent) <= 1:
                continue

            if label not in self.label_dict:
                index = len(self.label_dict)
                self.label_dict[label] = index

            for word in sent:
                if word not in self.word_dict:
                    index = len(self.word_dict)
                    self.word_dict[word[0]] = index

    def pickle_exist(self, pickle_name):
        """
        Check whether a pickle file exists.

        Params
            pickle_name: the filename of target pickle file
        Return
            True if file exists else False
        """
        if not os.path.exists(self.pickle_path):
            os.makedirs(self.pickle_path)
        file_name = os.path.join(self.pickle_path, pickle_name)
        if os.path.exists(file_name):
            return True
        else:
            return False

    def word2id(self):
        """Save vocabulary of {word:id} mapping format."""
        # nothing will be done if word2id.pkl exists
        if self.pickle_exist("word2id.pkl"):
            return

        file_name = os.path.join(self.pickle_path, "word2id.pkl")
        with open(file_name, "wb") as f:
            _pickle.dump(self.word_dict, f)

    def id2word(self):
        """Save vocabulary of {id:word} mapping format."""
        # nothing will be done if id2word.pkl exists
        if self.pickle_exist("id2word.pkl"):
            file_name = os.path.join(self.pickle_path, "id2word.pkl")
            with open(file_name, 'rb') as f:
                id2word_dict = _pickle.load(f)
            return len(id2word_dict)

        id2word_dict = {self.word_dict[w]: w for w in self.word_dict}
        file_name = os.path.join(self.pickle_path, "id2word.pkl")
        with open(file_name, "wb") as f:
            _pickle.dump(id2word_dict, f)
        return len(id2word_dict)

    def class2id(self):
        """Save mapping of {class:id}."""
        # nothing will be done if class2id.pkl exists
        if self.pickle_exist("class2id.pkl"):
            return

        file_name = os.path.join(self.pickle_path, "class2id.pkl")
        with open(file_name, "wb") as f:
            _pickle.dump(self.label_dict, f)

    def id2class(self):
        """Save mapping of {id:class}."""
        # nothing will be done if id2class.pkl exists
        if self.pickle_exist("id2class.pkl"):
            file_name = os.path.join(self.pickle_path, "id2class.pkl")
            with open(file_name, "rb") as f:
                id2class_dict = _pickle.load(f)
            return len(id2class_dict)

        id2class_dict = {self.label_dict[c]: c for c in self.label_dict}
        file_name = os.path.join(self.pickle_path, "id2class.pkl")
        with open(file_name, "wb") as f:
            _pickle.dump(id2class_dict, f)
        return len(id2class_dict)

    def embedding(self):
        """Save embedding lookup table corresponding to vocabulary."""
        # nothing will be done if embedding.pkl exists
        if self.pickle_exist("embedding.pkl"):
            return

        # retrieve vocabulary from pre-trained embedding (not implemented)

    def data_generate(self, data_src, save_name):
        """Convert dataset from text to digit."""

        # nothing will be done if file exists
        save_path = os.path.join(self.pickle_path, save_name)
        if os.path.exists(save_path):
            return

        data = []
        # for every sample
        for sent, label in data_src:
            if len(sent) <= 1:
                continue

            label_id = self.label_dict[label]  # label id
            sent_id = []  # sentence ids
            for word in sent:
                if word in self.word_dict:
                    sent_id.append(self.word_dict[word])
                else:
                    sent_id.append(self.word_dict[DEFAULT_UNKNOWN_LABEL])
            data.append([sent_id, label_id])

        # save data
        with open(save_path, "wb") as f:
            _pickle.dump(data, f)


 class LMPreprocess(BasePreprocess):
    def __init__(self, data, pickle_path):
        super(LMPreprocess, self).__init__(data, pickle_path)


 def infer_preprocess(pickle_path, data):
    """
        Preprocess over inference data.
        Transform three-level list of strings into that of index.
        [
            [word_11, word_12, ...],
            [word_21, word_22, ...],
            ...
        ]
    """
    word2index = load_pickle(pickle_path, "word2id.pkl")
    data_index = []
    for example in data:
        data_index.append([word2index.get(w, DEFAULT_UNKNOWN_LABEL) for w in example])
    return data_index
--- a/reproduction/CNN-sentence_classification/train.py
+++ b/reproduction/CNN-sentence_classification/train.py
@@ -1,13 +1,12 @@
 import os

 import
 import
 import torch
 import torch.nn as nn
 .dataset as dst
 from .model import CNN_text
 from torch.autograd import Variable

 from . import dataset as dst
 from .model import CNN_text

 # Hyper Parameters
 batch_size = 50
 learning_rate = 0.0001
--- a/reproduction/chinese_word_seg/cws_train.py
+++ b/reproduction/chinese_word_seg/cws_train.py
@@ -5,7 +5,7 @@ sys.path.append("..")
 from fastNLP.loader.config_loader import ConfigLoader, ConfigSection
 from fastNLP.core.trainer import SeqLabelTrainer
 from fastNLP.loader.dataset_loader import TokenizeDatasetLoader, BaseLoader
 from fastNLP.loader.preprocess import POSPreprocess, load_pickle
 from fastNLP.core.preprocess import SeqLabelPreprocess, load_pickle
 from fastNLP.saver.model_saver import ModelSaver
 from fastNLP.loader.model_loader import ModelLoader
 from fastNLP.core.tester import SeqLabelTester
@@ -48,7 +48,7 @@ def infer():
    print("Inference finished!")


 def train():
 def train_test():
    # Config Loader
    train_args = ConfigSection()
    test_args = ConfigSection()
@@ -59,9 +59,10 @@ def train():
    train_data = loader.load_pku()

    # Preprocessor
    p = POSPreprocess(train_data, pickle_path, train_dev_split=0.3)
    train_args["vocab_size"] = p.vocab_size
    train_args["num_classes"] = p.num_classes
    preprocess = SeqLabelPreprocess()
    data_train, data_dev = preprocess.run(train_data, pickle_path=pickle_path, train_dev_split=0.3)
    train_args["vocab_size"] = preprocess.vocab_size
    train_args["num_classes"] = preprocess.num_classes

    # Trainer
    trainer = SeqLabelTrainer(train_args)
@@ -70,7 +71,7 @@ def train():
    model = SeqLabeling(train_args)

    # Start training
    trainer.train(model)
    trainer.train(model, data_train, data_dev)
    print("Training finished!")

    # Saver
@@ -78,8 +79,11 @@ def train():
    saver.save_pytorch(model)
    print("Model saved!")

    # testing with validation set
    test(data_dev)

 def test():

 def test(test_data):
    # Config Loader
    train_args = ConfigSection()
    ConfigLoader("config.cfg", "").load_config("./data_for_tests/config", {"POS": train_args})
@@ -99,7 +103,7 @@ def test():
    tester = SeqLabelTester(test_args)

    # Start testing
    tester.test(model)
    tester.test(model, test_data)

    # print test results
    print(tester.show_matrices())
@@ -107,4 +111,4 @@ def test():


 if __name__ == "__main__":
    train()
    train_test()
--- a/test/ner.py
+++ b/test/ner.py
@@ -4,9 +4,9 @@ import os
 import numpy as np
 import torch

 from fastNLP.core.preprocess import SeqLabelPreprocess
 from fastNLP.core.tester import SeqLabelTester
 from fastNLP.core.trainer import SeqLabelTrainer
 from fastNLP.loader.preprocess import POSPreprocess
 from fastNLP.models.sequence_modeling import AdvSeqLabel


@@ -114,7 +114,8 @@ emb_path = "data_for_tests/emb50.txt"
 save_path = "data_for_tests/"
 if __name__ == "__main__":
    data = data_load(data_path)
    p = POSPreprocess(data, pickle_path=pick_path, train_dev_split=0.3)
    preprocess = SeqLabelPreprocess()
    data_train, data_dev = preprocess.run(data, pickle_path=pick_path, train_dev_split=0.3)
    # emb = embedding_process(emb_path, p.word2index, 50, os.path.join(pick_path, "embedding.pkl"))
    emb = None
    args = {"epochs": 20,
@@ -125,13 +126,13 @@ if __name__ == "__main__":
            "model_saved_path": save_path,
            "use_cuda": True,

            "vocab_size": p.vocab_size,
            "num_classes": p.num_classes,
            "vocab_size": preprocess.vocab_size,
            "num_classes": preprocess.num_classes,
            "word_emb_dim": 50,
            "rnn_hidden_units": 100
            }
    # emb = torch.Tensor(emb).float().cuda()
    networks = AdvSeqLabel(args, emb)
    trainer = MyNERTrainer(args)
    trainer.train(network=networks)
    trainer.train(networks, data_train, data_dev)
    print("Training finished!")
--- a/test/readme_example.py
+++ b/test/readme_example.py
@@ -0,0 +1,78 @@
 # python: 3.5
 # pytorch: 0.4

 ################
 # Test cross validation.
 ################

 from fastNLP.loader.preprocess import ClassPreprocess

 from fastNLP.core.predictor import ClassificationInfer
 from fastNLP.core.trainer import ClassificationTrainer
 from fastNLP.loader.dataset_loader import ClassDatasetLoader
 from fastNLP.models.base_model import BaseModel
 from fastNLP.modules import aggregation
 from fastNLP.modules import encoder


 class ClassificationModel(BaseModel):
    """
    Simple text classification model based on CNN.
    """

    def __init__(self, class_num, vocab_size):
        super(ClassificationModel, self).__init__()

        self.embed = encoder.Embedding(nums=vocab_size, dims=300)
        self.conv = encoder.Conv(
            in_channels=300, out_channels=100, kernel_size=3)
        self.pool = aggregation.MaxPool()
        self.output = encoder.Linear(input_size=100, output_size=class_num)

    def forward(self, x):
        x = self.embed(x)  # [N,L] -> [N,L,C]
        x = self.conv(x)  # [N,L,C_in] -> [N,L,C_out]
        x = self.pool(x)  # [N,L,C] -> [N,C]
        x = self.output(x)  # [N,C] -> [N, N_class]
        return x


 data_dir = 'data'  # directory to save data and model
 train_path = 'test/data_for_tests/text_classify.txt'  # training set file

 # load dataset
 ds_loader = ClassDatasetLoader("train", train_path)
 data = ds_loader.load()

 # pre-process dataset
 pre = ClassPreprocess(data, data_dir, cross_val=True, n_fold=5)
 # pre = ClassPreprocess(data, data_dir)
 n_classes = pre.num_classes
 vocab_size = pre.vocab_size

 # construct model
 model_args = {
    'num_classes': n_classes,
    'vocab_size': vocab_size
 }
 model = ClassificationModel(class_num=n_classes, vocab_size=vocab_size)

 # train model
 train_args = {
    "epochs": 10,
    "batch_size": 50,
    "pickle_path": data_dir,
    "validate": False,
    "save_best_dev": False,
    "model_saved_path": None,
    "use_cuda": True,
    "learn_rate": 1e-3,
    "momentum": 0.9}
 trainer = ClassificationTrainer(train_args)
 # trainer.train(model, ['data_train.pkl', 'data_dev.pkl'])
 trainer.cross_validate(model)

 # predict using model
 data_infer = [x[0] for x in data]
 infer = ClassificationInfer(data_dir)
 labels_pred = infer.predict(model, data_infer)
--- a/test/seq_labeling.py
+++ b/test/seq_labeling.py
@@ -5,7 +5,7 @@ sys.path.append("..")
 from fastNLP.loader.config_loader import ConfigLoader, ConfigSection
 from fastNLP.core.trainer import SeqLabelTrainer
 from fastNLP.loader.dataset_loader import POSDatasetLoader, BaseLoader
 from fastNLP.loader.preprocess import POSPreprocess, load_pickle
 from fastNLP.core.preprocess import SeqLabelPreprocess, load_pickle
 from fastNLP.saver.model_saver import ModelSaver
 from fastNLP.loader.model_loader import ModelLoader
 from fastNLP.core.tester import SeqLabelTester
@@ -68,7 +68,8 @@ def train_and_test():
    train_data = pos_loader.load_lines()

    # Preprocessor
    p = POSPreprocess(train_data, pickle_path, train_dev_split=0.5)
    p = SeqLabelPreprocess()
    data_train, data_dev = p.run(train_data, pickle_path, train_dev_split=0.5)
    train_args["vocab_size"] = p.vocab_size
    train_args["num_classes"] = p.num_classes

@@ -79,7 +80,7 @@ def train_and_test():
    model = SeqLabeling(train_args)

    # Start training
    trainer.train(model)
    trainer.train(model, data_train, data_dev)
    print("Training finished!")

    # Saver
@@ -103,8 +104,8 @@ def train_and_test():
    # Tester
    tester = SeqLabelTester(test_args)

    # Start testing
    tester.test(model)
    # Start testing with validation data
    tester.test(model, data_dev)

    # print test results
    print(tester.show_matrices())
--- a/test/test_cws.py
+++ b/test/test_cws.py
@@ -5,7 +5,7 @@ sys.path.append("..")
 from fastNLP.loader.config_loader import ConfigLoader, ConfigSection
 from fastNLP.core.trainer import SeqLabelTrainer
 from fastNLP.loader.dataset_loader import TokenizeDatasetLoader, BaseLoader
 from fastNLP.loader.preprocess import POSPreprocess, load_pickle
 from fastNLP.core.preprocess import SeqLabelPreprocess, load_pickle
 from fastNLP.saver.model_saver import ModelSaver
 from fastNLP.loader.model_loader import ModelLoader
 from fastNLP.core.tester import SeqLabelTester
@@ -68,7 +68,8 @@ def train_test():
    train_data = loader.load_pku()

    # Preprocessor
    p = POSPreprocess(train_data, pickle_path)
    p = SeqLabelPreprocess()
    data_train = p.run(train_data, pickle_path=pickle_path)
    train_args["vocab_size"] = p.vocab_size
    train_args["num_classes"] = p.num_classes

@@ -79,7 +80,7 @@ def train_test():
    model = SeqLabeling(train_args)

    # Start training
    trainer.train(model)
    trainer.train(model, data_train)
    print("Training finished!")

    # Saver
@@ -104,7 +105,7 @@ def train_test():
    tester = SeqLabelTester(test_args)

    # Start testing
    tester.test(model)
    tester.test(model, data_train)

    # print test results
    print(tester.show_matrices())
--- a/test/test_tester.py
+++ b/test/test_tester.py
@@ -1,7 +1,7 @@
 from fastNLP.core.preprocess import SeqLabelPreprocess
 from fastNLP.core.tester import SeqLabelTester
 from fastNLP.loader.config_loader import ConfigSection, ConfigLoader
 from fastNLP.loader.dataset_loader import TokenizeDatasetLoader
 from fastNLP.loader.preprocess import POSPreprocess
 from fastNLP.models.sequence_modeling import SeqLabeling

 data_name = "pku_training.utf8"
@@ -17,7 +17,7 @@ def foo():
    ConfigLoader("config.cfg", "").load_config("./data_for_tests/config", {"POS": train_args})

    # Preprocessor
    p = POSPreprocess(train_data, pickle_path)
    p = SeqLabelPreprocess(train_data, pickle_path)
    train_args["vocab_size"] = p.vocab_size
    train_args["num_classes"] = p.num_classes

--- a/test/text_classify.py
+++ b/test/text_classify.py
@@ -10,7 +10,7 @@ from fastNLP.core.trainer import ClassificationTrainer
 from fastNLP.loader.config_loader import ConfigLoader, ConfigSection
 from fastNLP.loader.dataset_loader import ClassDatasetLoader
 from fastNLP.loader.model_loader import ModelLoader
 from fastNLP.loader.preprocess import ClassPreprocess
 from fastNLP.core.preprocess import ClassPreprocess
 from fastNLP.models.cnn_text_classification import CNNText
 from fastNLP.saver.model_saver import ModelSaver

@@ -59,28 +59,28 @@ def train():
    print(data[0])

    # pre-process data
    pre = ClassPreprocess(data_dir)
    vocab_size, n_classes = pre.process(data, "data_train.pkl")
    print("vocabulary size:", vocab_size)
    print("number of classes:", n_classes)
    pre = ClassPreprocess()
    data_train = pre.run(data, pickle_path=data_dir)
    print("vocabulary size:", pre.vocab_size)
    print("number of classes:", pre.num_classes)

    # construct model
    print("Building model...")
    cnn = CNNText(model_args)
    model = CNNText(model_args)

    # train
    print("Training...")

    trainer = ClassificationTrainer(train_args)
    trainer.train(cnn)
    trainer.train(model, data_train)

    print("Training finished!")

    saver = ModelSaver("./data_for_tests/saved_model.pkl")
    saver.save_pytorch(cnn)
    saver.save_pytorch(model)
    print("Model saved!")


 if __name__ == "__main__":
    # train()
    infer()
    train()
    # infer()