From 4c8c2dfdb88c6c51d990d075c668f5518fc7f5a6 Mon Sep 17 00:00:00 2001
From: FengZiYjun <writerphone@163.com>
Date: Sun, 19 Aug 2018 16:21:14 +0800
Subject: [PATCH 1/2] updates to core, loader, test: - move preprocess.py from
 loader/ to core/ - changes to interface of preprocess: 1. add run method, to
 run the main processing 2. add cross validation split 3. add return value 4.
 merge subclasses - Trainer supports cross validation - add data as arguments
 in Trainer.train & Tester.test - add readme.example.py, to run the example
 program shown in README.md - other corresponding changes

---
 fastNLP/core/predictor.py                     |   2 +-
 fastNLP/core/preprocess.py                    | 306 +++++++++++++++
 fastNLP/core/tester.py                        |   6 +-
 fastNLP/core/trainer.py                       |  36 +-
 fastNLP/loader/preprocess.py                  | 366 ------------------
 .../CNN-sentence_classification/train.py      |   7 +-
 reproduction/chinese_word_seg/cws_train.py    |  22 +-
 test/ner.py                                   |  11 +-
 test/readme_example.py                        |  78 ++++
 test/seq_labeling.py                          |  11 +-
 test/test_cws.py                              |   9 +-
 test/test_tester.py                           |   4 +-
 test/text_classify.py                         |  20 +-
 13 files changed, 462 insertions(+), 416 deletions(-)
 create mode 100644 fastNLP/core/preprocess.py
 delete mode 100644 fastNLP/loader/preprocess.py
 create mode 100644 test/readme_example.py
diff --git a/fastNLP/core/predictor.py b/fastNLP/core/predictor.py
index 758f5efb..03a6e43c 100644
--- a/fastNLP/core/predictor.py
+++ b/fastNLP/core/predictor.py
@@ -3,7 +3,7 @@ import torch
 
 from fastNLP.core.action import Batchifier, SequentialSampler
 from fastNLP.core.action import convert_to_torch_tensor
-from fastNLP.loader.preprocess import load_pickle, DEFAULT_UNKNOWN_LABEL
+from fastNLP.core.preprocess import load_pickle, DEFAULT_UNKNOWN_LABEL
 from fastNLP.modules import utils
 
 
diff --git a/fastNLP/core/preprocess.py b/fastNLP/core/preprocess.py
new file mode 100644
index 00000000..6b81bff1
--- /dev/null
+++ b/fastNLP/core/preprocess.py
@@ -0,0 +1,306 @@
+import _pickle
+import os
+
+import numpy as np
+
+DEFAULT_PADDING_LABEL = '<pad>'  # dict index = 0
+DEFAULT_UNKNOWN_LABEL = '<unk>'  # dict index = 1
+DEFAULT_RESERVED_LABEL = ['<reserved-2>',
+                          '<reserved-3>',
+                          '<reserved-4>']  # dict index = 2~4
+
+DEFAULT_WORD_TO_INDEX = {DEFAULT_PADDING_LABEL: 0, DEFAULT_UNKNOWN_LABEL: 1,
+                         DEFAULT_RESERVED_LABEL[0]: 2, DEFAULT_RESERVED_LABEL[1]: 3,
+                         DEFAULT_RESERVED_LABEL[2]: 4}
+
+
+# the first vocab in dict with the index = 5
+
+def save_pickle(obj, pickle_path, file_name):
+    with open(os.path.join(pickle_path, file_name), "wb") as f:
+        _pickle.dump(obj, f)
+    print("{} saved. ".format(file_name))
+
+
+def load_pickle(pickle_path, file_name):
+    with open(os.path.join(pickle_path, file_name), "rb") as f:
+        obj = _pickle.load(f)
+    print("{} loaded. ".format(file_name))
+    return obj
+
+
+def pickle_exist(pickle_path, pickle_name):
+    """
+    :param pickle_path: the directory of target pickle file
+    :param pickle_name: the filename of target pickle file
+    :return: True if file exists else False
+    """
+    if not os.path.exists(pickle_path):
+        os.makedirs(pickle_path)
+    file_name = os.path.join(pickle_path, pickle_name)
+    if os.path.exists(file_name):
+        return True
+    else:
+        return False
+
+
+class BasePreprocess(object):
+    def __init__(self):
+        self.word2index = None
+        self.label2index = None
+
+    @property
+    def vocab_size(self):
+        return len(self.word2index)
+
+    @property
+    def num_classes(self):
+        return len(self.label2index)
+
+    def run(self, train_dev_data, test_data=None, pickle_path="./", train_dev_split=0, cross_val=False, n_fold=10):
+        """Main preprocessing pipeline.
+
+        :param train_dev_data: three-level list, with either single label or multiple labels in a sample.
+        :param test_data: three-level list, with either single label or multiple labels in a sample. (optional)
+        :param pickle_path: str, the path to save the pickle files.
+        :param train_dev_split: float, between [0, 1]. The ratio of training data used as validation set.
+        :param cross_val: bool, whether to do cross validation.
+        :param n_fold: int, the number of folds of cross validation. Only useful when cross_val is True.
+        :return results: a tuple of datasets after preprocessing.
+        """
+        if pickle_exist(pickle_path, "word2id.pkl") and pickle_exist(pickle_path, "class2id.pkl"):
+            self.word2index = load_pickle(pickle_path, "word2id.pkl")
+            self.label2index = load_pickle(pickle_path, "class2id.pkl")
+        else:
+            self.word2index, self.label2index = self.build_dict(train_dev_data)
+            save_pickle(self.word2index, pickle_path, "word2id.pkl")
+            save_pickle(self.label2index, pickle_path, "class2id.pkl")
+
+        if not pickle_exist(pickle_path, "id2word.pkl"):
+            index2word = self.build_reverse_dict(self.word2index)
+            save_pickle(index2word, pickle_path, "id2word.pkl")
+
+        if not pickle_exist(pickle_path, "id2class.pkl"):
+            index2label = self.build_reverse_dict(self.label2index)
+            save_pickle(index2label, pickle_path, "id2class.pkl")
+
+        data_train = []
+        data_dev = []
+        if not cross_val:
+            if not pickle_exist(pickle_path, "data_train.pkl"):
+                data_train.extend(self.to_index(train_dev_data))
+                if train_dev_split > 0 and not pickle_exist(pickle_path, "data_dev.pkl"):
+                    split = int(len(data_train) * train_dev_split)
+                    data_dev = data_train[: split]
+                    data_train = data_train[split:]
+                    save_pickle(data_dev, pickle_path, "data_dev.pkl")
+                    print("{} of the training data is split for validation. ".format(train_dev_split))
+                save_pickle(data_train, pickle_path, "data_train.pkl")
+            else:
+                data_train = load_pickle(pickle_path, "data_train.pkl")
+        else:
+            # cross_val is True
+            if not pickle_exist(pickle_path, "data_train_0.pkl"):
+                # cross validation
+                data_idx = self.to_index(train_dev_data)
+                data_cv = self.cv_split(data_idx, n_fold)
+                for i, (data_train_cv, data_dev_cv) in enumerate(data_cv):
+                    save_pickle(
+                        data_train_cv, pickle_path,
+                        "data_train_{}.pkl".format(i))
+                    save_pickle(
+                        data_dev_cv, pickle_path,
+                        "data_dev_{}.pkl".format(i))
+                    data_train.append(data_train_cv)
+                    data_dev.append(data_dev_cv)
+                print("{}-fold cross validation.".format(n_fold))
+            else:
+                for i in range(n_fold):
+                    data_train_cv = load_pickle(pickle_path, "data_train_{}.pkl".format(i))
+                    data_dev_cv = load_pickle(pickle_path, "data_dev_{}.pkl".format(i))
+                    data_train.append(data_train_cv)
+                    data_dev.append(data_dev_cv)
+
+        # prepare test data if provided
+        data_test = []
+        if test_data is not None:
+            if not pickle_exist(pickle_path, "data_test.pkl"):
+                data_test = self.to_index(test_data)
+                save_pickle(data_test, pickle_path, "data_test.pkl")
+
+        # return preprocessed results
+        results = [data_train]
+        if cross_val or train_dev_split > 0:
+            results.append(data_dev)
+        if test_data:
+            results.append(data_test)
+        return tuple(results)
+
+    def build_dict(self, data):
+        raise NotImplementedError
+
+    def to_index(self, data):
+        raise NotImplementedError
+
+    def build_reverse_dict(self, word_dict):
+        id2word = {word_dict[w]: w for w in word_dict}
+        return id2word
+
+    def data_split(self, data, train_dev_split):
+        """Split data into train and dev set."""
+        split = int(len(data) * train_dev_split)
+        data_dev = data[: split]
+        data_train = data[split:]
+        return data_train, data_dev
+
+    def cv_split(self, data, n_fold):
+        """Split data for cross validation."""
+        data_copy = data.copy()
+        np.random.shuffle(data_copy)
+        fold_size = round(len(data_copy) / n_fold)
+
+        data_cv = []
+        for i in range(n_fold - 1):
+            start = i * fold_size
+            end = (i + 1) * fold_size
+            data_dev = data_copy[start:end]
+            data_train = data_copy[:start] + data_copy[end:]
+            data_cv.append((data_train, data_dev))
+        start = (n_fold - 1) * fold_size
+        data_dev = data_copy[start:]
+        data_train = data_copy[:start]
+        data_cv.append((data_train, data_dev))
+
+        return data_cv
+
+
+class SeqLabelPreprocess(BasePreprocess):
+    """Preprocess pipeline, including building mapping from words to index, from index to words,
+        from labels/classes to index, from index to labels/classes.
+        data of three-level list which have multiple labels in each sample.
+            [
+                [ [word_11, word_12, ...], [label_1, label_1, ...] ],
+                [ [word_21, word_22, ...], [label_2, label_1, ...] ],
+                ...
+            ]
+    """
+
+    def __init__(self):
+        super(SeqLabelPreprocess, self).__init__()
+
+    def build_dict(self, data):
+        """
+        Add new words with indices into self.word_dict, new labels with indices into self.label_dict.
+        :param data: three-level list
+            [
+                [ [word_11, word_12, ...], [label_1, label_1, ...] ],
+                [ [word_21, word_22, ...], [label_2, label_1, ...] ],
+                ...
+            ]
+        :return word2index: dict of {str, int}
+                label2index: dict of {str, int}
+        """
+        # In seq labeling, both word seq and label seq need to be padded to the same length in a mini-batch.
+        label2index = DEFAULT_WORD_TO_INDEX.copy()
+        word2index = DEFAULT_WORD_TO_INDEX.copy()
+        for example in data:
+            for word, label in zip(example[0], example[1]):
+                if word not in word2index:
+                    word2index[word] = len(word2index)
+                if label not in label2index:
+                    label2index[label] = len(label2index)
+        return word2index, label2index
+
+    def to_index(self, data):
+        """
+        Convert word strings and label strings into indices.
+        :param data: three-level list
+            [
+                [ [word_11, word_12, ...], [label_1, label_1, ...] ],
+                [ [word_21, word_22, ...], [label_2, label_1, ...] ],
+                ...
+            ]
+        :return data_index: the same shape as data, but each string is replaced by its corresponding index
+        """
+        data_index = []
+        for example in data:
+            word_list = []
+            label_list = []
+            for word, label in zip(example[0], example[1]):
+                word_list.append(self.word2index.get(word, DEFAULT_WORD_TO_INDEX[DEFAULT_UNKNOWN_LABEL]))
+                label_list.append(self.label2index.get(label, DEFAULT_WORD_TO_INDEX[DEFAULT_UNKNOWN_LABEL]))
+            data_index.append([word_list, label_list])
+        return data_index
+
+
+class ClassPreprocess(BasePreprocess):
+    """ Preprocess pipeline for classification datasets.
+        Preprocess pipeline, including building mapping from words to index, from index to words,
+        from labels/classes to index, from index to labels/classes.
+        design for data of three-level list which has a single label in each sample.
+            [
+                [ [word_11, word_12, ...], label_1 ],
+                [ [word_21, word_22, ...], label_2 ],
+                ...
+            ]
+    """
+
+    def __init__(self):
+        super(ClassPreprocess, self).__init__()
+
+    def build_dict(self, data):
+        """Build vocabulary."""
+
+        # build vocabulary from scratch if nothing exists
+        word2index = DEFAULT_WORD_TO_INDEX.copy()
+        label2index = DEFAULT_WORD_TO_INDEX.copy()
+
+        # collect every word and label
+        for sent, label in data:
+            if len(sent) <= 1:
+                continue
+
+            if label not in label2index:
+                label2index[label] = len(label2index)
+
+            for word in sent:
+                if word not in word2index:
+                    word2index[word[0]] = len(word2index)
+        return word2index, label2index
+
+    def to_index(self, data):
+        """
+        Convert word strings and label strings into indices.
+        :param data: three-level list
+            [
+                [ [word_11, word_12, ...], label_1 ],
+                [ [word_21, word_22, ...], label_2 ],
+                ...
+            ]
+        :return data_index: the same shape as data, but each string is replaced by its corresponding index
+        """
+        data_index = []
+        for example in data:
+            word_list = []
+            for word, label in zip(example[0]):
+                word_list.append(self.word2index.get(word, DEFAULT_WORD_TO_INDEX[DEFAULT_UNKNOWN_LABEL]))
+            label_index = self.label2index.get(example[1], DEFAULT_WORD_TO_INDEX[DEFAULT_UNKNOWN_LABEL])
+            data_index.append([word_list, label_index])
+        return data_index
+
+
+def infer_preprocess(pickle_path, data):
+    """
+        Preprocess over inference data.
+        Transform three-level list of strings into that of index.
+        [
+            [word_11, word_12, ...],
+            [word_21, word_22, ...],
+            ...
+        ]
+    """
+    word2index = load_pickle(pickle_path, "word2id.pkl")
+    data_index = []
+    for example in data:
+        data_index.append([word2index.get(w, DEFAULT_UNKNOWN_LABEL) for w in example])
+    return data_index
diff --git a/fastNLP/core/tester.py b/fastNLP/core/tester.py
index 313b7dcb..bafc0b82 100644
--- a/fastNLP/core/tester.py
+++ b/fastNLP/core/tester.py
@@ -34,7 +34,7 @@ class BaseTester(object):
         self.eval_history = []
         self.batch_output = []
 
-    def test(self, network):
+    def test(self, network, dev_data):
         if torch.cuda.is_available() and self.use_cuda:
             self.model = network.cuda()
         else:
@@ -45,8 +45,8 @@ class BaseTester(object):
         self.eval_history.clear()
         self.batch_output.clear()
 
-        dev_data = self.prepare_input(self.pickle_path)
-        logger.info("validation data loaded")
+        # dev_data = self.prepare_input(self.pickle_path)
+        # logger.info("validation data loaded")
 
         iterator = iter(Batchifier(RandomSampler(dev_data), self.batch_size, drop_last=True))
         n_batches = len(dev_data) // self.batch_size
diff --git a/fastNLP/core/trainer.py b/fastNLP/core/trainer.py
index a9e74e22..4b3e5de1 100644
--- a/fastNLP/core/trainer.py
+++ b/fastNLP/core/trainer.py
@@ -1,4 +1,5 @@
 import _pickle
+import copy
 import os
 import time
 from datetime import timedelta
@@ -52,9 +53,11 @@ class BaseTrainer(object):
         self.loss_func = None
         self.optimizer = None
 
-    def train(self, network):
+    def train(self, network, train_data, dev_data=None):
         """General Training Steps
         :param network: a model
+        :param train_data: three-level list, the training set.
+        :param dev_data: three-level list, the validation data (optional)
 
         The method is framework independent.
         Work by calling the following methods:
@@ -73,8 +76,8 @@ class BaseTrainer(object):
         else:
             self.model = network
 
-        data_train = self.load_train_data(self.pickle_path)
-        logger.info("training data loaded")
+        # train_data = self.load_train_data(self.pickle_path)
+        # logger.info("training data loaded")
 
         # define tester over dev data
         if self.validate:
@@ -88,8 +91,7 @@ class BaseTrainer(object):
         logger.info("optimizer defined as {}".format(str(self.optimizer)))
 
         # main training epochs
-
-        n_samples = len(data_train)
+        n_samples = len(train_data)
         n_batches = n_samples // self.batch_size
         n_print = 1
         start = time.time()
@@ -101,14 +103,14 @@ class BaseTrainer(object):
             # turn on network training mode
             self.mode(network, test=False)
             # prepare mini-batch iterator
-            data_iterator = iter(Batchifier(RandomSampler(data_train), self.batch_size, drop_last=False))
+            data_iterator = iter(Batchifier(RandomSampler(train_data), self.batch_size, drop_last=False))
             logger.info("prepared data iterator")
 
             self._train_step(data_iterator, network, start=start, n_print=n_print, epoch=epoch)
 
             if self.validate:
                 logger.info("validation started")
-                validator.test(network)
+                validator.test(network, dev_data)
 
                 if self.save_best_dev and self.best_eval_result(validator):
                     self.save_model(network)
@@ -139,6 +141,26 @@ class BaseTrainer(object):
                 logger.info(print_output)
             step += 1
 
+    def cross_validate(self, network, train_data_cv, dev_data_cv):
+        """Training with cross validation.
+
+        :param network: the model
+        :param train_data_cv: four-level list, of shape [num_folds, num_examples, 2, ?]
+        :param dev_data_cv: four-level list, of shape [num_folds, num_examples, 2, ?]
+
+        """
+        if len(train_data_cv) != len(dev_data_cv):
+            logger.error("the number of folds in train and dev data unequals {}!={}".format(len(train_data_cv),
+                                                                                            len(dev_data_cv)))
+            raise RuntimeError("the number of folds in train and dev data unequals")
+        n_fold = len(train_data_cv)
+        logger.info("perform {} folds cross validation.".format(n_fold))
+        for i in range(n_fold):
+            print("CV:", i)
+            logger.info("running the {} of {} folds cross validation".format(i + 1, n_fold))
+            network_copy = copy.deepcopy(network)
+            self.train(network_copy, train_data_cv[i], dev_data_cv[i])
+
     def load_train_data(self, pickle_path):
         """
         For task-specific processing.
diff --git a/fastNLP/loader/preprocess.py b/fastNLP/loader/preprocess.py
deleted file mode 100644
index 2c972ddd..00000000
--- a/fastNLP/loader/preprocess.py
+++ /dev/null
@@ -1,366 +0,0 @@
-import _pickle
-import os
-
-DEFAULT_PADDING_LABEL = '<pad>'  # dict index = 0
-DEFAULT_UNKNOWN_LABEL = '<unk>'  # dict index = 1
-DEFAULT_RESERVED_LABEL = ['<reserved-2>',
-                          '<reserved-3>',
-                          '<reserved-4>']  # dict index = 2~4
-
-DEFAULT_WORD_TO_INDEX = {DEFAULT_PADDING_LABEL: 0, DEFAULT_UNKNOWN_LABEL: 1,
-                         DEFAULT_RESERVED_LABEL[0]: 2, DEFAULT_RESERVED_LABEL[1]: 3,
-                         DEFAULT_RESERVED_LABEL[2]: 4}
-
-
-# the first vocab in dict with the index = 5
-
-def save_pickle(obj, pickle_path, file_name):
-    with open(os.path.join(pickle_path, file_name), "wb") as f:
-        _pickle.dump(obj, f)
-    print("{} saved. ".format(file_name))
-
-
-def load_pickle(pickle_path, file_name):
-    with open(os.path.join(pickle_path, file_name), "rb") as f:
-        obj = _pickle.load(f)
-    print("{} loaded. ".format(file_name))
-    return obj
-
-
-def pickle_exist(pickle_path, pickle_name):
-    """
-    :param pickle_path: the directory of target pickle file
-    :param pickle_name: the filename of target pickle file
-    :return: True if file exists else False
-    """
-    if not os.path.exists(pickle_path):
-        os.makedirs(pickle_path)
-    file_name = os.path.join(pickle_path, pickle_name)
-    if os.path.exists(file_name):
-        return True
-    else:
-        return False
-
-
-class BasePreprocess(object):
-
-    def __init__(self, data, pickle_path):
-        super(BasePreprocess, self).__init__()
-        # self.data = data
-        self.pickle_path = pickle_path
-        if not self.pickle_path.endswith('/'):
-            self.pickle_path = self.pickle_path + '/'
-
-
-class POSPreprocess(BasePreprocess):
-    """
-        This class are used to preprocess the POS Tag datasets.
-
-    """
-
-    def __init__(self, data, pickle_path="./", train_dev_split=0):
-        """
-        Preprocess pipeline, including building mapping from words to index, from index to words,
-        from labels/classes to index, from index to labels/classes.
-        :param data: three-level list
-            [
-                [ [word_11, word_12, ...], [label_1, label_1, ...] ],
-                [ [word_21, word_22, ...], [label_2, label_1, ...] ],
-                ...
-            ]
-        :param pickle_path: str, the directory to the pickle files. Default: "./"
-        :param train_dev_split: float in [0, 1]. The ratio of dev data split from training data. Default: 0.
-
-        """
-        super(POSPreprocess, self).__init__(data, pickle_path)
-
-        self.pickle_path = pickle_path
-
-        if pickle_exist(pickle_path, "word2id.pkl") and pickle_exist(pickle_path, "class2id.pkl"):
-            self.word2index = load_pickle(self.pickle_path, "word2id.pkl")
-            self.label2index = load_pickle(self.pickle_path, "class2id.pkl")
-        else:
-            self.word2index, self.label2index = self.build_dict(data)
-            save_pickle(self.word2index, self.pickle_path, "word2id.pkl")
-            save_pickle(self.label2index, self.pickle_path, "class2id.pkl")
-
-        if not pickle_exist(pickle_path, "id2word.pkl"):
-            index2word = self.build_reverse_dict(self.word2index)
-            save_pickle(index2word, self.pickle_path, "id2word.pkl")
-
-        if not pickle_exist(pickle_path, "id2class.pkl"):
-            index2label = self.build_reverse_dict(self.label2index)
-            save_pickle(index2label, self.pickle_path, "id2class.pkl")
-
-        if not pickle_exist(pickle_path, "data_train.pkl"):
-            data_train = self.to_index(data)
-            if train_dev_split > 0 and not pickle_exist(pickle_path, "data_dev.pkl"):
-                split = int(len(data_train) * train_dev_split)
-                data_dev = data_train[: split]
-                data_train = data_train[split:]
-                save_pickle(data_dev, self.pickle_path, "data_dev.pkl")
-                print("{} of the training data is split for validation. ".format(train_dev_split))
-            save_pickle(data_train, self.pickle_path, "data_train.pkl")
-
-    def build_dict(self, data):
-        """
-        Add new words with indices into self.word_dict, new labels with indices into self.label_dict.
-        :param data: three-level list
-            [
-                [ [word_11, word_12, ...], [label_1, label_1, ...] ],
-                [ [word_21, word_22, ...], [label_2, label_1, ...] ],
-                ...
-            ]
-        :return word2index: dict of {str, int}
-                label2index: dict of {str, int}
-        """
-        # In seq labeling, both word seq and label seq need to be padded to the same length in a mini-batch.
-        label2index = DEFAULT_WORD_TO_INDEX.copy()
-        word2index = DEFAULT_WORD_TO_INDEX.copy()
-        for example in data:
-            for word, label in zip(example[0], example[1]):
-                if word not in word2index:
-                    word2index[word] = len(word2index)
-                if label not in label2index:
-                    label2index[label] = len(label2index)
-        return word2index, label2index
-
-    def build_reverse_dict(self, word_dict):
-        id2word = {word_dict[w]: w for w in word_dict}
-        return id2word
-
-    def to_index(self, data):
-        """
-        Convert word strings and label strings into indices.
-        :param data: three-level list
-            [
-                [ [word_11, word_12, ...], [label_1, label_1, ...] ],
-                [ [word_21, word_22, ...], [label_2, label_1, ...] ],
-                ...
-            ]
-        :return data_index: the shape of data, but each string is replaced by its corresponding index
-        """
-        data_index = []
-        for example in data:
-            word_list = []
-            label_list = []
-            for word, label in zip(example[0], example[1]):
-                word_list.append(self.word2index[word])
-                label_list.append(self.label2index[label])
-            data_index.append([word_list, label_list])
-        return data_index
-
-    @property
-    def vocab_size(self):
-        return len(self.word2index)
-
-    @property
-    def num_classes(self):
-        return len(self.label2index)
-
-
-class ClassPreprocess(BasePreprocess):
-    """
-    Pre-process the classification datasets.
-
-    Params:
-        pickle_path - directory to save result of pre-processing
-    Saves:
-        word2id.pkl
-        id2word.pkl
-        class2id.pkl
-        id2class.pkl
-        embedding.pkl
-        data_train.pkl
-        data_dev.pkl
-        data_test.pkl
-    """
-
-    def __init__(self, pickle_path):
-        # super(ClassPreprocess, self).__init__(data, pickle_path)
-        self.word_dict = None
-        self.label_dict = None
-        self.pickle_path = pickle_path  # save directory
-
-    def process(self, data, save_name):
-        """
-        Process data.
-
-        Params:
-            data - nested list, data = [sample1, sample2, ...],
-                sample = [sentence, label], sentence = [word1, word2, ...]
-            save_name - name of processed data, such as data_train.pkl
-        Returns:
-            vocab_size - vocabulary size
-            n_classes - number of classes
-        """
-        self.build_dict(data)
-        self.word2id()
-        vocab_size = self.id2word()
-        self.class2id()
-        num_classes = self.id2class()
-        self.embedding()
-        self.data_generate(data, save_name)
-
-        return vocab_size, num_classes
-
-    def build_dict(self, data):
-        """Build vocabulary."""
-
-        # just read if word2id.pkl and class2id.pkl exists
-        if self.pickle_exist("word2id.pkl") and \
-                self.pickle_exist("class2id.pkl"):
-            file_name = os.path.join(self.pickle_path, "word2id.pkl")
-            with open(file_name, 'rb') as f:
-                self.word_dict = _pickle.load(f)
-            file_name = os.path.join(self.pickle_path, "class2id.pkl")
-            with open(file_name, 'rb') as f:
-                self.label_dict = _pickle.load(f)
-            return
-
-        # build vocabulary from scratch if nothing exists
-        self.word_dict = {
-            DEFAULT_PADDING_LABEL: 0,
-            DEFAULT_UNKNOWN_LABEL: 1,
-            DEFAULT_RESERVED_LABEL[0]: 2,
-            DEFAULT_RESERVED_LABEL[1]: 3,
-            DEFAULT_RESERVED_LABEL[2]: 4}
-        self.label_dict = {}
-
-        # collect every word and label
-        for sent, label in data:
-            if len(sent) <= 1:
-                continue
-
-            if label not in self.label_dict:
-                index = len(self.label_dict)
-                self.label_dict[label] = index
-
-            for word in sent:
-                if word not in self.word_dict:
-                    index = len(self.word_dict)
-                    self.word_dict[word[0]] = index
-
-    def pickle_exist(self, pickle_name):
-        """
-        Check whether a pickle file exists.
-
-        Params
-            pickle_name: the filename of target pickle file
-        Return
-            True if file exists else False
-        """
-        if not os.path.exists(self.pickle_path):
-            os.makedirs(self.pickle_path)
-        file_name = os.path.join(self.pickle_path, pickle_name)
-        if os.path.exists(file_name):
-            return True
-        else:
-            return False
-
-    def word2id(self):
-        """Save vocabulary of {word:id} mapping format."""
-        # nothing will be done if word2id.pkl exists
-        if self.pickle_exist("word2id.pkl"):
-            return
-
-        file_name = os.path.join(self.pickle_path, "word2id.pkl")
-        with open(file_name, "wb") as f:
-            _pickle.dump(self.word_dict, f)
-
-    def id2word(self):
-        """Save vocabulary of {id:word} mapping format."""
-        # nothing will be done if id2word.pkl exists
-        if self.pickle_exist("id2word.pkl"):
-            file_name = os.path.join(self.pickle_path, "id2word.pkl")
-            with open(file_name, 'rb') as f:
-                id2word_dict = _pickle.load(f)
-            return len(id2word_dict)
-
-        id2word_dict = {self.word_dict[w]: w for w in self.word_dict}
-        file_name = os.path.join(self.pickle_path, "id2word.pkl")
-        with open(file_name, "wb") as f:
-            _pickle.dump(id2word_dict, f)
-        return len(id2word_dict)
-
-    def class2id(self):
-        """Save mapping of {class:id}."""
-        # nothing will be done if class2id.pkl exists
-        if self.pickle_exist("class2id.pkl"):
-            return
-
-        file_name = os.path.join(self.pickle_path, "class2id.pkl")
-        with open(file_name, "wb") as f:
-            _pickle.dump(self.label_dict, f)
-
-    def id2class(self):
-        """Save mapping of {id:class}."""
-        # nothing will be done if id2class.pkl exists
-        if self.pickle_exist("id2class.pkl"):
-            file_name = os.path.join(self.pickle_path, "id2class.pkl")
-            with open(file_name, "rb") as f:
-                id2class_dict = _pickle.load(f)
-            return len(id2class_dict)
-
-        id2class_dict = {self.label_dict[c]: c for c in self.label_dict}
-        file_name = os.path.join(self.pickle_path, "id2class.pkl")
-        with open(file_name, "wb") as f:
-            _pickle.dump(id2class_dict, f)
-        return len(id2class_dict)
-
-    def embedding(self):
-        """Save embedding lookup table corresponding to vocabulary."""
-        # nothing will be done if embedding.pkl exists
-        if self.pickle_exist("embedding.pkl"):
-            return
-
-        # retrieve vocabulary from pre-trained embedding (not implemented)
-
-    def data_generate(self, data_src, save_name):
-        """Convert dataset from text to digit."""
-
-        # nothing will be done if file exists
-        save_path = os.path.join(self.pickle_path, save_name)
-        if os.path.exists(save_path):
-            return
-
-        data = []
-        # for every sample
-        for sent, label in data_src:
-            if len(sent) <= 1:
-                continue
-
-            label_id = self.label_dict[label]  # label id
-            sent_id = []  # sentence ids
-            for word in sent:
-                if word in self.word_dict:
-                    sent_id.append(self.word_dict[word])
-                else:
-                    sent_id.append(self.word_dict[DEFAULT_UNKNOWN_LABEL])
-            data.append([sent_id, label_id])
-
-        # save data
-        with open(save_path, "wb") as f:
-            _pickle.dump(data, f)
-
-
-class LMPreprocess(BasePreprocess):
-    def __init__(self, data, pickle_path):
-        super(LMPreprocess, self).__init__(data, pickle_path)
-
-
-def infer_preprocess(pickle_path, data):
-    """
-        Preprocess over inference data.
-        Transform three-level list of strings into that of index.
-        [
-            [word_11, word_12, ...],
-            [word_21, word_22, ...],
-            ...
-        ]
-    """
-    word2index = load_pickle(pickle_path, "word2id.pkl")
-    data_index = []
-    for example in data:
-        data_index.append([word2index.get(w, DEFAULT_UNKNOWN_LABEL) for w in example])
-    return data_index
diff --git a/reproduction/CNN-sentence_classification/train.py b/reproduction/CNN-sentence_classification/train.py
index d22f054e..6e35ee5e 100644
--- a/reproduction/CNN-sentence_classification/train.py
+++ b/reproduction/CNN-sentence_classification/train.py
@@ -1,13 +1,12 @@
 import os
 
-import
-import
 import torch
 import torch.nn as nn
-.dataset as dst
-from .model import CNN_text
 from torch.autograd import Variable
 
+from . import dataset as dst
+from .model import CNN_text
+
 # Hyper Parameters
 batch_size = 50
 learning_rate = 0.0001
diff --git a/reproduction/chinese_word_seg/cws_train.py b/reproduction/chinese_word_seg/cws_train.py
index afb0ec7e..b63a9401 100644
--- a/reproduction/chinese_word_seg/cws_train.py
+++ b/reproduction/chinese_word_seg/cws_train.py
@@ -5,7 +5,7 @@ sys.path.append("..")
 from fastNLP.loader.config_loader import ConfigLoader, ConfigSection
 from fastNLP.core.trainer import SeqLabelTrainer
 from fastNLP.loader.dataset_loader import TokenizeDatasetLoader, BaseLoader
-from fastNLP.loader.preprocess import POSPreprocess, load_pickle
+from fastNLP.core.preprocess import SeqLabelPreprocess, load_pickle
 from fastNLP.saver.model_saver import ModelSaver
 from fastNLP.loader.model_loader import ModelLoader
 from fastNLP.core.tester import SeqLabelTester
@@ -48,7 +48,7 @@ def infer():
     print("Inference finished!")
 
 
-def train():
+def train_test():
     # Config Loader
     train_args = ConfigSection()
     test_args = ConfigSection()
@@ -59,9 +59,10 @@ def train():
     train_data = loader.load_pku()
 
     # Preprocessor
-    p = POSPreprocess(train_data, pickle_path, train_dev_split=0.3)
-    train_args["vocab_size"] = p.vocab_size
-    train_args["num_classes"] = p.num_classes
+    preprocess = SeqLabelPreprocess()
+    data_train, data_dev = preprocess.run(train_data, pickle_path=pickle_path, train_dev_split=0.3)
+    train_args["vocab_size"] = preprocess.vocab_size
+    train_args["num_classes"] = preprocess.num_classes
 
     # Trainer
     trainer = SeqLabelTrainer(train_args)
@@ -70,7 +71,7 @@ def train():
     model = SeqLabeling(train_args)
 
     # Start training
-    trainer.train(model)
+    trainer.train(model, data_train, data_dev)
     print("Training finished!")
 
     # Saver
@@ -78,8 +79,11 @@ def train():
     saver.save_pytorch(model)
     print("Model saved!")
 
+    # testing with validation set
+    test(data_dev)
 
-def test():
+
+def test(test_data):
     # Config Loader
     train_args = ConfigSection()
     ConfigLoader("config.cfg", "").load_config("./data_for_tests/config", {"POS": train_args})
@@ -99,7 +103,7 @@ def test():
     tester = SeqLabelTester(test_args)
 
     # Start testing
-    tester.test(model)
+    tester.test(model, test_data)
 
     # print test results
     print(tester.show_matrices())
@@ -107,4 +111,4 @@ def test():
 
 
 if __name__ == "__main__":
-    train()
+    train_test()
diff --git a/test/ner.py b/test/ner.py
index beaac1d6..accf92c2 100644
--- a/test/ner.py
+++ b/test/ner.py
@@ -4,9 +4,9 @@ import os
 import numpy as np
 import torch
 
+from fastNLP.core.preprocess import SeqLabelPreprocess
 from fastNLP.core.tester import SeqLabelTester
 from fastNLP.core.trainer import SeqLabelTrainer
-from fastNLP.loader.preprocess import POSPreprocess
 from fastNLP.models.sequence_modeling import AdvSeqLabel
 
 
@@ -114,7 +114,8 @@ emb_path = "data_for_tests/emb50.txt"
 save_path = "data_for_tests/"
 if __name__ == "__main__":
     data = data_load(data_path)
-    p = POSPreprocess(data, pickle_path=pick_path, train_dev_split=0.3)
+    preprocess = SeqLabelPreprocess()
+    data_train, data_dev = preprocess.run(data, pickle_path=pick_path, train_dev_split=0.3)
     # emb = embedding_process(emb_path, p.word2index, 50, os.path.join(pick_path, "embedding.pkl"))
     emb = None
     args = {"epochs": 20,
@@ -125,13 +126,13 @@ if __name__ == "__main__":
             "model_saved_path": save_path,
             "use_cuda": True,
 
-            "vocab_size": p.vocab_size,
-            "num_classes": p.num_classes,
+            "vocab_size": preprocess.vocab_size,
+            "num_classes": preprocess.num_classes,
             "word_emb_dim": 50,
             "rnn_hidden_units": 100
             }
     # emb = torch.Tensor(emb).float().cuda()
     networks = AdvSeqLabel(args, emb)
     trainer = MyNERTrainer(args)
-    trainer.train(network=networks)
+    trainer.train(networks, data_train, data_dev)
     print("Training finished!")
diff --git a/test/readme_example.py b/test/readme_example.py
new file mode 100644
index 00000000..03cae2e6
--- /dev/null
+++ b/test/readme_example.py
@@ -0,0 +1,78 @@
+# python: 3.5
+# pytorch: 0.4
+
+################
+# Test cross validation.
+################
+
+from fastNLP.loader.preprocess import ClassPreprocess
+
+from fastNLP.core.predictor import ClassificationInfer
+from fastNLP.core.trainer import ClassificationTrainer
+from fastNLP.loader.dataset_loader import ClassDatasetLoader
+from fastNLP.models.base_model import BaseModel
+from fastNLP.modules import aggregation
+from fastNLP.modules import encoder
+
+
+class ClassificationModel(BaseModel):
+    """
+    Simple text classification model based on CNN.
+    """
+
+    def __init__(self, class_num, vocab_size):
+        super(ClassificationModel, self).__init__()
+
+        self.embed = encoder.Embedding(nums=vocab_size, dims=300)
+        self.conv = encoder.Conv(
+            in_channels=300, out_channels=100, kernel_size=3)
+        self.pool = aggregation.MaxPool()
+        self.output = encoder.Linear(input_size=100, output_size=class_num)
+
+    def forward(self, x):
+        x = self.embed(x)  # [N,L] -> [N,L,C]
+        x = self.conv(x)  # [N,L,C_in] -> [N,L,C_out]
+        x = self.pool(x)  # [N,L,C] -> [N,C]
+        x = self.output(x)  # [N,C] -> [N, N_class]
+        return x
+
+
+data_dir = 'data'  # directory to save data and model
+train_path = 'test/data_for_tests/text_classify.txt'  # training set file
+
+# load dataset
+ds_loader = ClassDatasetLoader("train", train_path)
+data = ds_loader.load()
+
+# pre-process dataset
+pre = ClassPreprocess(data, data_dir, cross_val=True, n_fold=5)
+# pre = ClassPreprocess(data, data_dir)
+n_classes = pre.num_classes
+vocab_size = pre.vocab_size
+
+# construct model
+model_args = {
+    'num_classes': n_classes,
+    'vocab_size': vocab_size
+}
+model = ClassificationModel(class_num=n_classes, vocab_size=vocab_size)
+
+# train model
+train_args = {
+    "epochs": 10,
+    "batch_size": 50,
+    "pickle_path": data_dir,
+    "validate": False,
+    "save_best_dev": False,
+    "model_saved_path": None,
+    "use_cuda": True,
+    "learn_rate": 1e-3,
+    "momentum": 0.9}
+trainer = ClassificationTrainer(train_args)
+# trainer.train(model, ['data_train.pkl', 'data_dev.pkl'])
+trainer.cross_validate(model)
+
+# predict using model
+data_infer = [x[0] for x in data]
+infer = ClassificationInfer(data_dir)
+labels_pred = infer.predict(model, data_infer)
diff --git a/test/seq_labeling.py b/test/seq_labeling.py
index 79f542fb..fe67b79c 100644
--- a/test/seq_labeling.py
+++ b/test/seq_labeling.py
@@ -5,7 +5,7 @@ sys.path.append("..")
 from fastNLP.loader.config_loader import ConfigLoader, ConfigSection
 from fastNLP.core.trainer import SeqLabelTrainer
 from fastNLP.loader.dataset_loader import POSDatasetLoader, BaseLoader
-from fastNLP.loader.preprocess import POSPreprocess, load_pickle
+from fastNLP.core.preprocess import SeqLabelPreprocess, load_pickle
 from fastNLP.saver.model_saver import ModelSaver
 from fastNLP.loader.model_loader import ModelLoader
 from fastNLP.core.tester import SeqLabelTester
@@ -68,7 +68,8 @@ def train_and_test():
     train_data = pos_loader.load_lines()
 
     # Preprocessor
-    p = POSPreprocess(train_data, pickle_path, train_dev_split=0.5)
+    p = SeqLabelPreprocess()
+    data_train, data_dev = p.run(train_data, pickle_path, train_dev_split=0.5)
     train_args["vocab_size"] = p.vocab_size
     train_args["num_classes"] = p.num_classes
 
@@ -79,7 +80,7 @@ def train_and_test():
     model = SeqLabeling(train_args)
 
     # Start training
-    trainer.train(model)
+    trainer.train(model, data_train, data_dev)
     print("Training finished!")
 
     # Saver
@@ -103,8 +104,8 @@ def train_and_test():
     # Tester
     tester = SeqLabelTester(test_args)
 
-    # Start testing
-    tester.test(model)
+    # Start testing with validation data
+    tester.test(model, data_dev)
 
     # print test results
     print(tester.show_matrices())
diff --git a/test/test_cws.py b/test/test_cws.py
index 74451e24..bbbef67f 100644
--- a/test/test_cws.py
+++ b/test/test_cws.py
@@ -5,7 +5,7 @@ sys.path.append("..")
 from fastNLP.loader.config_loader import ConfigLoader, ConfigSection
 from fastNLP.core.trainer import SeqLabelTrainer
 from fastNLP.loader.dataset_loader import TokenizeDatasetLoader, BaseLoader
-from fastNLP.loader.preprocess import POSPreprocess, load_pickle
+from fastNLP.core.preprocess import SeqLabelPreprocess, load_pickle
 from fastNLP.saver.model_saver import ModelSaver
 from fastNLP.loader.model_loader import ModelLoader
 from fastNLP.core.tester import SeqLabelTester
@@ -68,7 +68,8 @@ def train_test():
     train_data = loader.load_pku()
 
     # Preprocessor
-    p = POSPreprocess(train_data, pickle_path)
+    p = SeqLabelPreprocess()
+    data_train = p.run(train_data, pickle_path=pickle_path)
     train_args["vocab_size"] = p.vocab_size
     train_args["num_classes"] = p.num_classes
 
@@ -79,7 +80,7 @@ def train_test():
     model = SeqLabeling(train_args)
 
     # Start training
-    trainer.train(model)
+    trainer.train(model, data_train)
     print("Training finished!")
 
     # Saver
@@ -104,7 +105,7 @@ def train_test():
     tester = SeqLabelTester(test_args)
 
     # Start testing
-    tester.test(model)
+    tester.test(model, data_train)
 
     # print test results
     print(tester.show_matrices())
diff --git a/test/test_tester.py b/test/test_tester.py
index 9a3d949e..1c2658ef 100644
--- a/test/test_tester.py
+++ b/test/test_tester.py
@@ -1,7 +1,7 @@
+from fastNLP.core.preprocess import SeqLabelPreprocess
 from fastNLP.core.tester import SeqLabelTester
 from fastNLP.loader.config_loader import ConfigSection, ConfigLoader
 from fastNLP.loader.dataset_loader import TokenizeDatasetLoader
-from fastNLP.loader.preprocess import POSPreprocess
 from fastNLP.models.sequence_modeling import SeqLabeling
 
 data_name = "pku_training.utf8"
@@ -17,7 +17,7 @@ def foo():
     ConfigLoader("config.cfg", "").load_config("./data_for_tests/config", {"POS": train_args})
 
     # Preprocessor
-    p = POSPreprocess(train_data, pickle_path)
+    p = SeqLabelPreprocess(train_data, pickle_path)
     train_args["vocab_size"] = p.vocab_size
     train_args["num_classes"] = p.num_classes
 
diff --git a/test/text_classify.py b/test/text_classify.py
index f8353f27..d6a77781 100644
--- a/test/text_classify.py
+++ b/test/text_classify.py
@@ -10,7 +10,7 @@ from fastNLP.core.trainer import ClassificationTrainer
 from fastNLP.loader.config_loader import ConfigLoader, ConfigSection
 from fastNLP.loader.dataset_loader import ClassDatasetLoader
 from fastNLP.loader.model_loader import ModelLoader
-from fastNLP.loader.preprocess import ClassPreprocess
+from fastNLP.core.preprocess import ClassPreprocess
 from fastNLP.models.cnn_text_classification import CNNText
 from fastNLP.saver.model_saver import ModelSaver
 
@@ -59,28 +59,28 @@ def train():
     print(data[0])
 
     # pre-process data
-    pre = ClassPreprocess(data_dir)
-    vocab_size, n_classes = pre.process(data, "data_train.pkl")
-    print("vocabulary size:", vocab_size)
-    print("number of classes:", n_classes)
+    pre = ClassPreprocess()
+    data_train = pre.run(data, pickle_path=data_dir)
+    print("vocabulary size:", pre.vocab_size)
+    print("number of classes:", pre.num_classes)
 
     # construct model
     print("Building model...")
-    cnn = CNNText(model_args)
+    model = CNNText(model_args)
 
     # train
     print("Training...")
 
     trainer = ClassificationTrainer(train_args)
-    trainer.train(cnn)
+    trainer.train(model, data_train)
 
     print("Training finished!")
 
     saver = ModelSaver("./data_for_tests/saved_model.pkl")
-    saver.save_pytorch(cnn)
+    saver.save_pytorch(model)
     print("Model saved!")
 
 
 if __name__ == "__main__":
-    # train()
-    infer()
+    train()
+    # infer()

From fac830e1cd4bdad4fa7146e63efb97cfdeaeec1a Mon Sep 17 00:00:00 2001
From: FengZiYjun <writerphone@163.com>
Date: Mon, 20 Aug 2018 19:25:19 +0800
Subject: [PATCH 2/2] fix bugs and clean up

---
 fastNLP/core/preprocess.py |  8 ++++++--
 test/data_for_tests/config |  4 ++--
 test/seq_labeling.py       | 19 +++++--------------
 test/text_classify.py      |  7 ++++---
 4 files changed, 17 insertions(+), 21 deletions(-)

diff --git a/fastNLP/core/preprocess.py b/fastNLP/core/preprocess.py
index 6b81bff1..dfaf3e94 100644
--- a/fastNLP/core/preprocess.py
+++ b/fastNLP/core/preprocess.py
@@ -134,7 +134,10 @@ class BasePreprocess(object):
             results.append(data_dev)
         if test_data:
             results.append(data_test)
-        return tuple(results)
+        if len(results) == 1:
+            return results[0]
+        else:
+            return tuple(results)
 
     def build_dict(self, data):
         raise NotImplementedError
@@ -282,7 +285,8 @@ class ClassPreprocess(BasePreprocess):
         data_index = []
         for example in data:
             word_list = []
-            for word, label in zip(example[0]):
+            # example[0] is the word list, example[1] is the single label
+            for word in example[0]:
                 word_list.append(self.word2index.get(word, DEFAULT_WORD_TO_INDEX[DEFAULT_UNKNOWN_LABEL]))
             label_index = self.label2index.get(example[1], DEFAULT_WORD_TO_INDEX[DEFAULT_UNKNOWN_LABEL])
             data_index.append([word_list, label_index])
diff --git a/test/data_for_tests/config b/test/data_for_tests/config
index 60a7c9a5..2ffdcf3b 100644
--- a/test/data_for_tests/config
+++ b/test/data_for_tests/config
@@ -95,10 +95,10 @@ num_classes = 27
 [text_class]
 epochs = 1
 batch_size = 10
-pickle_path = "./data_for_tests/"
+pickle_path = "./save_path/"
 validate = false
 save_best_dev = false
-model_saved_path = "./data_for_tests/"
+model_saved_path = "./save_path/"
 use_cuda = true
 learn_rate = 1e-3
 momentum = 0.9
diff --git a/test/seq_labeling.py b/test/seq_labeling.py
index fe67b79c..b4007092 100644
--- a/test/seq_labeling.py
+++ b/test/seq_labeling.py
@@ -14,7 +14,7 @@ from fastNLP.core.predictor import SeqLabelInfer
 
 data_name = "people.txt"
 data_path = "data_for_tests/people.txt"
-pickle_path = "data_for_tests"
+pickle_path = "seq_label/"
 data_infer_path = "data_for_tests/people_infer.txt"
 
 
@@ -33,21 +33,12 @@ def infer():
     model = SeqLabeling(test_args)
 
     # Dump trained parameters into the model
-    ModelLoader.load_pytorch(model, "./data_for_tests/saved_model.pkl")
+    ModelLoader.load_pytorch(model, pickle_path + "saved_model.pkl")
     print("model loaded!")
 
     # Data Loader
     raw_data_loader = BaseLoader(data_name, data_infer_path)
     infer_data = raw_data_loader.load_lines()
-    """
-        Transform strings into list of list of strings. 
-        [
-            [word_11, word_12, ...],
-            [word_21, word_22, ...],
-            ...
-        ]
-        In this case, each line in "people_infer.txt" is already a sentence. So load_lines() just splits them.
-    """
 
     # Inference interface
     infer = SeqLabelInfer(pickle_path)
@@ -69,7 +60,7 @@ def train_and_test():
 
     # Preprocessor
     p = SeqLabelPreprocess()
-    data_train, data_dev = p.run(train_data, pickle_path, train_dev_split=0.5)
+    data_train, data_dev = p.run(train_data, pickle_path=pickle_path, train_dev_split=0.5)
     train_args["vocab_size"] = p.vocab_size
     train_args["num_classes"] = p.num_classes
 
@@ -84,7 +75,7 @@ def train_and_test():
     print("Training finished!")
 
     # Saver
-    saver = ModelSaver("./data_for_tests/saved_model.pkl")
+    saver = ModelSaver(pickle_path + "saved_model.pkl")
     saver.save_pytorch(model)
     print("Model saved!")
 
@@ -94,7 +85,7 @@ def train_and_test():
     model = SeqLabeling(train_args)
 
     # Dump trained parameters into the model
-    ModelLoader.load_pytorch(model, "./data_for_tests/saved_model.pkl")
+    ModelLoader.load_pytorch(model, pickle_path + "saved_model.pkl")
     print("model loaded!")
 
     # Load test configuration
diff --git a/test/text_classify.py b/test/text_classify.py
index d6a77781..c452e86c 100644
--- a/test/text_classify.py
+++ b/test/text_classify.py
@@ -14,6 +14,7 @@ from fastNLP.core.preprocess import ClassPreprocess
 from fastNLP.models.cnn_text_classification import CNNText
 from fastNLP.saver.model_saver import ModelSaver
 
+save_path = "./test_classification/"
 data_dir = "./data_for_tests/"
 train_file = 'text_classify.txt'
 model_name = "model_class.pkl"
@@ -27,8 +28,8 @@ def infer():
     unlabeled_data = [x[0] for x in data]
 
     # pre-process data
-    pre = ClassPreprocess(data_dir)
-    vocab_size, n_classes = pre.process(data, "data_train.pkl")
+    pre = ClassPreprocess()
+    vocab_size, n_classes = pre.run(data, pickle_path=save_path)
     print("vocabulary size:", vocab_size)
     print("number of classes:", n_classes)
 
@@ -60,7 +61,7 @@ def train():
 
     # pre-process data
     pre = ClassPreprocess()
-    data_train = pre.run(data, pickle_path=data_dir)
+    data_train = pre.run(data, pickle_path=save_path)
     print("vocabulary size:", pre.vocab_size)
     print("number of classes:", pre.num_classes)