From c83008add9068a9afbbb931a2579055692b2ef58 Mon Sep 17 00:00:00 2001
From: FengZiYjun <writerphone@163.com>
Date: Thu, 26 Jul 2018 22:19:30 +0800
Subject: [PATCH] fastnlp.py works, see test/test_fastNLP.py for high-level API

---
 fastNLP/action/inference.py                   |  10 +-
 fastNLP/{fastNLP.py => fastnlp.py}            |  93 ++++++++++++--
 fastNLP/loader/config_loader.py               |   6 +-
 fastNLP/loader/dataset_loader.py              |  51 ++++++++
 test/data_for_tests/config                    |  13 +-
 test/test_cws.py                              | 115 ++++++++++++++++++
 test/test_fastNLP.py                          |  14 +++
 test/test_keras_like.py                       |  28 -----
 ...t_POS_pipeline.py => test_seq_labeling.py} |  11 +-
 9 files changed, 289 insertions(+), 52 deletions(-)
 rename fastNLP/{fastNLP.py => fastnlp.py} (51%)
 create mode 100644 test/test_cws.py
 create mode 100644 test/test_fastNLP.py
 delete mode 100644 test/test_keras_like.py
 rename test/{test_POS_pipeline.py => test_seq_labeling.py} (90%)

diff --git a/fastNLP/action/inference.py b/fastNLP/action/inference.py
index c0692f28..ce6a8b62 100644
--- a/fastNLP/action/inference.py
+++ b/fastNLP/action/inference.py
@@ -38,7 +38,7 @@ class Inference(object):
         num_iter = len(data) // self.batch_size
 
         for step in range(num_iter):
-            batch_x = self.batchify(data)
+            batch_x = self.make_batch(data)
 
             prediction = self.data_forward(network, batch_x)
 
@@ -68,10 +68,11 @@ class Inference(object):
         results = torch.Tensor(prediction).view(-1, )
         return list(results.data)
 
-    def batchify(self, data):
+    def make_batch(self, data):
         indices = next(self.iterator)
         batch_x = [data[idx] for idx in indices]
-        batch_x = self.pad(batch_x)
+        if self.batch_size > 1:
+            batch_x = self.pad(batch_x)
         return batch_x
 
     @staticmethod
@@ -98,6 +99,7 @@ class Inference(object):
             ...
         ]
         """
+        assert isinstance(data, list)
         data_index = []
         default_unknown_index = self.word2index[DEFAULT_UNKNOWN_LABEL]
         for example in data:
@@ -107,7 +109,7 @@ class Inference(object):
     def prepare_output(self, batch_outputs):
         """
         Transform list of batch outputs into strings.
-        :param batch_outputs: list of list [num_batch, tag_seq_length]
+        :param batch_outputs: list of list, of shape [num_batch, tag_seq_length]. Element type is Tensor.
         :return:
         """
         results = []
diff --git a/fastNLP/fastNLP.py b/fastNLP/fastnlp.py
similarity index 51%
rename from fastNLP/fastNLP.py
rename to fastNLP/fastnlp.py
index cfda830c..cb97aa53 100644
--- a/fastNLP/fastNLP.py
+++ b/fastNLP/fastnlp.py
@@ -3,14 +3,14 @@ from fastNLP.loader.config_loader import ConfigLoader, ConfigSection
 from fastNLP.loader.model_loader import ModelLoader
 
 """
-mapping from model name to [URL, file_name.class_name]
+mapping from model name to [URL, file_name.class_name, model_pickle_name]
 Notice that the class of the model should be in "models" directory.
 
 Example:
-    "zh_pos_tag_model": ["www.fudan.edu.cn", "sequence_modeling.SeqLabeling"]
+    "zh_pos_tag_model": ["www.fudan.edu.cn", "sequence_modeling.SeqLabeling", "saved_model.pkl"]
 """
 FastNLP_MODEL_COLLECTION = {
-    "zh_pos_tag_model": ["www.fudan.edu.cn", "sequence_modeling.SeqLabeling"]
+    "zh_pos_tag_model": ["www.fudan.edu.cn", "sequence_modeling.SeqLabeling", "saved_model.pkl"]
 }
 
 
@@ -26,6 +26,13 @@ class FastNLP(object):
     """
 
     def __init__(self, model_dir="./"):
+        """
+        :param model_dir: this directory should contain the following files:
+            1. a pre-trained model
+            2. a config file
+            3. "id2class.pkl"
+            4. "word2id.pkl"
+        """
         self.model_dir = model_dir
         self.model = None
 
@@ -45,27 +52,32 @@ class FastNLP(object):
 
         model_args = ConfigSection()
         # To do: customized config file for model init parameters
-        ConfigLoader.load_config(self.model_dir + "default.cfg", model_args)
+        ConfigLoader.load_config(self.model_dir + "config", {"POS_infer": model_args})
 
+        # Construct the model
         model = model_class(model_args)
 
         # To do: framework independent
-        ModelLoader.load_pytorch(model, self.model_dir + model_name)
+        ModelLoader.load_pytorch(model, self.model_dir + FastNLP_MODEL_COLLECTION[model_name][2])
 
         self.model = model
 
         print("Model loaded. ")
 
-    def run(self, infer_input):
+    def run(self, raw_input):
         """
         Perform inference over given input using the loaded model.
-        :param infer_input: str, raw text
+        :param raw_input: str, raw text
         :return results:
         """
-        infer = Inference()
-        data = infer.prepare_input(infer_input)
-        results = infer.predict(self.model, data)
-        return results
+
+        infer = Inference(self.model_dir)
+        infer_input = self.string_to_list(raw_input)
+
+        results = infer.predict(self.model, infer_input)
+
+        outputs = self.make_output(results)
+        return outputs
 
     @staticmethod
     def _get_model_class(file_class_name):
@@ -101,4 +113,61 @@ class FastNLP(object):
         Check whether the desired model is already in the directory.
         :param model_dir:
         """
-        pass
+        return True
+
+    def string_to_list(self, text, delimiter="\n"):
+        """
+        For word seg only, currently.
+        This function is used to transform raw input to lists, which is done by DatasetLoader in training.
+        Split text string into three-level lists.
+        [
+            [word_11, word_12, ...],
+            [word_21, word_22, ...],
+            ...
+        ]
+        :param text: string
+        :param delimiter: str, character used to split text into sentences.
+        :return data: three-level lists
+        """
+        data = []
+        sents = text.strip().split(delimiter)
+        for sent in sents:
+            characters = []
+            for ch in sent:
+                characters.append(ch)
+            data.append(characters)
+        # To refactor: this is used in make_output
+        self.data = data
+        return data
+
+    def make_output(self, results):
+        """
+        Transform model output into user-friendly contents.
+        Example: In CWS, convert <BMES> labeling into segmented text.
+        :param results:
+        :return:
+        """
+        outputs = []
+        for sent_char, sent_label in zip(self.data, results):
+            words = []
+            word = ""
+            for char, label in zip(sent_char, sent_label):
+                if label[0] == "B":
+                    if word != "":
+                        words.append(word)
+                    word = char
+                elif label[0] == "M":
+                    word += char
+                elif label[0] == "E":
+                    word += char
+                    words.append(word)
+                    word = ""
+                elif label[0] == "S":
+                    if word != "":
+                        words.append(word)
+                    word = ""
+                    words.append(char)
+                else:
+                    raise ValueError("invalid label")
+            outputs.append(" ".join(words))
+        return outputs
diff --git a/fastNLP/loader/config_loader.py b/fastNLP/loader/config_loader.py
index e3a856d9..d348e75e 100644
--- a/fastNLP/loader/config_loader.py
+++ b/fastNLP/loader/config_loader.py
@@ -20,9 +20,13 @@ class ConfigLoader(BaseLoader):
     def load_config(file_path, sections):
         """
         :param file_path: the path of config file
-        :param sections: the dict of sections
+        :param sections: the dict of {section_name(string): Section instance}
+        Example:
+            test_args = ConfigSection()
+            ConfigLoader("config.cfg", "").load_config("./data_for_tests/config", {"POS_test": test_args})
         :return:
         """
+        assert isinstance(sections, dict)
         cfg = configparser.ConfigParser()
         if not os.path.exists(file_path):
             raise FileNotFoundError("config file {} not found. ".format(file_path))
diff --git a/fastNLP/loader/dataset_loader.py b/fastNLP/loader/dataset_loader.py
index dc5640f1..88ff151d 100644
--- a/fastNLP/loader/dataset_loader.py
+++ b/fastNLP/loader/dataset_loader.py
@@ -22,6 +22,7 @@ class POSDatasetLoader(DatasetLoader):
         and label2
         Jerry   label1
         .   label3
+        (separated by an empty line)
         Hello   label4
         world   label5
         !   label3
@@ -77,6 +78,51 @@ class POSDatasetLoader(DatasetLoader):
         return data
 
 
+class TokenizeDatasetLoader(DatasetLoader):
+    """
+    Data set loader for tokenization data sets
+    """
+
+    def __init__(self, data_name, data_path):
+        super(TokenizeDatasetLoader, self).__init__(data_name, data_path)
+
+    def load_pku(self):
+        """
+        load pku dataset for Chinese word segmentation
+        CWS (Chinese Word Segmentation) pku training dataset format:
+            1. Each line is a sentence.
+            2. Each word in a sentence is separated by space.
+        This function convert the pku dataset into three-level lists with labels <BMES>.
+            B: beginning of a word
+            M: middle of a word
+            E: ending of a word
+            S: single character
+
+        :return: three-level lists
+        """
+        with open(self.data_path, "r", encoding="utf-8") as f:
+            sentences = f.readlines()
+        data = []
+        for sent in sentences:
+            words = []
+            labels = []
+            tokens = sent.strip().split()
+            for token in tokens:
+                if len(token) == 1:
+                    words.append(token)
+                    labels.append("S")
+                else:
+                    words.append(token[0])
+                    labels.append("B")
+                    for idx in range(1, len(token) - 1):
+                        words.append(token[idx])
+                        labels.append("M")
+                    words.append(token[-1])
+                    labels.append("E")
+            data.append([words, labels])
+        return data
+
+
 class ClassDatasetLoader(DatasetLoader):
     """Loader for classification data sets"""
 
@@ -163,7 +209,12 @@ class LMDatasetLoader(DatasetLoader):
 
 
 if __name__ == "__main__":
+    """
     data = POSDatasetLoader("xxx", "../../test/data_for_tests/people.txt").load_lines()
     for example in data:
         for w, l in zip(example[0], example[1]):
             print(w, l)
+    """
+
+    ans = TokenizeDatasetLoader("xxx", "/home/zyfeng/Desktop/data/icwb2-data/training/test").load_pku()
+    print(ans)
diff --git a/test/data_for_tests/config b/test/data_for_tests/config
index fad9d876..243ad1ff 100644
--- a/test/data_for_tests/config
+++ b/test/data_for_tests/config
@@ -54,8 +54,8 @@ test = 5
 new_attr = 40
 
 [POS]
-epochs = 20
-batch_size = 1
+epochs = 1
+batch_size = 32
 pickle_path = "./data_for_tests/"
 validate = true
 save_best_dev = true
@@ -80,3 +80,12 @@ rnn_bi_direction = true
 word_emb_dim = 100
 dropout = 0.5
 use_crf = true
+
+[POS_infer]
+pickle_path = "./data_for_tests/"
+rnn_hidden_units = 100
+rnn_layers = 1
+rnn_bi_direction = true
+word_emb_dim = 100
+vocab_size = 52
+num_classes = 22
\ No newline at end of file
diff --git a/test/test_cws.py b/test/test_cws.py
new file mode 100644
index 00000000..8cee7177
--- /dev/null
+++ b/test/test_cws.py
@@ -0,0 +1,115 @@
+import sys
+
+sys.path.append("..")
+
+from fastNLP.loader.config_loader import ConfigLoader, ConfigSection
+from fastNLP.action.trainer import POSTrainer
+from fastNLP.loader.dataset_loader import TokenizeDatasetLoader, BaseLoader
+from fastNLP.loader.preprocess import POSPreprocess, load_pickle
+from fastNLP.saver.model_saver import ModelSaver
+from fastNLP.loader.model_loader import ModelLoader
+from fastNLP.action.tester import POSTester
+from fastNLP.models.sequence_modeling import SeqLabeling
+from fastNLP.action.inference import Inference
+
+data_name = "pku_training.utf8"
+cws_data_path = "/home/zyfeng/Desktop/data/pku_training.utf8"
+pickle_path = "data_for_tests"
+data_infer_path = "data_for_tests/people_infer.txt"
+
+
+def infer():
+    # Load infer configuration, the same as test
+    test_args = ConfigSection()
+    ConfigLoader("config.cfg", "").load_config("./data_for_tests/config", {"POS_test": test_args})
+
+    # fetch dictionary size and number of labels from pickle files
+    word2index = load_pickle(pickle_path, "word2id.pkl")
+    test_args["vocab_size"] = len(word2index)
+    index2label = load_pickle(pickle_path, "id2class.pkl")
+    test_args["num_classes"] = len(index2label)
+
+    # Define the same model
+    model = SeqLabeling(test_args)
+
+    # Dump trained parameters into the model
+    ModelLoader.load_pytorch(model, "./data_for_tests/saved_model.pkl")
+    print("model loaded!")
+
+    # Data Loader
+    raw_data_loader = BaseLoader(data_name, data_infer_path)
+    infer_data = raw_data_loader.load_lines()
+    """
+        Transform strings into list of list of strings. 
+        [
+            [word_11, word_12, ...],
+            [word_21, word_22, ...],
+            ...
+        ]
+        In this case, each line in "people_infer.txt" is already a sentence. So load_lines() just splits them.
+    """
+
+    # Inference interface
+    infer = Inference(pickle_path)
+    results = infer.predict(model, infer_data)
+
+    print(results)
+    print("Inference finished!")
+
+
+def train_test():
+    # Config Loader
+    train_args = ConfigSection()
+    ConfigLoader("config.cfg", "").load_config("./data_for_tests/config", {"POS": train_args})
+
+    # Data Loader
+    loader = TokenizeDatasetLoader(data_name, cws_data_path)
+    train_data = loader.load_pku()
+
+    # Preprocessor
+    p = POSPreprocess(train_data, pickle_path)
+    train_args["vocab_size"] = p.vocab_size
+    train_args["num_classes"] = p.num_classes
+
+    # Trainer
+    trainer = POSTrainer(train_args)
+
+    # Model
+    model = SeqLabeling(train_args)
+
+    # Start training
+    trainer.train(model)
+    print("Training finished!")
+
+    # Saver
+    saver = ModelSaver("./data_for_tests/saved_model.pkl")
+    saver.save_pytorch(model)
+    print("Model saved!")
+
+    del model, trainer, loader
+
+    # Define the same model
+    model = SeqLabeling(train_args)
+
+    # Dump trained parameters into the model
+    ModelLoader.load_pytorch(model, "./data_for_tests/saved_model.pkl")
+    print("model loaded!")
+
+    # Load test configuration
+    test_args = ConfigSection()
+    ConfigLoader("config.cfg", "").load_config("./data_for_tests/config", {"POS_test": test_args})
+
+    # Tester
+    tester = POSTester(test_args)
+
+    # Start testing
+    tester.test(model)
+
+    # print test results
+    print(tester.show_matrices())
+    print("model tested!")
+
+
+if __name__ == "__main__":
+    train_test()
+    # infer()
diff --git a/test/test_fastNLP.py b/test/test_fastNLP.py
new file mode 100644
index 00000000..35bac153
--- /dev/null
+++ b/test/test_fastNLP.py
@@ -0,0 +1,14 @@
+from fastNLP.fastnlp import FastNLP
+
+
+def foo():
+    nlp = FastNLP("./data_for_tests/")
+    nlp.load("zh_pos_tag_model")
+    text = "这是最好的基于深度学习的中文分词系统。"
+    result = nlp.run(text)
+    print(result)
+    print("FastNLP finished!")
+
+
+if __name__ == "__main__":
+    foo()
diff --git a/test/test_keras_like.py b/test/test_keras_like.py
deleted file mode 100644
index 08f7d6ae..00000000
--- a/test/test_keras_like.py
+++ /dev/null
@@ -1,28 +0,0 @@
-import aggregation
-import decoder
-import encoder
-
-
-class Input(object):
-    def __init__(self):
-        pass
-
-
-class Trainer(object):
-    def __init__(self, input, target, truth):
-        pass
-
-    def train(self):
-        pass
-
-
-def test_keras_like():
-    data_train, label_train = dataLoader("./data_path")
-
-    x = Input()
-    x = encoder.LSTM(input=x)
-    x = aggregation.max_pool(input=x)
-    y = decoder.CRF(input=x)
-
-    trainer = Trainer(input=data_train, target=y, truth=label_train)
-    trainer.train()
diff --git a/test/test_POS_pipeline.py b/test/test_seq_labeling.py
similarity index 90%
rename from test/test_POS_pipeline.py
rename to test/test_seq_labeling.py
index fdf5de3e..9a5fa711 100644
--- a/test/test_POS_pipeline.py
+++ b/test/test_seq_labeling.py
@@ -23,7 +23,7 @@ def infer():
     test_args = ConfigSection()
     ConfigLoader("config.cfg", "").load_config("./data_for_tests/config", {"POS_test": test_args})
 
-    # fetch dictinary size and number of labels from pickle files
+    # fetch dictionary size and number of labels from pickle files
     word2index = load_pickle(pickle_path, "word2id.pkl")
     test_args["vocab_size"] = len(word2index)
     index2label = load_pickle(pickle_path, "id2class.pkl")
@@ -33,7 +33,7 @@ def infer():
     model = SeqLabeling(test_args)
 
     # Dump trained parameters into the model
-    ModelLoader.load_pytorch(model, "./saved_model.pkl")
+    ModelLoader.load_pytorch(model, "./data_for_tests/saved_model.pkl")
     print("model loaded!")
 
     # Data Loader
@@ -82,7 +82,7 @@ def train_test():
     print("Training finished!")
 
     # Saver
-    saver = ModelSaver("./saved_model.pkl")
+    saver = ModelSaver("./data_for_tests/saved_model.pkl")
     saver.save_pytorch(model)
     print("Model saved!")
 
@@ -92,7 +92,7 @@ def train_test():
     model = SeqLabeling(train_args)
 
     # Dump trained parameters into the model
-    ModelLoader.load_pytorch(model, "./saved_model.pkl")
+    ModelLoader.load_pytorch(model, "./data_for_tests/saved_model.pkl")
     print("model loaded!")
 
     # Load test configuration
@@ -111,4 +111,5 @@ def train_test():
 
 
 if __name__ == "__main__":
-    infer()
+    train_test()
+    # infer()