From 83fe6f9f2127336910a264907886e6e4abab6ba3 Mon Sep 17 00:00:00 2001
From: FengZiYjun <writerphone@163.com>
Date: Sun, 1 Jul 2018 18:28:09 +0800
Subject: [PATCH 01/13] combine controller and trainer

---
 fastNLP/action/action.py        |  11 --
 fastNLP/action/trainer.py       | 174 ++++++++++++++++++++++++--------
 fastNLP/loader/config_loader.py |   3 +-
 fastNLP/models/base_model.py    |  95 -----------------
 4 files changed, 134 insertions(+), 149 deletions(-)

diff --git a/fastNLP/action/action.py b/fastNLP/action/action.py
index c85a74df..5512c7b1 100644
--- a/fastNLP/action/action.py
+++ b/fastNLP/action/action.py
@@ -1,4 +1,3 @@
-from saver.logger import Logger
 
 
 class Action(object):
@@ -8,16 +7,6 @@ class Action(object):
 
     def __init__(self):
         super(Action, self).__init__()
-        self.logger = Logger("logger_output.txt")
-
-    def load_config(self, args):
-        raise NotImplementedError
-
-    def load_dataset(self, args):
-        raise NotImplementedError
-
-    def log(self, string):
-        self.logger.log(string)
 
     def batchify(self, batch_size, X, Y=None):
         """
diff --git a/fastNLP/action/trainer.py b/fastNLP/action/trainer.py
index b3640ba2..79f14df3 100644
--- a/fastNLP/action/trainer.py
+++ b/fastNLP/action/trainer.py
@@ -1,36 +1,56 @@
 from collections import namedtuple
 
-from .action import Action
-from .tester import Tester
+import numpy as np
+import torch
 
+from fastNLP.action.action import Action
+from fastNLP.action.tester import Tester
 
-class Trainer(Action):
-    """
-        Trainer is a common training pipeline shared among all models.
+
+class BaseTrainer(Action):
+    """Base trainer for all trainers.
+        Trainer receives a model and data, and then performs training.
+
+        Subclasses must implement the following abstract methods:
+        - prepare_input
+        - mode
+        - define_optimizer
+        - data_forward
+        - grad_backward
+        - get_loss
     """
     TrainConfig = namedtuple("config", ["epochs", "validate", "save_when_better",
                                         "log_per_step", "log_validation", "batch_size"])
 
     def __init__(self, train_args):
         """
-        :param train_args: namedtuple
+        training parameters
         """
-        super(Trainer, self).__init__()
+        super(BaseTrainer, self).__init__()
         self.n_epochs = train_args.epochs
         self.validate = train_args.validate
-        self.save_when_better = train_args.save_when_better
-        self.log_per_step = train_args.log_per_step
-        self.log_validation = train_args.log_validation
         self.batch_size = train_args.batch_size
+        self.model = None
 
     def train(self, network, train_data, dev_data=None):
-        """
-        :param network: the models controller
+        """General training loop.
+        :param network: a model
         :param train_data: raw data for training
         :param dev_data: raw data for validation
-        This method will call all the base methods of network (implemented in models.base_model).
+
+        The method is framework independent.
+        Work by calling the following methods:
+            - prepare_input
+            - mode
+            - define_optimizer
+            - data_forward
+            - get_loss
+            - grad_backward
+            - update
+        Subclasses must implement these methods with a specific framework.
         """
-        train_x, train_y = network.prepare_input(train_data)
+        self.model = network
+        train_x, train_y = self.prepare_input(train_data)
 
         iterations, train_batch_generator = self.batchify(self.batch_size, train_x, train_y)
 
@@ -39,55 +59,125 @@ class Trainer(Action):
         evaluator = Tester(test_args)
 
         best_loss = 1e10
-        loss_history = list()
 
         for epoch in range(self.n_epochs):
-            network.mode(test=False)  # turn on the train mode
+            self.mode(test=False)  # turn on the train mode
 
-            network.define_optimizer()
+            self.define_optimizer()
             for step in range(iterations):
                 batch_x, batch_y = train_batch_generator.__next__()
 
-                prediction = network.data_forward(batch_x)
-
-                loss = network.get_loss(prediction, batch_y)
-                network.grad_backward()
+                prediction = self.data_forward(network, batch_x)
 
-                if step % self.log_per_step == 0:
-                    print("step ", step)
-                    loss_history.append(loss)
-                    self.log(self.make_log(epoch, step, loss))
+                loss = self.get_loss(prediction, batch_y)
+                self.grad_backward(loss)
+                self.update()
 
-            #################### evaluate over dev set  ###################
             if self.validate:
                 if dev_data is None:
                     raise RuntimeError("No validation data provided.")
-                # give all controls to tester
                 evaluator.test(network, dev_data)
-
-                if self.log_validation:
-                    self.log(self.make_valid_log(epoch, evaluator.loss))
                 if evaluator.loss < best_loss:
                     best_loss = evaluator.loss
-                    if self.save_when_better:
-                        self.save_model(network)
 
         # finish training
 
-    def make_log(self, *args):
-        return "make a log"
+    def prepare_input(self, data):
+        """
+        Perform data transformation from raw input to vector/matrix inputs.
+        :param data: raw inputs
+        :return (X, Y): tuple, input features and labels
+        """
+        raise NotImplementedError
 
-    def make_valid_log(self, *args):
-        return "make a valid log"
+    def mode(self, test=False):
+        """
+        Tell the network to be trained or not.
+        :param test: bool
+        """
+        raise NotImplementedError
 
-    def save_model(self, model):
-        model.save()
+    def define_optimizer(self):
+        """
+        Define framework-specific optimizer specified by the models.
+        """
+        raise NotImplementedError
 
-    def load_data(self, data_name):
-        print("load data")
+    def update(self):
+        """
+        Perform weight update on a model.
 
-    def load_config(self, args):
+        For PyTorch, just call optimizer to update.
+        """
         raise NotImplementedError
 
-    def load_dataset(self, args):
+    def data_forward(self, network, *x):
+        """
+        Forward pass of the data.
+        :param network: a model
+        :param x: input feature matrix and label vector
+        :return: output by the models
+
+        For PyTorch, just do "network(*x)"
+        """
         raise NotImplementedError
+
+    def grad_backward(self, loss):
+        """
+        Compute gradient with link rules.
+        :param loss: a scalar where back-prop starts
+
+        For PyTorch, just do "loss.backward()"
+        """
+        raise NotImplementedError
+
+    def get_loss(self, predict, truth):
+        """
+        Compute loss given prediction and ground truth.
+        :param predict: prediction label vector
+        :param truth: ground truth label vector
+        :return: a scalar
+        """
+        raise NotImplementedError
+
+
+class ToyTrainer(BaseTrainer):
+    """A simple trainer for a PyTorch model."""
+
+    def __init__(self, train_args):
+        super(ToyTrainer, self).__init__(train_args)
+        self.test_mode = False
+        self.weight = np.random.rand(5, 1)
+        self.bias = np.random.rand()
+        self._loss = 0
+        self._optimizer = None
+
+    def prepare_input(self, data):
+        return data[:, :-1], data[:, -1]
+
+    def mode(self, test=False):
+        self.model.mode(test)
+
+    def data_forward(self, network, *x):
+        return np.matmul(x, self.weight) + self.bias
+
+    def grad_backward(self, loss):
+        loss.backward()
+
+    def get_loss(self, pred, truth):
+        self._loss = np.mean(np.square(pred - truth))
+        return self._loss
+
+    def define_optimizer(self):
+        self._optimizer = torch.optim.SGD(self.model.parameters(), lr=0.01)
+
+    def update(self):
+        self._optimizer.step()
+
+
+if __name__ == "__name__":
+    Config = namedtuple("config", ["epochs", "validate", "save_when_better", "log_per_step",
+                                   "log_validation", "batch_size"])
+    train_config = Config(epochs=5, validate=True, save_when_better=True, log_per_step=10, log_validation=True,
+                          batch_size=32)
+    trainer = ToyTrainer(train_config)
diff --git a/fastNLP/loader/config_loader.py b/fastNLP/loader/config_loader.py
index fa1d446d..0f40ec51 100644
--- a/fastNLP/loader/config_loader.py
+++ b/fastNLP/loader/config_loader.py
@@ -1,4 +1,4 @@
-from loader.base_loader import BaseLoader
+from fastNLP.loader.base_loader import BaseLoader
 
 
 class ConfigLoader(BaseLoader):
@@ -11,3 +11,4 @@ class ConfigLoader(BaseLoader):
     @staticmethod
     def parse(string):
         raise NotImplementedError
+
diff --git a/fastNLP/models/base_model.py b/fastNLP/models/base_model.py
index 1a2782c3..9249e2e3 100644
--- a/fastNLP/models/base_model.py
+++ b/fastNLP/models/base_model.py
@@ -1,4 +1,3 @@
-import numpy as np
 import torch
 
 
@@ -30,100 +29,6 @@ class BaseModel(torch.nn.Module):
         raise NotImplementedError
 
 
-class BaseController(object):
-    """Base Controller for all controllers.
-        This class and its subclasses are actually "controllers" of the PyTorch models.
-        They act as an interface between Trainer and the PyTorch models.
-        This controller provides the following methods to be called by Trainer.
-        - prepare_input
-        - mode
-        - define_optimizer
-        - data_forward
-        - grad_backward
-        - get_loss
-    """
-
-    def __init__(self):
-        """
-        Define PyTorch model parameters here.
-        """
-        pass
-
-    def prepare_input(self, data):
-        """
-        Perform data transformation from raw input to vector/matrix inputs.
-        :param data: raw inputs
-        :return (X, Y): tuple, input features and labels
-        """
-        raise NotImplementedError
-
-    def mode(self, test=False):
-        """
-        Tell the network to be trained or not, required by PyTorch.
-        :param test: bool
-        """
-        raise NotImplementedError
-
-    def define_optimizer(self):
-        """
-        Define PyTorch optimizer specified by the models.
-        """
-        raise NotImplementedError
-
-    def data_forward(self, *x):
-        """
-        Forward pass of the data.
-        :param x: input feature matrix and label vector
-        :return: output by the models
-        """
-        # required by PyTorch nn
-        raise NotImplementedError
-
-    def grad_backward(self):
-        """
-        Perform gradient descent to update the models parameters.
-        """
-        raise NotImplementedError
-
-    def get_loss(self, pred, truth):
-        """
-        Compute loss given models prediction and ground truth. Loss function specified by the models.
-        :param pred: prediction label vector
-        :param truth: ground truth label vector
-        :return: a scalar
-        """
-        raise NotImplementedError
-
-
-class ToyController(BaseController):
-    """This is for code testing."""
-
-    def __init__(self):
-        super(ToyController, self).__init__()
-        self.test_mode = False
-        self.weight = np.random.rand(5, 1)
-        self.bias = np.random.rand()
-        self._loss = 0
-
-    def prepare_input(self, data):
-        return data[:, :-1], data[:, -1]
-
-    def mode(self, test=False):
-        self.test_mode = test
-
-    def data_forward(self, x):
-        return np.matmul(x, self.weight) + self.bias
-
-    def grad_backward(self):
-        print("loss gradient backward")
-
-    def get_loss(self, pred, truth):
-        self._loss = np.mean(np.square(pred - truth))
-        return self._loss
-
-    def define_optimizer(self):
-        pass
-
 
 class Vocabulary(object):
     """A look-up table that allows you to access `Lexeme` objects. The `Vocab`

From 1426fc3582ed5d99f5471bd1136f9706c17bac19 Mon Sep 17 00:00:00 2001
From: FengZiYjun <writerphone@163.com>
Date: Sun, 1 Jul 2018 19:50:07 +0800
Subject: [PATCH 02/13] refactor word_seg model & its test

---
 fastNLP/action/tester.py              |  2 +-
 fastNLP/action/trainer.py             | 89 +++++++++++++++++++++++++-
 fastNLP/models/char_language_model.py |  7 ++-
 fastNLP/models/word_seg_model.py      | 91 +--------------------------
 test/test_word_seg.py                 | 21 +++----
 5 files changed, 104 insertions(+), 106 deletions(-)

diff --git a/fastNLP/action/tester.py b/fastNLP/action/tester.py
index 0be1b010..7f660bb0 100644
--- a/fastNLP/action/tester.py
+++ b/fastNLP/action/tester.py
@@ -2,7 +2,7 @@ from collections import namedtuple
 
 import numpy as np
 
-from fastNLP.action import Action
+from fastNLP.action.action import Action
 
 
 class Tester(Action):
diff --git a/fastNLP/action/trainer.py b/fastNLP/action/trainer.py
index 79f14df3..0bbcccd7 100644
--- a/fastNLP/action/trainer.py
+++ b/fastNLP/action/trainer.py
@@ -111,7 +111,7 @@ class BaseTrainer(Action):
         """
         raise NotImplementedError
 
-    def data_forward(self, network, *x):
+    def data_forward(self, network, x):
         """
         Forward pass of the data.
         :param network: a model
@@ -158,7 +158,7 @@ class ToyTrainer(BaseTrainer):
     def mode(self, test=False):
         self.model.mode(test)
 
-    def data_forward(self, network, *x):
+    def data_forward(self, network, x):
         return np.matmul(x, self.weight) + self.bias
 
     def grad_backward(self, loss):
@@ -175,6 +175,91 @@ class ToyTrainer(BaseTrainer):
         self._optimizer.step()
 
 
+class WordSegTrainer(BaseTrainer):
+    """
+        reserve for changes
+    """
+
+    def __init__(self, train_args):
+        super(WordSegTrainer, self).__init__(train_args)
+        self.id2word = None
+        self.word2id = None
+        self.id2tag = None
+        self.tag2id = None
+
+        self.lstm_batch_size = 8
+        self.lstm_seq_len = 32  # Trainer batch_size == lstm_batch_size * lstm_seq_len
+        self.hidden_dim = 100
+        self.lstm_num_layers = 2
+        self.vocab_size = 100
+        self.word_emb_dim = 100
+
+        self.hidden = (self.to_var(torch.zeros(2, self.lstm_batch_size, self.word_emb_dim)),
+                       self.to_var(torch.zeros(2, self.lstm_batch_size, self.word_emb_dim)))
+
+        self.optimizer = None
+        self._loss = None
+
+        self.USE_GPU = False
+
+    def to_var(self, x):
+        if torch.cuda.is_available() and self.USE_GPU:
+            x = x.cuda()
+        return torch.autograd.Variable(x)
+
+    def prepare_input(self, data):
+        """
+            perform word indices lookup to convert strings into indices
+            :param data: list of string, each string contains word + space + [B, M, E, S]
+            :return
+        """
+        word_list = []
+        tag_list = []
+        for line in data:
+            if len(line) > 2:
+                tokens = line.split("#")
+                word_list.append(tokens[0])
+                tag_list.append(tokens[2][0])
+        self.id2word = list(set(word_list))
+        self.word2id = {word: idx for idx, word in enumerate(self.id2word)}
+        self.id2tag = list(set(tag_list))
+        self.tag2id = {tag: idx for idx, tag in enumerate(self.id2tag)}
+        words = np.array([self.word2id[w] for w in word_list]).reshape(-1, 1)
+        tags = np.array([self.tag2id[t] for t in tag_list]).reshape(-1, 1)
+        return words, tags
+
+    def mode(self, test=False):
+        if test:
+            self.model.eval()
+        else:
+            self.model.train()
+
+    def data_forward(self, network, x):
+        """
+        :param network: a PyTorch model
+        :param x: sequence of length [batch_size], word indices
+        :return:
+        """
+        x = x.reshape(self.lstm_batch_size, self.lstm_seq_len)
+        output, self.hidden = network(x, self.hidden)
+        return output
+
+    def define_optimizer(self):
+        self.optimizer = torch.optim.SGD(self.model.parameters(), lr=0.01, momentum=0.85)
+
+    def get_loss(self, predict, truth):
+        self._loss = torch.nn.CrossEntropyLoss(predict, truth)
+        return self._loss
+
+    def grad_backward(self, network):
+        self.model.zero_grad()
+        self._loss.backward()
+        torch.nn.utils.clip_grad_norm(self.model.parameters(), 5, norm_type=2)
+
+    def update(self):
+        self.optimizer.step()
+
+
 if __name__ == "__name__":
     Config = namedtuple("config", ["epochs", "validate", "save_when_better", "log_per_step",
                                    "log_validation", "batch_size"])
diff --git a/fastNLP/models/char_language_model.py b/fastNLP/models/char_language_model.py
index 9a6997b9..27a83903 100644
--- a/fastNLP/models/char_language_model.py
+++ b/fastNLP/models/char_language_model.py
@@ -6,11 +6,16 @@ import torch
 import torch.nn as nn
 import torch.nn.functional as F
 import torch.optim as optim
-from model.base_model import BaseModel
 from torch.autograd import Variable
 
+from fastNLP.models.base_model import BaseModel
+
 USE_GPU = True
 
+"""
+    To be deprecated.
+"""
+
 
 class CharLM(BaseModel):
     """
diff --git a/fastNLP/models/word_seg_model.py b/fastNLP/models/word_seg_model.py
index 58d7186d..969c7ff7 100644
--- a/fastNLP/models/word_seg_model.py
+++ b/fastNLP/models/word_seg_model.py
@@ -1,95 +1,6 @@
-import numpy as np
-import torch
 import torch.nn as nn
-import torch.optim as optim
-from torch.autograd import Variable
 
-from fastNLP.models.base_model import BaseModel, BaseController
-
-USE_GPU = True
-
-
-def to_var(x):
-    if torch.cuda.is_available() and USE_GPU:
-        x = x.cuda()
-    return Variable(x)
-
-
-class WordSegModel(BaseController):
-    """
-        Model controller for WordSeg
-    """
-
-    def __init__(self):
-        super(WordSegModel, self).__init__()
-        self.id2word = None
-        self.word2id = None
-        self.id2tag = None
-        self.tag2id = None
-
-        self.lstm_batch_size = 8
-        self.lstm_seq_len = 32  # Trainer batch_size == lstm_batch_size * lstm_seq_len
-        self.hidden_dim = 100
-        self.lstm_num_layers = 2
-        self.vocab_size = 100
-        self.word_emb_dim = 100
-
-        self.model = WordSeg(self.hidden_dim, self.lstm_num_layers, self.vocab_size, self.word_emb_dim)
-        self.hidden = (to_var(torch.zeros(2, self.lstm_batch_size, self.word_emb_dim)),
-                       to_var(torch.zeros(2, self.lstm_batch_size, self.word_emb_dim)))
-
-        self.optimizer = None
-        self._loss = None
-
-    def prepare_input(self, data):
-        """
-            perform word indices lookup to convert strings into indices
-            :param data: list of string, each string contains word + space + [B, M, E, S]
-            :return
-        """
-        word_list = []
-        tag_list = []
-        for line in data:
-            if len(line) > 2:
-                tokens = line.split("#")
-                word_list.append(tokens[0])
-                tag_list.append(tokens[2][0])
-        self.id2word = list(set(word_list))
-        self.word2id = {word: idx for idx, word in enumerate(self.id2word)}
-        self.id2tag = list(set(tag_list))
-        self.tag2id = {tag: idx for idx, tag in enumerate(self.id2tag)}
-        words = np.array([self.word2id[w] for w in word_list]).reshape(-1, 1)
-        tags = np.array([self.tag2id[t] for t in tag_list]).reshape(-1, 1)
-        return words, tags
-
-    def mode(self, test=False):
-        if test:
-            self.model.eval()
-        else:
-            self.model.train()
-
-    def data_forward(self, x):
-        """
-        :param x: sequence of length [batch_size], word indices
-        :return:
-        """
-        x = x.reshape(self.lstm_batch_size, self.lstm_seq_len)
-        output, self.hidden = self.model(x, self.hidden)
-        return output
-
-    def define_optimizer(self):
-        self.optimizer = optim.SGD(self.model.parameters(), lr=0.01, momentum=0.85)
-
-    def get_loss(self, pred, truth):
-
-        self._loss = nn.CrossEntropyLoss(pred, truth)
-        return self._loss
-
-    def grad_backward(self):
-        self.model.zero_grad()
-        self._loss.backward()
-        torch.nn.utils.clip_grad_norm(self.model.parameters(), 5, norm_type=2)
-        self.optimizer.step()
+from fastNLP.models.base_model import BaseModel
 
 
 class WordSeg(BaseModel):
diff --git a/test/test_word_seg.py b/test/test_word_seg.py
index bf693350..fca75356 100644
--- a/test/test_word_seg.py
+++ b/test/test_word_seg.py
@@ -1,23 +1,20 @@
-from loader.base_loader import BaseLoader
-from model.word_seg_model import WordSegModel
+from fastNLP.action.tester import Tester
+from fastNLP.action.trainer import WordSegTrainer
+from fastNLP.loader.base_loader import BaseLoader
+from fastNLP.models.word_seg_model import WordSeg
 
-from fastNLP.action import Tester
-from fastNLP.action.trainer import Trainer
 
-
-def test_charlm():
-    train_config = Trainer.TrainConfig(epochs=5, validate=False, save_when_better=False,
+def test_wordseg():
+    train_config = WordSegTrainer.TrainConfig(epochs=5, validate=False, save_when_better=False,
                                        log_per_step=10, log_validation=False, batch_size=254)
-    trainer = Trainer(train_config)
+    trainer = WordSegTrainer(train_config)
 
-    model = WordSegModel()
+    model = WordSeg(100, 2, 1000)
 
     train_data = BaseLoader("load_train", "./data_for_tests/cws_train").load_lines()
 
     trainer.train(model, train_data)
 
-    trainer.save_model(model)
-
     test_config = Tester.TestConfig(save_output=False, validate_in_training=False,
                                     save_dev_input=False, save_loss=False, batch_size=254)
     tester = Tester(test_config)
@@ -28,4 +25,4 @@ def test_charlm():
 
 
 if __name__ == "__main__":
-    test_charlm()
+    test_wordseg()

From 0e11dd5f242887e6cbff4dd4e12c2a9bc17791fe Mon Sep 17 00:00:00 2001
From: FengZiYjun <writerphone@163.com>
Date: Sun, 1 Jul 2018 19:53:55 +0800
Subject: [PATCH 03/13] correct spell error

---
 fastNLP/loader/dataset_loader.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/fastNLP/loader/dataset_loader.py b/fastNLP/loader/dataset_loader.py
index 0cec50e5..3871626b 100644
--- a/fastNLP/loader/dataset_loader.py
+++ b/fastNLP/loader/dataset_loader.py
@@ -1,6 +1,7 @@
-from fastNLP.loader.base_loader import BaseLoader
 import os
 
+from fastNLP.loader.base_loader import BaseLoader
+
 
 class DatasetLoader(BaseLoader):
     """"loader for data sets"""
@@ -16,7 +17,6 @@ class POSDatasetLoader(DatasetLoader):
         super(POSDatasetLoader, self).__init__(data_name, data_path)
         #self.data_set = self.load()
 
-
     def load(self):
         assert os.path.exists(self.data_path)
         with open(self.data_path, "r", encoding="utf-8") as f:
@@ -42,11 +42,12 @@ class POSDatasetLoader(DatasetLoader):
             dataset.append(sentence)
         return dataset
 
-class ClassficationDatasetLoader(DatasetLoader):
+
+class ClassificationDatasetLoader(DatasetLoader):
     """loader for classfication data sets"""
 
     def __init__(self, data_name, data_path):
-        super(ClassficationDatasetLoader, data_name)
+        super(ClassificationDatasetLoader, data_name).__init__()
 
     def load(self):
         assert os.path.exists(self.data_path)
@@ -70,6 +71,7 @@ class ClassficationDatasetLoader(DatasetLoader):
             dataset.append(sentence)
         return dataset
 
+
 class ConllLoader(DatasetLoader):
     """loader for conll format files"""
 

From b93cf0869122058dece64d732ba8128f1deca460 Mon Sep 17 00:00:00 2001
From: HENRY L <henryL7>
Date: Mon, 2 Jul 2018 01:40:17 +0800
Subject: [PATCH 04/13] initial commit

---
 fastNLP/modules/prototype/Word2Idx.py    |  62 +++++++++++++
 fastNLP/modules/prototype/aggregation.py |  41 +++++++++
 fastNLP/modules/prototype/dataloader.py  |  82 +++++++++++++++++
 fastNLP/modules/prototype/embedding.py   |  23 +++++
 fastNLP/modules/prototype/encoder.py     |  25 ++++++
 fastNLP/modules/prototype/example.py     | 108 +++++++++++++++++++++++
 fastNLP/modules/prototype/predict.py     |  25 ++++++
 7 files changed, 366 insertions(+)
 create mode 100644 fastNLP/modules/prototype/Word2Idx.py
 create mode 100644 fastNLP/modules/prototype/aggregation.py
 create mode 100644 fastNLP/modules/prototype/dataloader.py
 create mode 100644 fastNLP/modules/prototype/embedding.py
 create mode 100644 fastNLP/modules/prototype/encoder.py
 create mode 100644 fastNLP/modules/prototype/example.py
 create mode 100644 fastNLP/modules/prototype/predict.py

diff --git a/fastNLP/modules/prototype/Word2Idx.py b/fastNLP/modules/prototype/Word2Idx.py
new file mode 100644
index 00000000..544126be
--- /dev/null
+++ b/fastNLP/modules/prototype/Word2Idx.py
@@ -0,0 +1,62 @@
+import collections
+import pickle
+
+class Word2Idx():
+    """
+    Build a word index according to word frequency.
+    If "min_freq" is given, then only words with a frequncy not lesser than min_freq will be kept.
+    If "max_num" is given, then at most the most frequent $max_num words will be kept.
+    "words" should be a list [ w_1,w_2,...,w_i,...,w_n ] where each w_i is a string representing a word.
+    
+    num is the size of the lookup table.
+    w2i is a lookup table assigning each word an index.
+    Note that index 0 will be returned for any unregistered words.
+    i2w is a vector which serves as an invert mapping of w2i.
+    Token "<UNK>" will be returned for index 0
+    e.g. i2w[w2i["word"]] == "word"
+    """
+    def __init__(self):
+        self.__w2i = dict()
+        self.__i2w = []
+        self.num = 0
+
+    def build(self, words, min_freq=0, max_num=None):
+        """build a model from words"""
+        counter = collections.Counter(words)
+        word_set = set(words)
+        if max_num is not None:
+            most_common = counter.most_common(min(len(word_set), max_num - 1))
+        else:
+            most_common = counter.most_common()
+        self.__w2i = dict((w[0],i + 1) for i,w in enumerate(most_common) if w[1] >= min_freq)
+        self.__w2i["<UNK>"] = 0
+        self.__i2w = ["<UNK>"] + [ w[0] for w in most_common if w[1] >= min_freq ]
+        self.num = len(self.__i2w)
+
+    def w2i(self,word):
+        """word to index"""
+        if word in self.__w2i:
+            return self.__w2i[word]
+        return 0
+
+    def i2w(self,idx):
+        """index to word"""
+        if idx >= self.num:
+            raise Exception("out of range\n")
+        return self.__i2w[idx]
+
+    def save(self,addr):
+        """save the model to a file with address "addr" """
+        f = open(addr,"wb")
+        pickle.dump([self.__i2w, self.__w2i, self.num], f)
+        f.close()
+
+    def load(self,addr):
+        """load a model from a file with address "addr" """
+        f = open(addr,"rb")
+        paras = pickle.load(f)
+        self.__i2w, self.__w2i, self.num = paras[0], paras[1], paras[2]
+        f.close()
+
+    
+
diff --git a/fastNLP/modules/prototype/aggregation.py b/fastNLP/modules/prototype/aggregation.py
new file mode 100644
index 00000000..e87862b8
--- /dev/null
+++ b/fastNLP/modules/prototype/aggregation.py
@@ -0,0 +1,41 @@
+import torch
+import torch.nn as nn
+
+class Selfattention(nn.Module):
+    """
+    Self Attention Module.
+
+    Args:
+    input_size : the size for the input vector
+    d_a : the width of weight matrix
+    r : the number of encoded vectors
+    """
+    def __init__(self, input_size, d_a, r):
+        super(Selfattention, self).__init__()
+        self.W_s1 = nn.Parameter(torch.randn(d_a, input_size), requires_grad=True)
+        self.W_s2 = nn.Parameter(torch.randn(r, d_a), requires_grad=True)
+        self.softmax = nn.Softmax(dim=2)
+        self.tanh = nn.Tanh()
+
+    def penalization(self, A):
+        """
+        compute the penalization term for attention module
+        """
+        if self.W_s1.is_cuda:
+            I = Variable(torch.eye(A.size(1)).cuda(), requires_grad=False)
+        else:
+            I = Variable(torch.eye(A.size(1)), requires_grad=False)
+        M = torch.matmul(A, torch.transpose(A, 1, 2)) - I
+        M = M.view(M.size(0), -1)
+        return torch.sum(M ** 2, dim=1)
+        
+    def forward(self, x):
+        inter = self.tanh(torch.matmul(self.W_s1, torch.transpose(x, 1, 2)))
+        A = self.softmax(torch.matmul(self.W_s2, inter))
+        out = torch.matmul(A, H)
+        out = out.view(out.size(0), -1)
+        penalty = self.penalization(A)
+        return out, penalty
+
+if __name__ == "__main__":
+    model = Selfattention(100, 10, 20)
diff --git a/fastNLP/modules/prototype/dataloader.py b/fastNLP/modules/prototype/dataloader.py
new file mode 100644
index 00000000..a7eafdc2
--- /dev/null
+++ b/fastNLP/modules/prototype/dataloader.py
@@ -0,0 +1,82 @@
+import random
+import pickle
+import torch
+import numpy as np
+from torch.autograd import Variable
+
+def float_wrapper(x, requires_grad=True, using_cuda=True):
+    """
+    transform float type list to pytorch variable
+    """
+    if using_cuda==True:
+        return Variable(torch.FloatTensor(x).cuda(), requires_grad=requires_grad)
+    else:
+        return Variable(torch.FloatTensor(x), requires_grad=requires_grad)
+
+def long_wrapper(x, requires_grad=True, using_cuda=True):
+    """
+    transform long type list to pytorch variable
+    """
+    if using_cuda==True:
+        return Variable(torch.LongTensor(x).cuda(), requires_grad=requires_grad)
+    else:
+        return Variable(torch.LongTensor(x), requires_grad=requires_grad)
+    
+def pad(X, using_cuda):
+        """
+        zero-pad sequnces to same length then pack them together
+        """
+        maxlen = max([x.size(0) for x in X])
+        Y = []
+        for x in X:
+            padlen = maxlen - x.size(0)
+            if padlen > 0:
+                if using_cuda:
+                    paddings = torch.zeros(padlen).cuda()
+                else:
+                    paddings = torch.zeros(padlen)
+                x_ = torch.cat(x, paddings)
+                Y.append(x_)
+            else:
+                Y.append(x)
+        return torch.stack(Y)
+
+class DataLoader(object):
+    """
+    load data with form {"feature", "class"}
+
+    Args:
+    fdir : data file address
+    batch_size : batch_size
+    shuffle : if True, shuffle dataset every epoch
+    using_cuda : if True, return tensors on GPU
+    """
+    def __init__(self, fdir, batch_size, shuffle=True, using_cuda=True):
+        with open(fdir, "rb") as f:
+            self.data = pickle.load(f)
+        self.batch_size = batch_size
+        self.num = len(self.data)
+        self.count = 0
+        self.iters = int(self.num / batch_size)
+        self.shuffle = shuffle
+        self.using_cuda = using_cuda
+        
+    def __iter__(self):
+        return self
+
+    def __next__(self):
+        if self.count == self.iters:
+            self.count = 0
+            if self.shuffle:
+                random.shuffle(self.data)
+            raise StopIteration()
+        else:
+            X = self.data[self.count * self.batch_size : (self.count + 1) * self.batch_size]
+            self.count += 1
+            X = [long_wrapper(x["sent"], using_cuda=self.using_cuda) for x in X]
+            X = pad(X, self.using_cuda)
+            y = [long_wrapper(x["class"], using_cuda=self.using_cuda) for x in X]
+            y = torch.stack(y)
+            return {"feature" : X, "class" : y}
+            
+
diff --git a/fastNLP/modules/prototype/embedding.py b/fastNLP/modules/prototype/embedding.py
new file mode 100644
index 00000000..1ee88a92
--- /dev/null
+++ b/fastNLP/modules/prototype/embedding.py
@@ -0,0 +1,23 @@
+import torch
+import torch.nn as nn
+
+class Lookuptable(nn.Module):
+    """
+    A simple lookup table
+
+    Args:
+    nums : the size of the lookup table
+    dims : the size of each vector
+    padding_idx : pads the tensor with zeros whenever it encounters this index
+    sparse : If True, gradient matrix will be a sparse tensor. In this case,
+    only optim.SGD(cuda and cpu) and optim.Adagrad(cpu) can be used
+    """
+    def __init__(self, nums, dims, padding_idx=0, sparse=False):
+        super(Lookuptable, self).__init__()
+        self.embed = nn.Embedding(nums, dims, padding_idx, sparse=sparse)
+        
+    def forward(self, x):
+        return self.embed(x)
+
+if __name__ == "__main__":
+    model = Lookuptable(10, 20)
diff --git a/fastNLP/modules/prototype/encoder.py b/fastNLP/modules/prototype/encoder.py
new file mode 100644
index 00000000..249eaf8c
--- /dev/null
+++ b/fastNLP/modules/prototype/encoder.py
@@ -0,0 +1,25 @@
+import torch
+import torch.nn as nn
+
+class Lstm(nn.Module):
+    """
+    LSTM module
+
+    Args:
+    input_size : input size
+    hidden_size : hidden size
+    num_layers : number of hidden layers
+    dropout : dropout rate
+    bidirectional : If True, becomes a bidirectional RNN
+    """
+    def __init__(self, input_size, hidden_size, num_layers, dropout, bidirectional):
+        super(Lstm, self).__init__()
+        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, bias=True, batch_first=True,\
+         dropout=dropout, bidirectional=bidirectional)
+        
+    def forward(self, x):
+        x, _ = self.lstm(x)
+        return x
+
+if __name__ == "__main__":
+    model = Lstm(20, 30, 1, 0.5, False)
diff --git a/fastNLP/modules/prototype/example.py b/fastNLP/modules/prototype/example.py
new file mode 100644
index 00000000..9dffc59a
--- /dev/null
+++ b/fastNLP/modules/prototype/example.py
@@ -0,0 +1,108 @@
+import torch
+import torch.nn as nn
+import encoder
+import aggregation
+import embedding
+import predict
+import torch.optim as optim
+import time
+import dataloader
+
+WORD_SIZE = 100
+HIDDEN_SIZE = 300
+D_A = 350
+R = 20
+MLP_HIDDEN = 2000 
+CLASSES_NUM = 5
+WORD_NUM = 357361
+
+class Net(nn.Module):
+    """
+    A model for sentiment analysis using lstm and self-attention
+    """
+    def __init__(self):
+        super(Net, self).__init__()
+        self.embedding = embedding.Lookuptable(WORD_NUM, WORD_SIZE)
+        self.encoder = encoder.Lstm(WORD_SIZE, HIDDEN_SIZE, 1, 0.5, True)
+        self.aggregation = aggregation.Selfattention(2 * HIDDEN_SIZE, D_A, R)
+        self.predict = predict.MLP(R * HIDDEN_SIZE * 2, MLP_HIDDEN, CLASSES_NUM)
+
+    def forward(self, x):
+        x = self.embedding(x)
+        x = self.encoder(x)
+        x, penalty = self.aggregation(x)
+        x = self.predict(x)
+        return r, x
+
+def train(model_dict=None, using_cuda=True, learning_rate=0.06,\
+    momentum=0.3, batch_size=32, epochs=5, coef=1.0, interval=10):
+    """
+    training procedure
+
+    Args: 
+    If model_dict is given (a file address), it will continue training on the given model.
+    Otherwise, it would train a new model from scratch.
+    If using_cuda is true, the training would be conducted on GPU.
+    Learning_rate and momentum is for SGD optimizer.
+    coef is the coefficent between the cross-entropy loss and the penalization term.
+    interval is the frequncy of reporting.
+
+    the result will be saved with a form "model_dict_+current time", which could be used for further training
+    """
+    
+    if using_cuda == True:
+        net = Net().cuda()
+    else:
+        net = Net()
+        
+    if model_dict != None:
+        net.load_state_dict(torch.load(model_dict))
+
+    optimizer = optim.SGD(net.parameters(), lr=learning_rate, momentum=momentum)
+    criterion = nn.CrossEntropyLoss()
+    dataset = dataloader.DataLoader("trainset.pkl", using_cuda=using_cuda)
+
+    #statistics
+    loss_count = 0
+    prepare_time = 0
+    run_time = 0
+    count = 0
+
+    for epoch in range(epochs):
+        for i, batch in enumerate(dataset):
+            t1 = time.time()
+            X = batch["feature"]
+            y = batch["class"]
+            
+            t2 = time.time()
+            y_pred, y_penl = net(X)
+            loss = criterion(y_pred, y) + torch.sum(y_penl) / batch_size * coef
+            optimizer.zero_grad()
+            loss.backward()
+            nn.utils.clip_grad_norm(net.parameters(), 0.5)
+            optimizer.step()
+            t3 = time.time()
+
+            loss_count += torch.sum(y_penl).data[0]
+            prepare_time += (t2 - t1)
+            run_time += (t3 - t2)
+            p, idx = torch.max(y_pred, dim=1)
+            idx = idx.data
+            count += torch.sum(torch.eq(idx.cpu(), y))
+
+            if i % interval == 0:
+                print(i)      
+                print("loss count:" + str(loss_count / batch_size))
+                print("acuracy:" + str(count / batch_size))
+                print("penalty:" + str(torch.sum(y_penl).data[0] / batch_size))
+                print("prepare time:" + str(prepare_time / batch_size))
+                print("run time:" + str(run_time / batch_size))
+                prepare_time = 0
+                run_time = 0
+                loss_count = 0
+                count = 0
+        torch.save(net.state_dict(), "model_dict_%s.pkl"%(str(time.time())))
+
+if __name__ == "__main__":
+    train(using_cuda=torch.cuda.is_available())
+
diff --git a/fastNLP/modules/prototype/predict.py b/fastNLP/modules/prototype/predict.py
new file mode 100644
index 00000000..c8e72629
--- /dev/null
+++ b/fastNLP/modules/prototype/predict.py
@@ -0,0 +1,25 @@
+import torch
+import torch.nn as nn
+
+class MLP(nn.Module):
+    """
+    A two layers perceptron for classification.
+
+    Output : Unnormalized possibility distribution
+    Args:
+    input_size : the size of input
+    hidden_size : the size of hidden layer
+    output_size : the size of output
+    """
+    def __init__(self, input_size, hidden_size, output_size):
+        super(MLP,self).__init__()
+        self.L1 = nn.Linear(input_size, hidden_size)
+        self.L2 = nn.Linear(hidden_size, output_size)
+        self.softmax = nn.Softmax(dim=1)
+
+    def forward(self, x):
+        out = self.L2(F.relu(self.L1(x)))
+        return out
+
+if __name__ == "__main__":
+    MLP(20, 30, 20)
\ No newline at end of file

From 561305e03d51eb9209300fb21a32f7b5c0560ff8 Mon Sep 17 00:00:00 2001
From: HENRY L <henryL7>
Date: Mon, 2 Jul 2018 02:06:33 +0800
Subject: [PATCH 05/13] update and add readme

---
 fastNLP/modules/prototype/README.md      | 41 +++++++++++++++++++
 fastNLP/modules/prototype/Word2Idx.py    | 19 ++++-----
 fastNLP/modules/prototype/aggregation.py |  5 +--
 fastNLP/modules/prototype/dataloader.py  | 13 +++---
 fastNLP/modules/prototype/encoder.py     |  3 --
 fastNLP/modules/prototype/example.py     | 51 +++++++++++++++++-------
 fastNLP/modules/prototype/predict.py     |  2 +-
 fastNLP/modules/prototype/prepare.py     | 50 +++++++++++++++++++++++
 8 files changed, 146 insertions(+), 38 deletions(-)
 create mode 100644 fastNLP/modules/prototype/README.md
 create mode 100644 fastNLP/modules/prototype/prepare.py

diff --git a/fastNLP/modules/prototype/README.md b/fastNLP/modules/prototype/README.md
new file mode 100644
index 00000000..2dff7caa
--- /dev/null
+++ b/fastNLP/modules/prototype/README.md
@@ -0,0 +1,41 @@
+# Prototype
+
+## Word2Idx.py
+A mapping model between words and indexes
+
+## embedding.py
+embedding modules
+
+Contains a simple encapsulation for torch.nn.Embedding
+
+## encoder.py
+encoder modules
+
+Contains a simple encapsulation for torch.nn.LSTM
+
+## aggregation.py
+aggregation modules
+
+Contains a self-attention model, according to paper "A Structured Self-attentive Sentence Embedding", https://arxiv.org/abs/1703.03130
+
+## predict.py
+predict modules
+
+Contains a two layers perceptron for classification
+
+## example.py
+An example showing how to use above modules to build a model
+
+Contains a model for sentiment analysis on Yelp dataset, and its training and testing procedures. See https://arxiv.org/abs/1703.03130 for more details.
+
+## prepare.py
+A case of using Word2Idx to build Yelp datasets
+
+## dataloader.py
+A dataloader for Yelp dataset
+
+It is an iterable object, returning a zero-padded batch every iteration.
+
+
+
+
diff --git a/fastNLP/modules/prototype/Word2Idx.py b/fastNLP/modules/prototype/Word2Idx.py
index 544126be..2499aeae 100644
--- a/fastNLP/modules/prototype/Word2Idx.py
+++ b/fastNLP/modules/prototype/Word2Idx.py
@@ -4,15 +4,15 @@ import pickle
 class Word2Idx():
     """
     Build a word index according to word frequency.
+
     If "min_freq" is given, then only words with a frequncy not lesser than min_freq will be kept.
     If "max_num" is given, then at most the most frequent $max_num words will be kept.
     "words" should be a list [ w_1,w_2,...,w_i,...,w_n ] where each w_i is a string representing a word.
-    
     num is the size of the lookup table.
     w2i is a lookup table assigning each word an index.
-    Note that index 0 will be returned for any unregistered words.
     i2w is a vector which serves as an invert mapping of w2i.
-    Token "<UNK>" will be returned for index 0
+    Note that index 0 is token "<PAD>" for padding
+    index 1 is token "<UNK>" for unregistered words
     e.g. i2w[w2i["word"]] == "word"
     """
     def __init__(self):
@@ -29,29 +29,30 @@ class Word2Idx():
         else:
             most_common = counter.most_common()
         self.__w2i = dict((w[0],i + 1) for i,w in enumerate(most_common) if w[1] >= min_freq)
-        self.__w2i["<UNK>"] = 0
-        self.__i2w = ["<UNK>"] + [ w[0] for w in most_common if w[1] >= min_freq ]
+        self.__w2i["<PAD>"] = 0
+        self.__w2i["<UNK>"] = 1
+        self.__i2w = ["<PAD>", "<UNK>"] + [ w[0] for w in most_common if w[1] >= min_freq ]
         self.num = len(self.__i2w)
 
-    def w2i(self,word):
+    def w2i(self, word):
         """word to index"""
         if word in self.__w2i:
             return self.__w2i[word]
         return 0
 
-    def i2w(self,idx):
+    def i2w(self, idx):
         """index to word"""
         if idx >= self.num:
             raise Exception("out of range\n")
         return self.__i2w[idx]
 
-    def save(self,addr):
+    def save(self, addr):
         """save the model to a file with address "addr" """
         f = open(addr,"wb")
         pickle.dump([self.__i2w, self.__w2i, self.num], f)
         f.close()
 
-    def load(self,addr):
+    def load(self, addr):
         """load a model from a file with address "addr" """
         f = open(addr,"rb")
         paras = pickle.load(f)
diff --git a/fastNLP/modules/prototype/aggregation.py b/fastNLP/modules/prototype/aggregation.py
index e87862b8..59e50e99 100644
--- a/fastNLP/modules/prototype/aggregation.py
+++ b/fastNLP/modules/prototype/aggregation.py
@@ -1,5 +1,6 @@
 import torch
 import torch.nn as nn
+from torch.autograd import Variable
 
 class Selfattention(nn.Module):
     """
@@ -32,10 +33,8 @@ class Selfattention(nn.Module):
     def forward(self, x):
         inter = self.tanh(torch.matmul(self.W_s1, torch.transpose(x, 1, 2)))
         A = self.softmax(torch.matmul(self.W_s2, inter))
-        out = torch.matmul(A, H)
+        out = torch.matmul(A, x)
         out = out.view(out.size(0), -1)
         penalty = self.penalization(A)
         return out, penalty
 
-if __name__ == "__main__":
-    model = Selfattention(100, 10, 20)
diff --git a/fastNLP/modules/prototype/dataloader.py b/fastNLP/modules/prototype/dataloader.py
index a7eafdc2..af5cd8b8 100644
--- a/fastNLP/modules/prototype/dataloader.py
+++ b/fastNLP/modules/prototype/dataloader.py
@@ -32,10 +32,10 @@ def pad(X, using_cuda):
             padlen = maxlen - x.size(0)
             if padlen > 0:
                 if using_cuda:
-                    paddings = torch.zeros(padlen).cuda()
+                    paddings = Variable(torch.zeros(padlen).long()).cuda()
                 else:
-                    paddings = torch.zeros(padlen)
-                x_ = torch.cat(x, paddings)
+                    paddings = Variable(torch.zeros(padlen).long())
+                x_ = torch.cat((x, paddings), 0)
                 Y.append(x_)
             else:
                 Y.append(x)
@@ -71,12 +71,11 @@ class DataLoader(object):
                 random.shuffle(self.data)
             raise StopIteration()
         else:
-            X = self.data[self.count * self.batch_size : (self.count + 1) * self.batch_size]
+            batch = self.data[self.count * self.batch_size : (self.count + 1) * self.batch_size]
             self.count += 1
-            X = [long_wrapper(x["sent"], using_cuda=self.using_cuda) for x in X]
+            X = [long_wrapper(x["sent"], using_cuda=self.using_cuda, requires_grad=False) for x in batch]
             X = pad(X, self.using_cuda)
-            y = [long_wrapper(x["class"], using_cuda=self.using_cuda) for x in X]
-            y = torch.stack(y)
+            y = long_wrapper([x["class"] for x in batch], using_cuda=self.using_cuda, requires_grad=False)
             return {"feature" : X, "class" : y}
             
 
diff --git a/fastNLP/modules/prototype/encoder.py b/fastNLP/modules/prototype/encoder.py
index 249eaf8c..142496e1 100644
--- a/fastNLP/modules/prototype/encoder.py
+++ b/fastNLP/modules/prototype/encoder.py
@@ -20,6 +20,3 @@ class Lstm(nn.Module):
     def forward(self, x):
         x, _ = self.lstm(x)
         return x
-
-if __name__ == "__main__":
-    model = Lstm(20, 30, 1, 0.5, False)
diff --git a/fastNLP/modules/prototype/example.py b/fastNLP/modules/prototype/example.py
index 9dffc59a..782937fe 100644
--- a/fastNLP/modules/prototype/example.py
+++ b/fastNLP/modules/prototype/example.py
@@ -8,13 +8,13 @@ import torch.optim as optim
 import time
 import dataloader
 
+WORD_NUM = 357361
 WORD_SIZE = 100
 HIDDEN_SIZE = 300
 D_A = 350
-R = 20
+R = 10
 MLP_HIDDEN = 2000 
 CLASSES_NUM = 5
-WORD_NUM = 357361
 
 class Net(nn.Module):
     """
@@ -32,7 +32,7 @@ class Net(nn.Module):
         x = self.encoder(x)
         x, penalty = self.aggregation(x)
         x = self.predict(x)
-        return r, x
+        return x, penalty
 
 def train(model_dict=None, using_cuda=True, learning_rate=0.06,\
     momentum=0.3, batch_size=32, epochs=5, coef=1.0, interval=10):
@@ -50,7 +50,7 @@ def train(model_dict=None, using_cuda=True, learning_rate=0.06,\
     the result will be saved with a form "model_dict_+current time", which could be used for further training
     """
     
-    if using_cuda == True:
+    if using_cuda:
         net = Net().cuda()
     else:
         net = Net()
@@ -60,7 +60,7 @@ def train(model_dict=None, using_cuda=True, learning_rate=0.06,\
 
     optimizer = optim.SGD(net.parameters(), lr=learning_rate, momentum=momentum)
     criterion = nn.CrossEntropyLoss()
-    dataset = dataloader.DataLoader("trainset.pkl", using_cuda=using_cuda)
+    dataset = dataloader.DataLoader("test_set.pkl", batch_size, using_cuda=using_cuda)
 
     #statistics
     loss_count = 0
@@ -69,6 +69,7 @@ def train(model_dict=None, using_cuda=True, learning_rate=0.06,\
     count = 0
 
     for epoch in range(epochs):
+        print("epoch: %d"%(epoch))
         for i, batch in enumerate(dataset):
             t1 = time.time()
             X = batch["feature"]
@@ -86,23 +87,43 @@ def train(model_dict=None, using_cuda=True, learning_rate=0.06,\
             loss_count += torch.sum(y_penl).data[0]
             prepare_time += (t2 - t1)
             run_time += (t3 - t2)
-            p, idx = torch.max(y_pred, dim=1)
-            idx = idx.data
-            count += torch.sum(torch.eq(idx.cpu(), y))
+            p, idx = torch.max(y_pred.data, dim=1)
+            count += torch.sum(torch.eq(idx.cpu(), y.data.cpu()))
 
-            if i % interval == 0:
-                print(i)      
-                print("loss count:" + str(loss_count / batch_size))
-                print("acuracy:" + str(count / batch_size))
+            if (i + 1) % interval == 0:
+                print("epoch : %d, iters: %d"%(epoch, i + 1))     
+                print("loss count:" + str(loss_count / (interval * batch_size)))
+                print("acuracy:" + str(count / (interval * batch_size)))
                 print("penalty:" + str(torch.sum(y_penl).data[0] / batch_size))
-                print("prepare time:" + str(prepare_time / batch_size))
-                print("run time:" + str(run_time / batch_size))
+                print("prepare time:" + str(prepare_time))
+                print("run time:" + str(run_time))
                 prepare_time = 0
                 run_time = 0
                 loss_count = 0
                 count = 0
-        torch.save(net.state_dict(), "model_dict_%s.pkl"%(str(time.time())))
+        string = time.strftime("%Y-%m-%d-%H:%M:%S", time.localtime())
+        torch.save(net.state_dict(), "model_dict_%s.dict"%(string))
+
+def test(model_dict, using_cuda=True):
+    if using_cuda:
+        net = Net().cuda()
+    else:
+        net = Net()
+    net.load_state_dict(torch.load(model_dict))
+    dataset = dataloader.DataLoader("test_set.pkl", batch_size=1, using_cuda=using_cuda)
+    count = 0
+    for i, batch in enumerate(dataset):
+        X = batch["feature"]
+        y = batch["class"]
+        y_pred, _ = net(X)
+        p, idx = torch.max(y_pred.data, dim=1)
+        count += torch.sum(torch.eq(idx.cpu(), y.data.cpu()))
+    print("accuracy: %f"%(count / dataset.num))
+        
 
 if __name__ == "__main__":
     train(using_cuda=torch.cuda.is_available())
+    
+    
+    
 
diff --git a/fastNLP/modules/prototype/predict.py b/fastNLP/modules/prototype/predict.py
index c8e72629..d5346c0e 100644
--- a/fastNLP/modules/prototype/predict.py
+++ b/fastNLP/modules/prototype/predict.py
@@ -1,5 +1,6 @@
 import torch
 import torch.nn as nn
+import torch.nn.functional as F
 
 class MLP(nn.Module):
     """
@@ -15,7 +16,6 @@ class MLP(nn.Module):
         super(MLP,self).__init__()
         self.L1 = nn.Linear(input_size, hidden_size)
         self.L2 = nn.Linear(hidden_size, output_size)
-        self.softmax = nn.Softmax(dim=1)
 
     def forward(self, x):
         out = self.L2(F.relu(self.L1(x)))
diff --git a/fastNLP/modules/prototype/prepare.py b/fastNLP/modules/prototype/prepare.py
new file mode 100644
index 00000000..02fd19c5
--- /dev/null
+++ b/fastNLP/modules/prototype/prepare.py
@@ -0,0 +1,50 @@
+import pickle
+import Word2Idx
+
+def get_sets(m, n):
+    """
+    get a train set containing m samples and a test set containing n samples
+    """
+    samples = pickle.load(open("tuples.pkl","rb"))
+    if m+n > len(samples):
+        print("asking for too many tuples\n")
+        return
+    train_samples = samples[ : m]
+    test_samples = samples[m: m+n]
+    return train_samples, test_samples
+
+def build_wordidx():
+    """
+    build wordidx using word2idx
+    """
+    train, test = get_sets(500000, 2000)
+    words = []
+    for x in train:
+        words += x[0]
+    wordidx = Word2Idx.Word2Idx()
+    wordidx.build(words)
+    print(wordidx.num)
+    print(wordidx.i2w(0))
+    wordidx.save("wordidx.pkl")
+
+def build_sets():
+    """
+    build train set and test set, transform word to index
+    """
+    train, test = get_sets(500000, 2000)
+    wordidx = Word2Idx.Word2Idx()
+    wordidx.load("wordidx.pkl")
+    train_set = []
+    for x in train:
+        sent = [wordidx.w2i(w) for w in x[0]]
+        train_set.append({"sent" : sent, "class" : x[1]})
+    test_set = []
+    for x in test:
+        sent = [wordidx.w2i(w) for w in x[0]]
+        test_set.append({"sent" : sent, "class" : x[1]})
+    pickle.dump(train_set, open("train_set.pkl", "wb"))
+    pickle.dump(test_set, open("test_set.pkl", "wb"))
+
+if __name__ == "__main__":
+    build_wordidx()
+    build_sets()

From f585a9aa7df9b73e757dd51526a45bf3380b2ead Mon Sep 17 00:00:00 2001
From: HENRY L <henryL7>
Date: Mon, 2 Jul 2018 02:49:55 +0800
Subject: [PATCH 06/13] update

---
 fastNLP/modules/prototype/example.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fastNLP/modules/prototype/example.py b/fastNLP/modules/prototype/example.py
index 782937fe..a19898c6 100644
--- a/fastNLP/modules/prototype/example.py
+++ b/fastNLP/modules/prototype/example.py
@@ -60,7 +60,7 @@ def train(model_dict=None, using_cuda=True, learning_rate=0.06,\
 
     optimizer = optim.SGD(net.parameters(), lr=learning_rate, momentum=momentum)
     criterion = nn.CrossEntropyLoss()
-    dataset = dataloader.DataLoader("test_set.pkl", batch_size, using_cuda=using_cuda)
+    dataset = dataloader.DataLoader("train_set.pkl", batch_size, using_cuda=using_cuda)
 
     #statistics
     loss_count = 0

From b297f93537600760ad6026808fe778cf2667fac6 Mon Sep 17 00:00:00 2001
From: Ke Zhen <keezen@qq.com>
Date: Mon, 2 Jul 2018 12:03:45 +0800
Subject: [PATCH 07/13] add conv and pooling module

---
 fastNLP/modules/convolution/AvgPool1d.py | 22 +++++++++++++++++++
 fastNLP/modules/convolution/Conv1d.py    | 28 ++++++++++++++++++++++++
 fastNLP/modules/convolution/MaxPool1d.py | 23 +++++++++++++++++++
 3 files changed, 73 insertions(+)
 create mode 100644 fastNLP/modules/convolution/AvgPool1d.py
 create mode 100644 fastNLP/modules/convolution/Conv1d.py
 create mode 100644 fastNLP/modules/convolution/MaxPool1d.py

diff --git a/fastNLP/modules/convolution/AvgPool1d.py b/fastNLP/modules/convolution/AvgPool1d.py
new file mode 100644
index 00000000..c427fc9a
--- /dev/null
+++ b/fastNLP/modules/convolution/AvgPool1d.py
@@ -0,0 +1,22 @@
+# python: 3.6
+# encoding: utf-8
+
+import torch.nn as nn
+# import torch.nn.functional as F
+
+
+class AvgPool1d(nn.Module):
+    """1-d average pooling module."""
+
+    def __init__(self, kernel_size, stride=None, padding=0,
+                 ceil_mode=False, count_include_pad=True):
+        super(AvgPool1d, self).__init__()
+        self.pool = nn.AvgPool1d(
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            ceil_mode=ceil_mode,
+            count_include_pad=count_include_pad)
+
+    def forward(self, x):
+        return self.pool(x)
diff --git a/fastNLP/modules/convolution/Conv1d.py b/fastNLP/modules/convolution/Conv1d.py
new file mode 100644
index 00000000..60554a24
--- /dev/null
+++ b/fastNLP/modules/convolution/Conv1d.py
@@ -0,0 +1,28 @@
+# python: 3.6
+# encoding: utf-8
+
+import torch.nn as nn
+# import torch.nn.functional as F
+
+
+class Conv1d(nn.Module):
+    """
+    Basic 1-d convolution module.
+    """
+
+    def __init__(self, in_channels, out_channels, kernel_size,
+                 stride=1, padding=0, dilation=1,
+                 groups=1, bias=True):
+        super(Conv1d, self).__init__()
+        self.conv = nn.Conv1d(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            groups=groups,
+            bias=bias)
+
+    def forward(self, x):
+        return self.conv(x)
diff --git a/fastNLP/modules/convolution/MaxPool1d.py b/fastNLP/modules/convolution/MaxPool1d.py
new file mode 100644
index 00000000..d1f39395
--- /dev/null
+++ b/fastNLP/modules/convolution/MaxPool1d.py
@@ -0,0 +1,23 @@
+# python: 3.6
+# encoding: utf-8
+
+import torch.nn as nn
+# import torch.nn.functional as F
+
+
+class MaxPool1d(nn.Module):
+    """1-d max-pooling module."""
+
+    def __init__(self, kernel_size, stride=None, padding=0,
+                 dilation=1, return_indices=False, ceil_mode=False):
+        super(MaxPool1d, self).__init__()
+        self.maxpool = nn.MaxPool1d(
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            return_indices=return_indices,
+            ceil_mode=ceil_mode)
+
+    def forward(self, x):
+        return self.maxpool(x)

From 2569c85c8e87c195b89e8fc58bd0088ae09cc12e Mon Sep 17 00:00:00 2001
From: Ke Zhen <keezen@qq.com>
Date: Mon, 2 Jul 2018 14:02:45 +0800
Subject: [PATCH 08/13] modify conv and pool module

---
 fastNLP/modules/convolution/AvgPool.py        | 24 +++++++++++++++++
 fastNLP/modules/convolution/AvgPool1d.py      | 22 ----------------
 .../convolution/{Conv1d.py => Conv.py}        |  6 ++---
 fastNLP/modules/convolution/MaxPool.py        | 26 +++++++++++++++++++
 fastNLP/modules/convolution/MaxPool1d.py      | 23 ----------------
 5 files changed, 53 insertions(+), 48 deletions(-)
 create mode 100644 fastNLP/modules/convolution/AvgPool.py
 delete mode 100644 fastNLP/modules/convolution/AvgPool1d.py
 rename fastNLP/modules/convolution/{Conv1d.py => Conv.py} (85%)
 create mode 100644 fastNLP/modules/convolution/MaxPool.py
 delete mode 100644 fastNLP/modules/convolution/MaxPool1d.py

diff --git a/fastNLP/modules/convolution/AvgPool.py b/fastNLP/modules/convolution/AvgPool.py
new file mode 100644
index 00000000..70e473df
--- /dev/null
+++ b/fastNLP/modules/convolution/AvgPool.py
@@ -0,0 +1,24 @@
+# python: 3.6
+# encoding: utf-8
+
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class AvgPool(nn.Module):
+    """1-d average pooling module."""
+
+    def __init__(self, stride=None, padding=0):
+        super(AvgPool, self).__init__()
+        self.stride = stride
+        self.padding = padding
+
+    def forward(self, x):
+        # [N,C,L] -> [N,C]
+        kernel_size = x.size(2)
+        x = F.max_pool1d(
+            input=x,
+            kernel_size=kernel_size,
+            stride=self.stride,
+            padding=self.padding)
+        return x.squeeze(dim=-1)
diff --git a/fastNLP/modules/convolution/AvgPool1d.py b/fastNLP/modules/convolution/AvgPool1d.py
deleted file mode 100644
index c427fc9a..00000000
--- a/fastNLP/modules/convolution/AvgPool1d.py
+++ /dev/null
@@ -1,22 +0,0 @@
-# python: 3.6
-# encoding: utf-8
-
-import torch.nn as nn
-# import torch.nn.functional as F
-
-
-class AvgPool1d(nn.Module):
-    """1-d average pooling module."""
-
-    def __init__(self, kernel_size, stride=None, padding=0,
-                 ceil_mode=False, count_include_pad=True):
-        super(AvgPool1d, self).__init__()
-        self.pool = nn.AvgPool1d(
-            kernel_size=kernel_size,
-            stride=stride,
-            padding=padding,
-            ceil_mode=ceil_mode,
-            count_include_pad=count_include_pad)
-
-    def forward(self, x):
-        return self.pool(x)
diff --git a/fastNLP/modules/convolution/Conv1d.py b/fastNLP/modules/convolution/Conv.py
similarity index 85%
rename from fastNLP/modules/convolution/Conv1d.py
rename to fastNLP/modules/convolution/Conv.py
index 60554a24..a3a572d9 100644
--- a/fastNLP/modules/convolution/Conv1d.py
+++ b/fastNLP/modules/convolution/Conv.py
@@ -5,7 +5,7 @@ import torch.nn as nn
 # import torch.nn.functional as F
 
 
-class Conv1d(nn.Module):
+class Conv(nn.Module):
     """
     Basic 1-d convolution module.
     """
@@ -13,7 +13,7 @@ class Conv1d(nn.Module):
     def __init__(self, in_channels, out_channels, kernel_size,
                  stride=1, padding=0, dilation=1,
                  groups=1, bias=True):
-        super(Conv1d, self).__init__()
+        super(Conv, self).__init__()
         self.conv = nn.Conv1d(
             in_channels=in_channels,
             out_channels=out_channels,
@@ -25,4 +25,4 @@ class Conv1d(nn.Module):
             bias=bias)
 
     def forward(self, x):
-        return self.conv(x)
+        return self.conv(x)  # [N,C,L]
diff --git a/fastNLP/modules/convolution/MaxPool.py b/fastNLP/modules/convolution/MaxPool.py
new file mode 100644
index 00000000..12bdd96f
--- /dev/null
+++ b/fastNLP/modules/convolution/MaxPool.py
@@ -0,0 +1,26 @@
+# python: 3.6
+# encoding: utf-8
+
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class MaxPool(nn.Module):
+    """1-d max-pooling module."""
+
+    def __init__(self, stride=None, padding=0, dilation=1):
+        super(MaxPool, self).__init__()
+        self.stride = stride
+        self.padding = padding
+        self.dilation = dilation
+
+    def forward(self, x):
+        # [N,C,L] -> [N,C]
+        kernel_size = x.size(2)
+        x = F.max_pool1d(
+            input=x,
+            kernel_size=kernel_size,
+            stride=self.stride,
+            padding=self.padding,
+            dilation=self.dilation)
+        return x.squeeze(dim=-1)
diff --git a/fastNLP/modules/convolution/MaxPool1d.py b/fastNLP/modules/convolution/MaxPool1d.py
deleted file mode 100644
index d1f39395..00000000
--- a/fastNLP/modules/convolution/MaxPool1d.py
+++ /dev/null
@@ -1,23 +0,0 @@
-# python: 3.6
-# encoding: utf-8
-
-import torch.nn as nn
-# import torch.nn.functional as F
-
-
-class MaxPool1d(nn.Module):
-    """1-d max-pooling module."""
-
-    def __init__(self, kernel_size, stride=None, padding=0,
-                 dilation=1, return_indices=False, ceil_mode=False):
-        super(MaxPool1d, self).__init__()
-        self.maxpool = nn.MaxPool1d(
-            kernel_size=kernel_size,
-            stride=stride,
-            padding=padding,
-            dilation=dilation,
-            return_indices=return_indices,
-            ceil_mode=ceil_mode)
-
-    def forward(self, x):
-        return self.maxpool(x)

From 7b7826544e3f59241290fe88a4830ad7ea7b6e43 Mon Sep 17 00:00:00 2001
From: Ke Zhen <keezen@qq.com>
Date: Mon, 2 Jul 2018 15:12:18 +0800
Subject: [PATCH 09/13] add kmax pooling module

---
 .../convolution/{AvgPool.py => avg_pool.py}   |  0
 fastNLP/modules/convolution/kmax_pool.py      | 20 +++++++++++++++++++
 .../convolution/{MaxPool.py => max_pool.py}   |  0
 3 files changed, 20 insertions(+)
 rename fastNLP/modules/convolution/{AvgPool.py => avg_pool.py} (100%)
 create mode 100644 fastNLP/modules/convolution/kmax_pool.py
 rename fastNLP/modules/convolution/{MaxPool.py => max_pool.py} (100%)

diff --git a/fastNLP/modules/convolution/AvgPool.py b/fastNLP/modules/convolution/avg_pool.py
similarity index 100%
rename from fastNLP/modules/convolution/AvgPool.py
rename to fastNLP/modules/convolution/avg_pool.py
diff --git a/fastNLP/modules/convolution/kmax_pool.py b/fastNLP/modules/convolution/kmax_pool.py
new file mode 100644
index 00000000..17fa9248
--- /dev/null
+++ b/fastNLP/modules/convolution/kmax_pool.py
@@ -0,0 +1,20 @@
+# python: 3.6
+# encoding: utf-8
+
+import torch
+import torch.nn as nn
+# import torch.nn.functional as F
+
+
+class KMaxPool(nn.Module):
+    """K max-pooling module."""
+
+    def __init__(self, k):
+        super(KMaxPool, self).__init__()
+        self.k = k
+
+    def forward(self, x):
+        # [N,C,L] -> [N,C*k]
+        x, index = torch.topk(x, self.k, dim=-1, sorted=False)
+        x = torch.reshape(x, (x.size(0), -1))
+        return x
diff --git a/fastNLP/modules/convolution/MaxPool.py b/fastNLP/modules/convolution/max_pool.py
similarity index 100%
rename from fastNLP/modules/convolution/MaxPool.py
rename to fastNLP/modules/convolution/max_pool.py

From d6187274be8759cc8dd74d7dab264e343ff28204 Mon Sep 17 00:00:00 2001
From: Zhen Ke <keezen@qq.com>
Date: Mon, 2 Jul 2018 15:16:17 +0800
Subject: [PATCH 10/13] Rename Conv.py to conv.py

---
 fastNLP/modules/convolution/{Conv.py => conv.py} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename fastNLP/modules/convolution/{Conv.py => conv.py} (100%)

diff --git a/fastNLP/modules/convolution/Conv.py b/fastNLP/modules/convolution/conv.py
similarity index 100%
rename from fastNLP/modules/convolution/Conv.py
rename to fastNLP/modules/convolution/conv.py

From ceffed6a1615cfbb7fe1520bdd6fd3f0d9670473 Mon Sep 17 00:00:00 2001
From: FengZiYjun <writerphone@163.com>
Date: Tue, 3 Jul 2018 09:00:29 +0800
Subject: [PATCH 11/13] update trainer: add sampling and padding in batchify,
 add pkl loading in prepare_input, check model loss in get_loss

---
 fastNLP/action/action.py  | 86 +++++++++++++++++++++++++++------------
 fastNLP/action/trainer.py | 71 ++++++++++++++++++++++++++------
 2 files changed, 119 insertions(+), 38 deletions(-)

diff --git a/fastNLP/action/action.py b/fastNLP/action/action.py
index 5512c7b1..ea12a37e 100644
--- a/fastNLP/action/action.py
+++ b/fastNLP/action/action.py
@@ -1,3 +1,4 @@
+import numpy as np
 
 
 class Action(object):
@@ -8,28 +9,63 @@ class Action(object):
     def __init__(self):
         super(Action, self).__init__()
 
-    def batchify(self, batch_size, X, Y=None):
-        """
-        :param batch_size: int
-        :param X: feature matrix of size [n_sample, m_feature]
-        :param Y: label vector of size [n_sample, 1] (optional)
-        :return iteration:int, the number of step in each epoch
-                 generator:generator, to generate batch inputs
-        """
-        n_samples = X.shape[0]
-        num_iter = n_samples // batch_size
-        if Y is None:
-            generator = self._batch_generate(batch_size, num_iter, X)
-        else:
-            generator = self._batch_generate(batch_size, num_iter, X, Y)
-        return num_iter, generator
-
-    @staticmethod
-    def _batch_generate(batch_size, num_iter, *data):
-        for step in range(num_iter):
-            start = batch_size * step
-            end = batch_size * (step + 1)
-            yield tuple([x[start:end] for x in data])
-
-    def make_log(self, *args):
-        return "log"
+
+class BaseSampler(object):
+    """
+        Base class for all samplers.
+    """
+
+    def __init__(self, data_set):
+        self.data_set_length = len(data_set)
+
+    def __len__(self):
+        return self.data_set_length
+
+    def __iter__(self):
+        raise NotImplementedError
+
+
+class SequentialSampler(BaseSampler):
+    """
+    Sample data in the original order.
+    """
+
+    def __init__(self, data_set):
+        super(SequentialSampler, self).__init__(data_set)
+
+    def __iter__(self):
+        return iter(range(self.data_set_length))
+
+
+class RandomSampler(BaseSampler):
+    """
+    Sample data in random permutation order.
+    """
+
+    def __init__(self, data_set):
+        super(RandomSampler, self).__init__(data_set)
+
+    def __iter__(self):
+        return iter(np.random.permutation(self.data_set_length))
+
+
+class Batchifier(object):
+    """
+    Wrap random or sequential sampler to generate a mini-batch.
+    """
+
+    def __init__(self, sampler, batch_size, drop_last=True):
+        super(Batchifier, self).__init__()
+        self.sampler = sampler
+        self.batch_size = batch_size
+        self.drop_last = drop_last
+
+    def __iter__(self):
+        batch = []
+        for idx in self.sampler:
+            batch.append(idx)
+            if len(batch) == self.batch_size:
+                yield batch
+                batch = []
+        if len(batch) < self.batch_size and self.drop_last is False:
+            yield batch
diff --git a/fastNLP/action/trainer.py b/fastNLP/action/trainer.py
index 0bbcccd7..8b9eb717 100644
--- a/fastNLP/action/trainer.py
+++ b/fastNLP/action/trainer.py
@@ -1,9 +1,11 @@
+import pickle
 from collections import namedtuple
 
 import numpy as np
 import torch
 
 from fastNLP.action.action import Action
+from fastNLP.action.action import RandomSampler, Batchifier
 from fastNLP.action.tester import Tester
 
 
@@ -31,8 +33,10 @@ class BaseTrainer(Action):
         self.validate = train_args.validate
         self.batch_size = train_args.batch_size
         self.model = None
+        self.iterator = None
+        self.loss_func = None
 
-    def train(self, network, train_data, dev_data=None):
+    def train(self, network):
         """General training loop.
         :param network: a model
         :param train_data: raw data for training
@@ -50,22 +54,21 @@ class BaseTrainer(Action):
         Subclasses must implement these methods with a specific framework.
         """
         self.model = network
-        train_x, train_y = self.prepare_input(train_data)
-
-        iterations, train_batch_generator = self.batchify(self.batch_size, train_x, train_y)
+        data_train, data_dev, data_test, embedding = self.prepare_input("./save/")
 
         test_args = Tester.TestConfig(save_output=True, validate_in_training=True,
                                       save_dev_input=True, save_loss=True, batch_size=self.batch_size)
         evaluator = Tester(test_args)
 
         best_loss = 1e10
+        iterations = len(data_train) // self.batch_size
 
         for epoch in range(self.n_epochs):
-            self.mode(test=False)  # turn on the train mode
+            self.mode(test=False)
 
             self.define_optimizer()
             for step in range(iterations):
-                batch_x, batch_y = train_batch_generator.__next__()
+                batch_x, batch_y = self.batchify(self.batch_size, data_train)
 
                 prediction = self.data_forward(network, batch_x)
 
@@ -74,21 +77,23 @@ class BaseTrainer(Action):
                 self.update()
 
             if self.validate:
-                if dev_data is None:
+                if data_dev is None:
                     raise RuntimeError("No validation data provided.")
-                evaluator.test(network, dev_data)
+                evaluator.test(network, data_dev)
                 if evaluator.loss < best_loss:
                     best_loss = evaluator.loss
 
         # finish training
 
-    def prepare_input(self, data):
+    def prepare_input(self, data_path):
         """
-        Perform data transformation from raw input to vector/matrix inputs.
-        :param data: raw inputs
-        :return (X, Y): tuple, input features and labels
+            To do: Load pkl files of train/dev/test and embedding
         """
-        raise NotImplementedError
+        data_train = pickle.load(open(data_path + "data_train.pkl", "rb"))
+        data_dev = pickle.load(open(data_path + "data_dev.pkl", "rb"))
+        data_test = pickle.load(open(data_path + "data_test.pkl", "rb"))
+        embedding = pickle.load(open(data_path + "embedding.pkl", "rb"))
+        return data_train, data_dev, data_test, embedding
 
     def mode(self, test=False):
         """
@@ -138,8 +143,48 @@ class BaseTrainer(Action):
         :param truth: ground truth label vector
         :return: a scalar
         """
+        if self.loss_func is None:
+            if hasattr(self.model, "loss"):
+                self.loss_func = self.model.loss
+            else:
+                self.loss_func = self.define_loss()
+        return self.loss_func(predict, truth)
+
+    def define_loss(self):
         raise NotImplementedError
 
+    def batchify(self, batch_size, data):
+        """
+        Perform batching from data and produce a batch of training data.
+        Add padding.
+        :param batch_size:
+        :param data:
+        :param pad:
+        :return: batch_x, batch_y
+        """
+        if self.iterator is None:
+            self.iterator = iter(Batchifier(RandomSampler(data), batch_size, drop_last=True))
+        indices = next(self.iterator)
+        batch = [data[idx] for idx in indices]
+        batch_x = [sample[0] for sample in batch]
+        batch_y = [sample[1] for sample in batch]
+        batch_x = self.pad(batch_x)
+        return batch_x, batch_y
+
+    @staticmethod
+    def pad(batch, fill=0):
+        """
+        Pad a batch of samples to maximum length.
+        :param batch: list of list
+        :param fill: word index to pad, default 0.
+        :return: a padded batch
+        """
+        max_length = max([len(x) for x in batch])
+        for idx, sample in enumerate(batch):
+            if len(sample) < max_length:
+                batch[idx] = sample + [fill * (max_length - len(sample))]
+        return batch
+
 
 class ToyTrainer(BaseTrainer):
     """A simple trainer for a PyTorch model."""

From 982503d03329b9942ef2fb143cb6f7e8e176e65a Mon Sep 17 00:00:00 2001
From: FengZiYjun <writerphone@163.com>
Date: Wed, 4 Jul 2018 22:56:24 +0800
Subject: [PATCH 12/13] optimize code style

---
 fastNLP/loader/base_preprocess.py | 35 ---------------
 fastNLP/loader/config_loader.py   | 11 +++--
 fastNLP/loader/dataset_loader.py  |  1 -
 fastNLP/loader/preprocess.py      | 73 +++++++++++++++++++++----------
 fastNLP/saver/base_saver.py       | 14 ++++++
 fastNLP/saver/logger.py           | 12 +++++
 fastNLP/saver/model_saver.py      |  8 ++++
 7 files changed, 88 insertions(+), 66 deletions(-)
 delete mode 100644 fastNLP/loader/base_preprocess.py
 create mode 100644 fastNLP/saver/base_saver.py
 create mode 100644 fastNLP/saver/logger.py
 create mode 100644 fastNLP/saver/model_saver.py

diff --git a/fastNLP/loader/base_preprocess.py b/fastNLP/loader/base_preprocess.py
deleted file mode 100644
index 806fbd18..00000000
--- a/fastNLP/loader/base_preprocess.py
+++ /dev/null
@@ -1,35 +0,0 @@
-
-
-class BasePreprocess(object):
-
-
-    def __init__(self, data, pickle_path):
-        super(BasePreprocess, self).__init__()
-        self.data = data
-        self.pickle_path = pickle_path
-        if not self.pickle_path.endswith('/'):
-            self.pickle_path = self.pickle_path + '/'
-
-    def word2id(self):
-        raise NotImplementedError
-
-    def id2word(self):
-        raise NotImplementedError
-
-    def class2id(self):
-        raise NotImplementedError
-
-    def id2class(self):
-        raise NotImplementedError
-
-    def embedding(self):
-        raise NotImplementedError
-
-    def data_train(self):
-        raise NotImplementedError
-
-    def data_dev(self):
-        raise NotImplementedError
-
-    def data_test(self):
-        raise NotImplementedError
diff --git a/fastNLP/loader/config_loader.py b/fastNLP/loader/config_loader.py
index 371de4f1..e57d9891 100644
--- a/fastNLP/loader/config_loader.py
+++ b/fastNLP/loader/config_loader.py
@@ -1,9 +1,8 @@
-from fastNLP.loader.base_loader import BaseLoader
-
 import configparser
-import traceback
 import json
 
+from fastNLP.loader.base_loader import BaseLoader
+
 
 class ConfigLoader(BaseLoader):
     """loader for configuration files"""
@@ -17,14 +16,14 @@ class ConfigLoader(BaseLoader):
         raise NotImplementedError
 
     @staticmethod
-    def loadConfig(filePath, sections):
+    def load_config(file_path, sections):
         """
-        :param filePath: the path of config file
+        :param file_path: the path of config file
         :param sections: the dict of sections
         :return:
         """
         cfg = configparser.ConfigParser()
-        cfg.read(filePath)
+        cfg.read(file_path)
         for s in sections:
             attr_list = [i for i in type(sections[s]).__dict__.keys() if
                          not callable(getattr(sections[s], i)) and not i.startswith("__")]
diff --git a/fastNLP/loader/dataset_loader.py b/fastNLP/loader/dataset_loader.py
index f8bcb276..7132eb3b 100644
--- a/fastNLP/loader/dataset_loader.py
+++ b/fastNLP/loader/dataset_loader.py
@@ -30,7 +30,6 @@ class POSDatasetLoader(DatasetLoader):
         return lines
 
 
-
 class ClassificationDatasetLoader(DatasetLoader):
     """loader for classfication data sets"""
 
diff --git a/fastNLP/loader/preprocess.py b/fastNLP/loader/preprocess.py
index 8e880107..b8d88c35 100644
--- a/fastNLP/loader/preprocess.py
+++ b/fastNLP/loader/preprocess.py
@@ -1,25 +1,57 @@
-import pickle
 import _pickle
 import os
 
-from fastNLP.loader.base_preprocess import BasePreprocess
-
-DEFAULT_PADDING_LABEL = '<pad>'             #dict index = 0
-DEFAULT_UNKNOWN_LABEL = '<unk>'             #dict index = 1
+DEFAULT_PADDING_LABEL = '<pad>'  # dict index = 0
+DEFAULT_UNKNOWN_LABEL = '<unk>'  # dict index = 1
 DEFAULT_RESERVED_LABEL = ['<reserved-2>',
                           '<reserved-3>',
-                          '<reserved-4>']   #dict index = 2~4
-#the first vocab in dict with the index = 5
+                          '<reserved-4>']  # dict index = 2~4
+
+
+# the first vocab in dict with the index = 5
+
+
+class BasePreprocess(object):
+
+    def __init__(self, data, pickle_path):
+        super(BasePreprocess, self).__init__()
+        self.data = data
+        self.pickle_path = pickle_path
+        if not self.pickle_path.endswith('/'):
+            self.pickle_path = self.pickle_path + '/'
+
+    def word2id(self):
+        raise NotImplementedError
+
+    def id2word(self):
+        raise NotImplementedError
+
+    def class2id(self):
+        raise NotImplementedError
+
+    def id2class(self):
+        raise NotImplementedError
 
+    def embedding(self):
+        raise NotImplementedError
+
+    def data_train(self):
+        raise NotImplementedError
+
+    def data_dev(self):
+        raise NotImplementedError
+
+    def data_test(self):
+        raise NotImplementedError
 
 
 class POSPreprocess(BasePreprocess):
 
     """
         This class are used to preprocess the pos datasets.
-        In these datasets, each line are divided by '\t'
-    while the first Col is the vocabulary and the second
-    Col is the label.
+        In these datasets, each line is divided by '\t'
+        The first Col is the vocabulary.
+        The second Col is the labels.
         Different sentence are divided by an empty line.
         e.g:
         Tom label1
@@ -36,7 +68,9 @@ class POSPreprocess(BasePreprocess):
     """
 
     def __init__(self, data, pickle_path):
-        super(POSPreprocess, self).__init(data, pickle_path)
+        super(POSPreprocess, self).__init__(data, pickle_path)
+        self.word_dict = None
+        self.label_dict = None
         self.build_dict()
         self.word2id()
         self.id2word()
@@ -46,8 +80,6 @@ class POSPreprocess(BasePreprocess):
         self.data_train()
         self.data_dev()
         self.data_test()
-        #...
-
 
     def build_dict(self):
         self.word_dict = {DEFAULT_PADDING_LABEL: 0, DEFAULT_UNKNOWN_LABEL: 1,
@@ -68,7 +100,6 @@ class POSPreprocess(BasePreprocess):
                     index = len(self.label_dict)
                     self.label_dict[label] = index
 
-
     def pickle_exist(self, pickle_name):
         """
         :param pickle_name: the filename of target pickle file
@@ -82,7 +113,6 @@ class POSPreprocess(BasePreprocess):
         else:
             return False
 
-
     def word2id(self):
         if self.pickle_exist("word2id.pkl"):
             return
@@ -92,11 +122,10 @@ class POSPreprocess(BasePreprocess):
         with open(file_name, "wb", encoding='utf-8') as f:
             _pickle.dump(self.word_dict, f)
 
-
     def id2word(self):
         if self.pickle_exist("id2word.pkl"):
             return
-        #nothing will be done if id2word.pkl exists
+        # nothing will be done if id2word.pkl exists
 
         id2word_dict = {}
         for word in self.word_dict:
@@ -105,7 +134,6 @@ class POSPreprocess(BasePreprocess):
         with open(file_name, "wb", encoding='utf-8') as f:
             _pickle.dump(id2word_dict, f)
 
-
     def class2id(self):
         if self.pickle_exist("class2id.pkl"):
             return
@@ -115,11 +143,10 @@ class POSPreprocess(BasePreprocess):
         with open(file_name, "wb", encoding='utf-8') as f:
             _pickle.dump(self.label_dict, f)
 
-
     def id2class(self):
         if self.pickle_exist("id2class.pkl"):
             return
-        #nothing will be done if id2class.pkl exists
+        # nothing will be done if id2class.pkl exists
 
         id2class_dict = {}
         for label in self.label_dict:
@@ -128,17 +155,15 @@ class POSPreprocess(BasePreprocess):
         with open(file_name, "wb", encoding='utf-8') as f:
             _pickle.dump(id2class_dict, f)
 
-
     def embedding(self):
         if self.pickle_exist("embedding.pkl"):
             return
-        #nothing will be done if embedding.pkl exists
-
+        # nothing will be done if embedding.pkl exists
 
     def data_train(self):
         if self.pickle_exist("data_train.pkl"):
             return
-        #nothing will be done if data_train.pkl exists
+        # nothing will be done if data_train.pkl exists
 
         data_train = []
         sentence = []
diff --git a/fastNLP/saver/base_saver.py b/fastNLP/saver/base_saver.py
new file mode 100644
index 00000000..d721da2c
--- /dev/null
+++ b/fastNLP/saver/base_saver.py
@@ -0,0 +1,14 @@
+class BaseSaver(object):
+    """base class for all savers"""
+
+    def __init__(self, save_path):
+        self.save_path = save_path
+
+    def save_bytes(self):
+        raise NotImplementedError
+
+    def save_str(self):
+        raise NotImplementedError
+
+    def compress(self):
+        raise NotImplementedError
diff --git a/fastNLP/saver/logger.py b/fastNLP/saver/logger.py
new file mode 100644
index 00000000..be38de40
--- /dev/null
+++ b/fastNLP/saver/logger.py
@@ -0,0 +1,12 @@
+from saver.base_saver import BaseSaver
+
+
+class Logger(BaseSaver):
+    """Logging"""
+
+    def __init__(self, save_path):
+        super(Logger, self).__init__(save_path)
+
+    def log(self, string):
+        with open(self.save_path, "a") as f:
+            f.write(string)
diff --git a/fastNLP/saver/model_saver.py b/fastNLP/saver/model_saver.py
new file mode 100644
index 00000000..3b3cbeca
--- /dev/null
+++ b/fastNLP/saver/model_saver.py
@@ -0,0 +1,8 @@
+from saver.base_saver import BaseSaver
+
+
+class ModelSaver(BaseSaver):
+    """Save a models"""
+
+    def __init__(self, save_path):
+        super(ModelSaver, self).__init__(save_path)

From 7ea015c0f96b27bcb6091154adfac4ffae563766 Mon Sep 17 00:00:00 2001
From: FengZiYjun <writerphone@163.com>
Date: Wed, 4 Jul 2018 23:28:48 +0800
Subject: [PATCH 13/13] update trainer: loading data with _pickle; add
 arguments comments.

---
 fastNLP/action/trainer.py | 51 ++++++++++++++++++++++-----------------
 1 file changed, 29 insertions(+), 22 deletions(-)

diff --git a/fastNLP/action/trainer.py b/fastNLP/action/trainer.py
index 8b9eb717..437ab7d2 100644
--- a/fastNLP/action/trainer.py
+++ b/fastNLP/action/trainer.py
@@ -1,4 +1,4 @@
-import pickle
+import _pickle
 from collections import namedtuple
 
 import numpy as np
@@ -21,8 +21,7 @@ class BaseTrainer(Action):
         - grad_backward
         - get_loss
     """
-    TrainConfig = namedtuple("config", ["epochs", "validate", "save_when_better",
-                                        "log_per_step", "log_validation", "batch_size"])
+    TrainConfig = namedtuple("config", ["epochs", "validate", "batch_size", "pickle_path"])
 
     def __init__(self, train_args):
         """
@@ -32,6 +31,7 @@ class BaseTrainer(Action):
         self.n_epochs = train_args.epochs
         self.validate = train_args.validate
         self.batch_size = train_args.batch_size
+        self.pickle_path = train_args.pickle_path
         self.model = None
         self.iterator = None
         self.loss_func = None
@@ -39,8 +39,6 @@ class BaseTrainer(Action):
     def train(self, network):
         """General training loop.
         :param network: a model
-        :param train_data: raw data for training
-        :param dev_data: raw data for validation
 
         The method is framework independent.
         Work by calling the following methods:
@@ -54,7 +52,7 @@ class BaseTrainer(Action):
         Subclasses must implement these methods with a specific framework.
         """
         self.model = network
-        data_train, data_dev, data_test, embedding = self.prepare_input("./save/")
+        data_train, data_dev, data_test, embedding = self.prepare_input(self.pickle_path)
 
         test_args = Tester.TestConfig(save_output=True, validate_in_training=True,
                                       save_dev_input=True, save_loss=True, batch_size=self.batch_size)
@@ -89,10 +87,10 @@ class BaseTrainer(Action):
         """
             To do: Load pkl files of train/dev/test and embedding
         """
-        data_train = pickle.load(open(data_path + "data_train.pkl", "rb"))
-        data_dev = pickle.load(open(data_path + "data_dev.pkl", "rb"))
-        data_test = pickle.load(open(data_path + "data_test.pkl", "rb"))
-        embedding = pickle.load(open(data_path + "embedding.pkl", "rb"))
+        data_train = _pickle.load(open(data_path + "data_train.pkl", "rb"))
+        data_dev = _pickle.load(open(data_path + "data_dev.pkl", "rb"))
+        data_test = _pickle.load(open(data_path + "data_test.pkl", "rb"))
+        embedding = _pickle.load(open(data_path + "embedding.pkl", "rb"))
         return data_train, data_dev, data_test, embedding
 
     def mode(self, test=False):
@@ -147,20 +145,30 @@ class BaseTrainer(Action):
             if hasattr(self.model, "loss"):
                 self.loss_func = self.model.loss
             else:
-                self.loss_func = self.define_loss()
+                self.define_loss()
         return self.loss_func(predict, truth)
 
     def define_loss(self):
+        """
+            Assign an instance of loss function to self.loss_func
+            E.g. self.loss_func = nn.CrossEntropyLoss()
+        """
         raise NotImplementedError
 
     def batchify(self, batch_size, data):
         """
-        Perform batching from data and produce a batch of training data.
-        Add padding.
-        :param batch_size:
-        :param data:
-        :param pad:
-        :return: batch_x, batch_y
+        1. Perform batching from data and produce a batch of training data.
+        2. Add padding.
+        :param batch_size: int, the size of a batch
+        :param data: list. Each entry is a sample, which is also a list of features and label(s).
+            E.g.
+                [
+                    [[feature_1, feature_2, feature_3], [label_1. label_2]],  # sample 1
+                    [[feature_1, feature_2, feature_3], [label_1. label_2]],  # sample 2
+                    ...
+                ]
+        :return batch_x: list. Each entry is a list of features of a sample.
+                 batch_y: list. Each entry is a list of labels of a sample.
         """
         if self.iterator is None:
             self.iterator = iter(Batchifier(RandomSampler(data), batch_size, drop_last=True))
@@ -306,8 +314,7 @@ class WordSegTrainer(BaseTrainer):
 
 
 if __name__ == "__name__":
-    Config = namedtuple("config", ["epochs", "validate", "save_when_better", "log_per_step",
-                                   "log_validation", "batch_size"])
-    train_config = Config(epochs=5, validate=True, save_when_better=True, log_per_step=10, log_validation=True,
-                          batch_size=32)
-    trainer = ToyTrainer(train_config)
+    train_args = BaseTrainer.TrainConfig(epochs=1, validate=False, batch_size=3, pickle_path="./")
+    trainer = BaseTrainer(train_args)
+    data_train = [[[1, 2, 3, 4], [0]] * 10] + [[[1, 3, 5, 2], [1]] * 10]
+    trainer.batchify(batch_size=3, data=data_train)