Merge pull request #1 from fastnlp/master

1
6 years ago · b998593bc5
--- a/fastNLP/action/action.py
+++ b/fastNLP/action/action.py
@@ -1,4 +1,4 @@
 from saver.logger import Logger
 import numpy as np


 class Action(object):
@@ -8,39 +8,64 @@ class Action(object):

    def __init__(self):
        super(Action, self).__init__()
        self.logger = Logger("logger_output.txt")

    def load_config(self, args):
        raise NotImplementedError

    def load_dataset(self, args):
 class BaseSampler(object):
    """
        Base class for all samplers.
    """

    def __init__(self, data_set):
        self.data_set_length = len(data_set)

    def __len__(self):
        return self.data_set_length

    def __iter__(self):
        raise NotImplementedError

    def log(self, string):
        self.logger.log(string)

    def batchify(self, batch_size, X, Y=None):
        """
        :param batch_size: int
        :param X: feature matrix of size [n_sample, m_feature]
        :param Y: label vector of size [n_sample, 1] (optional)
        :return iteration:int, the number of step in each epoch
                 generator:generator, to generate batch inputs
        """
        n_samples = X.shape[0]
        num_iter = n_samples // batch_size
        if Y is None:
            generator = self._batch_generate(batch_size, num_iter, X)
        else:
            generator = self._batch_generate(batch_size, num_iter, X, Y)
        return num_iter, generator

    @staticmethod
    def _batch_generate(batch_size, num_iter, *data):
        for step in range(num_iter):
            start = batch_size * step
            end = batch_size * (step + 1)
            yield tuple([x[start:end] for x in data])

    def make_log(self, *args):
        return "log"

 class SequentialSampler(BaseSampler):
    """
    Sample data in the original order.
    """

    def __init__(self, data_set):
        super(SequentialSampler, self).__init__(data_set)

    def __iter__(self):
        return iter(range(self.data_set_length))


 class RandomSampler(BaseSampler):
    """
    Sample data in random permutation order.
    """

    def __init__(self, data_set):
        super(RandomSampler, self).__init__(data_set)

    def __iter__(self):
        return iter(np.random.permutation(self.data_set_length))


 class Batchifier(object):
    """
    Wrap random or sequential sampler to generate a mini-batch.
    """

    def __init__(self, sampler, batch_size, drop_last=True):
        super(Batchifier, self).__init__()
        self.sampler = sampler
        self.batch_size = batch_size
        self.drop_last = drop_last

    def __iter__(self):
        batch = []
        for idx in self.sampler:
            batch.append(idx)
            if len(batch) == self.batch_size:
                yield batch
                batch = []
        if len(batch) < self.batch_size and self.drop_last is False:
            yield batch
--- a/fastNLP/action/tester.py
+++ b/fastNLP/action/tester.py
@@ -2,7 +2,7 @@ from collections import namedtuple

 import numpy as np

 from fastNLP.action import Action
 from fastNLP.action.action import Action


 class Tester(Action):
--- a/fastNLP/action/trainer.py
+++ b/fastNLP/action/trainer.py
@@ -1,93 +1,320 @@
 import _pickle
 from collections import namedtuple

 from .action import Action
 from .tester import Tester
 import numpy as np
 import torch

 from fastNLP.action.action import Action
 from fastNLP.action.action import RandomSampler, Batchifier
 from fastNLP.action.tester import Tester

 class Trainer(Action):
    """
        Trainer is a common training pipeline shared among all models.

 class BaseTrainer(Action):
    """Base trainer for all trainers.
        Trainer receives a model and data, and then performs training.

        Subclasses must implement the following abstract methods:
        - prepare_input
        - mode
        - define_optimizer
        - data_forward
        - grad_backward
        - get_loss
    """
    TrainConfig = namedtuple("config", ["epochs", "validate", "save_when_better",
                                        "log_per_step", "log_validation", "batch_size"])
    TrainConfig = namedtuple("config", ["epochs", "validate", "batch_size", "pickle_path"])

    def __init__(self, train_args):
        """
        :param train_args: namedtuple
        training parameters
        """
        super(Trainer, self).__init__()
        super(BaseTrainer, self).__init__()
        self.n_epochs = train_args.epochs
        self.validate = train_args.validate
        self.save_when_better = train_args.save_when_better
        self.log_per_step = train_args.log_per_step
        self.log_validation = train_args.log_validation
        self.batch_size = train_args.batch_size
        self.pickle_path = train_args.pickle_path
        self.model = None
        self.iterator = None
        self.loss_func = None

    def train(self, network, train_data, dev_data=None):
        """
        :param network: the models controller
        :param train_data: raw data for training
        :param dev_data: raw data for validation
        This method will call all the base methods of network (implemented in models.base_model).
        """
        train_x, train_y = network.prepare_input(train_data)
    def train(self, network):
        """General training loop.
        :param network: a model

        iterations, train_batch_generator = self.batchify(self.batch_size, train_x, train_y)
        The method is framework independent.
        Work by calling the following methods:
            - prepare_input
            - mode
            - define_optimizer
            - data_forward
            - get_loss
            - grad_backward
            - update
        Subclasses must implement these methods with a specific framework.
        """
        self.model = network
        data_train, data_dev, data_test, embedding = self.prepare_input(self.pickle_path)

        test_args = Tester.TestConfig(save_output=True, validate_in_training=True,
                                      save_dev_input=True, save_loss=True, batch_size=self.batch_size)
        evaluator = Tester(test_args)

        best_loss = 1e10
        loss_history = list()
        iterations = len(data_train) // self.batch_size

        for epoch in range(self.n_epochs):
            network.mode(test=False)  # turn on the train mode
            self.mode(test=False)

            network.define_optimizer()
            self.define_optimizer()
            for step in range(iterations):
                batch_x, batch_y = train_batch_generator.__next__()

                prediction = network.data_forward(batch_x)
                batch_x, batch_y = self.batchify(self.batch_size, data_train)

                loss = network.get_loss(prediction, batch_y)
                network.grad_backward()
                prediction = self.data_forward(network, batch_x)

                if step % self.log_per_step == 0:
                    print("step ", step)
                    loss_history.append(loss)
                    self.log(self.make_log(epoch, step, loss))
                loss = self.get_loss(prediction, batch_y)
                self.grad_backward(loss)
                self.update()

            #################### evaluate over dev set  ###################
            if self.validate:
                if dev_data is None:
                if data_dev is None:
                    raise RuntimeError("No validation data provided.")
                # give all controls to tester
                evaluator.test(network, dev_data)

                if self.log_validation:
                    self.log(self.make_valid_log(epoch, evaluator.loss))
                evaluator.test(network, data_dev)
                if evaluator.loss < best_loss:
                    best_loss = evaluator.loss
                    if self.save_when_better:
                        self.save_model(network)

        # finish training

    def make_log(self, *args):
        return "make a log"
    def prepare_input(self, data_path):
        """
            To do: Load pkl files of train/dev/test and embedding
        """
        data_train = _pickle.load(open(data_path + "data_train.pkl", "rb"))
        data_dev = _pickle.load(open(data_path + "data_dev.pkl", "rb"))
        data_test = _pickle.load(open(data_path + "data_test.pkl", "rb"))
        embedding = _pickle.load(open(data_path + "embedding.pkl", "rb"))
        return data_train, data_dev, data_test, embedding

    def make_valid_log(self, *args):
        return "make a valid log"
    def mode(self, test=False):
        """
        Tell the network to be trained or not.
        :param test: bool
        """
        raise NotImplementedError

    def save_model(self, model):
        model.save()
    def define_optimizer(self):
        """
        Define framework-specific optimizer specified by the models.
        """
        raise NotImplementedError

    def load_data(self, data_name):
        print("load data")
    def update(self):
        """
        Perform weight update on a model.

    def load_config(self, args):
        For PyTorch, just call optimizer to update.
        """
        raise NotImplementedError

    def load_dataset(self, args):
    def data_forward(self, network, x):
        """
        Forward pass of the data.
        :param network: a model
        :param x: input feature matrix and label vector
        :return: output by the models

        For PyTorch, just do "network(*x)"
        """
        raise NotImplementedError

    def grad_backward(self, loss):
        """
        Compute gradient with link rules.
        :param loss: a scalar where back-prop starts

        For PyTorch, just do "loss.backward()"
        """
        raise NotImplementedError

    def get_loss(self, predict, truth):
        """
        Compute loss given prediction and ground truth.
        :param predict: prediction label vector
        :param truth: ground truth label vector
        :return: a scalar
        """
        if self.loss_func is None:
            if hasattr(self.model, "loss"):
                self.loss_func = self.model.loss
            else:
                self.define_loss()
        return self.loss_func(predict, truth)

    def define_loss(self):
        """
            Assign an instance of loss function to self.loss_func
            E.g. self.loss_func = nn.CrossEntropyLoss()
        """
        raise NotImplementedError

    def batchify(self, batch_size, data):
        """
        1. Perform batching from data and produce a batch of training data.
        2. Add padding.
        :param batch_size: int, the size of a batch
        :param data: list. Each entry is a sample, which is also a list of features and label(s).
            E.g.
                [
                    [[feature_1, feature_2, feature_3], [label_1. label_2]],  # sample 1
                    [[feature_1, feature_2, feature_3], [label_1. label_2]],  # sample 2
                    ...
                ]
        :return batch_x: list. Each entry is a list of features of a sample.
                 batch_y: list. Each entry is a list of labels of a sample.
        """
        if self.iterator is None:
            self.iterator = iter(Batchifier(RandomSampler(data), batch_size, drop_last=True))
        indices = next(self.iterator)
        batch = [data[idx] for idx in indices]
        batch_x = [sample[0] for sample in batch]
        batch_y = [sample[1] for sample in batch]
        batch_x = self.pad(batch_x)
        return batch_x, batch_y

    @staticmethod
    def pad(batch, fill=0):
        """
        Pad a batch of samples to maximum length.
        :param batch: list of list
        :param fill: word index to pad, default 0.
        :return: a padded batch
        """
        max_length = max([len(x) for x in batch])
        for idx, sample in enumerate(batch):
            if len(sample) < max_length:
                batch[idx] = sample + [fill * (max_length - len(sample))]
        return batch


 class ToyTrainer(BaseTrainer):
    """A simple trainer for a PyTorch model."""

    def __init__(self, train_args):
        super(ToyTrainer, self).__init__(train_args)
        self.test_mode = False
        self.weight = np.random.rand(5, 1)
        self.bias = np.random.rand()
        self._loss = 0
        self._optimizer = None

    def prepare_input(self, data):
        return data[:, :-1], data[:, -1]

    def mode(self, test=False):
        self.model.mode(test)

    def data_forward(self, network, x):
        return np.matmul(x, self.weight) + self.bias

    def grad_backward(self, loss):
        loss.backward()

    def get_loss(self, pred, truth):
        self._loss = np.mean(np.square(pred - truth))
        return self._loss

    def define_optimizer(self):
        self._optimizer = torch.optim.SGD(self.model.parameters(), lr=0.01)

    def update(self):
        self._optimizer.step()


 class WordSegTrainer(BaseTrainer):
    """
        reserve for changes
    """

    def __init__(self, train_args):
        super(WordSegTrainer, self).__init__(train_args)
        self.id2word = None
        self.word2id = None
        self.id2tag = None
        self.tag2id = None

        self.lstm_batch_size = 8
        self.lstm_seq_len = 32  # Trainer batch_size == lstm_batch_size * lstm_seq_len
        self.hidden_dim = 100
        self.lstm_num_layers = 2
        self.vocab_size = 100
        self.word_emb_dim = 100

        self.hidden = (self.to_var(torch.zeros(2, self.lstm_batch_size, self.word_emb_dim)),
                       self.to_var(torch.zeros(2, self.lstm_batch_size, self.word_emb_dim)))

        self.optimizer = None
        self._loss = None

        self.USE_GPU = False

    def to_var(self, x):
        if torch.cuda.is_available() and self.USE_GPU:
            x = x.cuda()
        return torch.autograd.Variable(x)

    def prepare_input(self, data):
        """
            perform word indices lookup to convert strings into indices
            :param data: list of string, each string contains word + space + [B, M, E, S]
            :return
        """
        word_list = []
        tag_list = []
        for line in data:
            if len(line) > 2:
                tokens = line.split("#")
                word_list.append(tokens[0])
                tag_list.append(tokens[2][0])
        self.id2word = list(set(word_list))
        self.word2id = {word: idx for idx, word in enumerate(self.id2word)}
        self.id2tag = list(set(tag_list))
        self.tag2id = {tag: idx for idx, tag in enumerate(self.id2tag)}
        words = np.array([self.word2id[w] for w in word_list]).reshape(-1, 1)
        tags = np.array([self.tag2id[t] for t in tag_list]).reshape(-1, 1)
        return words, tags

    def mode(self, test=False):
        if test:
            self.model.eval()
        else:
            self.model.train()

    def data_forward(self, network, x):
        """
        :param network: a PyTorch model
        :param x: sequence of length [batch_size], word indices
        :return:
        """
        x = x.reshape(self.lstm_batch_size, self.lstm_seq_len)
        output, self.hidden = network(x, self.hidden)
        return output

    def define_optimizer(self):
        self.optimizer = torch.optim.SGD(self.model.parameters(), lr=0.01, momentum=0.85)

    def get_loss(self, predict, truth):
        self._loss = torch.nn.CrossEntropyLoss(predict, truth)
        return self._loss

    def grad_backward(self, network):
        self.model.zero_grad()
        self._loss.backward()
        torch.nn.utils.clip_grad_norm(self.model.parameters(), 5, norm_type=2)

    def update(self):
        self.optimizer.step()


 if __name__ == "__name__":
    train_args = BaseTrainer.TrainConfig(epochs=1, validate=False, batch_size=3, pickle_path="./")
    trainer = BaseTrainer(train_args)
    data_train = [[[1, 2, 3, 4], [0]] * 10] + [[[1, 3, 5, 2], [1]] * 10]
    trainer.batchify(batch_size=3, data=data_train)
--- a/fastNLP/loader/base_preprocess.py
+++ b/fastNLP/loader/base_preprocess.py
@@ -1,35 +0,0 @@


 class BasePreprocess(object):


    def __init__(self, data, pickle_path):
        super(BasePreprocess, self).__init__()
        self.data = data
        self.pickle_path = pickle_path
        if not self.pickle_path.endswith('/'):
            self.pickle_path = self.pickle_path + '/'

    def word2id(self):
        raise NotImplementedError

    def id2word(self):
        raise NotImplementedError

    def class2id(self):
        raise NotImplementedError

    def id2class(self):
        raise NotImplementedError

    def embedding(self):
        raise NotImplementedError

    def data_train(self):
        raise NotImplementedError

    def data_dev(self):
        raise NotImplementedError

    def data_test(self):
        raise NotImplementedError
--- a/fastNLP/loader/config_loader.py
+++ b/fastNLP/loader/config_loader.py
@@ -1,9 +1,8 @@
 from fastNLP.loader.base_loader import BaseLoader

 import configparser
 import traceback
 import json

 from fastNLP.loader.base_loader import BaseLoader


 class ConfigLoader(BaseLoader):
    """loader for configuration files"""
@@ -17,14 +16,14 @@ class ConfigLoader(BaseLoader):
        raise NotImplementedError

    @staticmethod
    def loadConfig(filePath, sections):
    def load_config(file_path, sections):
        """
        :param filePath: the path of config file
        :param file_path: the path of config file
        :param sections: the dict of sections
        :return:
        """
        cfg = configparser.ConfigParser()
        cfg.read(filePath)
        cfg.read(file_path)
        for s in sections:
            attr_list = [i for i in type(sections[s]).__dict__.keys() if
                         not callable(getattr(sections[s], i)) and not i.startswith("__")]
--- a/fastNLP/loader/dataset_loader.py
+++ b/fastNLP/loader/dataset_loader.py
@@ -1,6 +1,7 @@
 from fastNLP.loader.base_loader import BaseLoader
 import os

 from fastNLP.loader.base_loader import BaseLoader


 class DatasetLoader(BaseLoader):
    """"loader for data sets"""
@@ -16,7 +17,6 @@ class POSDatasetLoader(DatasetLoader):
        super(POSDatasetLoader, self).__init__(data_name, data_path)
        #self.data_set = self.load()


    def load(self):
        assert os.path.exists(self.data_path)
        with open(self.data_path, "r", encoding="utf-8") as f:
@@ -30,11 +30,11 @@ class POSDatasetLoader(DatasetLoader):
        return lines


 class ClassficationDatasetLoader(DatasetLoader):
 class ClassificationDatasetLoader(DatasetLoader):
    """loader for classfication data sets"""

    def __init__(self, data_name, data_path):
        super(ClassficationDatasetLoader, data_name)
        super(ClassificationDatasetLoader, data_name).__init__()

    def load(self):
        assert os.path.exists(self.data_path)
@@ -58,6 +58,7 @@ class ClassficationDatasetLoader(DatasetLoader):
            dataset.append(sentence)
        return dataset


 class ConllLoader(DatasetLoader):
    """loader for conll format files"""

--- a/fastNLP/loader/preprocess.py
+++ b/fastNLP/loader/preprocess.py
@@ -1,25 +1,57 @@
 import pickle
 import _pickle
 import os

 from fastNLP.loader.base_preprocess import BasePreprocess

 DEFAULT_PADDING_LABEL = '<pad>'             #dict index = 0
 DEFAULT_UNKNOWN_LABEL = '<unk>'             #dict index = 1
 DEFAULT_PADDING_LABEL = '<pad>'  # dict index = 0
 DEFAULT_UNKNOWN_LABEL = '<unk>'  # dict index = 1
 DEFAULT_RESERVED_LABEL = ['<reserved-2>',
                          '<reserved-3>',
                          '<reserved-4>']   #dict index = 2~4
 #the first vocab in dict with the index = 5
                          '<reserved-4>']  # dict index = 2~4


 # the first vocab in dict with the index = 5


 class BasePreprocess(object):

    def __init__(self, data, pickle_path):
        super(BasePreprocess, self).__init__()
        self.data = data
        self.pickle_path = pickle_path
        if not self.pickle_path.endswith('/'):
            self.pickle_path = self.pickle_path + '/'

    def word2id(self):
        raise NotImplementedError

    def id2word(self):
        raise NotImplementedError

    def class2id(self):
        raise NotImplementedError

    def id2class(self):
        raise NotImplementedError

    def embedding(self):
        raise NotImplementedError

    def data_train(self):
        raise NotImplementedError

    def data_dev(self):
        raise NotImplementedError

    def data_test(self):
        raise NotImplementedError


 class POSPreprocess(BasePreprocess):

    """
        This class are used to preprocess the pos datasets.
        In these datasets, each line are divided by '\t'
    while the first Col is the vocabulary and the second
    Col is the label.
        In these datasets, each line is divided by '\t'
        The first Col is the vocabulary.
        The second Col is the labels.
        Different sentence are divided by an empty line.
        e.g:
        Tom label1
@@ -36,7 +68,9 @@ class POSPreprocess(BasePreprocess):
    """

    def __init__(self, data, pickle_path):
        super(POSPreprocess, self).__init(data, pickle_path)
        super(POSPreprocess, self).__init__(data, pickle_path)
        self.word_dict = None
        self.label_dict = None
        self.build_dict()
        self.word2id()
        self.id2word()
@@ -46,8 +80,6 @@ class POSPreprocess(BasePreprocess):
        self.data_train()
        self.data_dev()
        self.data_test()
        #...


    def build_dict(self):
        self.word_dict = {DEFAULT_PADDING_LABEL: 0, DEFAULT_UNKNOWN_LABEL: 1,
@@ -68,7 +100,6 @@ class POSPreprocess(BasePreprocess):
                    index = len(self.label_dict)
                    self.label_dict[label] = index


    def pickle_exist(self, pickle_name):
        """
        :param pickle_name: the filename of target pickle file
@@ -82,7 +113,6 @@ class POSPreprocess(BasePreprocess):
        else:
            return False


    def word2id(self):
        if self.pickle_exist("word2id.pkl"):
            return
@@ -92,11 +122,10 @@ class POSPreprocess(BasePreprocess):
        with open(file_name, "wb", encoding='utf-8') as f:
            _pickle.dump(self.word_dict, f)


    def id2word(self):
        if self.pickle_exist("id2word.pkl"):
            return
        #nothing will be done if id2word.pkl exists
        # nothing will be done if id2word.pkl exists

        id2word_dict = {}
        for word in self.word_dict:
@@ -105,7 +134,6 @@ class POSPreprocess(BasePreprocess):
        with open(file_name, "wb", encoding='utf-8') as f:
            _pickle.dump(id2word_dict, f)


    def class2id(self):
        if self.pickle_exist("class2id.pkl"):
            return
@@ -115,11 +143,10 @@ class POSPreprocess(BasePreprocess):
        with open(file_name, "wb", encoding='utf-8') as f:
            _pickle.dump(self.label_dict, f)


    def id2class(self):
        if self.pickle_exist("id2class.pkl"):
            return
        #nothing will be done if id2class.pkl exists
        # nothing will be done if id2class.pkl exists

        id2class_dict = {}
        for label in self.label_dict:
@@ -128,17 +155,15 @@ class POSPreprocess(BasePreprocess):
        with open(file_name, "wb", encoding='utf-8') as f:
            _pickle.dump(id2class_dict, f)


    def embedding(self):
        if self.pickle_exist("embedding.pkl"):
            return
        #nothing will be done if embedding.pkl exists

        # nothing will be done if embedding.pkl exists

    def data_train(self):
        if self.pickle_exist("data_train.pkl"):
            return
        #nothing will be done if data_train.pkl exists
        # nothing will be done if data_train.pkl exists

        data_train = []
        sentence = []
--- a/fastNLP/models/base_model.py
+++ b/fastNLP/models/base_model.py
@@ -1,4 +1,3 @@
 import numpy as np
 import torch


@@ -30,100 +29,6 @@ class BaseModel(torch.nn.Module):
        raise NotImplementedError


 class BaseController(object):
    """Base Controller for all controllers.
        This class and its subclasses are actually "controllers" of the PyTorch models.
        They act as an interface between Trainer and the PyTorch models.
        This controller provides the following methods to be called by Trainer.
        - prepare_input
        - mode
        - define_optimizer
        - data_forward
        - grad_backward
        - get_loss
    """

    def __init__(self):
        """
        Define PyTorch model parameters here.
        """
        pass

    def prepare_input(self, data):
        """
        Perform data transformation from raw input to vector/matrix inputs.
        :param data: raw inputs
        :return (X, Y): tuple, input features and labels
        """
        raise NotImplementedError

    def mode(self, test=False):
        """
        Tell the network to be trained or not, required by PyTorch.
        :param test: bool
        """
        raise NotImplementedError

    def define_optimizer(self):
        """
        Define PyTorch optimizer specified by the models.
        """
        raise NotImplementedError

    def data_forward(self, *x):
        """
        Forward pass of the data.
        :param x: input feature matrix and label vector
        :return: output by the models
        """
        # required by PyTorch nn
        raise NotImplementedError

    def grad_backward(self):
        """
        Perform gradient descent to update the models parameters.
        """
        raise NotImplementedError

    def get_loss(self, pred, truth):
        """
        Compute loss given models prediction and ground truth. Loss function specified by the models.
        :param pred: prediction label vector
        :param truth: ground truth label vector
        :return: a scalar
        """
        raise NotImplementedError


 class ToyController(BaseController):
    """This is for code testing."""

    def __init__(self):
        super(ToyController, self).__init__()
        self.test_mode = False
        self.weight = np.random.rand(5, 1)
        self.bias = np.random.rand()
        self._loss = 0

    def prepare_input(self, data):
        return data[:, :-1], data[:, -1]

    def mode(self, test=False):
        self.test_mode = test

    def data_forward(self, x):
        return np.matmul(x, self.weight) + self.bias

    def grad_backward(self):
        print("loss gradient backward")

    def get_loss(self, pred, truth):
        self._loss = np.mean(np.square(pred - truth))
        return self._loss

    def define_optimizer(self):
        pass


 class Vocabulary(object):
    """A look-up table that allows you to access `Lexeme` objects. The `Vocab`
--- a/fastNLP/models/char_language_model.py
+++ b/fastNLP/models/char_language_model.py
@@ -6,11 +6,16 @@ import torch
 import torch.nn as nn
 import torch.nn.functional as F
 import torch.optim as optim
 from model.base_model import BaseModel
 from torch.autograd import Variable

 from fastNLP.models.base_model import BaseModel

 USE_GPU = True

 """
    To be deprecated.
 """


 class CharLM(BaseModel):
    """
--- a/fastNLP/models/word_seg_model.py
+++ b/fastNLP/models/word_seg_model.py
@@ -1,95 +1,6 @@
 import numpy as np
 import torch
 import torch.nn as nn
 import torch.optim as optim
 from torch.autograd import Variable

 from fastNLP.models.base_model import BaseModel, BaseController

 USE_GPU = True


 def to_var(x):
    if torch.cuda.is_available() and USE_GPU:
        x = x.cuda()
    return Variable(x)


 class WordSegModel(BaseController):
    """
        Model controller for WordSeg
    """

    def __init__(self):
        super(WordSegModel, self).__init__()
        self.id2word = None
        self.word2id = None
        self.id2tag = None
        self.tag2id = None

        self.lstm_batch_size = 8
        self.lstm_seq_len = 32  # Trainer batch_size == lstm_batch_size * lstm_seq_len
        self.hidden_dim = 100
        self.lstm_num_layers = 2
        self.vocab_size = 100
        self.word_emb_dim = 100

        self.model = WordSeg(self.hidden_dim, self.lstm_num_layers, self.vocab_size, self.word_emb_dim)
        self.hidden = (to_var(torch.zeros(2, self.lstm_batch_size, self.word_emb_dim)),
                       to_var(torch.zeros(2, self.lstm_batch_size, self.word_emb_dim)))

        self.optimizer = None
        self._loss = None

    def prepare_input(self, data):
        """
            perform word indices lookup to convert strings into indices
            :param data: list of string, each string contains word + space + [B, M, E, S]
            :return
        """
        word_list = []
        tag_list = []
        for line in data:
            if len(line) > 2:
                tokens = line.split("#")
                word_list.append(tokens[0])
                tag_list.append(tokens[2][0])
        self.id2word = list(set(word_list))
        self.word2id = {word: idx for idx, word in enumerate(self.id2word)}
        self.id2tag = list(set(tag_list))
        self.tag2id = {tag: idx for idx, tag in enumerate(self.id2tag)}
        words = np.array([self.word2id[w] for w in word_list]).reshape(-1, 1)
        tags = np.array([self.tag2id[t] for t in tag_list]).reshape(-1, 1)
        return words, tags

    def mode(self, test=False):
        if test:
            self.model.eval()
        else:
            self.model.train()

    def data_forward(self, x):
        """
        :param x: sequence of length [batch_size], word indices
        :return:
        """
        x = x.reshape(self.lstm_batch_size, self.lstm_seq_len)
        output, self.hidden = self.model(x, self.hidden)
        return output

    def define_optimizer(self):
        self.optimizer = optim.SGD(self.model.parameters(), lr=0.01, momentum=0.85)

    def get_loss(self, pred, truth):

        self._loss = nn.CrossEntropyLoss(pred, truth)
        return self._loss

    def grad_backward(self):
        self.model.zero_grad()
        self._loss.backward()
        torch.nn.utils.clip_grad_norm(self.model.parameters(), 5, norm_type=2)
        self.optimizer.step()
 from fastNLP.models.base_model import BaseModel


 class WordSeg(BaseModel):
--- a/fastNLP/modules/convolution/avg_pool.py
+++ b/fastNLP/modules/convolution/avg_pool.py
@@ -0,0 +1,24 @@
 # python: 3.6
 # encoding: utf-8

 import torch.nn as nn
 import torch.nn.functional as F


 class AvgPool(nn.Module):
    """1-d average pooling module."""

    def __init__(self, stride=None, padding=0):
        super(AvgPool, self).__init__()
        self.stride = stride
        self.padding = padding

    def forward(self, x):
        # [N,C,L] -> [N,C]
        kernel_size = x.size(2)
        x = F.max_pool1d(
            input=x,
            kernel_size=kernel_size,
            stride=self.stride,
            padding=self.padding)
        return x.squeeze(dim=-1)
--- a/fastNLP/modules/convolution/conv.py
+++ b/fastNLP/modules/convolution/conv.py
@@ -0,0 +1,28 @@
 # python: 3.6
 # encoding: utf-8

 import torch.nn as nn
 # import torch.nn.functional as F


 class Conv(nn.Module):
    """
    Basic 1-d convolution module.
    """

    def __init__(self, in_channels, out_channels, kernel_size,
                 stride=1, padding=0, dilation=1,
                 groups=1, bias=True):
        super(Conv, self).__init__()
        self.conv = nn.Conv1d(
            in_channels=in_channels,
            out_channels=out_channels,
            kernel_size=kernel_size,
            stride=stride,
            padding=padding,
            dilation=dilation,
            groups=groups,
            bias=bias)

    def forward(self, x):
        return self.conv(x)  # [N,C,L]
--- a/fastNLP/modules/convolution/kmax_pool.py
+++ b/fastNLP/modules/convolution/kmax_pool.py
@@ -0,0 +1,20 @@
 # python: 3.6
 # encoding: utf-8

 import torch
 import torch.nn as nn
 # import torch.nn.functional as F


 class KMaxPool(nn.Module):
    """K max-pooling module."""

    def __init__(self, k):
        super(KMaxPool, self).__init__()
        self.k = k

    def forward(self, x):
        # [N,C,L] -> [N,C*k]
        x, index = torch.topk(x, self.k, dim=-1, sorted=False)
        x = torch.reshape(x, (x.size(0), -1))
        return x
--- a/fastNLP/modules/convolution/max_pool.py
+++ b/fastNLP/modules/convolution/max_pool.py
@@ -0,0 +1,26 @@
 # python: 3.6
 # encoding: utf-8

 import torch.nn as nn
 import torch.nn.functional as F


 class MaxPool(nn.Module):
    """1-d max-pooling module."""

    def __init__(self, stride=None, padding=0, dilation=1):
        super(MaxPool, self).__init__()
        self.stride = stride
        self.padding = padding
        self.dilation = dilation

    def forward(self, x):
        # [N,C,L] -> [N,C]
        kernel_size = x.size(2)
        x = F.max_pool1d(
            input=x,
            kernel_size=kernel_size,
            stride=self.stride,
            padding=self.padding,
            dilation=self.dilation)
        return x.squeeze(dim=-1)
--- a/fastNLP/modules/prototype/README.md
+++ b/fastNLP/modules/prototype/README.md
@@ -0,0 +1,41 @@
 # Prototype

 ## Word2Idx.py
 A mapping model between words and indexes

 ## embedding.py
 embedding modules

 Contains a simple encapsulation for torch.nn.Embedding

 ## encoder.py
 encoder modules

 Contains a simple encapsulation for torch.nn.LSTM

 ## aggregation.py
 aggregation modules

 Contains a self-attention model, according to paper "A Structured Self-attentive Sentence Embedding", https://arxiv.org/abs/1703.03130

 ## predict.py
 predict modules

 Contains a two layers perceptron for classification

 ## example.py
 An example showing how to use above modules to build a model

 Contains a model for sentiment analysis on Yelp dataset, and its training and testing procedures. See https://arxiv.org/abs/1703.03130 for more details.

 ## prepare.py
 A case of using Word2Idx to build Yelp datasets

 ## dataloader.py
 A dataloader for Yelp dataset

 It is an iterable object, returning a zero-padded batch every iteration.




--- a/fastNLP/modules/prototype/Word2Idx.py
+++ b/fastNLP/modules/prototype/Word2Idx.py
@@ -0,0 +1,63 @@
 import collections
 import pickle

 class Word2Idx():
    """
    Build a word index according to word frequency.

    If "min_freq" is given, then only words with a frequncy not lesser than min_freq will be kept.
    If "max_num" is given, then at most the most frequent $max_num words will be kept.
    "words" should be a list [ w_1,w_2,...,w_i,...,w_n ] where each w_i is a string representing a word.
    num is the size of the lookup table.
    w2i is a lookup table assigning each word an index.
    i2w is a vector which serves as an invert mapping of w2i.
    Note that index 0 is token "<PAD>" for padding
    index 1 is token "<UNK>" for unregistered words
    e.g. i2w[w2i["word"]] == "word"
    """
    def __init__(self):
        self.__w2i = dict()
        self.__i2w = []
        self.num = 0

    def build(self, words, min_freq=0, max_num=None):
        """build a model from words"""
        counter = collections.Counter(words)
        word_set = set(words)
        if max_num is not None:
            most_common = counter.most_common(min(len(word_set), max_num - 1))
        else:
            most_common = counter.most_common()
        self.__w2i = dict((w[0],i + 1) for i,w in enumerate(most_common) if w[1] >= min_freq)
        self.__w2i["<PAD>"] = 0
        self.__w2i["<UNK>"] = 1
        self.__i2w = ["<PAD>", "<UNK>"] + [ w[0] for w in most_common if w[1] >= min_freq ]
        self.num = len(self.__i2w)

    def w2i(self, word):
        """word to index"""
        if word in self.__w2i:
            return self.__w2i[word]
        return 0

    def i2w(self, idx):
        """index to word"""
        if idx >= self.num:
            raise Exception("out of range\n")
        return self.__i2w[idx]

    def save(self, addr):
        """save the model to a file with address "addr" """
        f = open(addr,"wb")
        pickle.dump([self.__i2w, self.__w2i, self.num], f)
        f.close()

    def load(self, addr):
        """load a model from a file with address "addr" """
        f = open(addr,"rb")
        paras = pickle.load(f)
        self.__i2w, self.__w2i, self.num = paras[0], paras[1], paras[2]
        f.close()

    

--- a/fastNLP/modules/prototype/aggregation.py
+++ b/fastNLP/modules/prototype/aggregation.py
@@ -0,0 +1,40 @@
 import torch
 import torch.nn as nn
 from torch.autograd import Variable

 class Selfattention(nn.Module):
    """
    Self Attention Module.

    Args:
    input_size : the size for the input vector
    d_a : the width of weight matrix
    r : the number of encoded vectors
    """
    def __init__(self, input_size, d_a, r):
        super(Selfattention, self).__init__()
        self.W_s1 = nn.Parameter(torch.randn(d_a, input_size), requires_grad=True)
        self.W_s2 = nn.Parameter(torch.randn(r, d_a), requires_grad=True)
        self.softmax = nn.Softmax(dim=2)
        self.tanh = nn.Tanh()

    def penalization(self, A):
        """
        compute the penalization term for attention module
        """
        if self.W_s1.is_cuda:
            I = Variable(torch.eye(A.size(1)).cuda(), requires_grad=False)
        else:
            I = Variable(torch.eye(A.size(1)), requires_grad=False)
        M = torch.matmul(A, torch.transpose(A, 1, 2)) - I
        M = M.view(M.size(0), -1)
        return torch.sum(M ** 2, dim=1)
        
    def forward(self, x):
        inter = self.tanh(torch.matmul(self.W_s1, torch.transpose(x, 1, 2)))
        A = self.softmax(torch.matmul(self.W_s2, inter))
        out = torch.matmul(A, x)
        out = out.view(out.size(0), -1)
        penalty = self.penalization(A)
        return out, penalty

--- a/fastNLP/modules/prototype/dataloader.py
+++ b/fastNLP/modules/prototype/dataloader.py
@@ -0,0 +1,81 @@
 import random
 import pickle
 import torch
 import numpy as np
 from torch.autograd import Variable

 def float_wrapper(x, requires_grad=True, using_cuda=True):
    """
    transform float type list to pytorch variable
    """
    if using_cuda==True:
        return Variable(torch.FloatTensor(x).cuda(), requires_grad=requires_grad)
    else:
        return Variable(torch.FloatTensor(x), requires_grad=requires_grad)

 def long_wrapper(x, requires_grad=True, using_cuda=True):
    """
    transform long type list to pytorch variable
    """
    if using_cuda==True:
        return Variable(torch.LongTensor(x).cuda(), requires_grad=requires_grad)
    else:
        return Variable(torch.LongTensor(x), requires_grad=requires_grad)
    
 def pad(X, using_cuda):
        """
        zero-pad sequnces to same length then pack them together
        """
        maxlen = max([x.size(0) for x in X])
        Y = []
        for x in X:
            padlen = maxlen - x.size(0)
            if padlen > 0:
                if using_cuda:
                    paddings = Variable(torch.zeros(padlen).long()).cuda()
                else:
                    paddings = Variable(torch.zeros(padlen).long())
                x_ = torch.cat((x, paddings), 0)
                Y.append(x_)
            else:
                Y.append(x)
        return torch.stack(Y)

 class DataLoader(object):
    """
    load data with form {"feature", "class"}

    Args:
    fdir : data file address
    batch_size : batch_size
    shuffle : if True, shuffle dataset every epoch
    using_cuda : if True, return tensors on GPU
    """
    def __init__(self, fdir, batch_size, shuffle=True, using_cuda=True):
        with open(fdir, "rb") as f:
            self.data = pickle.load(f)
        self.batch_size = batch_size
        self.num = len(self.data)
        self.count = 0
        self.iters = int(self.num / batch_size)
        self.shuffle = shuffle
        self.using_cuda = using_cuda
        
    def __iter__(self):
        return self

    def __next__(self):
        if self.count == self.iters:
            self.count = 0
            if self.shuffle:
                random.shuffle(self.data)
            raise StopIteration()
        else:
            batch = self.data[self.count * self.batch_size : (self.count + 1) * self.batch_size]
            self.count += 1
            X = [long_wrapper(x["sent"], using_cuda=self.using_cuda, requires_grad=False) for x in batch]
            X = pad(X, self.using_cuda)
            y = long_wrapper([x["class"] for x in batch], using_cuda=self.using_cuda, requires_grad=False)
            return {"feature" : X, "class" : y}
            

--- a/fastNLP/modules/prototype/embedding.py
+++ b/fastNLP/modules/prototype/embedding.py
@@ -0,0 +1,23 @@
 import torch
 import torch.nn as nn

 class Lookuptable(nn.Module):
    """
    A simple lookup table

    Args:
    nums : the size of the lookup table
    dims : the size of each vector
    padding_idx : pads the tensor with zeros whenever it encounters this index
    sparse : If True, gradient matrix will be a sparse tensor. In this case,
    only optim.SGD(cuda and cpu) and optim.Adagrad(cpu) can be used
    """
    def __init__(self, nums, dims, padding_idx=0, sparse=False):
        super(Lookuptable, self).__init__()
        self.embed = nn.Embedding(nums, dims, padding_idx, sparse=sparse)
        
    def forward(self, x):
        return self.embed(x)

 if __name__ == "__main__":
    model = Lookuptable(10, 20)
--- a/fastNLP/modules/prototype/encoder.py
+++ b/fastNLP/modules/prototype/encoder.py
@@ -0,0 +1,22 @@
 import torch
 import torch.nn as nn

 class Lstm(nn.Module):
    """
    LSTM module

    Args:
    input_size : input size
    hidden_size : hidden size
    num_layers : number of hidden layers
    dropout : dropout rate
    bidirectional : If True, becomes a bidirectional RNN
    """
    def __init__(self, input_size, hidden_size, num_layers, dropout, bidirectional):
        super(Lstm, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, bias=True, batch_first=True,\
         dropout=dropout, bidirectional=bidirectional)
        
    def forward(self, x):
        x, _ = self.lstm(x)
        return x
--- a/fastNLP/modules/prototype/example.py
+++ b/fastNLP/modules/prototype/example.py
@@ -0,0 +1,129 @@
 import torch
 import torch.nn as nn
 import encoder
 import aggregation
 import embedding
 import predict
 import torch.optim as optim
 import time
 import dataloader

 WORD_NUM = 357361
 WORD_SIZE = 100
 HIDDEN_SIZE = 300
 D_A = 350
 R = 10
 MLP_HIDDEN = 2000 
 CLASSES_NUM = 5

 class Net(nn.Module):
    """
    A model for sentiment analysis using lstm and self-attention
    """
    def __init__(self):
        super(Net, self).__init__()
        self.embedding = embedding.Lookuptable(WORD_NUM, WORD_SIZE)
        self.encoder = encoder.Lstm(WORD_SIZE, HIDDEN_SIZE, 1, 0.5, True)
        self.aggregation = aggregation.Selfattention(2 * HIDDEN_SIZE, D_A, R)
        self.predict = predict.MLP(R * HIDDEN_SIZE * 2, MLP_HIDDEN, CLASSES_NUM)

    def forward(self, x):
        x = self.embedding(x)
        x = self.encoder(x)
        x, penalty = self.aggregation(x)
        x = self.predict(x)
        return x, penalty

 def train(model_dict=None, using_cuda=True, learning_rate=0.06,\
    momentum=0.3, batch_size=32, epochs=5, coef=1.0, interval=10):
    """
    training procedure

    Args: 
    If model_dict is given (a file address), it will continue training on the given model.
    Otherwise, it would train a new model from scratch.
    If using_cuda is true, the training would be conducted on GPU.
    Learning_rate and momentum is for SGD optimizer.
    coef is the coefficent between the cross-entropy loss and the penalization term.
    interval is the frequncy of reporting.

    the result will be saved with a form "model_dict_+current time", which could be used for further training
    """
    
    if using_cuda:
        net = Net().cuda()
    else:
        net = Net()
        
    if model_dict != None:
        net.load_state_dict(torch.load(model_dict))

    optimizer = optim.SGD(net.parameters(), lr=learning_rate, momentum=momentum)
    criterion = nn.CrossEntropyLoss()
    dataset = dataloader.DataLoader("train_set.pkl", batch_size, using_cuda=using_cuda)

    #statistics
    loss_count = 0
    prepare_time = 0
    run_time = 0
    count = 0

    for epoch in range(epochs):
        print("epoch: %d"%(epoch))
        for i, batch in enumerate(dataset):
            t1 = time.time()
            X = batch["feature"]
            y = batch["class"]
            
            t2 = time.time()
            y_pred, y_penl = net(X)
            loss = criterion(y_pred, y) + torch.sum(y_penl) / batch_size * coef
            optimizer.zero_grad()
            loss.backward()
            nn.utils.clip_grad_norm(net.parameters(), 0.5)
            optimizer.step()
            t3 = time.time()

            loss_count += torch.sum(y_penl).data[0]
            prepare_time += (t2 - t1)
            run_time += (t3 - t2)
            p, idx = torch.max(y_pred.data, dim=1)
            count += torch.sum(torch.eq(idx.cpu(), y.data.cpu()))

            if (i + 1) % interval == 0:
                print("epoch : %d, iters: %d"%(epoch, i + 1))     
                print("loss count:" + str(loss_count / (interval * batch_size)))
                print("acuracy:" + str(count / (interval * batch_size)))
                print("penalty:" + str(torch.sum(y_penl).data[0] / batch_size))
                print("prepare time:" + str(prepare_time))
                print("run time:" + str(run_time))
                prepare_time = 0
                run_time = 0
                loss_count = 0
                count = 0
        string = time.strftime("%Y-%m-%d-%H:%M:%S", time.localtime())
        torch.save(net.state_dict(), "model_dict_%s.dict"%(string))

 def test(model_dict, using_cuda=True):
    if using_cuda:
        net = Net().cuda()
    else:
        net = Net()
    net.load_state_dict(torch.load(model_dict))
    dataset = dataloader.DataLoader("test_set.pkl", batch_size=1, using_cuda=using_cuda)
    count = 0
    for i, batch in enumerate(dataset):
        X = batch["feature"]
        y = batch["class"]
        y_pred, _ = net(X)
        p, idx = torch.max(y_pred.data, dim=1)
        count += torch.sum(torch.eq(idx.cpu(), y.data.cpu()))
    print("accuracy: %f"%(count / dataset.num))
        

 if __name__ == "__main__":
    train(using_cuda=torch.cuda.is_available())
    
    
    

--- a/fastNLP/modules/prototype/predict.py
+++ b/fastNLP/modules/prototype/predict.py
@@ -0,0 +1,25 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F

 class MLP(nn.Module):
    """
    A two layers perceptron for classification.

    Output : Unnormalized possibility distribution
    Args:
    input_size : the size of input
    hidden_size : the size of hidden layer
    output_size : the size of output
    """
    def __init__(self, input_size, hidden_size, output_size):
        super(MLP,self).__init__()
        self.L1 = nn.Linear(input_size, hidden_size)
        self.L2 = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        out = self.L2(F.relu(self.L1(x)))
        return out

 if __name__ == "__main__":
    MLP(20, 30, 20)
--- a/fastNLP/modules/prototype/prepare.py
+++ b/fastNLP/modules/prototype/prepare.py
@@ -0,0 +1,50 @@
 import pickle
 import Word2Idx

 def get_sets(m, n):
    """
    get a train set containing m samples and a test set containing n samples
    """
    samples = pickle.load(open("tuples.pkl","rb"))
    if m+n > len(samples):
        print("asking for too many tuples\n")
        return
    train_samples = samples[ : m]
    test_samples = samples[m: m+n]
    return train_samples, test_samples

 def build_wordidx():
    """
    build wordidx using word2idx
    """
    train, test = get_sets(500000, 2000)
    words = []
    for x in train:
        words += x[0]
    wordidx = Word2Idx.Word2Idx()
    wordidx.build(words)
    print(wordidx.num)
    print(wordidx.i2w(0))
    wordidx.save("wordidx.pkl")

 def build_sets():
    """
    build train set and test set, transform word to index
    """
    train, test = get_sets(500000, 2000)
    wordidx = Word2Idx.Word2Idx()
    wordidx.load("wordidx.pkl")
    train_set = []
    for x in train:
        sent = [wordidx.w2i(w) for w in x[0]]
        train_set.append({"sent" : sent, "class" : x[1]})
    test_set = []
    for x in test:
        sent = [wordidx.w2i(w) for w in x[0]]
        test_set.append({"sent" : sent, "class" : x[1]})
    pickle.dump(train_set, open("train_set.pkl", "wb"))
    pickle.dump(test_set, open("test_set.pkl", "wb"))

 if __name__ == "__main__":
    build_wordidx()
    build_sets()
--- a/fastNLP/saver/base_saver.py
+++ b/fastNLP/saver/base_saver.py
@@ -0,0 +1,14 @@
 class BaseSaver(object):
    """base class for all savers"""

    def __init__(self, save_path):
        self.save_path = save_path

    def save_bytes(self):
        raise NotImplementedError

    def save_str(self):
        raise NotImplementedError

    def compress(self):
        raise NotImplementedError
--- a/fastNLP/saver/logger.py
+++ b/fastNLP/saver/logger.py
@@ -0,0 +1,12 @@
 from saver.base_saver import BaseSaver


 class Logger(BaseSaver):
    """Logging"""

    def __init__(self, save_path):
        super(Logger, self).__init__(save_path)

    def log(self, string):
        with open(self.save_path, "a") as f:
            f.write(string)
--- a/fastNLP/saver/model_saver.py
+++ b/fastNLP/saver/model_saver.py
@@ -0,0 +1,8 @@
 from saver.base_saver import BaseSaver


 class ModelSaver(BaseSaver):
    """Save a models"""

    def __init__(self, save_path):
        super(ModelSaver, self).__init__(save_path)
--- a/test/test_word_seg.py
+++ b/test/test_word_seg.py
@@ -1,23 +1,20 @@
 from loader.base_loader import BaseLoader
 from model.word_seg_model import WordSegModel
 from fastNLP.action.tester import Tester
 from fastNLP.action.trainer import WordSegTrainer
 from fastNLP.loader.base_loader import BaseLoader
 from fastNLP.models.word_seg_model import WordSeg

 from fastNLP.action import Tester
 from fastNLP.action.trainer import Trainer


 def test_charlm():
    train_config = Trainer.TrainConfig(epochs=5, validate=False, save_when_better=False,
 def test_wordseg():
    train_config = WordSegTrainer.TrainConfig(epochs=5, validate=False, save_when_better=False,
                                       log_per_step=10, log_validation=False, batch_size=254)
    trainer = Trainer(train_config)
    trainer = WordSegTrainer(train_config)

    model = WordSegModel()
    model = WordSeg(100, 2, 1000)

    train_data = BaseLoader("load_train", "./data_for_tests/cws_train").load_lines()

    trainer.train(model, train_data)

    trainer.save_model(model)

    test_config = Tester.TestConfig(save_output=False, validate_in_training=False,
                                    save_dev_input=False, save_loss=False, batch_size=254)
    tester = Tester(test_config)
@@ -28,4 +25,4 @@ def test_charlm():


 if __name__ == "__main__":
    test_charlm()
    test_wordseg()