Merge pull request #10 from fastnlp/master

update
6 years ago · 7d0efebb1d
--- a/README.md
+++ b/README.md
@@ -8,8 +8,8 @@

 fastNLP is a modular Natural Language Processing system based on PyTorch, for fast development of NLP tools. It divides the NLP model based on deep learning into different modules. These modules fall into 4 categories: encoder, interaction, aggregation and decoder, while each category contains different implemented modules. Encoder modules encode the input into some abstract representation, interaction modules make the information in the representation interact with each other, aggregation modules aggregate and reduce information, and decoder modules decode the representation into the output. Most current NLP models could be built on these modules, which vastly simplifies the process of developing NLP models. The architecture of fastNLP is as the figure below:

 ![](https://github.com/fastnlp/fastNLP/raw/master/fastnlp-architecture.jpg)

 ![](https://github.com/fastnlp/fastNLP/raw/master/docs/source/figures/procedures.PNG)
 ![](https://github.com/fastnlp/fastNLP/raw/master/docs/source/figures/text_classification.png)

 ## Requirements

@@ -62,4 +62,4 @@ pip install fastNLP
    <td><b> fastNLP.fastnlp </b></td>
    <td> a high-level interface for prediction </td>
 </tr>
 </table>
 </table>
--- a/docs/source/fastNLP.core.rst
+++ b/docs/source/fastNLP.core.rst
@@ -1,12 +1,6 @@
 fastNLP.core 
 =============

 fastNLP.core.action 
 --------------------

 .. automodule:: fastNLP.core.action
    :members:

 fastNLP.core.batch 
 -------------------

@@ -61,6 +55,12 @@ fastNLP.core.preprocess
 .. automodule:: fastNLP.core.preprocess
    :members:

 fastNLP.core.sampler 
 ---------------------

 .. automodule:: fastNLP.core.sampler
    :members:

 fastNLP.core.tester 
 --------------------

@@ -73,6 +73,12 @@ fastNLP.core.trainer
 .. automodule:: fastNLP.core.trainer
    :members:

 fastNLP.core.vocabulary 
 ------------------------

 .. automodule:: fastNLP.core.vocabulary
    :members:


 .. automodule:: fastNLP.core
    :members:
--- a/docs/source/fastNLP.modules.aggregation.rst
+++ b/docs/source/fastNLP.modules.aggregation.rst
@@ -1,36 +0,0 @@
 fastNLP.modules.aggregation 
 ============================

 fastNLP.modules.aggregation.attention 
 --------------------------------------

 .. automodule:: fastNLP.modules.aggregation.attention
    :members:

 fastNLP.modules.aggregation.avg\_pool 
 --------------------------------------

 .. automodule:: fastNLP.modules.aggregation.avg_pool
    :members:

 fastNLP.modules.aggregation.kmax\_pool 
 ---------------------------------------

 .. automodule:: fastNLP.modules.aggregation.kmax_pool
    :members:

 fastNLP.modules.aggregation.max\_pool 
 --------------------------------------

 .. automodule:: fastNLP.modules.aggregation.max_pool
    :members:

 fastNLP.modules.aggregation.self\_attention 
 --------------------------------------------

 .. automodule:: fastNLP.modules.aggregation.self_attention
    :members:


 .. automodule:: fastNLP.modules.aggregation
    :members:
--- a/docs/source/fastNLP.modules.aggregator.rst
+++ b/docs/source/fastNLP.modules.aggregator.rst
@@ -0,0 +1,36 @@
 fastNLP.modules.aggregator 
 ===========================

 fastNLP.modules.aggregator.attention 
 -------------------------------------

 .. automodule:: fastNLP.modules.aggregator.attention
    :members:

 fastNLP.modules.aggregator.avg\_pool 
 -------------------------------------

 .. automodule:: fastNLP.modules.aggregator.avg_pool
    :members:

 fastNLP.modules.aggregator.kmax\_pool 
 --------------------------------------

 .. automodule:: fastNLP.modules.aggregator.kmax_pool
    :members:

 fastNLP.modules.aggregator.max\_pool 
 -------------------------------------

 .. automodule:: fastNLP.modules.aggregator.max_pool
    :members:

 fastNLP.modules.aggregator.self\_attention 
 -------------------------------------------

 .. automodule:: fastNLP.modules.aggregator.self_attention
    :members:


 .. automodule:: fastNLP.modules.aggregator
    :members:
--- a/docs/source/fastNLP.modules.interaction.rst
+++ b/docs/source/fastNLP.modules.interaction.rst
@@ -1,5 +0,0 @@
 fastNLP.modules.interaction 
 ============================

 .. automodule:: fastNLP.modules.interaction
    :members:
--- a/docs/source/fastNLP.modules.interactor.rst
+++ b/docs/source/fastNLP.modules.interactor.rst
@@ -0,0 +1,5 @@
 fastNLP.modules.interactor 
 ===========================

 .. automodule:: fastNLP.modules.interactor
    :members:
--- a/docs/source/fastNLP.modules.rst
+++ b/docs/source/fastNLP.modules.rst
@@ -3,10 +3,10 @@ fastNLP.modules

 .. toctree::

    fastNLP.modules.aggregation
    fastNLP.modules.aggregator
    fastNLP.modules.decoder
    fastNLP.modules.encoder
    fastNLP.modules.interaction
    fastNLP.modules.interactor

 fastNLP.modules.other\_modules 
 -------------------------------
--- a/fastNLP/core/dataset.py
+++ b/fastNLP/core/dataset.py
@@ -6,91 +6,45 @@ from copy import deepcopy
 from fastNLP.core.field import TextField, LabelField
 from fastNLP.core.instance import Instance
 from fastNLP.core.vocabulary import Vocabulary
 from fastNLP.loader.dataset_loader import POSDataSetLoader, ClassDataSetLoader


 def create_dataset_from_lists(str_lists: list, word_vocab: dict, has_target: bool = False, label_vocab: dict = None):
    if has_target is True:
        if label_vocab is None:
            raise RuntimeError("Must provide label vocabulary to transform labels.")
        return create_labeled_dataset_from_lists(str_lists, word_vocab, label_vocab)
    else:
        return create_unlabeled_dataset_from_lists(str_lists, word_vocab)


 def create_labeled_dataset_from_lists(str_lists, word_vocab, label_vocab):
    """Create an DataSet instance that contains labels.

    :param str_lists: list of list of strings, [num_examples, 2, *].
            ::
            [
                [[word_11, word_12, ...], [label_11, label_12, ...]],
                ...
            ]

    :param word_vocab: dict of (str: int), which means (word: index).
    :param label_vocab: dict of (str: int), which means (word: index).
    :return data_set: a DataSet instance.

    """
    data_set = DataSet()
    for example in str_lists:
        word_seq, label_seq = example[0], example[1]
        x = TextField(word_seq, is_target=False)
        y = TextField(label_seq, is_target=True)
        data_set.append(Instance(word_seq=x, label_seq=y))
    data_set.index_field("word_seq", word_vocab)
    data_set.index_field("label_seq", label_vocab)
    return data_set


 def create_unlabeled_dataset_from_lists(str_lists, word_vocab):
    """Create an DataSet instance that contains no labels.

    :param str_lists: list of list of strings, [num_examples, *].
            ::
            [
                [word_11, word_12, ...],
                ...
            ]

    :param word_vocab: dict of (str: int), which means (word: index).
    :return data_set: a DataSet instance.

    """
    data_set = DataSet()
    for word_seq in str_lists:
        x = TextField(word_seq, is_target=False)
        data_set.append(Instance(word_seq=x))
    data_set.index_field("word_seq", word_vocab)
    return data_set

 _READERS = {}

 class DataSet(list):
    """A DataSet object is a list of Instance objects.

    """

    def __init__(self, name="", instances=None, load_func=None):
    def __init__(self, name="", instances=None):
        """

        :param name: str, the name of the dataset. (default: "")
        :param instances: list of Instance objects. (default: None)
        :param load_func: a function that takes the dataset path (string) as input and returns multi-level lists.
        """
        list.__init__([])
        self.name = name
        self.origin_len = None
        if instances is not None:
            self.extend(instances)
        self.data_set_load_func = load_func

    def index_all(self, vocab):
        for ins in self:
            ins.index_all(vocab)
        return self

    def index_field(self, field_name, vocab):
        for ins in self:
            ins.index_field(field_name, vocab)
        if isinstance(field_name, str):
            field_list = [field_name]
            vocab_list = [vocab]
        else:
            classes = (list, tuple)
            assert isinstance(field_name, classes) and isinstance(vocab, classes) and len(field_name) == len(vocab)
            field_list = field_name
            vocab_list = vocab

        for name, vocabs in zip(field_list, vocab_list):
            for ins in self:
                ins.index_field(name, vocabs)
        return self

    def to_tensor(self, idx: int, padding_length: dict):
        """Convert an instance in a dataset to tensor.
@@ -102,7 +56,7 @@ class DataSet(list):

        """
        ins = self[idx]
        return ins.to_tensor(padding_length)
        return ins.to_tensor(padding_length, self.origin_len)

    def get_length(self):
        """Fetch lengths of all fields in all instances in a dataset.
@@ -117,42 +71,9 @@ class DataSet(list):
                lengths[field_name].append(field_length)
        return lengths

    def convert(self, data):
        """Convert lists of strings into Instances with Fields, creating Vocabulary for labeled data. Used in Training."""
        raise NotImplementedError

    def convert_with_vocabs(self, data, vocabs):
        """Convert lists of strings into Instances with Fields, using existing Vocabulary, with labels. Used in Testing."""
        raise NotImplementedError

    def convert_for_infer(self, data, vocabs):
        """Convert lists of strings into Instances with Fields, using existing Vocabulary, without labels. Used in predicting."""

    def load(self, data_path, vocabs=None, infer=False):
        """Load data from the given files.

        :param data_path: str, the path to the data
        :param infer: bool. If True, there is no label information in the data. Default: False.
        :param vocabs: dict of (name: Vocabulary object), used to index data. If not provided, a new vocabulary will be constructed.

        """
        raw_data = self.data_set_load_func(data_path)
        if infer is True:
            self.convert_for_infer(raw_data, vocabs)
        else:
            if vocabs is not None:
                self.convert_with_vocabs(raw_data, vocabs)
            else:
                self.convert(raw_data)

    def load_raw(self, raw_data, vocabs):
        """Load raw data without loader. Used in FastNLP class.

        :param raw_data:
        :param vocabs:
        :return:
        """
        self.convert_for_infer(raw_data, vocabs)
    def shuffle(self):
        random.shuffle(self)
        return self

    def split(self, ratio, shuffle=True):
        """Train/dev splitting
@@ -165,7 +86,7 @@ class DataSet(list):
        """
        assert 0 < ratio < 1
        if shuffle:
            random.shuffle(self)
            self.shuffle()
        split_idx = int(len(self) * ratio)
        dev_set = deepcopy(self)
        train_set = deepcopy(self)
@@ -173,134 +94,67 @@ class DataSet(list):
        del dev_set[split_idx:]
        return train_set, dev_set


 class SeqLabelDataSet(DataSet):
    def __init__(self, instances=None, load_func=POSDataSetLoader().load):
        super(SeqLabelDataSet, self).__init__(name="", instances=instances, load_func=load_func)
        self.word_vocab = Vocabulary()
        self.label_vocab = Vocabulary()

    def convert(self, data):
        """Convert lists of strings into Instances with Fields.

        :param data: 3-level lists. Entries are strings.
    def rename_field(self, old_name, new_name):
        """rename a field
        """
        bar = ProgressBar(total=len(data))
        for example in data:
            word_seq, label_seq = example[0], example[1]
            # list, list
            self.word_vocab.update(word_seq)
            self.label_vocab.update(label_seq)
            x = TextField(word_seq, is_target=False)
            x_len = LabelField(len(word_seq), is_target=False)
            y = TextField(label_seq, is_target=False)
            instance = Instance()
            instance.add_field("word_seq", x)
            instance.add_field("truth", y)
            instance.add_field("word_seq_origin_len", x_len)
            self.append(instance)
            bar.move()
        self.index_field("word_seq", self.word_vocab)
        self.index_field("truth", self.label_vocab)
        # no need to index "word_seq_origin_len"

    def convert_with_vocabs(self, data, vocabs):
        for example in data:
            word_seq, label_seq = example[0], example[1]
            # list, list
            x = TextField(word_seq, is_target=False)
            x_len = LabelField(len(word_seq), is_target=False)
            y = TextField(label_seq, is_target=False)
            instance = Instance()
            instance.add_field("word_seq", x)
            instance.add_field("truth", y)
            instance.add_field("word_seq_origin_len", x_len)
            self.append(instance)
        self.index_field("word_seq", vocabs["word_vocab"])
        self.index_field("truth", vocabs["label_vocab"])
        # no need to index "word_seq_origin_len"

    def convert_for_infer(self, data, vocabs):
        for word_seq in data:
            # list
            x = TextField(word_seq, is_target=False)
            x_len = LabelField(len(word_seq), is_target=False)
            instance = Instance()
            instance.add_field("word_seq", x)
            instance.add_field("word_seq_origin_len", x_len)
            self.append(instance)
        self.index_field("word_seq", vocabs["word_vocab"])
        # no need to index "word_seq_origin_len"


 class TextClassifyDataSet(DataSet):
    def __init__(self, instances=None, load_func=ClassDataSetLoader().load):
        super(TextClassifyDataSet, self).__init__(name="", instances=instances, load_func=load_func)
        self.word_vocab = Vocabulary()
        self.label_vocab = Vocabulary(need_default=False)

    def convert(self, data):
        for example in data:
            word_seq, label = example[0], example[1]
            # list, str
            self.word_vocab.update(word_seq)
            self.label_vocab.update(label)
            x = TextField(word_seq, is_target=False)
            y = LabelField(label, is_target=True)
            instance = Instance()
            instance.add_field("word_seq", x)
            instance.add_field("label", y)
            self.append(instance)
        self.index_field("word_seq", self.word_vocab)
        self.index_field("label", self.label_vocab)

    def convert_with_vocabs(self, data, vocabs):
        for example in data:
            word_seq, label = example[0], example[1]
            # list, str
            x = TextField(word_seq, is_target=False)
            y = LabelField(label, is_target=True)
            instance = Instance()
            instance.add_field("word_seq", x)
            instance.add_field("label", y)
            self.append(instance)
        self.index_field("word_seq", vocabs["word_vocab"])
        self.index_field("label", vocabs["label_vocab"])
        for ins in self:
            ins.rename_field(old_name, new_name)
        return self

    def convert_for_infer(self, data, vocabs):
        for word_seq in data:
            # list
            x = TextField(word_seq, is_target=False)
            instance = Instance()
            instance.add_field("word_seq", x)
            self.append(instance)
        self.index_field("word_seq", vocabs["word_vocab"])
    def set_target(self, **fields):
        """Change the flag of `is_target` for all instance. For fields not set here, leave their `is_target` unchanged.

        :param key-value pairs for field-name and `is_target` value(True, False or None).
        """
        for ins in self:
            ins.set_target(**fields)
        return self

 def change_field_is_target(data_set, field_name, new_target):
    """Change the flag of is_target in a field.
    def update_vocab(self, **name_vocab):
        """using certain field data to update vocabulary.

    :param data_set: a DataSet object
    :param field_name: str, the name of the field
    :param new_target: one of (True, False, None), representing this field is batch_x / is batch_y / neither.
        e.g. ::

    """
    for inst in data_set:
        inst.fields[field_name].is_target = new_target
            # update word vocab and label vocab seperately
            dataset.update_vocab(word_seq=word_vocab, label_seq=label_vocab)
        """
        for field_name, vocab in name_vocab.items():
            for ins in self:
                vocab.update(ins[field_name].contents())
        return self

    def set_origin_len(self, origin_field, origin_len_name=None):
        """make dataset tensor output contain origin_len field.

 class ProgressBar:
        e.g. ::

    def __init__(self, count=0, total=0, width=100):
        self.count = count
        self.total = total
        self.width = width
            # output "word_seq_origin_len", lengths based on "word_seq" field
            dataset.set_origin_len("word_seq")
        """
        if origin_field is None:
            self.origin_len = None
        else:
            self.origin_len = (origin_field + "_origin_len", origin_field) \
                if origin_len_name is None else (origin_len_name, origin_field)
        return self

    def __getattribute__(self, name):
        if name in _READERS:
            # add read_*data() support
            def _read(*args, **kwargs):
                data = _READERS[name]().load(*args, **kwargs)
                self.extend(data)
                return self
            return _read
        else:
            return object.__getattribute__(self, name)

    def move(self):
        self.count += 1
        progress = self.width * self.count // self.total
        sys.stdout.write('{0:3}/{1:3}: '.format(self.count, self.total))
        sys.stdout.write('#' * progress + '-' * (self.width - progress) + '\r')
        if progress == self.width:
            sys.stdout.write('\n')
        sys.stdout.flush()
    @classmethod
    def set_reader(cls, method_name):
        """decorator to add dataloader support
        """
        assert isinstance(method_name, str)
        def wrapper(read_cls):
            _READERS[method_name] = read_cls
            return read_cls
        return wrapper
--- a/fastNLP/core/field.py
+++ b/fastNLP/core/field.py
@@ -18,6 +18,8 @@ class Field(object):
    def to_tensor(self, padding_length):
        raise NotImplementedError

    def contents(self):
        raise NotImplementedError

 class TextField(Field):
    def __init__(self, text, is_target):
@@ -57,6 +59,8 @@ class TextField(Field):
            pads = [0] * (padding_length - self.get_length())
        return torch.LongTensor(self._index + pads)

    def contents(self):
        return self.text.copy()

 class LabelField(Field):
    """The Field representing a single label. Can be a string or integer.
@@ -92,6 +96,40 @@ class LabelField(Field):
        else:
            return torch.LongTensor([self._index])

    def contents(self):
        return [self.label]

 class SeqLabelField(Field):
    def __init__(self, label_seq, is_target=True):
        super(SeqLabelField, self).__init__(is_target)
        self.label_seq = label_seq
        self._index = None

    def get_length(self):
        return len(self.label_seq)

    def index(self, vocab):
        if self._index is None:
            self._index = [vocab[c] for c in self.label_seq]
        return self._index

    def to_tensor(self, padding_length):
        pads = [0] * (padding_length - self.get_length())
        if self._index is None:
            if self.get_length() == 0:
                return torch.LongTensor(pads)
            elif isinstance(self.label_seq[0], int):
                return torch.LongTensor(self.label_seq + pads)
            elif isinstance(self.label_seq[0], str):
                raise RuntimeError("Field {} not indexed. Call index method.".format(self.label))
            else:
                raise RuntimeError(
                    "Not support type for SeqLabelField. Expect str or int, got {}.".format(type(self.label)))
        else:
            return torch.LongTensor(self._index + pads)

    def contents(self):
        return self.label_seq.copy()

 if __name__ == "__main__":
    tf = TextField("test the code".split(), is_target=False)
--- a/fastNLP/core/instance.py
+++ b/fastNLP/core/instance.py
@@ -1,3 +1,5 @@
 import torch

 class Instance(object):
    """An instance which consists of Fields is an example in the DataSet.

@@ -10,6 +12,28 @@ class Instance(object):

    def add_field(self, field_name, field):
        self.fields[field_name] = field
        return self

    def rename_field(self, old_name, new_name):
        if old_name in self.fields:
            self.fields[new_name] = self.fields.pop(old_name)
            if old_name in self.indexes:
                self.indexes[new_name] = self.indexes.pop(old_name)
        else:
            raise KeyError("error, no such field: {}".format(old_name))
        return self

    def set_target(self, **fields):
        for name, val in fields.items():
            if name in self.fields:
                self.fields[name].is_target = val
        return self

    def __getitem__(self, name):
        if name in self.fields:
            return self.fields[name]
        else:
            raise KeyError("{} not found".format(name))

    def get_length(self):
        """Fetch the length of all fields in the instance.
@@ -24,6 +48,7 @@ class Instance(object):
        """use `vocab` to index certain field
        """
        self.indexes[field_name] = self.fields[field_name].index(vocab)
        return self

    def index_all(self, vocab):
        """use `vocab` to index all fields
@@ -35,7 +60,7 @@ class Instance(object):
        self.indexes = indexes
        return indexes

    def to_tensor(self, padding_length: dict):
    def to_tensor(self, padding_length: dict, origin_len=None):
        """Convert instance to tensor.

        :param padding_length: dict of (str: int), which means (field name: padding_length of this field)
@@ -53,4 +78,7 @@ class Instance(object):
            else:
                # is_target is None
                continue
        if origin_len is not None:
            name, field_name = origin_len
            tensor_x[name] = torch.LongTensor([self.fields[field_name].get_length()])
        return tensor_x, tensor_y
--- a/fastNLP/core/metrics.py
+++ b/fastNLP/core/metrics.py
@@ -57,6 +57,20 @@ class SeqLabelEvaluator(Evaluator):
        return {"accuracy": float(accuracy)}


 class SNLIEvaluator(Evaluator):
    def __init__(self):
        super(SNLIEvaluator, self).__init__()

    def __call__(self, predict, truth):
        y_prob = [torch.nn.functional.softmax(y_logit, dim=-1) for y_logit in predict]
        y_prob = torch.cat(y_prob, dim=0)
        y_pred = torch.argmax(y_prob, dim=-1)
        truth = [t['truth'] for t in truth]
        y_true = torch.cat(truth, dim=0).view(-1)
        acc = float(torch.sum(y_pred == y_true)) / y_true.size(0)
        return {"accuracy": acc}


 def _conver_numpy(x):
    """convert input data to numpy array

--- a/fastNLP/core/predictor.py
+++ b/fastNLP/core/predictor.py
@@ -2,9 +2,9 @@ import numpy as np
 import torch

 from fastNLP.core.batch import Batch
 from fastNLP.core.dataset import create_dataset_from_lists
 from fastNLP.core.preprocess import load_pickle
 from fastNLP.core.sampler import SequentialSampler
 from fastNLP.loader.dataset_loader import convert_seq2seq_dataset, convert_seq2tag_dataset, convert_seq_dataset


 class Predictor(object):
@@ -79,7 +79,8 @@ class Predictor(object):
        :return data_set: a DataSet instance.
        """
        assert isinstance(data, list)
        return create_dataset_from_lists(data, self.word_vocab, has_target=False)
        data = convert_seq_dataset(data)
        data.index_field("word_seq", self.word_vocab)


 class SeqLabelInfer(Predictor):
--- a/fastNLP/core/preprocess.py
+++ b/fastNLP/core/preprocess.py
@@ -1,13 +1,6 @@
 import _pickle
 import os

 import numpy as np

 from fastNLP.core.dataset import DataSet
 from fastNLP.core.field import TextField, LabelField
 from fastNLP.core.instance import Instance
 from fastNLP.core.vocabulary import Vocabulary


 # the first vocab in dict with the index = 5

@@ -53,258 +46,3 @@ def pickle_exist(pickle_path, pickle_name):
        return True
    else:
        return False


 class Preprocessor(object):
    """Preprocessors are responsible for converting data of strings into data of indices.
    During the pre-processing, the following pickle files will be built:

        - "word2id.pkl", a Vocabulary object, mapping words to indices.
        - "class2id.pkl", a Vocabulary object, mapping labels to indices.
        - "data_train.pkl", a DataSet object for training
        - "data_dev.pkl", a DataSet object for validation, if train_dev_split > 0.
        - "data_test.pkl", a DataSet object for testing, if test_data is not None.

    These four pickle files are expected to be saved in the given pickle directory once they are constructed.
    Preprocessors will check if those files are already in the directory and will reuse them in future calls.
    """

    def __init__(self, label_is_seq=False, share_vocab=False, add_char_field=False):
        """

        :param label_is_seq: bool, whether label is a sequence. If True, label vocabulary will preserve
                several special tokens for sequence processing.
        :param share_vocab: bool, whether word sequence and label sequence share the same vocabulary. Typically, this
                is only available when label_is_seq is True. Default: False.
        :param add_char_field: bool, whether to add character representations to all TextFields. Default: False.
        """
        print("Preprocessor is about to deprecate. Please use DataSet class.")
        self.data_vocab = Vocabulary()
        if label_is_seq is True:
            if share_vocab is True:
                self.label_vocab = self.data_vocab
            else:
                self.label_vocab = Vocabulary()
        else:
            self.label_vocab = Vocabulary(need_default=False)

        self.character_vocab = Vocabulary(need_default=False)
        self.add_char_field = add_char_field

    @property
    def vocab_size(self):
        return len(self.data_vocab)

    @property
    def num_classes(self):
        return len(self.label_vocab)

    @property
    def char_vocab_size(self):
        if self.character_vocab is None:
            self.build_char_dict()
        return len(self.character_vocab)

    def run(self, train_dev_data, test_data=None, pickle_path="./", train_dev_split=0, cross_val=False, n_fold=10):
        """Main pre-processing pipeline.

        :param train_dev_data: three-level list, with either single label or multiple labels in a sample.
        :param test_data: three-level list, with either single label or multiple labels in a sample. (optional)
        :param pickle_path: str, the path to save the pickle files.
        :param train_dev_split: float, between [0, 1]. The ratio of training data used as validation set.
        :param cross_val: bool, whether to do cross validation.
        :param n_fold: int, the number of folds of cross validation. Only useful when cross_val is True.
        :return results: multiple datasets after pre-processing. If test_data is provided, return one more dataset.
                If train_dev_split > 0, return one more dataset - the dev set. If cross_val is True, each dataset
                is a list of DataSet objects; Otherwise, each dataset is a DataSet object.
        """
        if pickle_exist(pickle_path, "word2id.pkl") and pickle_exist(pickle_path, "class2id.pkl"):
            self.data_vocab = load_pickle(pickle_path, "word2id.pkl")
            self.label_vocab = load_pickle(pickle_path, "class2id.pkl")
        else:
            self.data_vocab, self.label_vocab = self.build_dict(train_dev_data)
            save_pickle(self.data_vocab, pickle_path, "word2id.pkl")
            save_pickle(self.label_vocab, pickle_path, "class2id.pkl")

        self.build_reverse_dict()

        train_set = []
        dev_set = []
        if not cross_val:
            if not pickle_exist(pickle_path, "data_train.pkl"):
                if train_dev_split > 0 and not pickle_exist(pickle_path, "data_dev.pkl"):
                    split = int(len(train_dev_data) * train_dev_split)
                    data_dev = train_dev_data[: split]
                    data_train = train_dev_data[split:]
                    train_set = self.convert_to_dataset(data_train, self.data_vocab, self.label_vocab)
                    dev_set = self.convert_to_dataset(data_dev, self.data_vocab, self.label_vocab)

                    save_pickle(dev_set, pickle_path, "data_dev.pkl")
                    print("{} of the training data is split for validation. ".format(train_dev_split))
                else:
                    train_set = self.convert_to_dataset(train_dev_data, self.data_vocab, self.label_vocab)
                save_pickle(train_set, pickle_path, "data_train.pkl")
            else:
                train_set = load_pickle(pickle_path, "data_train.pkl")
                if pickle_exist(pickle_path, "data_dev.pkl"):
                    dev_set = load_pickle(pickle_path, "data_dev.pkl")
        else:
            # cross_val is True
            if not pickle_exist(pickle_path, "data_train_0.pkl"):
                # cross validation
                data_cv = self.cv_split(train_dev_data, n_fold)
                for i, (data_train_cv, data_dev_cv) in enumerate(data_cv):
                    data_train_cv = self.convert_to_dataset(data_train_cv, self.data_vocab, self.label_vocab)
                    data_dev_cv = self.convert_to_dataset(data_dev_cv, self.data_vocab, self.label_vocab)
                    save_pickle(
                        data_train_cv, pickle_path,
                        "data_train_{}.pkl".format(i))
                    save_pickle(
                        data_dev_cv, pickle_path,
                        "data_dev_{}.pkl".format(i))
                    train_set.append(data_train_cv)
                    dev_set.append(data_dev_cv)
                print("{}-fold cross validation.".format(n_fold))
            else:
                for i in range(n_fold):
                    data_train_cv = load_pickle(pickle_path, "data_train_{}.pkl".format(i))
                    data_dev_cv = load_pickle(pickle_path, "data_dev_{}.pkl".format(i))
                    train_set.append(data_train_cv)
                    dev_set.append(data_dev_cv)

        # prepare test data if provided
        test_set = []
        if test_data is not None:
            if not pickle_exist(pickle_path, "data_test.pkl"):
                test_set = self.convert_to_dataset(test_data, self.data_vocab, self.label_vocab)
                save_pickle(test_set, pickle_path, "data_test.pkl")

        # return preprocessed results
        results = [train_set]
        if cross_val or train_dev_split > 0:
            results.append(dev_set)
        if test_data:
            results.append(test_set)
        if len(results) == 1:
            return results[0]
        else:
            return tuple(results)

    def build_dict(self, data):
        for example in data:
            word, label = example
            self.data_vocab.update(word)
            self.label_vocab.update(label)
        return self.data_vocab, self.label_vocab

    def build_char_dict(self):
        char_collection = set()
        for word in self.data_vocab.word2idx:
            if len(word) == 0:
                continue
            for ch in word:
                if ch not in char_collection:
                    char_collection.add(ch)
        self.character_vocab.update(list(char_collection))

    def build_reverse_dict(self):
        self.data_vocab.build_reverse_vocab()
        self.label_vocab.build_reverse_vocab()

    def data_split(self, data, train_dev_split):
        """Split data into train and dev set."""
        split = int(len(data) * train_dev_split)
        data_dev = data[: split]
        data_train = data[split:]
        return data_train, data_dev

    def cv_split(self, data, n_fold):
        """Split data for cross validation.

        :param data: list of string
        :param n_fold: int
        :return data_cv:

            ::
            [
                (data_train, data_dev),  # 1st fold
                (data_train, data_dev),  # 2nd fold
                ...
            ]

        """
        data_copy = data.copy()
        np.random.shuffle(data_copy)
        fold_size = round(len(data_copy) / n_fold)
        data_cv = []
        for i in range(n_fold - 1):
            start = i * fold_size
            end = (i + 1) * fold_size
            data_dev = data_copy[start:end]
            data_train = data_copy[:start] + data_copy[end:]
            data_cv.append((data_train, data_dev))
        start = (n_fold - 1) * fold_size
        data_dev = data_copy[start:]
        data_train = data_copy[:start]
        data_cv.append((data_train, data_dev))
        return data_cv

    def convert_to_dataset(self, data, vocab, label_vocab):
        """Convert list of indices into a DataSet object.

        :param data: list. Entries are strings.
        :param vocab: a dict, mapping string (token) to index (int).
        :param label_vocab: a dict, mapping string (label) to index (int).
        :return data_set: a DataSet object
        """
        use_word_seq = False
        use_label_seq = False
        use_label_str = False

        # construct a DataSet object and fill it with Instances
        data_set = DataSet()
        for example in data:
            words, label = example[0], example[1]
            instance = Instance()

            if isinstance(words, list):
                x = TextField(words, is_target=False)
                instance.add_field("word_seq", x)
                use_word_seq = True
            else:
                raise NotImplementedError("words is a {}".format(type(words)))

            if isinstance(label, list):
                y = TextField(label, is_target=True)
                instance.add_field("label_seq", y)
                use_label_seq = True
            elif isinstance(label, str):
                y = LabelField(label, is_target=True)
                instance.add_field("label", y)
                use_label_str = True
            else:
                raise NotImplementedError("label is a {}".format(type(label)))
            data_set.append(instance)

        # convert strings to indices
        if use_word_seq:
            data_set.index_field("word_seq", vocab)
        if use_label_seq:
            data_set.index_field("label_seq", label_vocab)
        if use_label_str:
            data_set.index_field("label", label_vocab)

        return data_set


 class SeqLabelPreprocess(Preprocessor):
    def __init__(self):
        print("[FastNLP warning] SeqLabelPreprocess is about to deprecate. Please use Preprocess directly.")
        super(SeqLabelPreprocess, self).__init__()


 class ClassPreprocess(Preprocessor):
    def __init__(self):
        print("[FastNLP warning] ClassPreprocess is about to deprecate. Please use Preprocess directly.")
        super(ClassPreprocess, self).__init__()

--- a/fastNLP/core/tester.py
+++ b/fastNLP/core/tester.py
@@ -83,6 +83,7 @@ class Tester(object):
            truth_list.append(batch_y)
        eval_results = self.evaluate(output_list, truth_list)
        print("[tester] {}".format(self.print_eval_results(eval_results)))
        logger.info("[tester] {}".format(self.print_eval_results(eval_results)))

    def mode(self, model, is_test=False):
        """Train mode or Test mode. This is for PyTorch currently.
@@ -131,3 +132,10 @@ class ClassificationTester(Tester):
        print(
            "[FastNLP Warning] ClassificationTester will be deprecated. Please use Tester directly.")
        super(ClassificationTester, self).__init__(**test_args)


 class SNLITester(Tester):
    def __init__(self, **test_args):
        print(
            "[FastNLP Warning] SNLITester will be deprecated. Please use Tester directly.")
        super(SNLITester, self).__init__(**test_args)
--- a/fastNLP/core/trainer.py
+++ b/fastNLP/core/trainer.py
@@ -10,7 +10,7 @@ from fastNLP.core.loss import Loss
 from fastNLP.core.metrics import Evaluator
 from fastNLP.core.optimizer import Optimizer
 from fastNLP.core.sampler import RandomSampler
 from fastNLP.core.tester import SeqLabelTester, ClassificationTester
 from fastNLP.core.tester import SeqLabelTester, ClassificationTester, SNLITester
 from fastNLP.saver.logger import create_logger
 from fastNLP.saver.model_saver import ModelSaver

@@ -162,7 +162,7 @@ class Trainer(object):
            if kwargs["n_print"] > 0 and step % kwargs["n_print"] == 0:
                end = time.time()
                diff = timedelta(seconds=round(end - kwargs["start"]))
                print_output = "[epoch: {:>3} step: {:>4}] train loss: {:>4.2} time: {}".format(
                print_output = "[epoch: {:>3} step: {:>4}] train loss: {:>4.6} time: {}".format(
                    kwargs["epoch"], step, loss.data, diff)
                print(print_output)
                logger.info(print_output)
@@ -292,3 +292,15 @@ class ClassificationTrainer(Trainer):

    def _create_validator(self, valid_args):
        return ClassificationTester(**valid_args)


 class SNLITrainer(Trainer):
    """Trainer for text SNLI."""

    def __init__(self, **train_args):
        print(
            "[FastNLP Warning] SNLITrainer will be deprecated. Please use Trainer directly.")
        super(SNLITrainer, self).__init__(**train_args)

    def _create_validator(self, valid_args):
        return SNLITester(**valid_args)
--- a/fastNLP/core/vocabulary.py
+++ b/fastNLP/core/vocabulary.py
@@ -19,6 +19,17 @@ def isiterable(p_object):
    return True


 def check_build_vocab(func):
    def _wrapper(self, *args, **kwargs):
        if self.word2idx is None:
            self.build_vocab()
            self.build_reverse_vocab()
        elif self.idx2word is None:
            self.build_reverse_vocab()
        return func(self, *args, **kwargs)
    return _wrapper


 class Vocabulary(object):
    """Use for word and index one to one mapping

@@ -30,13 +41,41 @@ class Vocabulary(object):
        vocab["word"]
        vocab.to_word(5)
    """

    def __init__(self, need_default=True):
    def __init__(self, need_default=True, max_size=None, min_freq=None):
        """
        :param bool need_default: set if the Vocabulary has default labels reserved for sequences. Default: True.
        :param int max_size: set the max number of words in Vocabulary. Default: None
        :param int min_freq: set the min occur frequency of words in Vocabulary. Default: None
        """
        self.max_size = max_size
        self.min_freq = min_freq
        self.word_count = {}
        self.has_default = need_default
        self.word2idx = None
        self.idx2word = None

    def update(self, word):
        """add word or list of words into Vocabulary

        :param word: a list of string or a single string
        """
        if need_default:
        if not isinstance(word, str) and isiterable(word):
            # it's a nested list
            for w in word:
                self.update(w)
        else:
        # it's a word to be added
            if word not in self.word_count:
                self.word_count[word] = 1
            else:
                self.word_count[word] += 1
            self.word2idx = None
        return self

    def build_vocab(self):
        """build 'word to index' dict, and filter the word using `max_size` and `min_freq`
        """
        if self.has_default:
            self.word2idx = deepcopy(DEFAULT_WORD_TO_INDEX)
            self.padding_label = DEFAULT_PADDING_LABEL
            self.unknown_label = DEFAULT_UNKNOWN_LABEL
@@ -45,28 +84,28 @@ class Vocabulary(object):
            self.padding_label = None
            self.unknown_label = None

        self.has_default = need_default
        self.idx2word = None
        words = sorted(self.word_count.items(), key=lambda kv: kv[1], reverse=True)
        if self.min_freq is not None:
            words = list(filter(lambda kv: kv[1] >= self.min_freq, words))
        if self.max_size is not None and len(words) > self.max_size:
            words = words[:self.max_size]
        for w, _ in words:
            self.word2idx[w] = len(self.word2idx)

    def build_reverse_vocab(self):
        """build 'index to word' dict based on 'word to index' dict
        """
        self.idx2word = {self.word2idx[w] : w for w in self.word2idx}

    @check_build_vocab
    def __len__(self):
        return len(self.word2idx)

    def update(self, word):
        """add word or list of words into Vocabulary
        
        :param word: a list of string or a single string
        """
        if not isinstance(word, str) and isiterable(word):
            # it's a nested list
            for w in word:
                self.update(w)
        else:
            # it's a word to be added
            if word not in self.word2idx:
                self.word2idx[word] = len(self)
                if self.idx2word is not None:
                    self.idx2word = None
    @check_build_vocab
    def has_word(self, w):
        return w in self.word2idx

    @check_build_vocab
    def __getitem__(self, w):
        """To support usage like::

@@ -74,32 +113,35 @@ class Vocabulary(object):
        """
        if w in self.word2idx:
            return self.word2idx[w]
        else:
        elif self.has_default:
            return self.word2idx[DEFAULT_UNKNOWN_LABEL]
        else:
            raise ValueError("word {} not in vocabulary".format(w))

    @check_build_vocab
    def to_index(self, w):
        """ like to_index(w) function, turn a word to the index
            if w is not in Vocabulary, return the unknown label
        

        :param str w:
        """
        return self[w]

    @property
    @check_build_vocab
    def unknown_idx(self):
        if self.unknown_label is None:
            return None
        return self.word2idx[self.unknown_label]

    @property
    @check_build_vocab
    def padding_idx(self):
        if self.padding_label is None:
            return None
        return self.word2idx[self.padding_label]

    def build_reverse_vocab(self):
        """build 'index to word' dict based on 'word to index' dict
        """
        self.idx2word = {self.word2idx[w]: w for w in self.word2idx}

    @check_build_vocab
    def to_word(self, idx):
        """given a word's index, return the word itself

@@ -122,3 +164,11 @@ class Vocabulary(object):
        """
        self.__dict__.update(state)
        self.idx2word = None

    def __contains__(self, item):
        """Check if a word in vocabulary.

        :param item: the word
        :return: True or False
        """
        return self.has_word(item)
--- a/fastNLP/fastnlp.py
+++ b/fastNLP/fastnlp.py
@@ -1,6 +1,7 @@
 import os

 from fastNLP.core.dataset import SeqLabelDataSet, TextClassifyDataSet
 from fastNLP.core.dataset import DataSet
 from fastNLP.loader.dataset_loader import convert_seq_dataset
 from fastNLP.core.predictor import SeqLabelInfer, ClassificationInfer
 from fastNLP.core.preprocess import load_pickle
 from fastNLP.loader.config_loader import ConfigLoader, ConfigSection
@@ -178,13 +179,11 @@ class FastNLP(object):
        :param infer_input: 2-D lists of strings
        :return data_set: a DataSet object
        """
        if self.infer_type == "seq_label":
            data_set = SeqLabelDataSet()
            data_set.load_raw(infer_input, {"word_vocab": self.word_vocab})
            return data_set
        elif self.infer_type == "text_class":
            data_set = TextClassifyDataSet()
            data_set.load_raw(infer_input, {"word_vocab": self.word_vocab})
        if self.infer_type in ["seq_label", "text_class"]:
            data_set = convert_seq_dataset(infer_input)
            data_set.index_field("word_seq", self.word_vocab)
            if self.infer_type == "seq_label":
                data_set.set_origin_len("word_seq")
            return data_set
        else:
            raise RuntimeError("fail to make outputs with infer type {}".format(self.infer_type))
--- a/fastNLP/loader/config_loader.py
+++ b/fastNLP/loader/config_loader.py
@@ -8,9 +8,10 @@ from fastNLP.loader.base_loader import BaseLoader
 class ConfigLoader(BaseLoader):
    """loader for configuration files"""

    def __int__(self, data_path):
    def __init__(self, data_path=None):
        super(ConfigLoader, self).__init__()
        self.config = self.parse(super(ConfigLoader, self).load(data_path))
        if data_path is not None:
            self.config = self.parse(super(ConfigLoader, self).load(data_path))

    @staticmethod
    def parse(string):
--- a/fastNLP/loader/dataset_loader.py
+++ b/fastNLP/loader/dataset_loader.py
@@ -1,6 +1,74 @@
 import os

 from fastNLP.loader.base_loader import BaseLoader
 from fastNLP.core.dataset import DataSet
 from fastNLP.core.instance import Instance
 from fastNLP.core.field import *


 def convert_seq_dataset(data):
    """Create an DataSet instance that contains no labels.

    :param data: list of list of strings, [num_examples, *].
            ::
            [
                [word_11, word_12, ...],
                ...
            ]

    :return: a DataSet.
    """
    dataset = DataSet()
    for word_seq in data:
        x = TextField(word_seq, is_target=False)
        dataset.append(Instance(word_seq=x))
    return dataset


 def convert_seq2tag_dataset(data):
    """Convert list of data into DataSet

    :param data: list of list of strings, [num_examples, *].
            ::
            [
                [ [word_11, word_12, ...], label_1 ],
                [ [word_21, word_22, ...], label_2 ],
                ...
            ]

    :return: a DataSet.
    """
    dataset = DataSet()
    for sample in data:
        word_seq, label = sample[0], sample[1]
        ins = Instance()
        ins.add_field("word_seq", TextField(word_seq, is_target=False)) \
            .add_field("label", LabelField(label, is_target=True))
        dataset.append(ins)
    return dataset


 def convert_seq2seq_dataset(data):
    """Convert list of data into DataSet

    :param data: list of list of strings, [num_examples, *].
            ::
            [
                [ [word_11, word_12, ...], [label_1, label_1, ...] ],
                [ [word_21, word_22, ...], [label_2, label_1, ...] ],
                ...
            ]

    :return: a DataSet.
    """
    dataset = DataSet()
    for sample in data:
        word_seq, label_seq = sample[0], sample[1]
        ins = Instance()
        ins.add_field("word_seq", TextField(word_seq, is_target=False)) \
            .add_field("label_seq", TextField(label_seq, is_target=True))
        dataset.append(ins)
    return dataset


 class DataSetLoader(BaseLoader):
@@ -10,9 +78,33 @@ class DataSetLoader(BaseLoader):
        super(DataSetLoader, self).__init__()

    def load(self, path):
        """ load data in `path` into a dataset
        """
        raise NotImplementedError

    def convert(self, data):
        """convert list of data into dataset
        """
        raise NotImplementedError


@DataSet.set_reader('read_raw')
 class RawDataSetLoader(DataSetLoader):
    def __init__(self):
        super(RawDataSetLoader, self).__init__()

    def load(self, data_path, split=None):
        with open(data_path, "r", encoding="utf-8") as f:
            lines = f.readlines()
        lines = lines if split is None else [l.split(split) for l in lines]
        lines = list(filter(lambda x: len(x) > 0, lines))
        return self.convert(lines)

    def convert(self, data):
        return convert_seq_dataset(data)


@DataSet.set_reader('read_pos')
 class POSDataSetLoader(DataSetLoader):
    """Dataset Loader for POS Tag datasets.

@@ -48,7 +140,8 @@ class POSDataSetLoader(DataSetLoader):
        """
        with open(data_path, "r", encoding="utf-8") as f:
            lines = f.readlines()
        return self.parse(lines)
        data = self.parse(lines)
        return self.convert(data)

    @staticmethod
    def parse(lines):
@@ -75,7 +168,13 @@ class POSDataSetLoader(DataSetLoader):
            data.append([words, labels])
        return data

    def convert(self, data):
        """Convert lists of strings into Instances with Fields.
        """
        return convert_seq2seq_dataset(data)


@DataSet.set_reader('read_tokenize')
 class TokenizeDataSetLoader(DataSetLoader):
    """
    Data set loader for tokenization data sets
@@ -84,8 +183,7 @@ class TokenizeDataSetLoader(DataSetLoader):
    def __init__(self):
        super(TokenizeDataSetLoader, self).__init__()

    @staticmethod
    def load(data_path, max_seq_len=32):
    def load(self, data_path, max_seq_len=32):
        """
        load pku dataset for Chinese word segmentation
        CWS (Chinese Word Segmentation) pku training dataset format:
@@ -130,9 +228,13 @@ class TokenizeDataSetLoader(DataSetLoader):
                seq_words = words[start:end]
                seq_labels = labels[start:end]
                data.append([seq_words, seq_labels])
        return data
        return self.convert(data)

    def convert(self, data):
        return convert_seq2seq_dataset(data)


@DataSet.set_reader('read_class')
 class ClassDataSetLoader(DataSetLoader):
    """Loader for classification data sets"""

@@ -143,7 +245,8 @@ class ClassDataSetLoader(DataSetLoader):
        assert os.path.exists(data_path)
        with open(data_path, "r", encoding="utf-8") as f:
            lines = f.readlines()
        return self.parse(lines)
        data = self.parse(lines)
        return self.convert(data)

    @staticmethod
    def parse(lines):
@@ -166,16 +269,19 @@ class ClassDataSetLoader(DataSetLoader):
            dataset.append(sentence)
        return dataset

    def convert(self, data):
        return convert_seq2tag_dataset(data)


@DataSet.set_reader('read_conll')
 class ConllLoader(DataSetLoader):
    """loader for conll format files"""

    def __int__(self, data_path):
    def __init__(self):
        """
        :param str data_path: the path to the conll data set
        """
        super(ConllLoader, self).__init__()
        self.data_set = self.parse(self.load(data_path))

    def load(self, data_path):
        """
@@ -183,7 +289,8 @@ class ConllLoader(DataSetLoader):
        """
        with open(data_path, "r", encoding="utf-8") as f:
            lines = f.readlines()
        return lines
        data = self.parse(lines)
        return self.convert(data)

    @staticmethod
    def parse(lines):
@@ -204,7 +311,11 @@ class ConllLoader(DataSetLoader):
            tokens.append(line.split())
        return sentences

    def convert(self, data):
        pass


@DataSet.set_reader('read_lm')
 class LMDataSetLoader(DataSetLoader):
    """Language Model Dataset Loader

@@ -222,7 +333,8 @@ class LMDataSetLoader(DataSetLoader):
        with open(data_path, "r", encoding="utf=8") as f:
            text = " ".join(f.readlines())
        tokens = text.strip().split()
        return self.sentence_cut(tokens)
        data = self.sentence_cut(tokens)
        return self.convert(data)

    def sentence_cut(self, tokens, sentence_length=15):
        start_idx = 0
@@ -236,7 +348,11 @@ class LMDataSetLoader(DataSetLoader):
            data_set.append([x, y])
        return data_set

    def convert(self, data):
        pass


@DataSet.set_reader('read_people_daily')
 class PeopleDailyCorpusLoader(DataSetLoader):
    """
        People Daily Corpus: Chinese word segmentation, POS tag, NER
@@ -286,3 +402,74 @@ class PeopleDailyCorpusLoader(DataSetLoader):
            ner_examples.append([sent_words, sent_ner])
        return pos_tag_examples, ner_examples

    def convert(self, data):
        pass


 class SNLIDataSetLoader(DataSetLoader):
    """A data set loader for SNLI data set.

    """

    def __init__(self):
        super(SNLIDataSetLoader, self).__init__()

    def load(self, path_list):
        """

        :param path_list: A list of file name, in the order of premise file, hypothesis file, and label file.
        :return: data_set: A DataSet object.
        """
        assert len(path_list) == 3
        line_set = []
        for file in path_list:
            if not os.path.exists(file):
                raise FileNotFoundError("file {} NOT found".format(file))

            with open(file, 'r', encoding='utf-8') as f:
                lines = f.readlines()
                line_set.append(lines)

        premise_lines, hypothesis_lines, label_lines = line_set
        assert len(premise_lines) == len(hypothesis_lines) and len(premise_lines) == len(label_lines)

        data_set = []
        for premise, hypothesis, label in zip(premise_lines, hypothesis_lines, label_lines):
            p = premise.strip().split()
            h = hypothesis.strip().split()
            l = label.strip()
            data_set.append([p, h, l])

        return self.convert(data_set)

    def convert(self, data):
        """Convert a 3D list to a DataSet object.

        :param data: A 3D tensor.
            [
                [ [premise_word_11, premise_word_12, ...], [hypothesis_word_11, hypothesis_word_12, ...], [label_1] ],
                [ [premise_word_21, premise_word_22, ...], [hypothesis_word_21, hypothesis_word_22, ...], [label_2] ],
                ...
            ]
        :return: data_set: A DataSet object.
        """

        data_set = DataSet()

        for example in data:
            p, h, l = example
            # list, list, str
            x1 = TextField(p, is_target=False)
            x2 = TextField(h, is_target=False)
            x1_len = TextField([1] * len(p), is_target=False)
            x2_len = TextField([1] * len(h), is_target=False)
            y = LabelField(l, is_target=True)
            instance = Instance()
            instance.add_field("premise", x1)
            instance.add_field("hypothesis", x2)
            instance.add_field("premise_len", x1_len)
            instance.add_field("hypothesis_len", x2_len)
            instance.add_field("truth", y)
            data_set.append(instance)

        return data_set
--- a/fastNLP/loader/embed_loader.py
+++ b/fastNLP/loader/embed_loader.py
@@ -1,50 +1,85 @@
 import _pickle
 import os

 import numpy as np
 import torch

 from fastNLP.loader.base_loader import BaseLoader
 from fastNLP.core.vocabulary import Vocabulary


 class EmbedLoader(BaseLoader):
    """docstring for EmbedLoader"""

    def __init__(self, data_path):
        super(EmbedLoader, self).__init__(data_path)
    def __init__(self):
        super(EmbedLoader, self).__init__()

    @staticmethod
    def load_embedding(emb_dim, emb_file, word_dict, emb_pkl):
    def _load_glove(emb_file):
        """Read file as a glove embedding

        file format: 
            embeddings are split by line, 
            for one embedding, word and numbers split by space
        Example::

        word_1 float_1 float_2 ... float_emb_dim
        word_2 float_1 float_2 ... float_emb_dim
        ...
        """
        emb = {}
        with open(emb_file, 'r', encoding='utf-8') as f:
            for line in f:
                line = list(filter(lambda w: len(w)>0, line.strip().split(' ')))
                if len(line) > 0:
                    emb[line[0]] = torch.Tensor(list(map(float, line[1:])))
        return emb
    
    @staticmethod
    def _load_pretrain(emb_file, emb_type):
        """Read txt data from embedding file and convert to np.array as pre-trained embedding

        :param emb_file: str, the pre-trained embedding file path
        :param emb_type: str, the pre-trained embedding data format
        :return dict: {str: np.array}
        """
        if emb_type == 'glove':
            return EmbedLoader._load_glove(emb_file)
        else:
            raise Exception("embedding type {} not support yet".format(emb_type))

    @staticmethod
    def load_embedding(emb_dim, emb_file, emb_type, vocab, emb_pkl):
        """Load the pre-trained embedding and combine with the given dictionary.

        :param emb_file: str, the pre-trained embedding.
                The embedding file should have the following format:
                    Each line is a word embedding, where a word string is followed by multiple floats.
                    Floats are separated by space. The word and the first float are separated by space.
        :param word_dict: dict, a mapping from word to index.
        :param emb_dim: int, the dimension of the embedding. Should be the same as pre-trained embedding.
        :param emb_file: str, the pre-trained embedding file path.
        :param emb_type: str, the pre-trained embedding format, support glove now
        :param vocab: Vocabulary, a mapping from word to index, can be provided by user or built from pre-trained embedding
        :param emb_pkl: str, the embedding pickle file.
        :return embedding_np: numpy array of shape (len(word_dict), emb_dim)

        :return embedding_tensor: Tensor of shape (len(word_dict), emb_dim)
                vocab: input vocab or vocab built by pre-train
        TODO: fragile code
        """
        # If the embedding pickle exists, load it and return.
        if os.path.exists(emb_pkl):
            with open(emb_pkl, "rb") as f:
                embedding_np = _pickle.load(f)
            return embedding_np
                embedding_tensor, vocab = _pickle.load(f)
            return embedding_tensor, vocab
        # Otherwise, load the pre-trained embedding.
        with open(emb_file, "r", encoding="utf-8") as f:
            # begin with a random embedding
            embedding_np = np.random.uniform(-1, 1, size=(len(word_dict), emb_dim))
            for line in f:
                line = line.strip().split()
                if len(line) != emb_dim + 1:
                    # skip this line if two embedding dimension not match
                    continue
                if line[0] in word_dict:
                    # find the word and replace its embedding with a pre-trained one
                    embedding_np[word_dict[line[0]]] = [float(i) for i in line[1:]]
        pretrain = EmbedLoader._load_pretrain(emb_file, emb_type)
        if vocab is None:
            # build vocabulary from pre-trained embedding
            vocab = Vocabulary()
            for w in pretrain.keys():
                vocab.update(w)
        embedding_tensor = torch.randn(len(vocab), emb_dim)
        for w, v in pretrain.items():
            if len(v.shape) > 1 or emb_dim != v.shape[0]:
                raise ValueError('pretrian embedding dim is {}, dismatching required {}'.format(v.shape, (emb_dim,)))
            if vocab.has_word(w):
                embedding_tensor[vocab[w]] = v

        # save and return the result
        with open(emb_pkl, "wb") as f:
            _pickle.dump(embedding_np, f)
        return embedding_np
            _pickle.dump((embedding_tensor, vocab), f)
        return embedding_tensor, vocab
--- a/fastNLP/models/base_model.py
+++ b/fastNLP/models/base_model.py
@@ -1,5 +1,7 @@
 import torch

 from fastNLP.core.trainer import Trainer


 class BaseModel(torch.nn.Module):
    """Base PyTorch model for all models.
@@ -8,68 +10,6 @@ class BaseModel(torch.nn.Module):
    def __init__(self):
        super(BaseModel, self).__init__()


 class Vocabulary(object):
    """A look-up table that allows you to access `Lexeme` objects. The `Vocab`
    instance also provides access to the `StringStore`, and owns underlying
    data that is shared between `Doc` objects.
    """

    def __init__(self):
        """Create the vocabulary.
        RETURNS (Vocab): The newly constructed object.
        """
        self.data_frame = None


 class Document(object):
    """A sequence of Token objects. Access sentences and named entities, export
    annotations to numpy arrays, losslessly serialize to compressed binary
    strings. The `Doc` object holds an array of `Token` objects. The
    Python-level `Token` and `Span` objects are views of this array, i.e.
    they don't own the data themselves. -- spacy
    """

    def __init__(self, vocab, words=None, spaces=None):
        """Create a Doc object.
        vocab (Vocab): A vocabulary object, which must match any models you
            want to use (e.g. tokenizer, parser, entity recognizer).
        words (list or None): A list of unicode strings, to add to the document
            as words. If `None`, defaults to empty list.
        spaces (list or None): A list of boolean values, of the same length as
            words. True means that the word is followed by a space, False means
            it is not. If `None`, defaults to `[True]*len(words)`
        user_data (dict or None): Optional extra data to attach to the Doc.
        RETURNS (Doc): The newly constructed object.
        """
        self.vocab = vocab
        self.spaces = spaces
        self.words = words
        if spaces is None:
            self.spaces = [True] * len(self.words)
        elif len(spaces) != len(self.words):
            raise ValueError("dismatch spaces and words")

    def get_chunker(self, vocab):
        return None

    def push_back(self, vocab):
        pass


 class Token(object):
    """An individual token – i.e. a word, punctuation symbol, whitespace,
    etc.
    """

    def __init__(self, vocab, doc, offset):
        """Construct a `Token` object.
            vocab (Vocabulary): A storage container for lexical types.
            doc (Document): The parent document.
            offset (int): The index of the token within the document.
        """
        self.vocab = vocab
        self.doc = doc
        self.token = doc[offset]
        self.i = offset

    def fit(self, train_data, dev_data=None, **train_args):
        trainer = Trainer(**train_args)
        trainer.train(self, train_data, dev_data)
--- a/fastNLP/models/biaffine_parser.py
+++ b/fastNLP/models/biaffine_parser.py
@@ -0,0 +1,364 @@
 import sys, os
 sys.path.append(os.path.join(os.path.dirname(__file__), '../..'))
 import copy
 import numpy as np
 import torch
 from collections import defaultdict
 from torch import nn
 from torch.nn import functional as F
 from fastNLP.modules.utils import initial_parameter
 from fastNLP.modules.encoder.variational_rnn import VarLSTM
 from fastNLP.modules.dropout import TimestepDropout

 def mst(scores):
    """
    with some modification to support parser output for MST decoding
    https://github.com/tdozat/Parser/blob/0739216129cd39d69997d28cbc4133b360ea3934/lib/models/nn.py#L692
    """
    length = scores.shape[0]
    min_score = -np.inf
    mask = np.zeros((length, length))
    np.fill_diagonal(mask, -np.inf)
    scores = scores + mask
    heads = np.argmax(scores, axis=1)
    heads[0] = 0
    tokens = np.arange(1, length)
    roots = np.where(heads[tokens] == 0)[0] + 1
    if len(roots) < 1:
        root_scores = scores[tokens, 0]
        head_scores = scores[tokens, heads[tokens]]
        new_root = tokens[np.argmax(root_scores / head_scores)]
        heads[new_root] = 0
    elif len(roots) > 1:
        root_scores = scores[roots, 0]
        scores[roots, 0] = 0
        new_heads = np.argmax(scores[roots][:, tokens], axis=1) + 1
        new_root = roots[np.argmin(
            scores[roots, new_heads] / root_scores)]
        heads[roots] = new_heads
        heads[new_root] = 0

    edges = defaultdict(set)
    vertices = set((0,))
    for dep, head in enumerate(heads[tokens]):
        vertices.add(dep + 1)
        edges[head].add(dep + 1)
    for cycle in _find_cycle(vertices, edges):
        dependents = set()
        to_visit = set(cycle)
        while len(to_visit) > 0:
            node = to_visit.pop()
            if node not in dependents:
                dependents.add(node)
                to_visit.update(edges[node])
        cycle = np.array(list(cycle))
        old_heads = heads[cycle]
        old_scores = scores[cycle, old_heads]
        non_heads = np.array(list(dependents))
        scores[np.repeat(cycle, len(non_heads)),
               np.repeat([non_heads], len(cycle), axis=0).flatten()] = min_score
        new_heads = np.argmax(scores[cycle][:, tokens], axis=1) + 1
        new_scores = scores[cycle, new_heads] / old_scores
        change = np.argmax(new_scores)
        changed_cycle = cycle[change]
        old_head = old_heads[change]
        new_head = new_heads[change]
        heads[changed_cycle] = new_head
        edges[new_head].add(changed_cycle)
        edges[old_head].remove(changed_cycle)

    return heads


 def _find_cycle(vertices, edges):
    """
    https://en.wikipedia.org/wiki/Tarjan%27s_strongly_connected_components_algorithm
    https://github.com/tdozat/Parser/blob/0739216129cd39d69997d28cbc4133b360ea3934/lib/etc/tarjan.py
    """
    _index = 0
    _stack = []
    _indices = {}
    _lowlinks = {}
    _onstack = defaultdict(lambda: False)
    _SCCs = []

    def _strongconnect(v):
        nonlocal _index
        _indices[v] = _index
        _lowlinks[v] = _index
        _index += 1
        _stack.append(v)
        _onstack[v] = True

        for w in edges[v]:
            if w not in _indices:
                _strongconnect(w)
                _lowlinks[v] = min(_lowlinks[v], _lowlinks[w])
            elif _onstack[w]:
                _lowlinks[v] = min(_lowlinks[v], _indices[w])

        if _lowlinks[v] == _indices[v]:
            SCC = set()
            while True:
                w = _stack.pop()
                _onstack[w] = False
                SCC.add(w)
                if not(w != v):
                    break
            _SCCs.append(SCC)

    for v in vertices:
        if v not in _indices:
            _strongconnect(v)

    return [SCC for SCC in _SCCs if len(SCC) > 1]


 class GraphParser(nn.Module):
    """Graph based Parser helper class, support greedy decoding and MST(Maximum Spanning Tree) decoding
    """
    def __init__(self):
        super(GraphParser, self).__init__()

    def forward(self, x):
        raise NotImplementedError

    def _greedy_decoder(self, arc_matrix, seq_mask=None):
        _, seq_len, _ = arc_matrix.shape
        matrix = arc_matrix + torch.diag(arc_matrix.new(seq_len).fill_(-np.inf))
        _, heads = torch.max(matrix, dim=2)
        if seq_mask is not None:
            heads *= seq_mask.long()
        return heads

    def _mst_decoder(self, arc_matrix, seq_mask=None):
        batch_size, seq_len, _ = arc_matrix.shape
        matrix = torch.zeros_like(arc_matrix).copy_(arc_matrix)
        ans = matrix.new_zeros(batch_size, seq_len).long()
        for i, graph in enumerate(matrix):
            ans[i] = torch.as_tensor(mst(graph.cpu().numpy()), device=ans.device)
        if seq_mask is not None:
            ans *= seq_mask.long()
        return ans


 class ArcBiaffine(nn.Module):
    """helper module for Biaffine Dependency Parser predicting arc
    """
    def __init__(self, hidden_size, bias=True):
        super(ArcBiaffine, self).__init__()
        self.U = nn.Parameter(torch.Tensor(hidden_size, hidden_size), requires_grad=True)
        self.has_bias = bias
        if self.has_bias:
            self.bias = nn.Parameter(torch.Tensor(hidden_size), requires_grad=True)
        else:
            self.register_parameter("bias", None)
        initial_parameter(self)

    def forward(self, head, dep):
        """
        :param head arc-head tensor = [batch, length, emb_dim]
        :param dep arc-dependent tensor = [batch, length, emb_dim]

        :return output tensor = [bacth, length, length]
        """
        output = dep.matmul(self.U)
        output = output.bmm(head.transpose(-1, -2))
        if self.has_bias:
            output += head.matmul(self.bias).unsqueeze(1)
        return output


 class LabelBilinear(nn.Module):
    """helper module for Biaffine Dependency Parser predicting label
    """
    def __init__(self, in1_features, in2_features, num_label, bias=True):
        super(LabelBilinear, self).__init__()
        self.bilinear = nn.Bilinear(in1_features, in2_features, num_label, bias=bias)
        self.lin1 = nn.Linear(in1_features, num_label, bias=False)
        self.lin2 = nn.Linear(in2_features, num_label, bias=False)

    def forward(self, x1, x2):
        output = self.bilinear(x1, x2)
        output += self.lin1(x1) + self.lin2(x2)
        return output


 class BiaffineParser(GraphParser):
    """Biaffine Dependency Parser implemantation.
    refer to ` Deep Biaffine Attention for Neural Dependency Parsing (Dozat and Manning, 2016)
    <https://arxiv.org/abs/1611.01734>`_ .
    """
    def __init__(self,
                word_vocab_size,
                word_emb_dim,
                pos_vocab_size,
                pos_emb_dim,
                rnn_layers,
                rnn_hidden_size,
                arc_mlp_size,
                label_mlp_size,
                num_label,
                dropout,
                use_var_lstm=False,
                use_greedy_infer=False):

        super(BiaffineParser, self).__init__()
        self.word_embedding = nn.Embedding(num_embeddings=word_vocab_size, embedding_dim=word_emb_dim)
        self.pos_embedding = nn.Embedding(num_embeddings=pos_vocab_size, embedding_dim=pos_emb_dim)
        if use_var_lstm:
            self.lstm = VarLSTM(input_size=word_emb_dim + pos_emb_dim,
                                hidden_size=rnn_hidden_size,
                                num_layers=rnn_layers,
                                bias=True,
                                batch_first=True,
                                input_dropout=dropout,
                                hidden_dropout=dropout,
                                bidirectional=True)
        else:
            self.lstm = nn.LSTM(input_size=word_emb_dim + pos_emb_dim,
                                hidden_size=rnn_hidden_size,
                                num_layers=rnn_layers,
                                bias=True,
                                batch_first=True,
                                dropout=dropout,
                                bidirectional=True)

        rnn_out_size = 2 * rnn_hidden_size
        self.arc_head_mlp = nn.Sequential(nn.Linear(rnn_out_size, arc_mlp_size),
                                          nn.ELU())
        self.arc_dep_mlp = copy.deepcopy(self.arc_head_mlp)
        self.label_head_mlp = nn.Sequential(nn.Linear(rnn_out_size, label_mlp_size),
                                            nn.ELU())
        self.label_dep_mlp = copy.deepcopy(self.label_head_mlp)
        self.arc_predictor = ArcBiaffine(arc_mlp_size, bias=True)
        self.label_predictor = LabelBilinear(label_mlp_size, label_mlp_size, num_label, bias=True)
        self.normal_dropout = nn.Dropout(p=dropout)
        self.timestep_dropout = TimestepDropout(p=dropout)
        self.use_greedy_infer = use_greedy_infer
        initial_parameter(self)

    def forward(self, word_seq, pos_seq, seq_mask, gold_heads=None, **_):
        """
        :param word_seq: [batch_size, seq_len] sequence of word's indices
        :param pos_seq: [batch_size, seq_len] sequence of word's indices
        :param seq_mask: [batch_size, seq_len] sequence of length masks
        :param gold_heads: [batch_size, seq_len] sequence of golden heads
        :return dict: parsing results
            arc_pred: [batch_size, seq_len, seq_len]
            label_pred: [batch_size, seq_len, seq_len]
            seq_mask: [batch_size, seq_len]
            head_pred: [batch_size, seq_len] if gold_heads is not provided, predicting the heads
        """
        # prepare embeddings
        batch_size, seq_len = word_seq.shape
        # print('forward {} {}'.format(batch_size, seq_len))
        batch_range = torch.arange(start=0, end=batch_size, dtype=torch.long, device=word_seq.device).unsqueeze(1)

        # get sequence mask
        seq_mask = seq_mask.long()

        word = self.normal_dropout(self.word_embedding(word_seq)) # [N,L] -> [N,L,C_0]
        pos = self.normal_dropout(self.pos_embedding(pos_seq)) # [N,L] -> [N,L,C_1]
        x = torch.cat([word, pos], dim=2) # -> [N,L,C]

        # lstm, extract features
        feat, _ = self.lstm(x) # -> [N,L,C]

        # for arc biaffine
        # mlp, reduce dim
        arc_dep = self.timestep_dropout(self.arc_dep_mlp(feat))
        arc_head = self.timestep_dropout(self.arc_head_mlp(feat))
        label_dep = self.timestep_dropout(self.label_dep_mlp(feat))
        label_head = self.timestep_dropout(self.label_head_mlp(feat))

        # biaffine arc classifier
        arc_pred = self.arc_predictor(arc_head, arc_dep) # [N, L, L]
        flip_mask = (seq_mask == 0)
        arc_pred.masked_fill_(flip_mask.unsqueeze(1), -np.inf)

        # use gold or predicted arc to predict label
        if gold_heads is None:
            # use greedy decoding in training
            if self.training or self.use_greedy_infer:
                heads = self._greedy_decoder(arc_pred, seq_mask)
            else:
                heads = self._mst_decoder(arc_pred, seq_mask)
            head_pred = heads
        else:
            head_pred = None
            heads = gold_heads

        label_head = label_head[batch_range, heads].contiguous()
        label_pred = self.label_predictor(label_head, label_dep) # [N, L, num_label]
        res_dict = {'arc_pred': arc_pred, 'label_pred': label_pred, 'seq_mask': seq_mask}
        if head_pred is not None:
            res_dict['head_pred'] = head_pred
        return res_dict

    def loss(self, arc_pred, label_pred, head_indices, head_labels, seq_mask, **_):
        """
        Compute loss.

        :param arc_pred: [batch_size, seq_len, seq_len]
        :param label_pred: [batch_size, seq_len, seq_len]
        :param head_indices: [batch_size, seq_len]
        :param head_labels: [batch_size, seq_len]
        :param seq_mask: [batch_size, seq_len]
        :return: loss value
        """

        batch_size, seq_len, _ = arc_pred.shape
        arc_logits = F.log_softmax(arc_pred, dim=2)
        label_logits = F.log_softmax(label_pred, dim=2)
        batch_index = torch.arange(start=0, end=batch_size, device=arc_logits.device).long().unsqueeze(1)
        child_index = torch.arange(start=0, end=seq_len, device=arc_logits.device).long().unsqueeze(0)
        arc_loss = arc_logits[batch_index, child_index, head_indices]
        label_loss = label_logits[batch_index, child_index, head_labels]

        arc_loss = arc_loss[:, 1:]
        label_loss = label_loss[:, 1:]

        float_mask = seq_mask[:, 1:].float()
        length = (seq_mask.sum() - batch_size).float()
        arc_nll = -(arc_loss*float_mask).sum() / length
        label_nll = -(label_loss*float_mask).sum() / length
        return arc_nll + label_nll

    def evaluate(self, arc_pred, label_pred, head_indices, head_labels, seq_mask, **kwargs):
        """
        Evaluate the performance of prediction.

        :return dict: performance results.
            head_pred_corrct: number of correct predicted heads.
            label_pred_correct: number of correct predicted labels.
            total_tokens: number of predicted tokens
        """
        if 'head_pred' in kwargs:
            head_pred = kwargs['head_pred']
        elif self.use_greedy_infer:
            head_pred = self._greedy_decoder(arc_pred, seq_mask)
        else:
            head_pred = self._mst_decoder(arc_pred, seq_mask)

        head_pred_correct = (head_pred == head_indices).long() * seq_mask
        _, label_preds = torch.max(label_pred, dim=2)
        label_pred_correct = (label_preds == head_labels).long() * head_pred_correct
        return {"head_pred_correct": head_pred_correct.sum(dim=1),
                "label_pred_correct": label_pred_correct.sum(dim=1),
                "total_tokens": seq_mask.sum(dim=1)}

    def metrics(self, head_pred_correct, label_pred_correct, total_tokens, **_):
        """
        Compute the metrics of model

        :param head_pred_corrct: number of correct predicted heads.
        :param label_pred_correct: number of correct predicted labels.
        :param total_tokens: number of predicted tokens
        :return dict: the metrics results
            UAS: the head predicted accuracy
            LAS: the label predicted accuracy
        """
        return {"UAS": head_pred_correct.sum().float() / total_tokens.sum().float() * 100,
                "LAS": label_pred_correct.sum().float() / total_tokens.sum().float() * 100}

--- a/fastNLP/models/char_language_model.py
+++ b/fastNLP/models/char_language_model.py
@@ -103,7 +103,7 @@ class CharLM(nn.Module):
        x = x.contiguous().view(lstm_batch_size, lstm_seq_len, -1)
        # [num_seq, seq_len, total_num_filters]

        x, hidden = self.lstm(x)
        x = self.lstm(x)
        # [seq_len, num_seq, hidden_size]

        x = self.dropout(x)
--- a/fastNLP/models/snli.py
+++ b/fastNLP/models/snli.py
@@ -0,0 +1,161 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F

 from fastNLP.models.base_model import BaseModel
 from fastNLP.modules import decoder as Decoder, encoder as Encoder


 my_inf = 10e12


 class SNLI(BaseModel):
    """
    PyTorch Network for SNLI.
    """

    def __init__(self, args, init_embedding=None):
        super(SNLI, self).__init__()
        self.vocab_size = args["vocab_size"]
        self.embed_dim = args["embed_dim"]
        self.hidden_size = args["hidden_size"]
        self.batch_first = args["batch_first"]
        self.dropout = args["dropout"]
        self.n_labels = args["num_classes"]
        self.gpu = args["gpu"] and torch.cuda.is_available()

        self.embedding = Encoder.embedding.Embedding(self.vocab_size, self.embed_dim, init_emb=init_embedding,
                                                     dropout=self.dropout)

        self.embedding_layer = Encoder.Linear(self.embed_dim, self.hidden_size)

        self.encoder = Encoder.LSTM(
            input_size=self.embed_dim, hidden_size=self.hidden_size, num_layers=1, bias=True,
            batch_first=self.batch_first, bidirectional=True
        )

        self.inference_layer = Encoder.Linear(self.hidden_size * 4, self.hidden_size)

        self.decoder = Encoder.LSTM(
            input_size=self.hidden_size, hidden_size=self.hidden_size, num_layers=1, bias=True,
            batch_first=self.batch_first, bidirectional=True
        )

        self.output = Decoder.MLP([4 * self.hidden_size, self.hidden_size, self.n_labels], 'tanh')

    def forward(self, premise, hypothesis, premise_len, hypothesis_len):
        """ Forward function

        :param premise: A Tensor represents premise: [batch size(B), premise seq len(PL), hidden size(H)].
        :param hypothesis: A Tensor represents hypothesis: [B, hypothesis seq len(HL), H].
        :param premise_len: A Tensor record which is a real word and which is a padding word in premise: [B, PL].
        :param hypothesis_len: A Tensor record which is a real word and which is a padding word in hypothesis: [B, HL].
        :return: prediction: A Tensor of classification result: [B, n_labels(N)].
        """

        premise0 = self.embedding_layer(self.embedding(premise))
        hypothesis0 = self.embedding_layer(self.embedding(hypothesis))

        _BP, _PSL, _HP = premise0.size()
        _BH, _HSL, _HH = hypothesis0.size()
        _BPL, _PLL = premise_len.size()
        _HPL, _HLL = hypothesis_len.size()

        assert _BP == _BH and _BPL == _HPL and _BP == _BPL
        assert _HP == _HH
        assert _PSL == _PLL and _HSL == _HLL

        B, PL, H = premise0.size()
        B, HL, H = hypothesis0.size()

        # a0, (ah0, ac0) = self.encoder(premise)  # a0: [B, PL, H * 2], ah0: [2, B, H]
        # b0, (bh0, bc0) = self.encoder(hypothesis)  # b0: [B, HL, H * 2]

        a0 = self.encoder(premise0)  # a0: [B, PL, H * 2]
        b0 = self.encoder(hypothesis0)  # b0: [B, HL, H * 2]

        a = torch.mean(a0.view(B, PL, -1, H), dim=2)  # a: [B, PL, H]
        b = torch.mean(b0.view(B, HL, -1, H), dim=2)  # b: [B, HL, H]

        ai, bi = self.calc_bi_attention(a, b, premise_len, hypothesis_len)

        ma = torch.cat((a, ai, a - ai, a * ai), dim=2)  # ma: [B, PL, 4 * H]
        mb = torch.cat((b, bi, b - bi, b * bi), dim=2)  # mb: [B, HL, 4 * H]

        f_ma = self.inference_layer(ma)
        f_mb = self.inference_layer(mb)

        vat = self.decoder(f_ma)
        vbt = self.decoder(f_mb)

        va = torch.mean(vat.view(B, PL, -1, H), dim=2)  # va: [B, PL, H]
        vb = torch.mean(vbt.view(B, HL, -1, H), dim=2)  # vb: [B, HL, H]

        # va_ave = torch.mean(va, dim=1)  # va_ave: [B, H]
        # va_max, va_arg_max = torch.max(va, dim=1)  # va_max: [B, H]
        # vb_ave = torch.mean(vb, dim=1)  # vb_ave: [B, H]
        # vb_max, vb_arg_max = torch.max(vb, dim=1)  # vb_max: [B, H]

        va_ave = self.mean_pooling(va, premise_len, dim=1)  # va_ave: [B, H]
        va_max, va_arg_max = self.max_pooling(va, premise_len, dim=1)  # va_max: [B, H]
        vb_ave = self.mean_pooling(vb, hypothesis_len, dim=1)  # vb_ave: [B, H]
        vb_max, vb_arg_max = self.max_pooling(vb, hypothesis_len, dim=1)  # vb_max: [B, H]

        v = torch.cat((va_ave, va_max, vb_ave, vb_max), dim=1)  # v: [B, 4 * H]

        # v_mlp = F.tanh(self.mlp_layer1(v))  # v_mlp: [B, H]
        # prediction = self.mlp_layer2(v_mlp)  # prediction: [B, N]

        prediction = F.tanh(self.output(v))  # prediction: [B, N]

        return prediction

    @staticmethod
    def calc_bi_attention(in_x1, in_x2, x1_len, x2_len):

        # in_x1: [batch_size, x1_seq_len, hidden_size]
        # in_x2: [batch_size, x2_seq_len, hidden_size]
        # x1_len: [batch_size, x1_seq_len]
        # x2_len: [batch_size, x2_seq_len]

        assert in_x1.size()[0] == in_x2.size()[0]
        assert in_x1.size()[2] == in_x2.size()[2]
        # The batch size and hidden size must be equal.
        assert in_x1.size()[1] == x1_len.size()[1] and in_x2.size()[1] == x2_len.size()[1]
        # The seq len in in_x and x_len must be equal.
        assert in_x1.size()[0] == x1_len.size()[0] and x1_len.size()[0] == x2_len.size()[0]

        batch_size = in_x1.size()[0]
        x1_max_len = in_x1.size()[1]
        x2_max_len = in_x2.size()[1]

        in_x2_t = torch.transpose(in_x2, 1, 2)  # [batch_size, hidden_size, x2_seq_len]

        attention_matrix = torch.bmm(in_x1, in_x2_t)  # [batch_size, x1_seq_len, x2_seq_len]

        a_mask = x1_len.le(0.5).float() * -my_inf  # [batch_size, x1_seq_len]
        a_mask = a_mask.view(batch_size, x1_max_len, -1)
        a_mask = a_mask.expand(-1, -1, x2_max_len)  # [batch_size, x1_seq_len, x2_seq_len]
        b_mask = x2_len.le(0.5).float() * -my_inf
        b_mask = b_mask.view(batch_size, -1, x2_max_len)
        b_mask = b_mask.expand(-1, x1_max_len, -1)  # [batch_size, x1_seq_len, x2_seq_len]

        attention_a = F.softmax(attention_matrix + a_mask, dim=2)  # [batch_size, x1_seq_len, x2_seq_len]
        attention_b = F.softmax(attention_matrix + b_mask, dim=1)  # [batch_size, x1_seq_len, x2_seq_len]

        out_x1 = torch.bmm(attention_a, in_x2)  # [batch_size, x1_seq_len, hidden_size]
        attention_b_t = torch.transpose(attention_b, 1, 2)
        out_x2 = torch.bmm(attention_b_t, in_x1)  # [batch_size, x2_seq_len, hidden_size]

        return out_x1, out_x2

    @staticmethod
    def mean_pooling(tensor, mask, dim=0):
        masks = mask.view(mask.size(0), mask.size(1), -1).float()
        return torch.sum(tensor * masks, dim=dim) / torch.sum(masks, dim=1)

    @staticmethod
    def max_pooling(tensor, mask, dim=0):
        masks = mask.view(mask.size(0), mask.size(1), -1)
        masks = masks.expand(-1, -1, tensor.size(2)).float()
        return torch.max(tensor + masks.le(0.5).float() * -my_inf, dim=dim)
--- a/fastNLP/modules/decoder/MLP.py
+++ b/fastNLP/modules/decoder/MLP.py
@@ -1,12 +1,15 @@
 import torch
 import torch.nn as nn
 from fastNLP.modules.utils import initial_parameter


 class MLP(nn.Module):
    def __init__(self, size_layer, activation='relu' , initial_method = None):
    def __init__(self, size_layer, activation='relu', initial_method=None):
        """Multilayer Perceptrons as a decoder

        :param size_layer: list of int, define the size of MLP layers
        :param activation: str or function, the activation function for hidden layers
        :param size_layer: list of int, define the size of MLP layers.
        :param activation: str or function, the activation function for hidden layers.
        :param initial_method: str, the name of init method.

        .. note::
            There is no activation function applying on output layer.
@@ -23,7 +26,7 @@ class MLP(nn.Module):

        actives = {
            'relu': nn.ReLU(),
            'tanh': nn.Tanh()
            'tanh': nn.Tanh(),
        }
        if activation in actives:
            self.hidden_active = actives[activation]
@@ -31,7 +34,7 @@ class MLP(nn.Module):
            self.hidden_active = activation
        else:
            raise ValueError("should set activation correctly: {}".format(activation))
        initial_parameter(self, initial_method  )
        initial_parameter(self, initial_method)

    def forward(self, x):
        for layer in self.hiddens:
@@ -40,13 +43,11 @@ class MLP(nn.Module):
        return x



 if __name__ == '__main__':
    net1 = MLP([5,10,5])
    net2 = MLP([5,10,5], 'tanh')
    net1 = MLP([5, 10, 5])
    net2 = MLP([5, 10, 5], 'tanh')
    for net in [net1, net2]:
        x = torch.randn(5, 5)
        y = net(x)
        print(x)
        print(y)
    
--- a/fastNLP/modules/dropout.py
+++ b/fastNLP/modules/dropout.py
@@ -0,0 +1,15 @@
 import torch

 class TimestepDropout(torch.nn.Dropout):
    """This module accepts a `[batch_size, num_timesteps, embedding_dim)]` and use a single
    dropout mask of shape `(batch_size, embedding_dim)` to apply on every time step.
    """
    def forward(self, x):
        dropout_mask = x.new_ones(x.shape[0], x.shape[-1])
        torch.nn.functional.dropout(dropout_mask, self.p, self.training, inplace=True)
        dropout_mask = dropout_mask.unsqueeze(1) # [batch_size, 1, embedding_dim]
        if self.inplace:
            x *= dropout_mask
            return
        else:
            return x * dropout_mask
--- a/fastNLP/modules/encoder/char_embedding.py
+++ b/fastNLP/modules/encoder/char_embedding.py
@@ -1,12 +1,14 @@
 import torch
 import torch.nn.functional as F
 from torch import nn
 # from torch.nn.init import xavier_uniform

 from fastNLP.modules.utils import initial_parameter


 # from torch.nn.init import xavier_uniform
 class ConvCharEmbedding(nn.Module):

    def __init__(self, char_emb_size=50, feature_maps=(40, 30, 30), kernels=(3, 4, 5),initial_method = None):
    def __init__(self, char_emb_size=50, feature_maps=(40, 30, 30), kernels=(3, 4, 5), initial_method=None):
        """
        Character Level Word Embedding
        :param char_emb_size: the size of character level embedding. Default: 50
@@ -21,7 +23,7 @@ class ConvCharEmbedding(nn.Module):
            nn.Conv2d(1, feature_maps[i], kernel_size=(char_emb_size, kernels[i]), bias=True, padding=(0, 4))
            for i in range(len(kernels))])

        initial_parameter(self,initial_method)
        initial_parameter(self, initial_method)

    def forward(self, x):
        """
@@ -56,7 +58,7 @@ class LSTMCharEmbedding(nn.Module):
    :param hidden_size: int, the number of hidden units. Default:  equal to char_emb_size.
    """

    def __init__(self, char_emb_size=50, hidden_size=None , initial_method= None):
    def __init__(self, char_emb_size=50, hidden_size=None, initial_method=None):
        super(LSTMCharEmbedding, self).__init__()
        self.hidden_size = char_emb_size if hidden_size is None else hidden_size

@@ -66,6 +68,7 @@ class LSTMCharEmbedding(nn.Module):
                            bias=True,
                            batch_first=True)
        initial_parameter(self, initial_method)

    def forward(self, x):
        """
        :param x:[ n_batch*n_word, word_length, char_emb_size]
@@ -79,20 +82,3 @@ class LSTMCharEmbedding(nn.Module):

        _, hidden = self.lstm(x, (h0, c0))
        return hidden[0].squeeze().unsqueeze(2)


 if __name__ == "__main__":
    batch_size = 128
    char_emb = 100
    word_length = 1
    x = torch.Tensor(batch_size, char_emb, word_length)
    x = x.transpose(1, 2)
    cce = ConvCharEmbedding(char_emb)
    y = cce(x)
    print("CNN Char Emb input: ", x.shape)
    print("CNN Char Emb output: ", y.shape)  # [128, 100]

    lce = LSTMCharEmbedding(char_emb)
    o = lce(x)
    print("LSTM Char Emb input: ", x.shape)
    print("LSTM Char Emb size: ", o.shape)
--- a/fastNLP/modules/encoder/linear.py
+++ b/fastNLP/modules/encoder/linear.py
@@ -1,6 +1,8 @@
 import torch.nn as nn

 from fastNLP.modules.utils import initial_parameter


 class Linear(nn.Module):
    """
    Linear module
@@ -12,10 +14,11 @@ class Linear(nn.Module):
    bidirectional : If True, becomes a bidirectional RNN
    """

    def __init__(self, input_size, output_size, bias=True,initial_method = None        ):
    def __init__(self, input_size, output_size, bias=True, initial_method=None):
        super(Linear, self).__init__()
        self.linear = nn.Linear(input_size, output_size, bias)
        initial_parameter(self, initial_method)

    def forward(self, x):
        x = self.linear(x)
        return x
--- a/fastNLP/modules/encoder/lstm.py
+++ b/fastNLP/modules/encoder/lstm.py
@@ -14,16 +14,23 @@ class LSTM(nn.Module):
    bidirectional : If True, becomes a bidirectional RNN. Default: False.
    """

    def __init__(self, input_size, hidden_size=100, num_layers=1, dropout=0.0, bidirectional=False,
                 initial_method=None):
    def __init__(self, input_size, hidden_size=100, num_layers=1, dropout=0.0, batch_first=True,
                 bidirectional=False, bias=True, initial_method=None, get_hidden=False):
        super(LSTM, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, bias=True, batch_first=True,
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, bias=bias, batch_first=batch_first,
                            dropout=dropout, bidirectional=bidirectional)
        self.get_hidden = get_hidden
        initial_parameter(self, initial_method)

    def forward(self, x):
        x, _ = self.lstm(x)
        return x
    def forward(self, x, h0=None, c0=None):
        if h0 is not None and c0 is not None:
            x, (ht, ct) = self.lstm(x, (h0, c0))
        else:
            x, (ht, ct) = self.lstm(x)
        if self.get_hidden:
            return x, (ht, ct)
        else:
            return x


 if __name__ == "__main__":
--- a/fastNLP/modules/encoder/variational_rnn.py
+++ b/fastNLP/modules/encoder/variational_rnn.py
@@ -2,384 +2,153 @@ import math

 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from torch.nn._functions.thnn import rnnFusedPointwise as fusedBackend
 from torch.nn.parameter import Parameter
 from torch.nn.utils.rnn import PackedSequence

 from fastNLP.modules.utils import initial_parameter

 def default_initializer(hidden_size):
    stdv = 1.0 / math.sqrt(hidden_size)

    def forward(tensor):
        nn.init.uniform_(tensor, -stdv, stdv)

    return forward


 def VarMaskedRecurrent(reverse=False):
    def forward(input, hidden, cell, mask):
        output = []
        steps = range(input.size(0) - 1, -1, -1) if reverse else range(input.size(0))
        for i in steps:
            if mask is None or mask[i].data.min() > 0.5:
                hidden = cell(input[i], hidden)
            elif mask[i].data.max() > 0.5:
                hidden_next = cell(input[i], hidden)
                # hack to handle LSTM
                if isinstance(hidden, tuple):
                    hx, cx = hidden
                    hp1, cp1 = hidden_next
                    hidden = (hx + (hp1 - hx) * mask[i], cx + (cp1 - cx) * mask[i])
                else:
                    hidden = hidden + (hidden_next - hidden) * mask[i]
            # hack to handle LSTM
            output.append(hidden[0] if isinstance(hidden, tuple) else hidden)

        if reverse:
            output.reverse()
        output = torch.cat(output, 0).view(input.size(0), *output[0].size())

        return hidden, output

    return forward


 def StackedRNN(inners, num_layers, lstm=False):
    num_directions = len(inners)
    total_layers = num_layers * num_directions

    def forward(input, hidden, cells, mask):
        assert (len(cells) == total_layers)
        next_hidden = []

        if lstm:
            hidden = list(zip(*hidden))

        for i in range(num_layers):
            all_output = []
            for j, inner in enumerate(inners):
                l = i * num_directions + j
                hy, output = inner(input, hidden[l], cells[l], mask)
                next_hidden.append(hy)
                all_output.append(output)

            input = torch.cat(all_output, input.dim() - 1)

        if lstm:
            next_h, next_c = zip(*next_hidden)
            next_hidden = (
                torch.cat(next_h, 0).view(total_layers, *next_h[0].size()),
                torch.cat(next_c, 0).view(total_layers, *next_c[0].size())
            )
        else:
            next_hidden = torch.cat(next_hidden, 0).view(total_layers, *next_hidden[0].size())

        return next_hidden, input

    return forward


 def AutogradVarMaskedRNN(num_layers=1, batch_first=False, bidirectional=False, lstm=False):
    rec_factory = VarMaskedRecurrent

    if bidirectional:
        layer = (rec_factory(), rec_factory(reverse=True))
    else:
        layer = (rec_factory(),)

    func = StackedRNN(layer,
                      num_layers,
                      lstm=lstm)

    def forward(input, cells, hidden, mask):
        if batch_first:
            input = input.transpose(0, 1)
            if mask is not None:
                mask = mask.transpose(0, 1)

        nexth, output = func(input, hidden, cells, mask)

        if batch_first:
            output = output.transpose(0, 1)

        return output, nexth

    return forward

 try:
    from torch import flip
 except ImportError:
   def flip(x, dims):
        indices = [slice(None)] * x.dim()
        for dim in dims:
            indices[dim] = torch.arange(x.size(dim) - 1, -1, -1, dtype=torch.long, device=x.device)
        return x[tuple(indices)]

 class VarRnnCellWrapper(nn.Module):
    """Wrapper for normal RNN Cells, make it support variational dropout
    """
    def __init__(self, cell, hidden_size, input_p, hidden_p):
        super(VarRnnCellWrapper, self).__init__()
        self.cell = cell
        self.hidden_size = hidden_size
        self.input_p = input_p
        self.hidden_p = hidden_p

 def VarMaskedStep():
    def forward(input, hidden, cell, mask):
        if mask is None or mask.data.min() > 0.5:
            hidden = cell(input, hidden)
        elif mask.data.max() > 0.5:
            hidden_next = cell(input, hidden)
            # hack to handle LSTM
            if isinstance(hidden, tuple):
    def forward(self, input, hidden, mask_x=None, mask_h=None):
        """
        :param input: [seq_len, batch_size, input_size]
        :param hidden: for LSTM, tuple of (h_0, c_0), [batch_size, hidden_size]
                       for other RNN, h_0, [batch_size, hidden_size]
        :param mask_x: [batch_size, input_size] dropout mask for input
        :param mask_h: [batch_size, hidden_size] dropout mask for hidden
        :return output: [seq_len, bacth_size, hidden_size]
                hidden: for LSTM, tuple of (h_n, c_n), [batch_size, hidden_size]
                        for other RNN, h_n, [batch_size, hidden_size]
        """
        is_lstm = isinstance(hidden, tuple)
        input = input * mask_x.unsqueeze(0) if mask_x is not None else input
        output_list = []
        for x in input:
            if is_lstm:
                hx, cx = hidden
                hp1, cp1 = hidden_next
                hidden = (hx + (hp1 - hx) * mask, cx + (cp1 - cx) * mask)
                hidden = (hx * mask_h, cx) if mask_h is not None else (hx, cx)
            else:
                hidden = hidden + (hidden_next - hidden) * mask
        # hack to handle LSTM
        output = hidden[0] if isinstance(hidden, tuple) else hidden

        return hidden, output

    return forward


 def StackedStep(layer, num_layers, lstm=False):
    def forward(input, hidden, cells, mask):
        assert (len(cells) == num_layers)
        next_hidden = []

        if lstm:
            hidden = list(zip(*hidden))

        for l in range(num_layers):
            hy, output = layer(input, hidden[l], cells[l], mask)
            next_hidden.append(hy)
            input = output

        if lstm:
            next_h, next_c = zip(*next_hidden)
            next_hidden = (
                torch.cat(next_h, 0).view(num_layers, *next_h[0].size()),
                torch.cat(next_c, 0).view(num_layers, *next_c[0].size())
            )
        else:
            next_hidden = torch.cat(next_hidden, 0).view(num_layers, *next_hidden[0].size())

        return next_hidden, input

    return forward


 def AutogradVarMaskedStep(num_layers=1, lstm=False):
    layer = VarMaskedStep()

    func = StackedStep(layer,
                       num_layers,
                       lstm=lstm)

    def forward(input, cells, hidden, mask):
        nexth, output = func(input, hidden, cells, mask)
        return output, nexth

    return forward

                hidden *= mask_h if mask_h is not None else hidden
            hidden = self.cell(x, hidden)
            output_list.append(hidden[0] if is_lstm else hidden)
        output = torch.stack(output_list, dim=0)
        return output, hidden

 class VarMaskedRNNBase(nn.Module):
    def __init__(self, Cell, input_size, hidden_size,
                 num_layers=1, bias=True, batch_first=False,
                 dropout=(0, 0), bidirectional=False, initializer=None,initial_method = None, **kwargs):

        super(VarMaskedRNNBase, self).__init__()
        self.Cell = Cell
 class VarRNNBase(nn.Module):
    """Implementation of Variational Dropout RNN network.
    refer to `A Theoretically Grounded Application of Dropout in Recurrent Neural Networks (Yarin Gal and Zoubin Ghahramani, 2016)
    https://arxiv.org/abs/1512.05287`.
    """
    def __init__(self, mode, Cell, input_size, hidden_size, num_layers=1,
                 bias=True, batch_first=False,
                 input_dropout=0, hidden_dropout=0, bidirectional=False):
        super(VarRNNBase, self).__init__()
        self.mode = mode
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.bias = bias
        self.batch_first = batch_first
        self.input_dropout = input_dropout
        self.hidden_dropout = hidden_dropout
        self.bidirectional = bidirectional
        self.lstm = False
        num_directions = 2 if bidirectional else 1

        self.all_cells = []
        for layer in range(num_layers):
            for direction in range(num_directions):
                layer_input_size = input_size if layer == 0 else hidden_size * num_directions

                cell = self.Cell(layer_input_size, hidden_size, self.bias, p=dropout, initializer=initializer, **kwargs)
                self.all_cells.append(cell)
                self.add_module('cell%d' % (layer * num_directions + direction), cell)
        initial_parameter(self, initial_method)
    def reset_parameters(self):
        for cell in self.all_cells:
            cell.reset_parameters()

    def reset_noise(self, batch_size):
        for cell in self.all_cells:
            cell.reset_noise(batch_size)
        self.num_directions = 2 if bidirectional else 1
        self._all_cells = nn.ModuleList()
        for layer in range(self.num_layers):
            for direction in range(self.num_directions):
                input_size = self.input_size if layer == 0 else self.hidden_size * self.num_directions
                cell = Cell(input_size, self.hidden_size, bias)
                self._all_cells.append(VarRnnCellWrapper(cell, self.hidden_size, input_dropout, hidden_dropout))
        initial_parameter(self)

    def forward(self, input, hx=None):
        is_packed = isinstance(input, PackedSequence)
        is_lstm = (self.mode == "LSTM")
        if is_packed:
            input, batch_sizes = input
            max_batch_size = int(batch_sizes[0])
        else:
            batch_sizes = None
            max_batch_size = input.size(0) if self.batch_first else input.size(1)

    def forward(self, input, mask=None, hx=None):
        batch_size = input.size(0) if self.batch_first else input.size(1)
        if hx is None:
            num_directions = 2 if self.bidirectional else 1
            hx = torch.tensor(input.data.new(self.num_layers * num_directions, batch_size, self.hidden_size).zero_(),
                              requires_grad=True)
            if self.lstm:
            hx = input.new_zeros(self.num_layers * self.num_directions,
                                 max_batch_size, self.hidden_size,
                                 requires_grad=False)
            if is_lstm:
                hx = (hx, hx)

        func = AutogradVarMaskedRNN(num_layers=self.num_layers,
                                    batch_first=self.batch_first,
                                    bidirectional=self.bidirectional,
                                    lstm=self.lstm)

        self.reset_noise(batch_size)

        output, hidden = func(input, self.all_cells, hx, None if mask is None else mask.view(mask.size() + (1,)))
        return output, hidden

    def step(self, input, hx=None, mask=None):
        '''
        execute one step forward (only for one-directional RNN).
        Args:
            input (batch, input_size): input tensor of this step.
            hx (num_layers, batch, hidden_size): the hidden state of last step.
            mask (batch): the mask tensor of this step.
        Returns:
            output (batch, hidden_size): tensor containing the output of this step from the last layer of RNN.
            hn (num_layers, batch, hidden_size): tensor containing the hidden state of this step
        '''
        assert not self.bidirectional, "step only cannot be applied to bidirectional RNN."
        batch_size = input.size(0)
        if hx is None:
            hx = torch.tensor(input.data.new(self.num_layers, batch_size, self.hidden_size).zero_(), requires_grad=True)
            if self.lstm:
                hx = (hx, hx)
        if self.batch_first:
            input = input.transpose(0, 1)
            batch_size = input.shape[1]

        mask_x = input.new_ones((batch_size, self.input_size))
        mask_out = input.new_ones((batch_size, self.hidden_size * self.num_directions))
        mask_h = input.new_ones((batch_size, self.hidden_size))
        nn.functional.dropout(mask_x, p=self.input_dropout, training=self.training, inplace=True)
        nn.functional.dropout(mask_out, p=self.hidden_dropout, training=self.training, inplace=True)
        nn.functional.dropout(mask_h, p=self.hidden_dropout, training=self.training, inplace=True)

        hidden_list = []
        for layer in range(self.num_layers):
            output_list = []
            for direction in range(self.num_directions):
                input_x = input if direction == 0 else flip(input, [0])
                idx = self.num_directions * layer + direction
                cell = self._all_cells[idx]
                hi = (hx[0][idx], hx[1][idx]) if is_lstm else hx[idx]
                mask_xi = mask_x if layer == 0 else mask_out
                output_x, hidden_x = cell(input_x, hi, mask_xi, mask_h)
                output_list.append(output_x if direction == 0 else flip(output_x, [0]))
                hidden_list.append(hidden_x)
            input = torch.cat(output_list, dim=-1)

        output = input.transpose(0, 1) if self.batch_first else input
        if is_lstm:
            h_list, c_list = zip(*hidden_list)
            hn = torch.stack(h_list, dim=0)
            cn = torch.stack(c_list, dim=0)
            hidden = (hn, cn)
        else:
            hidden = torch.stack(hidden_list, dim=0)

        func = AutogradVarMaskedStep(num_layers=self.num_layers, lstm=self.lstm)
        if is_packed:
            output = PackedSequence(output, batch_sizes)

        output, hidden = func(input, self.all_cells, hx, mask)
        return output, hidden


 class VarMaskedFastLSTM(VarMaskedRNNBase):
 class VarLSTM(VarRNNBase):
    """Variational Dropout LSTM.
    """
    def __init__(self, *args, **kwargs):
        super(VarMaskedFastLSTM, self).__init__(VarFastLSTMCell, *args, **kwargs)
        self.lstm = True


 class VarRNNCellBase(nn.Module):
    def __repr__(self):
        s = '{name}({input_size}, {hidden_size}'
        if 'bias' in self.__dict__ and self.bias is not True:
            s += ', bias={bias}'
        if 'nonlinearity' in self.__dict__ and self.nonlinearity != "tanh":
            s += ', nonlinearity={nonlinearity}'
        s += ')'
        return s.format(name=self.__class__.__name__, **self.__dict__)
        super(VarLSTM, self).__init__(mode="LSTM", Cell=nn.LSTMCell, *args, **kwargs)

    def reset_noise(self, batch_size):
        """
        Should be overriden by all subclasses.
        Args:
            batch_size: (int) batch size of input.
        """
        raise NotImplementedError


 class VarFastLSTMCell(VarRNNCellBase):
    """
    A long short-term memory (LSTM) cell with variational dropout.
    .. math::
        \begin{array}{ll}
        i = \mathrm{sigmoid}(W_{ii} x + b_{ii} + W_{hi} h + b_{hi}) \\
        f = \mathrm{sigmoid}(W_{if} x + b_{if} + W_{hf} h + b_{hf}) \\
        g = \tanh(W_{ig} x + b_{ig} + W_{hc} h + b_{hg}) \\
        o = \mathrm{sigmoid}(W_{io} x + b_{io} + W_{ho} h + b_{ho}) \\
        c' = f * c + i * g \\
        h' = o * \tanh(c') \\
        \end{array}
 class VarRNN(VarRNNBase):
    """Variational Dropout RNN.
    """
    def __init__(self, *args, **kwargs):
        super(VarRNN, self).__init__(mode="RNN", Cell=nn.RNNCell, *args, **kwargs)

    def __init__(self, input_size, hidden_size, bias=True, p=(0.5, 0.5), initializer=None,initial_method =None):
        super(VarFastLSTMCell, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.bias = bias
        self.weight_ih = Parameter(torch.Tensor(4 * hidden_size, input_size))
        self.weight_hh = Parameter(torch.Tensor(4 * hidden_size, hidden_size))
        if bias:
            self.bias_ih = Parameter(torch.Tensor(4 * hidden_size))
            self.bias_hh = Parameter(torch.Tensor(4 * hidden_size))
        else:
            self.register_parameter('bias_ih', None)
            self.register_parameter('bias_hh', None)

        self.initializer = default_initializer(self.hidden_size) if initializer is None else initializer
        self.reset_parameters()
        p_in, p_hidden = p
        if p_in < 0 or p_in > 1:
            raise ValueError("input dropout probability has to be between 0 and 1, "
                             "but got {}".format(p_in))
        if p_hidden < 0 or p_hidden > 1:
            raise ValueError("hidden state dropout probability has to be between 0 and 1, "
                             "but got {}".format(p_hidden))
        self.p_in = p_in
        self.p_hidden = p_hidden
        self.noise_in = None
        self.noise_hidden = None
        initial_parameter(self, initial_method)
    def reset_parameters(self):
        for weight in self.parameters():
            if weight.dim() == 1:
                weight.data.zero_()
            else:
                self.initializer(weight.data)

    def reset_noise(self, batch_size):
        if self.training:
            if self.p_in:
                noise = self.weight_ih.data.new(batch_size, self.input_size)
                self.noise_in = torch.tensor(noise.bernoulli_(1.0 - self.p_in) / (1.0 - self.p_in))
            else:
                self.noise_in = None

            if self.p_hidden:
                noise = self.weight_hh.data.new(batch_size, self.hidden_size)
                self.noise_hidden = torch.tensor(noise.bernoulli_(1.0 - self.p_hidden) / (1.0 - self.p_hidden))
            else:
                self.noise_hidden = None
        else:
            self.noise_in = None
            self.noise_hidden = None

    def forward(self, input, hx):
        return self.__forward(
            input, hx,
            self.weight_ih, self.weight_hh,
            self.bias_ih, self.bias_hh,
            self.noise_in, self.noise_hidden,
        )

    @staticmethod
    def __forward(input, hidden, w_ih, w_hh, b_ih=None, b_hh=None, noise_in=None, noise_hidden=None):
        if noise_in is not None:
            if input.is_cuda:
                input = input * noise_in.cuda(input.get_device())
            else:
                input = input * noise_in

        if input.is_cuda:
            w_ih = w_ih.cuda(input.get_device())
            w_hh = w_hh.cuda(input.get_device())
            hidden = [h.cuda(input.get_device()) for h in hidden]
            b_ih = b_ih.cuda(input.get_device())
            b_hh = b_hh.cuda(input.get_device())
            igates = F.linear(input, w_ih.cuda(input.get_device()))
            hgates = F.linear(hidden[0], w_hh) if noise_hidden is None \
                else F.linear(hidden[0] * noise_hidden.cuda(input.get_device()), w_hh)
            state = fusedBackend.LSTMFused.apply
            # print("use backend")
            # use some magic function
            return state(igates, hgates, hidden[1]) if b_ih is None else state(igates, hgates, hidden[1], b_ih, b_hh)

        hx, cx = hidden
        if noise_hidden is not None:
            hx = hx * noise_hidden
        gates = F.linear(input, w_ih, b_ih) + F.linear(hx, w_hh, b_hh)

        ingate, forgetgate, cellgate, outgate = gates.chunk(4, 1)

        ingate = F.sigmoid(ingate)
        forgetgate = F.sigmoid(forgetgate)
        cellgate = F.tanh(cellgate)
        outgate = F.sigmoid(outgate)

        cy = (forgetgate * cx) + (ingate * cellgate)
        hy = outgate * F.tanh(cy)

        return hy, cy
 class VarGRU(VarRNNBase):
    """Variational Dropout GRU.
    """
    def __init__(self, *args, **kwargs):
        super(VarGRU, self).__init__(mode="GRU", Cell=nn.GRUCell, *args, **kwargs)
--- a/reproduction/Biaffine_parser/cfg.cfg
+++ b/reproduction/Biaffine_parser/cfg.cfg
@@ -0,0 +1,37 @@
 [train]
 epochs = 50
 batch_size = 16
 pickle_path = "./save/"
 validate = true
 save_best_dev = false
 use_cuda = true
 model_saved_path = "./save/"
 task = "parse"


 [test]
 save_output = true
 validate_in_training = true
 save_dev_input = false
 save_loss = true
 batch_size = 16
 pickle_path = "./save/"
 use_cuda = true
 task = "parse"

 [model]
 word_vocab_size = -1
 word_emb_dim = 100
 pos_vocab_size = -1
 pos_emb_dim = 100
 rnn_layers = 3
 rnn_hidden_size = 400
 arc_mlp_size = 500
 label_mlp_size = 100
 num_label = -1
 dropout = 0.33
 use_var_lstm=true
 use_greedy_infer=false

 [optim]
 lr = 2e-3
--- a/reproduction/Biaffine_parser/run.py
+++ b/reproduction/Biaffine_parser/run.py
@@ -0,0 +1,260 @@
 import os
 import sys

 sys.path.append(os.path.join(os.path.dirname(__file__), '../..'))

 from collections import defaultdict
 import math
 import torch

 from fastNLP.core.trainer import Trainer
 from fastNLP.core.instance import Instance
 from fastNLP.core.vocabulary import Vocabulary
 from fastNLP.core.dataset import DataSet
 from fastNLP.core.batch import Batch
 from fastNLP.core.sampler import SequentialSampler
 from fastNLP.core.field import TextField, SeqLabelField
 from fastNLP.core.preprocess import SeqLabelPreprocess, load_pickle
 from fastNLP.core.tester import Tester
 from fastNLP.loader.config_loader import ConfigLoader, ConfigSection
 from fastNLP.loader.model_loader import ModelLoader
 from fastNLP.loader.embed_loader import EmbedLoader
 from fastNLP.models.biaffine_parser import BiaffineParser
 from fastNLP.saver.model_saver import ModelSaver

 # not in the file's dir
 if len(os.path.dirname(__file__)) != 0:
    os.chdir(os.path.dirname(__file__))

 class MyDataLoader(object):
    def __init__(self, pickle_path):
        self.pickle_path = pickle_path

    def load(self, path, word_v=None, pos_v=None, headtag_v=None):
        datalist = []
        with open(path, 'r', encoding='utf-8') as f:
            sample = []
            for line in f:
                if line.startswith('\n'):
                    datalist.append(sample)
                    sample = []
                elif line.startswith('#'):
                    continue
                else:
                    sample.append(line.split('\t'))
            if len(sample) > 0:
                datalist.append(sample)

        ds = DataSet(name='conll')
        for sample in datalist:
            # print(sample)
            res = self.get_one(sample)
            if word_v is not None:
                word_v.update(res[0])
                pos_v.update(res[1])
                headtag_v.update(res[3])
            ds.append(Instance(word_seq=TextField(res[0], is_target=False),
                               pos_seq=TextField(res[1], is_target=False),
                               head_indices=SeqLabelField(res[2], is_target=True),
                               head_labels=TextField(res[3], is_target=True),
                               seq_mask=SeqLabelField([1 for _ in range(len(res[0]))], is_target=False)))

        return ds

    def get_one(self, sample):
        text = ['<root>']
        pos_tags = ['<root>']
        heads = [0]
        head_tags = ['root']
        for w in sample:
            t1, t2, t3, t4 = w[1], w[3], w[6], w[7]
            if t3 == '_':
                continue
            text.append(t1)
            pos_tags.append(t2)
            heads.append(int(t3))
            head_tags.append(t4)
        return (text, pos_tags, heads, head_tags)

    def index_data(self, dataset, word_v, pos_v, tag_v):
        dataset.index_field('word_seq', word_v)
        dataset.index_field('pos_seq', pos_v)
        dataset.index_field('head_labels', tag_v)

 # datadir = "/mnt/c/Me/Dev/release-2.2-st-train-dev-data/ud-treebanks-v2.2/UD_English-EWT"
 datadir = "/home/yfshao/UD_English-EWT"
 cfgfile = './cfg.cfg'
 train_data_name = "en_ewt-ud-train.conllu"
 dev_data_name = "en_ewt-ud-dev.conllu"
 emb_file_name = '/home/yfshao/glove.6B.100d.txt'
 processed_datadir = './save'

 # Config Loader
 train_args = ConfigSection()
 test_args = ConfigSection()
 model_args = ConfigSection()
 optim_args = ConfigSection()
 ConfigLoader.load_config(cfgfile, {"train": train_args, "test": test_args, "model": model_args, "optim": optim_args})

 # Data Loader
 def save_data(dirpath, **kwargs):
    import _pickle
    if not os.path.exists(dirpath):
        os.mkdir(dirpath)
    for name, data in kwargs.items():
        with open(os.path.join(dirpath, name+'.pkl'), 'wb') as f:
            _pickle.dump(data, f)


 def load_data(dirpath):
    import _pickle
    datas = {}
    for f_name in os.listdir(dirpath):
        if not f_name.endswith('.pkl'):
            continue
        name = f_name[:-4]
        with open(os.path.join(dirpath, f_name), 'rb') as f:
            datas[name] = _pickle.load(f)
    return datas

 class MyTester(object):
    def __init__(self, batch_size, use_cuda=False, **kwagrs):
        self.batch_size = batch_size
        self.use_cuda = use_cuda

    def test(self, model, dataset):
        self.model = model.cuda() if self.use_cuda else model
        self.model.eval()
        batchiter = Batch(dataset, self.batch_size, SequentialSampler(), self.use_cuda)
        eval_res = defaultdict(list)
        i = 0
        for batch_x, batch_y in batchiter:
            with torch.no_grad():
                pred_y = self.model(**batch_x)
                eval_one = self.model.evaluate(**pred_y, **batch_y)
            i += self.batch_size
            for eval_name, tensor in eval_one.items():
                eval_res[eval_name].append(tensor)
        tmp = {}
        for eval_name, tensorlist in eval_res.items():
            tmp[eval_name] = torch.cat(tensorlist, dim=0)

        self.res = self.model.metrics(**tmp)

    def show_metrics(self):
        s = ""
        for name, val in self.res.items():
            s += '{}: {:.2f}\t'.format(name, val)
        return s


 loader = MyDataLoader('')
 try:
    data_dict = load_data(processed_datadir)
    word_v = data_dict['word_v']
    pos_v = data_dict['pos_v']
    tag_v = data_dict['tag_v']
    train_data = data_dict['train_data']
    dev_data = data_dict['dev_data']
    print('use saved pickles')

 except Exception as _:
    print('load raw data and preprocess')
    word_v = Vocabulary(need_default=True, min_freq=2)
    pos_v = Vocabulary(need_default=True)
    tag_v = Vocabulary(need_default=False)
    train_data = loader.load(os.path.join(datadir, train_data_name), word_v, pos_v, tag_v)
    dev_data = loader.load(os.path.join(datadir, dev_data_name))
    save_data(processed_datadir, word_v=word_v, pos_v=pos_v, tag_v=tag_v, train_data=train_data, dev_data=dev_data)

 loader.index_data(train_data, word_v, pos_v, tag_v)
 loader.index_data(dev_data, word_v, pos_v, tag_v)
 print(len(train_data))
 print(len(dev_data))
 ep = train_args['epochs']
 train_args['epochs'] =  math.ceil(50000.0 / len(train_data) * train_args['batch_size']) if ep <= 0 else ep
 model_args['word_vocab_size'] = len(word_v)
 model_args['pos_vocab_size'] = len(pos_v)
 model_args['num_label'] = len(tag_v)


 def train():
    # Trainer
    trainer = Trainer(**train_args.data)

    def _define_optim(obj):
        obj._optimizer = torch.optim.Adam(obj._model.parameters(), **optim_args.data)
        obj._scheduler = torch.optim.lr_scheduler.LambdaLR(obj._optimizer, lambda ep: .75 ** (ep / 5e4))

    def _update(obj):
        obj._scheduler.step()
        obj._optimizer.step()

    trainer.define_optimizer = lambda: _define_optim(trainer)
    trainer.update = lambda: _update(trainer)
    trainer.get_loss = lambda predict, truth: trainer._loss_func(**predict, **truth)
    trainer._create_validator = lambda x: MyTester(**test_args.data)

    # Model
    model = BiaffineParser(**model_args.data)

    # use pretrain embedding
    embed, _ = EmbedLoader.load_embedding(model_args['word_emb_dim'], emb_file_name, 'glove', word_v, os.path.join(processed_datadir, 'word_emb.pkl'))
    model.word_embedding = torch.nn.Embedding.from_pretrained(embed, freeze=False)
    model.word_embedding.padding_idx = word_v.padding_idx
    model.word_embedding.weight.data[word_v.padding_idx].fill_(0)
    model.pos_embedding.padding_idx = pos_v.padding_idx
    model.pos_embedding.weight.data[pos_v.padding_idx].fill_(0)

    try:
        ModelLoader.load_pytorch(model, "./save/saved_model.pkl")
        print('model parameter loaded!')
    except Exception as _:
        print("No saved model. Continue.")
        pass

    # Start training
    trainer.train(model, train_data, dev_data)
    print("Training finished!")

    # Saver
    saver = ModelSaver("./save/saved_model.pkl")
    saver.save_pytorch(model)
    print("Model saved!")


 def test():
    # Tester
    tester = MyTester(**test_args.data)

    # Model
    model = BiaffineParser(**model_args.data)

    try:
        ModelLoader.load_pytorch(model, "./save/saved_model.pkl")
        print('model parameter loaded!')
    except Exception as _:
        print("No saved model. Abort test.")
        raise

    # Start training
    tester.test(model, dev_data)
    print(tester.show_metrics())
    print("Testing finished!")



 if __name__ == "__main__":
    import argparse
    parser = argparse.ArgumentParser(description='Run a chinese word segmentation model')
    parser.add_argument('--mode', help='set the model\'s model', choices=['train', 'test', 'infer'])
    args = parser.parse_args()
    if args.mode == 'train':
        train()
    elif args.mode == 'test':
        test()
    elif args.mode == 'infer':
        infer()
    else:
        print('no mode specified for model!')
        parser.print_help()
--- a/reproduction/Char-aware_NLM/main.py
+++ b/reproduction/Char-aware_NLM/main.py
@@ -1,24 +1,8 @@
 from fastNLP.core.loss import Loss
 from fastNLP.core.preprocess import Preprocessor
 from fastNLP.core.trainer import Trainer
 from fastNLP.loader.dataset_loader import LMDataSetLoader
 from fastNLP.models.char_language_model import CharLM

 PICKLE = "./save/"


 def train():
    loader = LMDataSetLoader()
    train_data = loader.load()

    pre = Preprocessor(label_is_seq=True, share_vocab=True)
    train_set = pre.run(train_data, pickle_path=PICKLE)

    model = CharLM(50, 50, pre.vocab_size, pre.char_vocab_size)

    trainer = Trainer(task="language_model", loss=Loss("cross_entropy"))

    trainer.train(model, train_set)
    pass


 if __name__ == "__main__":
--- a/reproduction/chinese_word_segment/run.py
+++ b/reproduction/chinese_word_segment/run.py
@@ -12,7 +12,7 @@ from fastNLP.loader.model_loader import ModelLoader
 from fastNLP.core.tester import SeqLabelTester
 from fastNLP.models.sequence_modeling import AdvSeqLabel
 from fastNLP.core.predictor import SeqLabelInfer
 from fastNLP.core.dataset import SeqLabelDataSet, change_field_is_target
 from fastNLP.core.dataset import DataSet
 from fastNLP.core.preprocess import save_pickle
 from fastNLP.core.metrics import SeqLabelEvaluator

--- a/test/core/test_batch.py
+++ b/test/core/test_batch.py
@@ -3,7 +3,7 @@ import unittest
 import torch

 from fastNLP.core.batch import Batch
 from fastNLP.core.dataset import DataSet, create_dataset_from_lists
 from fastNLP.core.dataset import DataSet
 from fastNLP.core.field import TextField, LabelField
 from fastNLP.core.instance import Instance

@@ -51,14 +51,3 @@ class TestCase1(unittest.TestCase):
            self.assertTrue(isinstance(batch_x["text"], torch.LongTensor))
            self.assertTrue(isinstance(batch_y, dict))
            self.assertTrue(isinstance(batch_y["label"], torch.LongTensor))


 class TestCase2(unittest.TestCase):
    def test(self):
        data = DataSet()
        for text in texts:
            x = TextField(text, is_target=False)
            ins = Instance(text=x)
            data.append(ins)
        data_set = create_dataset_from_lists(texts, vocab, has_target=False)
        self.assertTrue(type(data) == type(data_set))
--- a/test/core/test_dataset.py
+++ b/test/core/test_dataset.py
@@ -1,7 +1,6 @@
 import unittest

 from fastNLP.core.dataset import SeqLabelDataSet, TextClassifyDataSet
 from fastNLP.core.dataset import create_dataset_from_lists
 from fastNLP.loader.dataset_loader import convert_seq2seq_dataset, convert_seq_dataset


 class TestDataSet(unittest.TestCase):
@@ -19,8 +18,9 @@ class TestDataSet(unittest.TestCase):
    label_vocab = {"1": 1, "2": 2, "3": 3, "4": 4}

    def test_case_1(self):
        data_set = create_dataset_from_lists(self.labeled_data_list, self.word_vocab, has_target=True,
                                             label_vocab=self.label_vocab)
        data_set = convert_seq2seq_dataset(self.labeled_data_list)
        data_set.index_field("word_seq", self.word_vocab)
        data_set.index_field("label_seq", self.label_vocab)
        self.assertEqual(len(data_set), len(self.labeled_data_list))
        self.assertTrue(len(data_set) > 0)
        self.assertTrue(hasattr(data_set[0], "fields"))
@@ -39,7 +39,8 @@ class TestDataSet(unittest.TestCase):
                         [self.label_vocab[c] for c in self.labeled_data_list[0][1]])

    def test_case_2(self):
        data_set = create_dataset_from_lists(self.unlabeled_data_list, self.word_vocab, has_target=False)
        data_set = convert_seq_dataset(self.unlabeled_data_list)
        data_set.index_field("word_seq", self.word_vocab)

        self.assertEqual(len(data_set), len(self.unlabeled_data_list))
        self.assertTrue(len(data_set) > 0)
@@ -51,193 +52,3 @@ class TestDataSet(unittest.TestCase):
        self.assertEqual(data_set[0].fields["word_seq"]._index,
                         [self.word_vocab[c] for c in self.unlabeled_data_list[0]])


 class TestDataSetConvertion(unittest.TestCase):
    labeled_data_list = [
        [["a", "b", "e", "d"], ["1", "2", "3", "4"]],
        [["a", "b", "e", "d"], ["1", "2", "3", "4"]],
        [["a", "b", "e", "d"], ["1", "2", "3", "4"]],
    ]
    unlabeled_data_list = [
        ["a", "b", "e", "d"],
        ["a", "b", "e", "d"],
        ["a", "b", "e", "d"]
    ]
    word_vocab = {"a": 0, "b": 1, "e": 2, "d": 3}
    label_vocab = {"1": 1, "2": 2, "3": 3, "4": 4}

    def test_case_1(self):
        def loader(path):
            labeled_data_list = [
                [["a", "b", "e", "d"], ["1", "2", "3", "4"]],
                [["a", "b", "e", "d"], ["1", "2", "3", "4"]],
                [["a", "b", "e", "d"], ["1", "2", "3", "4"]],
            ]
            return labeled_data_list

        data_set = SeqLabelDataSet(load_func=loader)
        data_set.load("any_path")

        self.assertEqual(len(data_set), len(self.labeled_data_list))
        self.assertTrue(len(data_set) > 0)
        self.assertTrue(hasattr(data_set[0], "fields"))
        self.assertTrue("word_seq" in data_set[0].fields)

        self.assertTrue(hasattr(data_set[0].fields["word_seq"], "text"))
        self.assertTrue(hasattr(data_set[0].fields["word_seq"], "_index"))
        self.assertEqual(data_set[0].fields["word_seq"].text, self.labeled_data_list[0][0])

        self.assertTrue("truth" in data_set[0].fields)
        self.assertTrue(hasattr(data_set[0].fields["truth"], "text"))
        self.assertTrue(hasattr(data_set[0].fields["truth"], "_index"))
        self.assertEqual(data_set[0].fields["truth"].text, self.labeled_data_list[0][1])

        self.assertTrue("word_seq_origin_len" in data_set[0].fields)

    def test_case_2(self):
        def loader(path):
            unlabeled_data_list = [
                ["a", "b", "e", "d"],
                ["a", "b", "e", "d"],
                ["a", "b", "e", "d"]
            ]
            return unlabeled_data_list

        data_set = SeqLabelDataSet(load_func=loader)
        data_set.load("any_path", vocabs={"word_vocab": self.word_vocab}, infer=True)

        self.assertEqual(len(data_set), len(self.labeled_data_list))
        self.assertTrue(len(data_set) > 0)
        self.assertTrue(hasattr(data_set[0], "fields"))
        self.assertTrue("word_seq" in data_set[0].fields)
        self.assertTrue(hasattr(data_set[0].fields["word_seq"], "text"))
        self.assertTrue(hasattr(data_set[0].fields["word_seq"], "_index"))
        self.assertEqual(data_set[0].fields["word_seq"].text, self.labeled_data_list[0][0])
        self.assertEqual(data_set[0].fields["word_seq"]._index,
                         [self.word_vocab[c] for c in self.labeled_data_list[0][0]])

        self.assertTrue("word_seq_origin_len" in data_set[0].fields)

    def test_case_3(self):
        def loader(path):
            labeled_data_list = [
                [["a", "b", "e", "d"], ["1", "2", "3", "4"]],
                [["a", "b", "e", "d"], ["1", "2", "3", "4"]],
                [["a", "b", "e", "d"], ["1", "2", "3", "4"]],
            ]
            return labeled_data_list

        data_set = SeqLabelDataSet(load_func=loader)
        data_set.load("any_path", vocabs={"word_vocab": self.word_vocab, "label_vocab": self.label_vocab})

        self.assertEqual(len(data_set), len(self.labeled_data_list))
        self.assertTrue(len(data_set) > 0)
        self.assertTrue(hasattr(data_set[0], "fields"))
        self.assertTrue("word_seq" in data_set[0].fields)
        self.assertTrue(hasattr(data_set[0].fields["word_seq"], "text"))
        self.assertTrue(hasattr(data_set[0].fields["word_seq"], "_index"))
        self.assertEqual(data_set[0].fields["word_seq"].text, self.labeled_data_list[0][0])
        self.assertEqual(data_set[0].fields["word_seq"]._index,
                         [self.word_vocab[c] for c in self.labeled_data_list[0][0]])

        self.assertTrue("truth" in data_set[0].fields)
        self.assertTrue(hasattr(data_set[0].fields["truth"], "text"))
        self.assertTrue(hasattr(data_set[0].fields["truth"], "_index"))
        self.assertEqual(data_set[0].fields["truth"].text, self.labeled_data_list[0][1])
        self.assertEqual(data_set[0].fields["truth"]._index,
                         [self.label_vocab[c] for c in self.labeled_data_list[0][1]])

        self.assertTrue("word_seq_origin_len" in data_set[0].fields)


 class TestDataSetConvertionHHH(unittest.TestCase):
    labeled_data_list = [
        [["a", "b", "e", "d"], "A"],
        [["a", "b", "e", "d"], "C"],
        [["a", "b", "e", "d"], "B"],
    ]
    unlabeled_data_list = [
        ["a", "b", "e", "d"],
        ["a", "b", "e", "d"],
        ["a", "b", "e", "d"]
    ]
    word_vocab = {"a": 0, "b": 1, "e": 2, "d": 3}
    label_vocab = {"A": 1, "B": 2, "C": 3}

    def test_case_1(self):
        def loader(path):
            labeled_data_list = [
                [["a", "b", "e", "d"], "A"],
                [["a", "b", "e", "d"], "C"],
                [["a", "b", "e", "d"], "B"],
            ]
            return labeled_data_list

        data_set = TextClassifyDataSet(load_func=loader)
        data_set.load("xxx")

        self.assertEqual(len(data_set), len(self.labeled_data_list))
        self.assertTrue(len(data_set) > 0)
        self.assertTrue(hasattr(data_set[0], "fields"))
        self.assertTrue("word_seq" in data_set[0].fields)

        self.assertTrue(hasattr(data_set[0].fields["word_seq"], "text"))
        self.assertTrue(hasattr(data_set[0].fields["word_seq"], "_index"))
        self.assertEqual(data_set[0].fields["word_seq"].text, self.labeled_data_list[0][0])

        self.assertTrue("label" in data_set[0].fields)
        self.assertTrue(hasattr(data_set[0].fields["label"], "label"))
        self.assertTrue(hasattr(data_set[0].fields["label"], "_index"))
        self.assertEqual(data_set[0].fields["label"].label, self.labeled_data_list[0][1])

    def test_case_2(self):
        def loader(path):
            labeled_data_list = [
                [["a", "b", "e", "d"], "A"],
                [["a", "b", "e", "d"], "C"],
                [["a", "b", "e", "d"], "B"],
            ]
            return labeled_data_list

        data_set = TextClassifyDataSet(load_func=loader)
        data_set.load("xxx", vocabs={"word_vocab": self.word_vocab, "label_vocab": self.label_vocab})

        self.assertEqual(len(data_set), len(self.labeled_data_list))
        self.assertTrue(len(data_set) > 0)
        self.assertTrue(hasattr(data_set[0], "fields"))
        self.assertTrue("word_seq" in data_set[0].fields)

        self.assertTrue(hasattr(data_set[0].fields["word_seq"], "text"))
        self.assertTrue(hasattr(data_set[0].fields["word_seq"], "_index"))
        self.assertEqual(data_set[0].fields["word_seq"].text, self.labeled_data_list[0][0])
        self.assertEqual(data_set[0].fields["word_seq"]._index,
                         [self.word_vocab[c] for c in self.labeled_data_list[0][0]])

        self.assertTrue("label" in data_set[0].fields)
        self.assertTrue(hasattr(data_set[0].fields["label"], "label"))
        self.assertTrue(hasattr(data_set[0].fields["label"], "_index"))
        self.assertEqual(data_set[0].fields["label"].label, self.labeled_data_list[0][1])
        self.assertEqual(data_set[0].fields["label"]._index, self.label_vocab[self.labeled_data_list[0][1]])

    def test_case_3(self):
        def loader(path):
            unlabeled_data_list = [
                ["a", "b", "e", "d"],
                ["a", "b", "e", "d"],
                ["a", "b", "e", "d"]
            ]
            return unlabeled_data_list

        data_set = TextClassifyDataSet(load_func=loader)
        data_set.load("xxx", vocabs={"word_vocab": self.word_vocab}, infer=True)

        self.assertEqual(len(data_set), len(self.labeled_data_list))
        self.assertTrue(len(data_set) > 0)
        self.assertTrue(hasattr(data_set[0], "fields"))
        self.assertTrue("word_seq" in data_set[0].fields)

        self.assertTrue(hasattr(data_set[0].fields["word_seq"], "text"))
        self.assertTrue(hasattr(data_set[0].fields["word_seq"], "_index"))
        self.assertEqual(data_set[0].fields["word_seq"].text, self.labeled_data_list[0][0])
        self.assertEqual(data_set[0].fields["word_seq"]._index,
                         [self.word_vocab[c] for c in self.labeled_data_list[0][0]])
--- a/test/core/test_predictor.py
+++ b/test/core/test_predictor.py
@@ -1,11 +1,12 @@
 import os
 import unittest

 from fastNLP.core.dataset import TextClassifyDataSet, SeqLabelDataSet
 from fastNLP.core.dataset import DataSet
 from fastNLP.core.predictor import Predictor
 from fastNLP.core.preprocess import save_pickle
 from fastNLP.core.vocabulary import Vocabulary
 from fastNLP.loader.base_loader import BaseLoader
 from fastNLP.loader.dataset_loader import convert_seq_dataset
 from fastNLP.models.cnn_text_classification import CNNText
 from fastNLP.models.sequence_modeling import SeqLabeling

@@ -42,8 +43,8 @@ class TestPredictor(unittest.TestCase):
        predictor = Predictor("./save/", pre.text_classify_post_processor)

        # Load infer data
        infer_data_set = TextClassifyDataSet(load_func=BaseLoader.load)
        infer_data_set.convert_for_infer(infer_data, vocabs={"word_vocab": vocab.word2idx})
        infer_data_set = convert_seq_dataset(infer_data)
        infer_data_set.index_field("word_seq", vocab)

        results = predictor.predict(network=model, data=infer_data_set)

@@ -54,14 +55,12 @@ class TestPredictor(unittest.TestCase):
            self.assertTrue(isinstance(res, str))
            self.assertTrue(res in class_vocab.word2idx)

        del model, predictor, infer_data_set
        del model, predictor
        infer_data_set.set_origin_len("word_seq")

        model = SeqLabeling(model_args)
        predictor = Predictor("./save/", pre.seq_label_post_processor)

        infer_data_set = SeqLabelDataSet(load_func=BaseLoader.load)
        infer_data_set.convert_for_infer(infer_data, vocabs={"word_vocab": vocab.word2idx})

        results = predictor.predict(network=model, data=infer_data_set)
        self.assertTrue(isinstance(results, list))
        self.assertEqual(len(results), len(infer_data))
--- a/test/core/test_preprocess.py
+++ b/test/core/test_preprocess.py
@@ -1,72 +0,0 @@
 import os
 import unittest

 from fastNLP.core.dataset import DataSet
 from fastNLP.core.preprocess import SeqLabelPreprocess

 data = [
    [['Tom', 'and', 'Jerry', '.'], ['n', '&', 'n', '.']],
    [['Hello', 'world', '!'], ['a', 'n', '.']],
    [['Tom', 'and', 'Jerry', '.'], ['n', '&', 'n', '.']],
    [['Hello', 'world', '!'], ['a', 'n', '.']],
    [['Tom', 'and', 'Jerry', '.'], ['n', '&', 'n', '.']],
    [['Hello', 'world', '!'], ['a', 'n', '.']],
    [['Tom', 'and', 'Jerry', '.'], ['n', '&', 'n', '.']],
    [['Hello', 'world', '!'], ['a', 'n', '.']],
    [['Tom', 'and', 'Jerry', '.'], ['n', '&', 'n', '.']],
    [['Hello', 'world', '!'], ['a', 'n', '.']],
 ]


 class TestCase1(unittest.TestCase):
    def test(self):
        if os.path.exists("./save"):
            for root, dirs, files in os.walk("./save", topdown=False):
                for name in files:
                    os.remove(os.path.join(root, name))
                for name in dirs:
                    os.rmdir(os.path.join(root, name))
        result = SeqLabelPreprocess().run(train_dev_data=data, train_dev_split=0.4,
                                          pickle_path="./save")
        self.assertEqual(len(result), 2)
        self.assertEqual(type(result[0]), DataSet)
        self.assertEqual(type(result[1]), DataSet)

        os.system("rm -rf save")
        print("pickle path deleted")


 class TestCase2(unittest.TestCase):
    def test(self):
        if os.path.exists("./save"):
            for root, dirs, files in os.walk("./save", topdown=False):
                for name in files:
                    os.remove(os.path.join(root, name))
                for name in dirs:
                    os.rmdir(os.path.join(root, name))
        result = SeqLabelPreprocess().run(test_data=data, train_dev_data=data,
                                          pickle_path="./save", train_dev_split=0.4,
                                          cross_val=False)
        self.assertEqual(len(result), 3)
        self.assertEqual(type(result[0]), DataSet)
        self.assertEqual(type(result[1]), DataSet)
        self.assertEqual(type(result[2]), DataSet)

        os.system("rm -rf save")
        print("pickle path deleted")


 class TestCase3(unittest.TestCase):
    def test(self):
        num_folds = 2
        result = SeqLabelPreprocess().run(test_data=None, train_dev_data=data,
                                          pickle_path="./save", train_dev_split=0.4,
                                          cross_val=True, n_fold=num_folds)
        self.assertEqual(len(result), 2)
        self.assertEqual(len(result[0]), num_folds)
        self.assertEqual(len(result[1]), num_folds)
        for data_set in result[0] + result[1]:
            self.assertEqual(type(data_set), DataSet)

        os.system("rm -rf save")
        print("pickle path deleted")
--- a/test/core/test_tester.py
+++ b/test/core/test_tester.py
@@ -1,7 +1,7 @@
 import os
 import unittest

 from fastNLP.core.dataset import SeqLabelDataSet
 from fastNLP.core.dataset import DataSet
 from fastNLP.core.metrics import SeqLabelEvaluator
 from fastNLP.core.field import TextField, LabelField
 from fastNLP.core.instance import Instance
@@ -35,7 +35,7 @@ class TestTester(unittest.TestCase):
        vocab = {'a': 0, 'b': 1, 'c': 2, 'd': 3, 'e': 4, '!': 5, '@': 6, '#': 7, '$': 8, '?': 9}
        label_vocab = {'a': 0, '@': 1, 'c': 2, 'd': 3, 'e': 4}

        data_set = SeqLabelDataSet()
        data_set = DataSet()
        for example in train_data:
            text, label = example[0], example[1]
            x = TextField(text, False)
--- a/test/core/test_trainer.py
+++ b/test/core/test_trainer.py
@@ -1,7 +1,7 @@
 import os
 import unittest

 from fastNLP.core.dataset import SeqLabelDataSet
 from fastNLP.core.dataset import DataSet
 from fastNLP.core.metrics import SeqLabelEvaluator
 from fastNLP.core.field import TextField, LabelField
 from fastNLP.core.instance import Instance
@@ -36,7 +36,7 @@ class TestTrainer(unittest.TestCase):
        vocab = {'a': 0, 'b': 1, 'c': 2, 'd': 3, 'e': 4, '!': 5, '@': 6, '#': 7, '$': 8, '?': 9}
        label_vocab = {'a': 0, '@': 1, 'c': 2, 'd': 3, 'e': 4}

        data_set = SeqLabelDataSet()
        data_set = DataSet()
        for example in train_data:
            text, label = example[0], example[1]
            x = TextField(text, False)
--- a/test/data_for_tests/config
+++ b/test/data_for_tests/config
@@ -45,3 +45,28 @@ use_cuda = true
 learn_rate = 1e-3
 momentum = 0.9
 model_name = "class_model.pkl"

 [snli_trainer]
 epochs = 5
 batch_size = 32
 validate = true
 save_best_dev = true
 use_cuda = true
 learn_rate = 1e-4
 loss = "cross_entropy"
 print_every_step = 1000

 [snli_tester]
 batch_size = 512
 use_cuda = true

 [snli_model]
 model_name = "snli_model.pkl"
 embed_dim = 300
 hidden_size = 300
 batch_first = true
 dropout = 0.5
 gpu = true
 embed_file = "./../data_for_tests/glove.840B.300d.txt"
 embed_pkl = "./snli/embed.pkl"
 examples = 0
--- a/test/data_for_tests/glove.6B.50d_test.txt
+++ b/test/data_for_tests/glove.6B.50d_test.txt
@@ -0,0 +1,12 @@
 the 0.418 0.24968 -0.41242 0.1217 0.34527 -0.044457 -0.49688 -0.17862 -0.00066023 -0.6566 0.27843 -0.14767 -0.55677 0.14658 -0.0095095 0.011658 0.10204 -0.12792 -0.8443 -0.12181 -0.016801 -0.33279 -0.1552 -0.23131 -0.19181 -1.8823 -0.76746 0.099051 -0.42125 -0.19526 4.0071 -0.18594 -0.52287 -0.31681 0.00059213 0.0074449 0.17778 -0.15897 0.012041 -0.054223 -0.29871 -0.15749 -0.34758 -0.045637 -0.44251 0.18785 0.0027849 -0.18411 -0.11514 -0.78581
 , 0.013441 0.23682 -0.16899 0.40951 0.63812 0.47709 -0.42852 -0.55641 -0.364 -0.23938 0.13001 -0.063734 -0.39575 -0.48162 0.23291 0.090201 -0.13324 0.078639 -0.41634 -0.15428 0.10068 0.48891 0.31226 -0.1252 -0.037512 -1.5179 0.12612 -0.02442 -0.042961 -0.28351 3.5416 -0.11956 -0.014533 -0.1499 0.21864 -0.33412 -0.13872 0.31806 0.70358 0.44858 -0.080262 0.63003 0.32111 -0.46765 0.22786 0.36034 -0.37818 -0.56657 0.044691 0.30392
 . 0.15164 0.30177 -0.16763 0.17684 0.31719 0.33973 -0.43478 -0.31086 -0.44999 -0.29486 0.16608 0.11963 -0.41328 -0.42353 0.59868 0.28825 -0.11547 -0.041848 -0.67989 -0.25063 0.18472 0.086876 0.46582 0.015035 0.043474 -1.4671 -0.30384 -0.023441 0.30589 -0.21785 3.746 0.0042284 -0.18436 -0.46209 0.098329 -0.11907 0.23919 0.1161 0.41705 0.056763 -6.3681e-05 0.068987 0.087939 -0.10285 -0.13931 0.22314 -0.080803 -0.35652 0.016413 0.10216
 of 0.70853 0.57088 -0.4716 0.18048 0.54449 0.72603 0.18157 -0.52393 0.10381 -0.17566 0.078852 -0.36216 -0.11829 -0.83336 0.11917 -0.16605 0.061555 -0.012719 -0.56623 0.013616 0.22851 -0.14396 -0.067549 -0.38157 -0.23698 -1.7037 -0.86692 -0.26704 -0.2589 0.1767 3.8676 -0.1613 -0.13273 -0.68881 0.18444 0.0052464 -0.33874 -0.078956 0.24185 0.36576 -0.34727 0.28483 0.075693 -0.062178 -0.38988 0.22902 -0.21617 -0.22562 -0.093918 -0.80375
 to 0.68047 -0.039263 0.30186 -0.17792 0.42962 0.032246 -0.41376 0.13228 -0.29847 -0.085253 0.17118 0.22419 -0.10046 -0.43653 0.33418 0.67846 0.057204 -0.34448 -0.42785 -0.43275 0.55963 0.10032 0.18677 -0.26854 0.037334 -2.0932 0.22171 -0.39868 0.20912 -0.55725 3.8826 0.47466 -0.95658 -0.37788 0.20869 -0.32752 0.12751 0.088359 0.16351 -0.21634 -0.094375 0.018324 0.21048 -0.03088 -0.19722 0.082279 -0.09434 -0.073297 -0.064699 -0.26044
 and 0.26818 0.14346 -0.27877 0.016257 0.11384 0.69923 -0.51332 -0.47368 -0.33075 -0.13834 0.2702 0.30938 -0.45012 -0.4127 -0.09932 0.038085 0.029749 0.10076 -0.25058 -0.51818 0.34558 0.44922 0.48791 -0.080866 -0.10121 -1.3777 -0.10866 -0.23201 0.012839 -0.46508 3.8463 0.31362 0.13643 -0.52244 0.3302 0.33707 -0.35601 0.32431 0.12041 0.3512 -0.069043 0.36885 0.25168 -0.24517 0.25381 0.1367 -0.31178 -0.6321 -0.25028 -0.38097
 in 0.33042 0.24995 -0.60874 0.10923 0.036372 0.151 -0.55083 -0.074239 -0.092307 -0.32821 0.09598 -0.82269 -0.36717 -0.67009 0.42909 0.016496 -0.23573 0.12864 -1.0953 0.43334 0.57067 -0.1036 0.20422 0.078308 -0.42795 -1.7984 -0.27865 0.11954 -0.12689 0.031744 3.8631 -0.17786 -0.082434 -0.62698 0.26497 -0.057185 -0.073521 0.46103 0.30862 0.12498 -0.48609 -0.0080272 0.031184 -0.36576 -0.42699 0.42164 -0.11666 -0.50703 -0.027273 -0.53285
 a 0.21705 0.46515 -0.46757 0.10082 1.0135 0.74845 -0.53104 -0.26256 0.16812 0.13182 -0.24909 -0.44185 -0.21739 0.51004 0.13448 -0.43141 -0.03123 0.20674 -0.78138 -0.20148 -0.097401 0.16088 -0.61836 -0.18504 -0.12461 -2.2526 -0.22321 0.5043 0.32257 0.15313 3.9636 -0.71365 -0.67012 0.28388 0.21738 0.14433 0.25926 0.23434 0.4274 -0.44451 0.13813 0.36973 -0.64289 0.024142 -0.039315 -0.26037 0.12017 -0.043782 0.41013 0.1796
 " 0.25769 0.45629 -0.76974 -0.37679 0.59272 -0.063527 0.20545 -0.57385 -0.29009 -0.13662 0.32728 1.4719 -0.73681 -0.12036 0.71354 -0.46098 0.65248 0.48887 -0.51558 0.039951 -0.34307 -0.014087 0.86488 0.3546 0.7999 -1.4995 -1.8153 0.41128 0.23921 -0.43139 3.6623 -0.79834 -0.54538 0.16943 -0.82017 -0.3461 0.69495 -1.2256 -0.17992 -0.057474 0.030498 -0.39543 -0.38515 -1.0002 0.087599 -0.31009 -0.34677 -0.31438 0.75004 0.97065
 's 0.23727 0.40478 -0.20547 0.58805 0.65533 0.32867 -0.81964 -0.23236 0.27428 0.24265 0.054992 0.16296 -1.2555 -0.086437 0.44536 0.096561 -0.16519 0.058378 -0.38598 0.086977 0.0033869 0.55095 -0.77697 -0.62096 0.092948 -2.5685 -0.67739 0.10151 -0.48643 -0.057805 3.1859 -0.017554 -0.16138 0.055486 -0.25885 -0.33938 -0.19928 0.26049 0.10478 -0.55934 -0.12342 0.65961 -0.51802 -0.82995 -0.082739 0.28155 -0.423 -0.27378 -0.007901 -0.030231


--- a/test/loader/test_dataset_loader.py
+++ b/test/loader/test_dataset_loader.py
@@ -3,7 +3,7 @@ import unittest

 from fastNLP.loader.dataset_loader import POSDataSetLoader, LMDataSetLoader, TokenizeDataSetLoader, \
    PeopleDailyCorpusLoader, ConllLoader

 from fastNLP.core.dataset import DataSet

 class TestDatasetLoader(unittest.TestCase):
    def test_case_1(self):
@@ -15,13 +15,23 @@ class TestDatasetLoader(unittest.TestCase):

    def test_case_TokenizeDatasetLoader(self):
        loader = TokenizeDataSetLoader()
        data = loader.load("./test/data_for_tests/cws_pku_utf_8", max_seq_len=32)
        filepath = "./test/data_for_tests/cws_pku_utf_8"
        data = loader.load(filepath, max_seq_len=32)
        assert len(data) > 0

        data1 = DataSet()
        data1.read_tokenize(filepath, max_seq_len=32)
        assert len(data1) > 0
        print("pass TokenizeDataSetLoader test!")

    def test_case_POSDatasetLoader(self):
        loader = POSDataSetLoader()
        filepath = "./test/data_for_tests/people.txt"
        data = loader.load("./test/data_for_tests/people.txt")
        datas = loader.load_lines("./test/data_for_tests/people.txt")

        data1 = DataSet().read_pos(filepath)
        assert len(data1) > 0
        print("pass POSDataSetLoader test!")

    def test_case_LMDatasetLoader(self):
--- a/test/loader/test_embed_loader.py
+++ b/test/loader/test_embed_loader.py
@@ -0,0 +1,33 @@
 import unittest
 import os

 import torch

 from fastNLP.loader.embed_loader import EmbedLoader
 from fastNLP.core.vocabulary import Vocabulary


 class TestEmbedLoader(unittest.TestCase):
    glove_path = './test/data_for_tests/glove.6B.50d_test.txt'
    pkl_path = './save'
    raw_texts = ["i am a cat",
                "this is a test of new batch",
                "ha ha",
                "I am a good boy .",
                "This is the most beautiful girl ."
                ]
    texts = [text.strip().split() for text in raw_texts]
    vocab = Vocabulary()
    vocab.update(texts)
    def test1(self):
        emb, _ = EmbedLoader.load_embedding(50, self.glove_path, 'glove', self.vocab, self.pkl_path)
        self.assertTrue(emb.shape[0] == (len(self.vocab)))
        self.assertTrue(emb.shape[1] == 50)
        os.remove(self.pkl_path)
    
    def test2(self):
        try:
            _ = EmbedLoader.load_embedding(100, self.glove_path, 'glove', self.vocab, self.pkl_path)
            self.fail(msg="load dismatch embedding")
        except ValueError:
            pass
--- a/test/model/seq_labeling.py
+++ b/test/model/seq_labeling.py
@@ -1,9 +1,9 @@
 import os
 import sys

 sys.path.append("..")
 import argparse
 from fastNLP.loader.config_loader import ConfigLoader, ConfigSection
 from fastNLP.core.trainer import SeqLabelTrainer
 from fastNLP.loader.dataset_loader import BaseLoader
 from fastNLP.saver.model_saver import ModelSaver
 from fastNLP.loader.model_loader import ModelLoader
@@ -82,6 +82,7 @@ def train_and_test():
    save_pickle(data_set.word_vocab, pickle_path, "word2id.pkl")
    save_pickle(data_set.label_vocab, pickle_path, "label2id.pkl")

    """
    trainer = SeqLabelTrainer(
        epochs=trainer_args["epochs"],
        batch_size=trainer_args["batch_size"],
@@ -92,12 +93,23 @@ def train_and_test():
        model_name=model_name,
        optimizer=Optimizer("SGD", lr=0.01, momentum=0.9),
    )
    """

    # Model
    model = SeqLabeling(model_args)

    model.fit(train_set, dev_set,
              epochs=trainer_args["epochs"],
              batch_size=trainer_args["batch_size"],
              validate=False,
              use_cuda=trainer_args["use_cuda"],
              pickle_path=pickle_path,
              save_best_dev=trainer_args["save_best_dev"],
              model_name=model_name,
              optimizer=Optimizer("SGD", lr=0.01, momentum=0.9))

    # Start training
    trainer.train(model, train_set, dev_set)
    # trainer.train(model, train_set, dev_set)
    print("Training finished!")

    # Saver
@@ -105,7 +117,7 @@ def train_and_test():
    saver.save_pytorch(model)
    print("Model saved!")

    del model, trainer
    del model

    change_field_is_target(dev_set, "truth", True)

--- a/test/model/test_char_language_model.py
+++ b/test/model/test_char_language_model.py
@@ -0,0 +1,25 @@
 import unittest

 import numpy as np
 import torch

 from fastNLP.models.char_language_model import CharLM


 class TestCharLM(unittest.TestCase):
    def test_case_1(self):
        char_emb_dim = 50
        word_emb_dim = 50
        vocab_size = 1000
        num_char = 24
        max_word_len = 21
        num_seq = 64
        seq_len = 32

        model = CharLM(char_emb_dim, word_emb_dim, vocab_size, num_char)

        x = torch.from_numpy(np.random.randint(0, num_char, size=(num_seq, seq_len, max_word_len + 2)))

        self.assertEqual(tuple(x.shape), (num_seq, seq_len, max_word_len + 2))
        y = model(x)
        self.assertEqual(tuple(y.shape), (num_seq * seq_len, vocab_size))
--- a/test/model/test_cws.py
+++ b/test/model/test_cws.py
@@ -1,13 +1,14 @@
 import os

 from fastNLP.core.dataset import SeqLabelDataSet, change_field_is_target
 from fastNLP.core.dataset import DataSet
 from fastNLP.core.vocabulary import Vocabulary
 from fastNLP.core.metrics import SeqLabelEvaluator
 from fastNLP.core.predictor import SeqLabelInfer
 from fastNLP.core.preprocess import save_pickle, load_pickle
 from fastNLP.core.tester import SeqLabelTester
 from fastNLP.core.trainer import SeqLabelTrainer
 from fastNLP.loader.config_loader import ConfigLoader, ConfigSection
 from fastNLP.loader.dataset_loader import TokenizeDataSetLoader, BaseLoader
 from fastNLP.loader.dataset_loader import TokenizeDataSetLoader, BaseLoader, RawDataSetLoader
 from fastNLP.loader.model_loader import ModelLoader
 from fastNLP.models.sequence_modeling import SeqLabeling
 from fastNLP.saver.model_saver import ModelSaver
@@ -37,9 +38,9 @@ def infer():
    print("model loaded!")

    # Load infer data
    infer_data = SeqLabelDataSet(load_func=BaseLoader.load)
    infer_data.load(data_infer_path, vocabs={"word_vocab": word2index}, infer=True)

    infer_data = RawDataSetLoader().load(data_infer_path)
    infer_data.index_field("word_seq", word2index)
    infer_data.set_origin_len("word_seq")
    # inference
    infer = SeqLabelInfer(pickle_path)
    results = infer.predict(model, infer_data)
@@ -52,13 +53,18 @@ def train_test():
    ConfigLoader().load_config(config_path, {"POS_infer": train_args})

    # define dataset
    data_train = SeqLabelDataSet(load_func=TokenizeDataSetLoader.load)
    data_train.load(cws_data_path)
    train_args["vocab_size"] = len(data_train.word_vocab)
    train_args["num_classes"] = len(data_train.label_vocab)

    save_pickle(data_train.word_vocab, pickle_path, "word2id.pkl")
    save_pickle(data_train.label_vocab, pickle_path, "label2id.pkl")
    data_train = TokenizeDataSetLoader().load(cws_data_path)
    word_vocab = Vocabulary()
    label_vocab = Vocabulary()
    data_train.update_vocab(word_seq=word_vocab, label_seq=label_vocab)
    data_train.index_field("word_seq", word_vocab).index_field("label_seq", label_vocab)
    data_train.set_origin_len("word_seq")
    data_train.rename_field("label_seq", "truth").set_target(truth=False)
    train_args["vocab_size"] = len(word_vocab)
    train_args["num_classes"] = len(label_vocab)

    save_pickle(word_vocab, pickle_path, "word2id.pkl")
    save_pickle(label_vocab, pickle_path, "label2id.pkl")

    # Trainer
    trainer = SeqLabelTrainer(**train_args.data)
@@ -90,7 +96,7 @@ def train_test():
    tester = SeqLabelTester(**test_args.data)

    # Start testing
    change_field_is_target(data_train, "truth", True)
    data_train.set_target(truth=True)
    tester.test(model, data_train)


--- a/test/model/test_seq_label.py
+++ b/test/model/test_seq_label.py
@@ -1,6 +1,7 @@
 import os

 from fastNLP.core.dataset import SeqLabelDataSet, change_field_is_target
 from fastNLP.core.vocabulary import Vocabulary
 from fastNLP.loader.dataset_loader import TokenizeDataSetLoader
 from fastNLP.core.metrics import SeqLabelEvaluator
 from fastNLP.core.optimizer import Optimizer
 from fastNLP.core.preprocess import save_pickle
@@ -25,14 +26,19 @@ def test_training():
    ConfigLoader().load_config(config_dir, {
        "test_seq_label_trainer": trainer_args, "test_seq_label_model": model_args})

    data_set = SeqLabelDataSet()
    data_set.load(data_path)
    data_set = TokenizeDataSetLoader().load(data_path)
    word_vocab = Vocabulary()
    label_vocab = Vocabulary()
    data_set.update_vocab(word_seq=word_vocab, label_seq=label_vocab)
    data_set.index_field("word_seq", word_vocab).index_field("label_seq", label_vocab)
    data_set.set_origin_len("word_seq")
    data_set.rename_field("label_seq", "truth").set_target(truth=False)
    data_train, data_dev = data_set.split(0.3, shuffle=True)
    model_args["vocab_size"] = len(data_set.word_vocab)
    model_args["num_classes"] = len(data_set.label_vocab)
    model_args["vocab_size"] = len(word_vocab)
    model_args["num_classes"] = len(label_vocab)

    save_pickle(data_set.word_vocab, pickle_path, "word2id.pkl")
    save_pickle(data_set.label_vocab, pickle_path, "label2id.pkl")
    save_pickle(word_vocab, pickle_path, "word2id.pkl")
    save_pickle(label_vocab, pickle_path, "label2id.pkl")

    trainer = SeqLabelTrainer(
        epochs=trainer_args["epochs"],
@@ -76,5 +82,5 @@ def test_training():
                            )

    # Start testing with validation data
    change_field_is_target(data_dev, "truth", True)
    data_dev.set_target(truth=True)
    tester.test(model, data_dev)
--- a/test/modules/test_char_embedding.py
+++ b/test/modules/test_char_embedding.py
@@ -0,0 +1,28 @@
 import unittest

 import torch

 from fastNLP.modules.encoder.char_embedding import ConvCharEmbedding, LSTMCharEmbedding


 class TestCharEmbed(unittest.TestCase):
    def test_case_1(self):
        batch_size = 128
        char_emb = 100
        word_length = 1
        x = torch.Tensor(batch_size, char_emb, word_length)
        x = x.transpose(1, 2)

        cce = ConvCharEmbedding(char_emb)
        y = cce(x)
        self.assertEqual(tuple(x.shape), (batch_size, word_length, char_emb))
        print("CNN Char Emb input: ", x.shape)
        self.assertEqual(tuple(y.shape), (batch_size, char_emb, 1))
        print("CNN Char Emb output: ", y.shape)  # [128, 100]

        lce = LSTMCharEmbedding(char_emb)
        o = lce(x)
        self.assertEqual(tuple(x.shape), (batch_size, word_length, char_emb))
        print("LSTM Char Emb input: ", x.shape)
        self.assertEqual(tuple(o.shape), (batch_size, char_emb, 1))
        print("LSTM Char Emb size: ", o.shape)
--- a/test/modules/test_variational_rnn.py
+++ b/test/modules/test_variational_rnn.py
@@ -1,28 +1,25 @@
 import unittest

 import numpy as np
 import torch
 import unittest

 from fastNLP.modules.encoder.variational_rnn import VarMaskedFastLSTM
 from fastNLP.modules.encoder.variational_rnn import VarLSTM


 class TestMaskedRnn(unittest.TestCase):
    def test_case_1(self):
        masked_rnn = VarMaskedFastLSTM(input_size=1, hidden_size=1, bidirectional=True, batch_first=True)
        masked_rnn = VarLSTM(input_size=1, hidden_size=1, bidirectional=True, batch_first=True)
        x = torch.tensor([[[1.0], [2.0]]])
        print(x.size())
        y = masked_rnn(x)
        mask = torch.tensor([[[1], [1]]])
        y = masked_rnn(x, mask=mask)
        mask = torch.tensor([[[1], [0]]])
        y = masked_rnn(x, mask=mask)


    def test_case_2(self):
        masked_rnn = VarMaskedFastLSTM(input_size=1, hidden_size=1, bidirectional=False, batch_first=True)
        x = torch.tensor([[[1.0], [2.0]]])
        print(x.size())
        y = masked_rnn(x)
        mask = torch.tensor([[[1], [1]]])
        y = masked_rnn(x, mask=mask)
        xx = torch.tensor([[[1.0]]])
        #y, hidden = masked_rnn.step(xx)
        #step() still has a bug
        #y, hidden = masked_rnn.step(xx, mask=mask)
        input_size = 12
        batch = 16
        hidden = 10
        masked_rnn = VarLSTM(input_size=input_size, hidden_size=hidden, bidirectional=False, batch_first=True)

        xx = torch.randn((batch, 32, input_size))
        y, _ = masked_rnn(xx)
        self.assertEqual(tuple(y.shape), (batch, 32, hidden))