diff --git a/README.md b/README.md
index 8169520a..be73c356 100644
--- a/README.md
+++ b/README.md
@@ -8,8 +8,8 @@
 
 fastNLP is a modular Natural Language Processing system based on PyTorch, for fast development of NLP tools. It divides the NLP model based on deep learning into different modules. These modules fall into 4 categories: encoder, interaction, aggregation and decoder, while each category contains different implemented modules. Encoder modules encode the input into some abstract representation, interaction modules make the information in the representation interact with each other, aggregation modules aggregate and reduce information, and decoder modules decode the representation into the output. Most current NLP models could be built on these modules, which vastly simplifies the process of developing NLP models. The architecture of fastNLP is as the figure below:
 
-![](https://github.com/fastnlp/fastNLP/raw/master/fastnlp-architecture.jpg)
-
+![](https://github.com/fastnlp/fastNLP/raw/master/docs/source/figures/procedures.PNG)
+![](https://github.com/fastnlp/fastNLP/raw/master/docs/source/figures/text_classification.png)
 
 ## Requirements
 
@@ -62,4 +62,4 @@ pip install fastNLP
     <td><b> fastNLP.fastnlp </b></td>
     <td> a high-level interface for prediction </td>
 </tr>
-</table>
\ No newline at end of file
+</table>
diff --git a/docs/source/fastNLP.core.rst b/docs/source/fastNLP.core.rst
index 86793e5d..5c941e55 100644
--- a/docs/source/fastNLP.core.rst
+++ b/docs/source/fastNLP.core.rst
@@ -1,12 +1,6 @@
 fastNLP.core 
 =============
 
-fastNLP.core.action 
---------------------
-
-.. automodule:: fastNLP.core.action
-    :members:
-
 fastNLP.core.batch 
 -------------------
 
@@ -61,6 +55,12 @@ fastNLP.core.preprocess
 .. automodule:: fastNLP.core.preprocess
     :members:
 
+fastNLP.core.sampler 
+---------------------
+
+.. automodule:: fastNLP.core.sampler
+    :members:
+
 fastNLP.core.tester 
 --------------------
 
@@ -73,6 +73,12 @@ fastNLP.core.trainer
 .. automodule:: fastNLP.core.trainer
     :members:
 
+fastNLP.core.vocabulary 
+------------------------
+
+.. automodule:: fastNLP.core.vocabulary
+    :members:
+
 
 .. automodule:: fastNLP.core
     :members:
diff --git a/docs/source/fastNLP.modules.aggregation.rst b/docs/source/fastNLP.modules.aggregation.rst
deleted file mode 100644
index bfaf8646..00000000
--- a/docs/source/fastNLP.modules.aggregation.rst
+++ /dev/null
@@ -1,36 +0,0 @@
-fastNLP.modules.aggregation 
-============================
-
-fastNLP.modules.aggregation.attention 
---------------------------------------
-
-.. automodule:: fastNLP.modules.aggregation.attention
-    :members:
-
-fastNLP.modules.aggregation.avg\_pool 
---------------------------------------
-
-.. automodule:: fastNLP.modules.aggregation.avg_pool
-    :members:
-
-fastNLP.modules.aggregation.kmax\_pool 
----------------------------------------
-
-.. automodule:: fastNLP.modules.aggregation.kmax_pool
-    :members:
-
-fastNLP.modules.aggregation.max\_pool 
---------------------------------------
-
-.. automodule:: fastNLP.modules.aggregation.max_pool
-    :members:
-
-fastNLP.modules.aggregation.self\_attention 
---------------------------------------------
-
-.. automodule:: fastNLP.modules.aggregation.self_attention
-    :members:
-
-
-.. automodule:: fastNLP.modules.aggregation
-    :members:
diff --git a/docs/source/fastNLP.modules.aggregator.rst b/docs/source/fastNLP.modules.aggregator.rst
new file mode 100644
index 00000000..073da4a5
--- /dev/null
+++ b/docs/source/fastNLP.modules.aggregator.rst
@@ -0,0 +1,36 @@
+fastNLP.modules.aggregator 
+===========================
+
+fastNLP.modules.aggregator.attention 
+-------------------------------------
+
+.. automodule:: fastNLP.modules.aggregator.attention
+    :members:
+
+fastNLP.modules.aggregator.avg\_pool 
+-------------------------------------
+
+.. automodule:: fastNLP.modules.aggregator.avg_pool
+    :members:
+
+fastNLP.modules.aggregator.kmax\_pool 
+--------------------------------------
+
+.. automodule:: fastNLP.modules.aggregator.kmax_pool
+    :members:
+
+fastNLP.modules.aggregator.max\_pool 
+-------------------------------------
+
+.. automodule:: fastNLP.modules.aggregator.max_pool
+    :members:
+
+fastNLP.modules.aggregator.self\_attention 
+-------------------------------------------
+
+.. automodule:: fastNLP.modules.aggregator.self_attention
+    :members:
+
+
+.. automodule:: fastNLP.modules.aggregator
+    :members:
diff --git a/docs/source/fastNLP.modules.interaction.rst b/docs/source/fastNLP.modules.interaction.rst
deleted file mode 100644
index 91a34268..00000000
--- a/docs/source/fastNLP.modules.interaction.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-fastNLP.modules.interaction 
-============================
-
-.. automodule:: fastNLP.modules.interaction
-    :members:
diff --git a/docs/source/fastNLP.modules.interactor.rst b/docs/source/fastNLP.modules.interactor.rst
new file mode 100644
index 00000000..5eb3bdef
--- /dev/null
+++ b/docs/source/fastNLP.modules.interactor.rst
@@ -0,0 +1,5 @@
+fastNLP.modules.interactor 
+===========================
+
+.. automodule:: fastNLP.modules.interactor
+    :members:
diff --git a/docs/source/fastNLP.modules.rst b/docs/source/fastNLP.modules.rst
index 6ccdc21a..eda85aa7 100644
--- a/docs/source/fastNLP.modules.rst
+++ b/docs/source/fastNLP.modules.rst
@@ -3,10 +3,10 @@ fastNLP.modules
 
 .. toctree::
 
-    fastNLP.modules.aggregation
+    fastNLP.modules.aggregator
     fastNLP.modules.decoder
     fastNLP.modules.encoder
-    fastNLP.modules.interaction
+    fastNLP.modules.interactor
 
 fastNLP.modules.other\_modules 
 -------------------------------
diff --git a/fastNLP/core/dataset.py b/fastNLP/core/dataset.py
index 13370969..c73e3fef 100644
--- a/fastNLP/core/dataset.py
+++ b/fastNLP/core/dataset.py
@@ -6,91 +6,45 @@ from copy import deepcopy
 from fastNLP.core.field import TextField, LabelField
 from fastNLP.core.instance import Instance
 from fastNLP.core.vocabulary import Vocabulary
-from fastNLP.loader.dataset_loader import POSDataSetLoader, ClassDataSetLoader
-
-
-def create_dataset_from_lists(str_lists: list, word_vocab: dict, has_target: bool = False, label_vocab: dict = None):
-    if has_target is True:
-        if label_vocab is None:
-            raise RuntimeError("Must provide label vocabulary to transform labels.")
-        return create_labeled_dataset_from_lists(str_lists, word_vocab, label_vocab)
-    else:
-        return create_unlabeled_dataset_from_lists(str_lists, word_vocab)
-
-
-def create_labeled_dataset_from_lists(str_lists, word_vocab, label_vocab):
-    """Create an DataSet instance that contains labels.
-
-    :param str_lists: list of list of strings, [num_examples, 2, *].
-            ::
-            [
-                [[word_11, word_12, ...], [label_11, label_12, ...]],
-                ...
-            ]
-
-    :param word_vocab: dict of (str: int), which means (word: index).
-    :param label_vocab: dict of (str: int), which means (word: index).
-    :return data_set: a DataSet instance.
-
-    """
-    data_set = DataSet()
-    for example in str_lists:
-        word_seq, label_seq = example[0], example[1]
-        x = TextField(word_seq, is_target=False)
-        y = TextField(label_seq, is_target=True)
-        data_set.append(Instance(word_seq=x, label_seq=y))
-    data_set.index_field("word_seq", word_vocab)
-    data_set.index_field("label_seq", label_vocab)
-    return data_set
-
-
-def create_unlabeled_dataset_from_lists(str_lists, word_vocab):
-    """Create an DataSet instance that contains no labels.
-
-    :param str_lists: list of list of strings, [num_examples, *].
-            ::
-            [
-                [word_11, word_12, ...],
-                ...
-            ]
-
-    :param word_vocab: dict of (str: int), which means (word: index).
-    :return data_set: a DataSet instance.
-
-    """
-    data_set = DataSet()
-    for word_seq in str_lists:
-        x = TextField(word_seq, is_target=False)
-        data_set.append(Instance(word_seq=x))
-    data_set.index_field("word_seq", word_vocab)
-    return data_set
 
+_READERS = {}
 
 class DataSet(list):
     """A DataSet object is a list of Instance objects.
 
     """
 
-    def __init__(self, name="", instances=None, load_func=None):
+    def __init__(self, name="", instances=None):
         """
 
         :param name: str, the name of the dataset. (default: "")
         :param instances: list of Instance objects. (default: None)
-        :param load_func: a function that takes the dataset path (string) as input and returns multi-level lists.
         """
         list.__init__([])
         self.name = name
+        self.origin_len = None
         if instances is not None:
             self.extend(instances)
-        self.data_set_load_func = load_func
 
     def index_all(self, vocab):
         for ins in self:
             ins.index_all(vocab)
+        return self
 
     def index_field(self, field_name, vocab):
-        for ins in self:
-            ins.index_field(field_name, vocab)
+        if isinstance(field_name, str):
+            field_list = [field_name]
+            vocab_list = [vocab]
+        else:
+            classes = (list, tuple)
+            assert isinstance(field_name, classes) and isinstance(vocab, classes) and len(field_name) == len(vocab)
+            field_list = field_name
+            vocab_list = vocab
+
+        for name, vocabs in zip(field_list, vocab_list):
+            for ins in self:
+                ins.index_field(name, vocabs)
+        return self
 
     def to_tensor(self, idx: int, padding_length: dict):
         """Convert an instance in a dataset to tensor.
@@ -102,7 +56,7 @@ class DataSet(list):
 
         """
         ins = self[idx]
-        return ins.to_tensor(padding_length)
+        return ins.to_tensor(padding_length, self.origin_len)
 
     def get_length(self):
         """Fetch lengths of all fields in all instances in a dataset.
@@ -117,42 +71,9 @@ class DataSet(list):
                 lengths[field_name].append(field_length)
         return lengths
 
-    def convert(self, data):
-        """Convert lists of strings into Instances with Fields, creating Vocabulary for labeled data. Used in Training."""
-        raise NotImplementedError
-
-    def convert_with_vocabs(self, data, vocabs):
-        """Convert lists of strings into Instances with Fields, using existing Vocabulary, with labels. Used in Testing."""
-        raise NotImplementedError
-
-    def convert_for_infer(self, data, vocabs):
-        """Convert lists of strings into Instances with Fields, using existing Vocabulary, without labels. Used in predicting."""
-
-    def load(self, data_path, vocabs=None, infer=False):
-        """Load data from the given files.
-
-        :param data_path: str, the path to the data
-        :param infer: bool. If True, there is no label information in the data. Default: False.
-        :param vocabs: dict of (name: Vocabulary object), used to index data. If not provided, a new vocabulary will be constructed.
-
-        """
-        raw_data = self.data_set_load_func(data_path)
-        if infer is True:
-            self.convert_for_infer(raw_data, vocabs)
-        else:
-            if vocabs is not None:
-                self.convert_with_vocabs(raw_data, vocabs)
-            else:
-                self.convert(raw_data)
-
-    def load_raw(self, raw_data, vocabs):
-        """Load raw data without loader. Used in FastNLP class.
-
-        :param raw_data:
-        :param vocabs:
-        :return:
-        """
-        self.convert_for_infer(raw_data, vocabs)
+    def shuffle(self):
+        random.shuffle(self)
+        return self
 
     def split(self, ratio, shuffle=True):
         """Train/dev splitting
@@ -165,7 +86,7 @@ class DataSet(list):
         """
         assert 0 < ratio < 1
         if shuffle:
-            random.shuffle(self)
+            self.shuffle()
         split_idx = int(len(self) * ratio)
         dev_set = deepcopy(self)
         train_set = deepcopy(self)
@@ -173,134 +94,67 @@ class DataSet(list):
         del dev_set[split_idx:]
         return train_set, dev_set
 
-
-class SeqLabelDataSet(DataSet):
-    def __init__(self, instances=None, load_func=POSDataSetLoader().load):
-        super(SeqLabelDataSet, self).__init__(name="", instances=instances, load_func=load_func)
-        self.word_vocab = Vocabulary()
-        self.label_vocab = Vocabulary()
-
-    def convert(self, data):
-        """Convert lists of strings into Instances with Fields.
-
-        :param data: 3-level lists. Entries are strings.
+    def rename_field(self, old_name, new_name):
+        """rename a field
         """
-        bar = ProgressBar(total=len(data))
-        for example in data:
-            word_seq, label_seq = example[0], example[1]
-            # list, list
-            self.word_vocab.update(word_seq)
-            self.label_vocab.update(label_seq)
-            x = TextField(word_seq, is_target=False)
-            x_len = LabelField(len(word_seq), is_target=False)
-            y = TextField(label_seq, is_target=False)
-            instance = Instance()
-            instance.add_field("word_seq", x)
-            instance.add_field("truth", y)
-            instance.add_field("word_seq_origin_len", x_len)
-            self.append(instance)
-            bar.move()
-        self.index_field("word_seq", self.word_vocab)
-        self.index_field("truth", self.label_vocab)
-        # no need to index "word_seq_origin_len"
-
-    def convert_with_vocabs(self, data, vocabs):
-        for example in data:
-            word_seq, label_seq = example[0], example[1]
-            # list, list
-            x = TextField(word_seq, is_target=False)
-            x_len = LabelField(len(word_seq), is_target=False)
-            y = TextField(label_seq, is_target=False)
-            instance = Instance()
-            instance.add_field("word_seq", x)
-            instance.add_field("truth", y)
-            instance.add_field("word_seq_origin_len", x_len)
-            self.append(instance)
-        self.index_field("word_seq", vocabs["word_vocab"])
-        self.index_field("truth", vocabs["label_vocab"])
-        # no need to index "word_seq_origin_len"
-
-    def convert_for_infer(self, data, vocabs):
-        for word_seq in data:
-            # list
-            x = TextField(word_seq, is_target=False)
-            x_len = LabelField(len(word_seq), is_target=False)
-            instance = Instance()
-            instance.add_field("word_seq", x)
-            instance.add_field("word_seq_origin_len", x_len)
-            self.append(instance)
-        self.index_field("word_seq", vocabs["word_vocab"])
-        # no need to index "word_seq_origin_len"
-
-
-class TextClassifyDataSet(DataSet):
-    def __init__(self, instances=None, load_func=ClassDataSetLoader().load):
-        super(TextClassifyDataSet, self).__init__(name="", instances=instances, load_func=load_func)
-        self.word_vocab = Vocabulary()
-        self.label_vocab = Vocabulary(need_default=False)
-
-    def convert(self, data):
-        for example in data:
-            word_seq, label = example[0], example[1]
-            # list, str
-            self.word_vocab.update(word_seq)
-            self.label_vocab.update(label)
-            x = TextField(word_seq, is_target=False)
-            y = LabelField(label, is_target=True)
-            instance = Instance()
-            instance.add_field("word_seq", x)
-            instance.add_field("label", y)
-            self.append(instance)
-        self.index_field("word_seq", self.word_vocab)
-        self.index_field("label", self.label_vocab)
-
-    def convert_with_vocabs(self, data, vocabs):
-        for example in data:
-            word_seq, label = example[0], example[1]
-            # list, str
-            x = TextField(word_seq, is_target=False)
-            y = LabelField(label, is_target=True)
-            instance = Instance()
-            instance.add_field("word_seq", x)
-            instance.add_field("label", y)
-            self.append(instance)
-        self.index_field("word_seq", vocabs["word_vocab"])
-        self.index_field("label", vocabs["label_vocab"])
+        for ins in self:
+            ins.rename_field(old_name, new_name)
+        return self
 
-    def convert_for_infer(self, data, vocabs):
-        for word_seq in data:
-            # list
-            x = TextField(word_seq, is_target=False)
-            instance = Instance()
-            instance.add_field("word_seq", x)
-            self.append(instance)
-        self.index_field("word_seq", vocabs["word_vocab"])
+    def set_target(self, **fields):
+        """Change the flag of `is_target` for all instance. For fields not set here, leave their `is_target` unchanged.
 
+        :param key-value pairs for field-name and `is_target` value(True, False or None).
+        """
+        for ins in self:
+            ins.set_target(**fields)
+        return self
 
-def change_field_is_target(data_set, field_name, new_target):
-    """Change the flag of is_target in a field.
+    def update_vocab(self, **name_vocab):
+        """using certain field data to update vocabulary.
 
-    :param data_set: a DataSet object
-    :param field_name: str, the name of the field
-    :param new_target: one of (True, False, None), representing this field is batch_x / is batch_y / neither.
+        e.g. ::
 
-    """
-    for inst in data_set:
-        inst.fields[field_name].is_target = new_target
+            # update word vocab and label vocab seperately
+            dataset.update_vocab(word_seq=word_vocab, label_seq=label_vocab)
+        """
+        for field_name, vocab in name_vocab.items():
+            for ins in self:
+                vocab.update(ins[field_name].contents())
+        return self
 
+    def set_origin_len(self, origin_field, origin_len_name=None):
+        """make dataset tensor output contain origin_len field.
 
-class ProgressBar:
+        e.g. ::
 
-    def __init__(self, count=0, total=0, width=100):
-        self.count = count
-        self.total = total
-        self.width = width
+            # output "word_seq_origin_len", lengths based on "word_seq" field
+            dataset.set_origin_len("word_seq")
+        """
+        if origin_field is None:
+            self.origin_len = None
+        else:
+            self.origin_len = (origin_field + "_origin_len", origin_field) \
+                if origin_len_name is None else (origin_len_name, origin_field)
+        return self
+
+    def __getattribute__(self, name):
+        if name in _READERS:
+            # add read_*data() support
+            def _read(*args, **kwargs):
+                data = _READERS[name]().load(*args, **kwargs)
+                self.extend(data)
+                return self
+            return _read
+        else:
+            return object.__getattribute__(self, name)
 
-    def move(self):
-        self.count += 1
-        progress = self.width * self.count // self.total
-        sys.stdout.write('{0:3}/{1:3}: '.format(self.count, self.total))
-        sys.stdout.write('#' * progress + '-' * (self.width - progress) + '\r')
-        if progress == self.width:
-            sys.stdout.write('\n')
-        sys.stdout.flush()
+    @classmethod
+    def set_reader(cls, method_name):
+        """decorator to add dataloader support
+        """
+        assert isinstance(method_name, str)
+        def wrapper(read_cls):
+            _READERS[method_name] = read_cls
+            return read_cls
+        return wrapper
diff --git a/fastNLP/core/field.py b/fastNLP/core/field.py
index b57b9bb6..64aafdd3 100644
--- a/fastNLP/core/field.py
+++ b/fastNLP/core/field.py
@@ -18,6 +18,8 @@ class Field(object):
     def to_tensor(self, padding_length):
         raise NotImplementedError
 
+    def contents(self):
+        raise NotImplementedError
 
 class TextField(Field):
     def __init__(self, text, is_target):
@@ -57,6 +59,8 @@ class TextField(Field):
             pads = [0] * (padding_length - self.get_length())
         return torch.LongTensor(self._index + pads)
 
+    def contents(self):
+        return self.text.copy()
 
 class LabelField(Field):
     """The Field representing a single label. Can be a string or integer.
@@ -92,6 +96,40 @@ class LabelField(Field):
         else:
             return torch.LongTensor([self._index])
 
+    def contents(self):
+        return [self.label]
+
+class SeqLabelField(Field):
+    def __init__(self, label_seq, is_target=True):
+        super(SeqLabelField, self).__init__(is_target)
+        self.label_seq = label_seq
+        self._index = None
+
+    def get_length(self):
+        return len(self.label_seq)
+
+    def index(self, vocab):
+        if self._index is None:
+            self._index = [vocab[c] for c in self.label_seq]
+        return self._index
+
+    def to_tensor(self, padding_length):
+        pads = [0] * (padding_length - self.get_length())
+        if self._index is None:
+            if self.get_length() == 0:
+                return torch.LongTensor(pads)
+            elif isinstance(self.label_seq[0], int):
+                return torch.LongTensor(self.label_seq + pads)
+            elif isinstance(self.label_seq[0], str):
+                raise RuntimeError("Field {} not indexed. Call index method.".format(self.label))
+            else:
+                raise RuntimeError(
+                    "Not support type for SeqLabelField. Expect str or int, got {}.".format(type(self.label)))
+        else:
+            return torch.LongTensor(self._index + pads)
+
+    def contents(self):
+        return self.label_seq.copy()
 
 if __name__ == "__main__":
     tf = TextField("test the code".split(), is_target=False)
diff --git a/fastNLP/core/instance.py b/fastNLP/core/instance.py
index ebf01912..a4eca1aa 100644
--- a/fastNLP/core/instance.py
+++ b/fastNLP/core/instance.py
@@ -1,3 +1,5 @@
+import torch
+
 class Instance(object):
     """An instance which consists of Fields is an example in the DataSet.
 
@@ -10,6 +12,28 @@ class Instance(object):
 
     def add_field(self, field_name, field):
         self.fields[field_name] = field
+        return self
+
+    def rename_field(self, old_name, new_name):
+        if old_name in self.fields:
+            self.fields[new_name] = self.fields.pop(old_name)
+            if old_name in self.indexes:
+                self.indexes[new_name] = self.indexes.pop(old_name)
+        else:
+            raise KeyError("error, no such field: {}".format(old_name))
+        return self
+
+    def set_target(self, **fields):
+        for name, val in fields.items():
+            if name in self.fields:
+                self.fields[name].is_target = val
+        return self
+
+    def __getitem__(self, name):
+        if name in self.fields:
+            return self.fields[name]
+        else:
+            raise KeyError("{} not found".format(name))
 
     def get_length(self):
         """Fetch the length of all fields in the instance.
@@ -24,6 +48,7 @@ class Instance(object):
         """use `vocab` to index certain field
         """
         self.indexes[field_name] = self.fields[field_name].index(vocab)
+        return self
 
     def index_all(self, vocab):
         """use `vocab` to index all fields
@@ -35,7 +60,7 @@ class Instance(object):
         self.indexes = indexes
         return indexes
 
-    def to_tensor(self, padding_length: dict):
+    def to_tensor(self, padding_length: dict, origin_len=None):
         """Convert instance to tensor.
 
         :param padding_length: dict of (str: int), which means (field name: padding_length of this field)
@@ -53,4 +78,7 @@ class Instance(object):
             else:
                 # is_target is None
                 continue
+        if origin_len is not None:
+            name, field_name = origin_len
+            tensor_x[name] = torch.LongTensor([self.fields[field_name].get_length()])
         return tensor_x, tensor_y
diff --git a/fastNLP/core/metrics.py b/fastNLP/core/metrics.py
index 6eedd214..d4bf475a 100644
--- a/fastNLP/core/metrics.py
+++ b/fastNLP/core/metrics.py
@@ -57,6 +57,20 @@ class SeqLabelEvaluator(Evaluator):
         return {"accuracy": float(accuracy)}
 
 
+class SNLIEvaluator(Evaluator):
+    def __init__(self):
+        super(SNLIEvaluator, self).__init__()
+
+    def __call__(self, predict, truth):
+        y_prob = [torch.nn.functional.softmax(y_logit, dim=-1) for y_logit in predict]
+        y_prob = torch.cat(y_prob, dim=0)
+        y_pred = torch.argmax(y_prob, dim=-1)
+        truth = [t['truth'] for t in truth]
+        y_true = torch.cat(truth, dim=0).view(-1)
+        acc = float(torch.sum(y_pred == y_true)) / y_true.size(0)
+        return {"accuracy": acc}
+
+
 def _conver_numpy(x):
     """convert input data to numpy array
 
diff --git a/fastNLP/core/predictor.py b/fastNLP/core/predictor.py
index 14c4e8c1..c5d22df4 100644
--- a/fastNLP/core/predictor.py
+++ b/fastNLP/core/predictor.py
@@ -2,9 +2,9 @@ import numpy as np
 import torch
 
 from fastNLP.core.batch import Batch
-from fastNLP.core.dataset import create_dataset_from_lists
 from fastNLP.core.preprocess import load_pickle
 from fastNLP.core.sampler import SequentialSampler
+from fastNLP.loader.dataset_loader import convert_seq2seq_dataset, convert_seq2tag_dataset, convert_seq_dataset
 
 
 class Predictor(object):
@@ -79,7 +79,8 @@ class Predictor(object):
         :return data_set: a DataSet instance.
         """
         assert isinstance(data, list)
-        return create_dataset_from_lists(data, self.word_vocab, has_target=False)
+        data = convert_seq_dataset(data)
+        data.index_field("word_seq", self.word_vocab)
 
 
 class SeqLabelInfer(Predictor):
diff --git a/fastNLP/core/preprocess.py b/fastNLP/core/preprocess.py
index 5e77649e..12a7a987 100644
--- a/fastNLP/core/preprocess.py
+++ b/fastNLP/core/preprocess.py
@@ -1,13 +1,6 @@
 import _pickle
 import os
 
-import numpy as np
-
-from fastNLP.core.dataset import DataSet
-from fastNLP.core.field import TextField, LabelField
-from fastNLP.core.instance import Instance
-from fastNLP.core.vocabulary import Vocabulary
-
 
 # the first vocab in dict with the index = 5
 
@@ -53,258 +46,3 @@ def pickle_exist(pickle_path, pickle_name):
         return True
     else:
         return False
-
-
-class Preprocessor(object):
-    """Preprocessors are responsible for converting data of strings into data of indices.
-    During the pre-processing, the following pickle files will be built:
-
-        - "word2id.pkl", a Vocabulary object, mapping words to indices.
-        - "class2id.pkl", a Vocabulary object, mapping labels to indices.
-        - "data_train.pkl", a DataSet object for training
-        - "data_dev.pkl", a DataSet object for validation, if train_dev_split > 0.
-        - "data_test.pkl", a DataSet object for testing, if test_data is not None.
-
-    These four pickle files are expected to be saved in the given pickle directory once they are constructed.
-    Preprocessors will check if those files are already in the directory and will reuse them in future calls.
-    """
-
-    def __init__(self, label_is_seq=False, share_vocab=False, add_char_field=False):
-        """
-
-        :param label_is_seq: bool, whether label is a sequence. If True, label vocabulary will preserve
-                several special tokens for sequence processing.
-        :param share_vocab: bool, whether word sequence and label sequence share the same vocabulary. Typically, this
-                is only available when label_is_seq is True. Default: False.
-        :param add_char_field: bool, whether to add character representations to all TextFields. Default: False.
-        """
-        print("Preprocessor is about to deprecate. Please use DataSet class.")
-        self.data_vocab = Vocabulary()
-        if label_is_seq is True:
-            if share_vocab is True:
-                self.label_vocab = self.data_vocab
-            else:
-                self.label_vocab = Vocabulary()
-        else:
-            self.label_vocab = Vocabulary(need_default=False)
-
-        self.character_vocab = Vocabulary(need_default=False)
-        self.add_char_field = add_char_field
-
-    @property
-    def vocab_size(self):
-        return len(self.data_vocab)
-
-    @property
-    def num_classes(self):
-        return len(self.label_vocab)
-
-    @property
-    def char_vocab_size(self):
-        if self.character_vocab is None:
-            self.build_char_dict()
-        return len(self.character_vocab)
-
-    def run(self, train_dev_data, test_data=None, pickle_path="./", train_dev_split=0, cross_val=False, n_fold=10):
-        """Main pre-processing pipeline.
-
-        :param train_dev_data: three-level list, with either single label or multiple labels in a sample.
-        :param test_data: three-level list, with either single label or multiple labels in a sample. (optional)
-        :param pickle_path: str, the path to save the pickle files.
-        :param train_dev_split: float, between [0, 1]. The ratio of training data used as validation set.
-        :param cross_val: bool, whether to do cross validation.
-        :param n_fold: int, the number of folds of cross validation. Only useful when cross_val is True.
-        :return results: multiple datasets after pre-processing. If test_data is provided, return one more dataset.
-                If train_dev_split > 0, return one more dataset - the dev set. If cross_val is True, each dataset
-                is a list of DataSet objects; Otherwise, each dataset is a DataSet object.
-        """
-        if pickle_exist(pickle_path, "word2id.pkl") and pickle_exist(pickle_path, "class2id.pkl"):
-            self.data_vocab = load_pickle(pickle_path, "word2id.pkl")
-            self.label_vocab = load_pickle(pickle_path, "class2id.pkl")
-        else:
-            self.data_vocab, self.label_vocab = self.build_dict(train_dev_data)
-            save_pickle(self.data_vocab, pickle_path, "word2id.pkl")
-            save_pickle(self.label_vocab, pickle_path, "class2id.pkl")
-
-        self.build_reverse_dict()
-
-        train_set = []
-        dev_set = []
-        if not cross_val:
-            if not pickle_exist(pickle_path, "data_train.pkl"):
-                if train_dev_split > 0 and not pickle_exist(pickle_path, "data_dev.pkl"):
-                    split = int(len(train_dev_data) * train_dev_split)
-                    data_dev = train_dev_data[: split]
-                    data_train = train_dev_data[split:]
-                    train_set = self.convert_to_dataset(data_train, self.data_vocab, self.label_vocab)
-                    dev_set = self.convert_to_dataset(data_dev, self.data_vocab, self.label_vocab)
-
-                    save_pickle(dev_set, pickle_path, "data_dev.pkl")
-                    print("{} of the training data is split for validation. ".format(train_dev_split))
-                else:
-                    train_set = self.convert_to_dataset(train_dev_data, self.data_vocab, self.label_vocab)
-                save_pickle(train_set, pickle_path, "data_train.pkl")
-            else:
-                train_set = load_pickle(pickle_path, "data_train.pkl")
-                if pickle_exist(pickle_path, "data_dev.pkl"):
-                    dev_set = load_pickle(pickle_path, "data_dev.pkl")
-        else:
-            # cross_val is True
-            if not pickle_exist(pickle_path, "data_train_0.pkl"):
-                # cross validation
-                data_cv = self.cv_split(train_dev_data, n_fold)
-                for i, (data_train_cv, data_dev_cv) in enumerate(data_cv):
-                    data_train_cv = self.convert_to_dataset(data_train_cv, self.data_vocab, self.label_vocab)
-                    data_dev_cv = self.convert_to_dataset(data_dev_cv, self.data_vocab, self.label_vocab)
-                    save_pickle(
-                        data_train_cv, pickle_path,
-                        "data_train_{}.pkl".format(i))
-                    save_pickle(
-                        data_dev_cv, pickle_path,
-                        "data_dev_{}.pkl".format(i))
-                    train_set.append(data_train_cv)
-                    dev_set.append(data_dev_cv)
-                print("{}-fold cross validation.".format(n_fold))
-            else:
-                for i in range(n_fold):
-                    data_train_cv = load_pickle(pickle_path, "data_train_{}.pkl".format(i))
-                    data_dev_cv = load_pickle(pickle_path, "data_dev_{}.pkl".format(i))
-                    train_set.append(data_train_cv)
-                    dev_set.append(data_dev_cv)
-
-        # prepare test data if provided
-        test_set = []
-        if test_data is not None:
-            if not pickle_exist(pickle_path, "data_test.pkl"):
-                test_set = self.convert_to_dataset(test_data, self.data_vocab, self.label_vocab)
-                save_pickle(test_set, pickle_path, "data_test.pkl")
-
-        # return preprocessed results
-        results = [train_set]
-        if cross_val or train_dev_split > 0:
-            results.append(dev_set)
-        if test_data:
-            results.append(test_set)
-        if len(results) == 1:
-            return results[0]
-        else:
-            return tuple(results)
-
-    def build_dict(self, data):
-        for example in data:
-            word, label = example
-            self.data_vocab.update(word)
-            self.label_vocab.update(label)
-        return self.data_vocab, self.label_vocab
-
-    def build_char_dict(self):
-        char_collection = set()
-        for word in self.data_vocab.word2idx:
-            if len(word) == 0:
-                continue
-            for ch in word:
-                if ch not in char_collection:
-                    char_collection.add(ch)
-        self.character_vocab.update(list(char_collection))
-
-    def build_reverse_dict(self):
-        self.data_vocab.build_reverse_vocab()
-        self.label_vocab.build_reverse_vocab()
-
-    def data_split(self, data, train_dev_split):
-        """Split data into train and dev set."""
-        split = int(len(data) * train_dev_split)
-        data_dev = data[: split]
-        data_train = data[split:]
-        return data_train, data_dev
-
-    def cv_split(self, data, n_fold):
-        """Split data for cross validation.
-
-        :param data: list of string
-        :param n_fold: int
-        :return data_cv:
-
-            ::
-            [
-                (data_train, data_dev),  # 1st fold
-                (data_train, data_dev),  # 2nd fold
-                ...
-            ]
-
-        """
-        data_copy = data.copy()
-        np.random.shuffle(data_copy)
-        fold_size = round(len(data_copy) / n_fold)
-        data_cv = []
-        for i in range(n_fold - 1):
-            start = i * fold_size
-            end = (i + 1) * fold_size
-            data_dev = data_copy[start:end]
-            data_train = data_copy[:start] + data_copy[end:]
-            data_cv.append((data_train, data_dev))
-        start = (n_fold - 1) * fold_size
-        data_dev = data_copy[start:]
-        data_train = data_copy[:start]
-        data_cv.append((data_train, data_dev))
-        return data_cv
-
-    def convert_to_dataset(self, data, vocab, label_vocab):
-        """Convert list of indices into a DataSet object.
-
-        :param data: list. Entries are strings.
-        :param vocab: a dict, mapping string (token) to index (int).
-        :param label_vocab: a dict, mapping string (label) to index (int).
-        :return data_set: a DataSet object
-        """
-        use_word_seq = False
-        use_label_seq = False
-        use_label_str = False
-
-        # construct a DataSet object and fill it with Instances
-        data_set = DataSet()
-        for example in data:
-            words, label = example[0], example[1]
-            instance = Instance()
-
-            if isinstance(words, list):
-                x = TextField(words, is_target=False)
-                instance.add_field("word_seq", x)
-                use_word_seq = True
-            else:
-                raise NotImplementedError("words is a {}".format(type(words)))
-
-            if isinstance(label, list):
-                y = TextField(label, is_target=True)
-                instance.add_field("label_seq", y)
-                use_label_seq = True
-            elif isinstance(label, str):
-                y = LabelField(label, is_target=True)
-                instance.add_field("label", y)
-                use_label_str = True
-            else:
-                raise NotImplementedError("label is a {}".format(type(label)))
-            data_set.append(instance)
-
-        # convert strings to indices
-        if use_word_seq:
-            data_set.index_field("word_seq", vocab)
-        if use_label_seq:
-            data_set.index_field("label_seq", label_vocab)
-        if use_label_str:
-            data_set.index_field("label", label_vocab)
-
-        return data_set
-
-
-class SeqLabelPreprocess(Preprocessor):
-    def __init__(self):
-        print("[FastNLP warning] SeqLabelPreprocess is about to deprecate. Please use Preprocess directly.")
-        super(SeqLabelPreprocess, self).__init__()
-
-
-class ClassPreprocess(Preprocessor):
-    def __init__(self):
-        print("[FastNLP warning] ClassPreprocess is about to deprecate. Please use Preprocess directly.")
-        super(ClassPreprocess, self).__init__()
-
diff --git a/fastNLP/core/tester.py b/fastNLP/core/tester.py
index 0e74145b..24aac951 100644
--- a/fastNLP/core/tester.py
+++ b/fastNLP/core/tester.py
@@ -83,6 +83,7 @@ class Tester(object):
             truth_list.append(batch_y)
         eval_results = self.evaluate(output_list, truth_list)
         print("[tester] {}".format(self.print_eval_results(eval_results)))
+        logger.info("[tester] {}".format(self.print_eval_results(eval_results)))
 
     def mode(self, model, is_test=False):
         """Train mode or Test mode. This is for PyTorch currently.
@@ -131,3 +132,10 @@ class ClassificationTester(Tester):
         print(
             "[FastNLP Warning] ClassificationTester will be deprecated. Please use Tester directly.")
         super(ClassificationTester, self).__init__(**test_args)
+
+
+class SNLITester(Tester):
+    def __init__(self, **test_args):
+        print(
+            "[FastNLP Warning] SNLITester will be deprecated. Please use Tester directly.")
+        super(SNLITester, self).__init__(**test_args)
diff --git a/fastNLP/core/trainer.py b/fastNLP/core/trainer.py
index 957a4757..a180b10d 100644
--- a/fastNLP/core/trainer.py
+++ b/fastNLP/core/trainer.py
@@ -10,7 +10,7 @@ from fastNLP.core.loss import Loss
 from fastNLP.core.metrics import Evaluator
 from fastNLP.core.optimizer import Optimizer
 from fastNLP.core.sampler import RandomSampler
-from fastNLP.core.tester import SeqLabelTester, ClassificationTester
+from fastNLP.core.tester import SeqLabelTester, ClassificationTester, SNLITester
 from fastNLP.saver.logger import create_logger
 from fastNLP.saver.model_saver import ModelSaver
 
@@ -162,7 +162,7 @@ class Trainer(object):
             if kwargs["n_print"] > 0 and step % kwargs["n_print"] == 0:
                 end = time.time()
                 diff = timedelta(seconds=round(end - kwargs["start"]))
-                print_output = "[epoch: {:>3} step: {:>4}] train loss: {:>4.2} time: {}".format(
+                print_output = "[epoch: {:>3} step: {:>4}] train loss: {:>4.6} time: {}".format(
                     kwargs["epoch"], step, loss.data, diff)
                 print(print_output)
                 logger.info(print_output)
@@ -292,3 +292,15 @@ class ClassificationTrainer(Trainer):
 
     def _create_validator(self, valid_args):
         return ClassificationTester(**valid_args)
+
+
+class SNLITrainer(Trainer):
+    """Trainer for text SNLI."""
+
+    def __init__(self, **train_args):
+        print(
+            "[FastNLP Warning] SNLITrainer will be deprecated. Please use Trainer directly.")
+        super(SNLITrainer, self).__init__(**train_args)
+
+    def _create_validator(self, valid_args):
+        return SNLITester(**valid_args)
diff --git a/fastNLP/core/vocabulary.py b/fastNLP/core/vocabulary.py
index 08c00644..26d2e837 100644
--- a/fastNLP/core/vocabulary.py
+++ b/fastNLP/core/vocabulary.py
@@ -19,6 +19,17 @@ def isiterable(p_object):
     return True
 
 
+def check_build_vocab(func):
+    def _wrapper(self, *args, **kwargs):
+        if self.word2idx is None:
+            self.build_vocab()
+            self.build_reverse_vocab()
+        elif self.idx2word is None:
+            self.build_reverse_vocab()
+        return func(self, *args, **kwargs)
+    return _wrapper
+
+
 class Vocabulary(object):
     """Use for word and index one to one mapping
 
@@ -30,13 +41,41 @@ class Vocabulary(object):
         vocab["word"]
         vocab.to_word(5)
     """
-
-    def __init__(self, need_default=True):
+    def __init__(self, need_default=True, max_size=None, min_freq=None):
         """
         :param bool need_default: set if the Vocabulary has default labels reserved for sequences. Default: True.
+        :param int max_size: set the max number of words in Vocabulary. Default: None
+        :param int min_freq: set the min occur frequency of words in Vocabulary. Default: None
+        """
+        self.max_size = max_size
+        self.min_freq = min_freq
+        self.word_count = {}
+        self.has_default = need_default
+        self.word2idx = None
+        self.idx2word = None
+
+    def update(self, word):
+        """add word or list of words into Vocabulary
 
+        :param word: a list of string or a single string
         """
-        if need_default:
+        if not isinstance(word, str) and isiterable(word):
+            # it's a nested list
+            for w in word:
+                self.update(w)
+        else:
+        # it's a word to be added
+            if word not in self.word_count:
+                self.word_count[word] = 1
+            else:
+                self.word_count[word] += 1
+            self.word2idx = None
+        return self
+
+    def build_vocab(self):
+        """build 'word to index' dict, and filter the word using `max_size` and `min_freq`
+        """
+        if self.has_default:
             self.word2idx = deepcopy(DEFAULT_WORD_TO_INDEX)
             self.padding_label = DEFAULT_PADDING_LABEL
             self.unknown_label = DEFAULT_UNKNOWN_LABEL
@@ -45,28 +84,28 @@ class Vocabulary(object):
             self.padding_label = None
             self.unknown_label = None
 
-        self.has_default = need_default
-        self.idx2word = None
+        words = sorted(self.word_count.items(), key=lambda kv: kv[1], reverse=True)
+        if self.min_freq is not None:
+            words = list(filter(lambda kv: kv[1] >= self.min_freq, words))
+        if self.max_size is not None and len(words) > self.max_size:
+            words = words[:self.max_size]
+        for w, _ in words:
+            self.word2idx[w] = len(self.word2idx)
 
+    def build_reverse_vocab(self):
+        """build 'index to word' dict based on 'word to index' dict
+        """
+        self.idx2word = {self.word2idx[w] : w for w in self.word2idx}
+
+    @check_build_vocab
     def __len__(self):
         return len(self.word2idx)
 
-    def update(self, word):
-        """add word or list of words into Vocabulary
-        
-        :param word: a list of string or a single string
-        """
-        if not isinstance(word, str) and isiterable(word):
-            # it's a nested list
-            for w in word:
-                self.update(w)
-        else:
-            # it's a word to be added
-            if word not in self.word2idx:
-                self.word2idx[word] = len(self)
-                if self.idx2word is not None:
-                    self.idx2word = None
+    @check_build_vocab
+    def has_word(self, w):
+        return w in self.word2idx
 
+    @check_build_vocab
     def __getitem__(self, w):
         """To support usage like::
 
@@ -74,32 +113,35 @@ class Vocabulary(object):
         """
         if w in self.word2idx:
             return self.word2idx[w]
-        else:
+        elif self.has_default:
             return self.word2idx[DEFAULT_UNKNOWN_LABEL]
+        else:
+            raise ValueError("word {} not in vocabulary".format(w))
 
+    @check_build_vocab
     def to_index(self, w):
         """ like to_index(w) function, turn a word to the index
             if w is not in Vocabulary, return the unknown label
-        
+
         :param str w:
         """
         return self[w]
 
+    @property
+    @check_build_vocab
     def unknown_idx(self):
         if self.unknown_label is None:
             return None
         return self.word2idx[self.unknown_label]
 
+    @property
+    @check_build_vocab
     def padding_idx(self):
         if self.padding_label is None:
             return None
         return self.word2idx[self.padding_label]
 
-    def build_reverse_vocab(self):
-        """build 'index to word' dict based on 'word to index' dict
-        """
-        self.idx2word = {self.word2idx[w]: w for w in self.word2idx}
-
+    @check_build_vocab
     def to_word(self, idx):
         """given a word's index, return the word itself
 
@@ -122,3 +164,11 @@ class Vocabulary(object):
         """
         self.__dict__.update(state)
         self.idx2word = None
+
+    def __contains__(self, item):
+        """Check if a word in vocabulary.
+
+        :param item: the word
+        :return: True or False
+        """
+        return self.has_word(item)
diff --git a/fastNLP/fastnlp.py b/fastNLP/fastnlp.py
index 0bd56d18..92229d0d 100644
--- a/fastNLP/fastnlp.py
+++ b/fastNLP/fastnlp.py
@@ -1,6 +1,7 @@
 import os
 
-from fastNLP.core.dataset import SeqLabelDataSet, TextClassifyDataSet
+from fastNLP.core.dataset import DataSet
+from fastNLP.loader.dataset_loader import convert_seq_dataset
 from fastNLP.core.predictor import SeqLabelInfer, ClassificationInfer
 from fastNLP.core.preprocess import load_pickle
 from fastNLP.loader.config_loader import ConfigLoader, ConfigSection
@@ -178,13 +179,11 @@ class FastNLP(object):
         :param infer_input: 2-D lists of strings
         :return data_set: a DataSet object
         """
-        if self.infer_type == "seq_label":
-            data_set = SeqLabelDataSet()
-            data_set.load_raw(infer_input, {"word_vocab": self.word_vocab})
-            return data_set
-        elif self.infer_type == "text_class":
-            data_set = TextClassifyDataSet()
-            data_set.load_raw(infer_input, {"word_vocab": self.word_vocab})
+        if self.infer_type in ["seq_label", "text_class"]:
+            data_set = convert_seq_dataset(infer_input)
+            data_set.index_field("word_seq", self.word_vocab)
+            if self.infer_type == "seq_label":
+                data_set.set_origin_len("word_seq")
             return data_set
         else:
             raise RuntimeError("fail to make outputs with infer type {}".format(self.infer_type))
diff --git a/fastNLP/loader/config_loader.py b/fastNLP/loader/config_loader.py
index 9818d411..cf3ac1a9 100644
--- a/fastNLP/loader/config_loader.py
+++ b/fastNLP/loader/config_loader.py
@@ -8,9 +8,10 @@ from fastNLP.loader.base_loader import BaseLoader
 class ConfigLoader(BaseLoader):
     """loader for configuration files"""
 
-    def __int__(self, data_path):
+    def __init__(self, data_path=None):
         super(ConfigLoader, self).__init__()
-        self.config = self.parse(super(ConfigLoader, self).load(data_path))
+        if data_path is not None:
+            self.config = self.parse(super(ConfigLoader, self).load(data_path))
 
     @staticmethod
     def parse(string):
diff --git a/fastNLP/loader/dataset_loader.py b/fastNLP/loader/dataset_loader.py
index a6a0fb77..91be0215 100644
--- a/fastNLP/loader/dataset_loader.py
+++ b/fastNLP/loader/dataset_loader.py
@@ -1,6 +1,74 @@
 import os
 
 from fastNLP.loader.base_loader import BaseLoader
+from fastNLP.core.dataset import DataSet
+from fastNLP.core.instance import Instance
+from fastNLP.core.field import *
+
+
+def convert_seq_dataset(data):
+    """Create an DataSet instance that contains no labels.
+
+    :param data: list of list of strings, [num_examples, *].
+            ::
+            [
+                [word_11, word_12, ...],
+                ...
+            ]
+
+    :return: a DataSet.
+    """
+    dataset = DataSet()
+    for word_seq in data:
+        x = TextField(word_seq, is_target=False)
+        dataset.append(Instance(word_seq=x))
+    return dataset
+
+
+def convert_seq2tag_dataset(data):
+    """Convert list of data into DataSet
+
+    :param data: list of list of strings, [num_examples, *].
+            ::
+            [
+                [ [word_11, word_12, ...], label_1 ],
+                [ [word_21, word_22, ...], label_2 ],
+                ...
+            ]
+
+    :return: a DataSet.
+    """
+    dataset = DataSet()
+    for sample in data:
+        word_seq, label = sample[0], sample[1]
+        ins = Instance()
+        ins.add_field("word_seq", TextField(word_seq, is_target=False)) \
+            .add_field("label", LabelField(label, is_target=True))
+        dataset.append(ins)
+    return dataset
+
+
+def convert_seq2seq_dataset(data):
+    """Convert list of data into DataSet
+
+    :param data: list of list of strings, [num_examples, *].
+            ::
+            [
+                [ [word_11, word_12, ...], [label_1, label_1, ...] ],
+                [ [word_21, word_22, ...], [label_2, label_1, ...] ],
+                ...
+            ]
+
+    :return: a DataSet.
+    """
+    dataset = DataSet()
+    for sample in data:
+        word_seq, label_seq = sample[0], sample[1]
+        ins = Instance()
+        ins.add_field("word_seq", TextField(word_seq, is_target=False)) \
+            .add_field("label_seq", TextField(label_seq, is_target=True))
+        dataset.append(ins)
+    return dataset
 
 
 class DataSetLoader(BaseLoader):
@@ -10,9 +78,33 @@ class DataSetLoader(BaseLoader):
         super(DataSetLoader, self).__init__()
 
     def load(self, path):
+        """ load data in `path` into a dataset
+        """
         raise NotImplementedError
 
+    def convert(self, data):
+        """convert list of data into dataset
+        """
+        raise NotImplementedError
+
+
+@DataSet.set_reader('read_raw')
+class RawDataSetLoader(DataSetLoader):
+    def __init__(self):
+        super(RawDataSetLoader, self).__init__()
+
+    def load(self, data_path, split=None):
+        with open(data_path, "r", encoding="utf-8") as f:
+            lines = f.readlines()
+        lines = lines if split is None else [l.split(split) for l in lines]
+        lines = list(filter(lambda x: len(x) > 0, lines))
+        return self.convert(lines)
+
+    def convert(self, data):
+        return convert_seq_dataset(data)
+
 
+@DataSet.set_reader('read_pos')
 class POSDataSetLoader(DataSetLoader):
     """Dataset Loader for POS Tag datasets.
 
@@ -48,7 +140,8 @@ class POSDataSetLoader(DataSetLoader):
         """
         with open(data_path, "r", encoding="utf-8") as f:
             lines = f.readlines()
-        return self.parse(lines)
+        data = self.parse(lines)
+        return self.convert(data)
 
     @staticmethod
     def parse(lines):
@@ -75,7 +168,13 @@ class POSDataSetLoader(DataSetLoader):
             data.append([words, labels])
         return data
 
+    def convert(self, data):
+        """Convert lists of strings into Instances with Fields.
+        """
+        return convert_seq2seq_dataset(data)
 
+
+@DataSet.set_reader('read_tokenize')
 class TokenizeDataSetLoader(DataSetLoader):
     """
     Data set loader for tokenization data sets
@@ -84,8 +183,7 @@ class TokenizeDataSetLoader(DataSetLoader):
     def __init__(self):
         super(TokenizeDataSetLoader, self).__init__()
 
-    @staticmethod
-    def load(data_path, max_seq_len=32):
+    def load(self, data_path, max_seq_len=32):
         """
         load pku dataset for Chinese word segmentation
         CWS (Chinese Word Segmentation) pku training dataset format:
@@ -130,9 +228,13 @@ class TokenizeDataSetLoader(DataSetLoader):
                 seq_words = words[start:end]
                 seq_labels = labels[start:end]
                 data.append([seq_words, seq_labels])
-        return data
+        return self.convert(data)
+
+    def convert(self, data):
+        return convert_seq2seq_dataset(data)
 
 
+@DataSet.set_reader('read_class')
 class ClassDataSetLoader(DataSetLoader):
     """Loader for classification data sets"""
 
@@ -143,7 +245,8 @@ class ClassDataSetLoader(DataSetLoader):
         assert os.path.exists(data_path)
         with open(data_path, "r", encoding="utf-8") as f:
             lines = f.readlines()
-        return self.parse(lines)
+        data = self.parse(lines)
+        return self.convert(data)
 
     @staticmethod
     def parse(lines):
@@ -166,16 +269,19 @@ class ClassDataSetLoader(DataSetLoader):
             dataset.append(sentence)
         return dataset
 
+    def convert(self, data):
+        return convert_seq2tag_dataset(data)
 
+
+@DataSet.set_reader('read_conll')
 class ConllLoader(DataSetLoader):
     """loader for conll format files"""
 
-    def __int__(self, data_path):
+    def __init__(self):
         """
         :param str data_path: the path to the conll data set
         """
         super(ConllLoader, self).__init__()
-        self.data_set = self.parse(self.load(data_path))
 
     def load(self, data_path):
         """
@@ -183,7 +289,8 @@ class ConllLoader(DataSetLoader):
         """
         with open(data_path, "r", encoding="utf-8") as f:
             lines = f.readlines()
-        return lines
+        data = self.parse(lines)
+        return self.convert(data)
 
     @staticmethod
     def parse(lines):
@@ -204,7 +311,11 @@ class ConllLoader(DataSetLoader):
             tokens.append(line.split())
         return sentences
 
+    def convert(self, data):
+        pass
 
+
+@DataSet.set_reader('read_lm')
 class LMDataSetLoader(DataSetLoader):
     """Language Model Dataset Loader
 
@@ -222,7 +333,8 @@ class LMDataSetLoader(DataSetLoader):
         with open(data_path, "r", encoding="utf=8") as f:
             text = " ".join(f.readlines())
         tokens = text.strip().split()
-        return self.sentence_cut(tokens)
+        data = self.sentence_cut(tokens)
+        return self.convert(data)
 
     def sentence_cut(self, tokens, sentence_length=15):
         start_idx = 0
@@ -236,7 +348,11 @@ class LMDataSetLoader(DataSetLoader):
             data_set.append([x, y])
         return data_set
 
+    def convert(self, data):
+        pass
+
 
+@DataSet.set_reader('read_people_daily')
 class PeopleDailyCorpusLoader(DataSetLoader):
     """
         People Daily Corpus: Chinese word segmentation, POS tag, NER
@@ -286,3 +402,74 @@ class PeopleDailyCorpusLoader(DataSetLoader):
             ner_examples.append([sent_words, sent_ner])
         return pos_tag_examples, ner_examples
 
+    def convert(self, data):
+        pass
+
+
+class SNLIDataSetLoader(DataSetLoader):
+    """A data set loader for SNLI data set.
+
+    """
+
+    def __init__(self):
+        super(SNLIDataSetLoader, self).__init__()
+
+    def load(self, path_list):
+        """
+
+        :param path_list: A list of file name, in the order of premise file, hypothesis file, and label file.
+        :return: data_set: A DataSet object.
+        """
+        assert len(path_list) == 3
+        line_set = []
+        for file in path_list:
+            if not os.path.exists(file):
+                raise FileNotFoundError("file {} NOT found".format(file))
+
+            with open(file, 'r', encoding='utf-8') as f:
+                lines = f.readlines()
+                line_set.append(lines)
+
+        premise_lines, hypothesis_lines, label_lines = line_set
+        assert len(premise_lines) == len(hypothesis_lines) and len(premise_lines) == len(label_lines)
+
+        data_set = []
+        for premise, hypothesis, label in zip(premise_lines, hypothesis_lines, label_lines):
+            p = premise.strip().split()
+            h = hypothesis.strip().split()
+            l = label.strip()
+            data_set.append([p, h, l])
+
+        return self.convert(data_set)
+
+    def convert(self, data):
+        """Convert a 3D list to a DataSet object.
+
+        :param data: A 3D tensor.
+            [
+                [ [premise_word_11, premise_word_12, ...], [hypothesis_word_11, hypothesis_word_12, ...], [label_1] ],
+                [ [premise_word_21, premise_word_22, ...], [hypothesis_word_21, hypothesis_word_22, ...], [label_2] ],
+                ...
+            ]
+        :return: data_set: A DataSet object.
+        """
+
+        data_set = DataSet()
+
+        for example in data:
+            p, h, l = example
+            # list, list, str
+            x1 = TextField(p, is_target=False)
+            x2 = TextField(h, is_target=False)
+            x1_len = TextField([1] * len(p), is_target=False)
+            x2_len = TextField([1] * len(h), is_target=False)
+            y = LabelField(l, is_target=True)
+            instance = Instance()
+            instance.add_field("premise", x1)
+            instance.add_field("hypothesis", x2)
+            instance.add_field("premise_len", x1_len)
+            instance.add_field("hypothesis_len", x2_len)
+            instance.add_field("truth", y)
+            data_set.append(instance)
+
+        return data_set
diff --git a/fastNLP/loader/embed_loader.py b/fastNLP/loader/embed_loader.py
index a84f6335..2f61830f 100644
--- a/fastNLP/loader/embed_loader.py
+++ b/fastNLP/loader/embed_loader.py
@@ -1,50 +1,85 @@
 import _pickle
 import os
 
-import numpy as np
+import torch
 
 from fastNLP.loader.base_loader import BaseLoader
+from fastNLP.core.vocabulary import Vocabulary
 
 
 class EmbedLoader(BaseLoader):
     """docstring for EmbedLoader"""
 
-    def __init__(self, data_path):
-        super(EmbedLoader, self).__init__(data_path)
+    def __init__(self):
+        super(EmbedLoader, self).__init__()
 
     @staticmethod
-    def load_embedding(emb_dim, emb_file, word_dict, emb_pkl):
+    def _load_glove(emb_file):
+        """Read file as a glove embedding
+
+        file format: 
+            embeddings are split by line, 
+            for one embedding, word and numbers split by space
+        Example::
+
+        word_1 float_1 float_2 ... float_emb_dim
+        word_2 float_1 float_2 ... float_emb_dim
+        ...
+        """
+        emb = {}
+        with open(emb_file, 'r', encoding='utf-8') as f:
+            for line in f:
+                line = list(filter(lambda w: len(w)>0, line.strip().split(' ')))
+                if len(line) > 0:
+                    emb[line[0]] = torch.Tensor(list(map(float, line[1:])))
+        return emb
+    
+    @staticmethod
+    def _load_pretrain(emb_file, emb_type):
+        """Read txt data from embedding file and convert to np.array as pre-trained embedding
+
+        :param emb_file: str, the pre-trained embedding file path
+        :param emb_type: str, the pre-trained embedding data format
+        :return dict: {str: np.array}
+        """
+        if emb_type == 'glove':
+            return EmbedLoader._load_glove(emb_file)
+        else:
+            raise Exception("embedding type {} not support yet".format(emb_type))
+
+    @staticmethod
+    def load_embedding(emb_dim, emb_file, emb_type, vocab, emb_pkl):
         """Load the pre-trained embedding and combine with the given dictionary.
 
-        :param emb_file: str, the pre-trained embedding.
-                The embedding file should have the following format:
-                    Each line is a word embedding, where a word string is followed by multiple floats.
-                    Floats are separated by space. The word and the first float are separated by space.
-        :param word_dict: dict, a mapping from word to index.
         :param emb_dim: int, the dimension of the embedding. Should be the same as pre-trained embedding.
+        :param emb_file: str, the pre-trained embedding file path.
+        :param emb_type: str, the pre-trained embedding format, support glove now
+        :param vocab: Vocabulary, a mapping from word to index, can be provided by user or built from pre-trained embedding
         :param emb_pkl: str, the embedding pickle file.
-        :return embedding_np: numpy array of shape (len(word_dict), emb_dim)
-
+        :return embedding_tensor: Tensor of shape (len(word_dict), emb_dim)
+                vocab: input vocab or vocab built by pre-train
         TODO: fragile code
         """
         # If the embedding pickle exists, load it and return.
         if os.path.exists(emb_pkl):
             with open(emb_pkl, "rb") as f:
-                embedding_np = _pickle.load(f)
-            return embedding_np
+                embedding_tensor, vocab = _pickle.load(f)
+            return embedding_tensor, vocab
         # Otherwise, load the pre-trained embedding.
-        with open(emb_file, "r", encoding="utf-8") as f:
-            # begin with a random embedding
-            embedding_np = np.random.uniform(-1, 1, size=(len(word_dict), emb_dim))
-            for line in f:
-                line = line.strip().split()
-                if len(line) != emb_dim + 1:
-                    # skip this line if two embedding dimension not match
-                    continue
-                if line[0] in word_dict:
-                    # find the word and replace its embedding with a pre-trained one
-                    embedding_np[word_dict[line[0]]] = [float(i) for i in line[1:]]
+        pretrain = EmbedLoader._load_pretrain(emb_file, emb_type)
+        if vocab is None:
+            # build vocabulary from pre-trained embedding
+            vocab = Vocabulary()
+            for w in pretrain.keys():
+                vocab.update(w)
+        embedding_tensor = torch.randn(len(vocab), emb_dim)
+        for w, v in pretrain.items():
+            if len(v.shape) > 1 or emb_dim != v.shape[0]:
+                raise ValueError('pretrian embedding dim is {}, dismatching required {}'.format(v.shape, (emb_dim,)))
+            if vocab.has_word(w):
+                embedding_tensor[vocab[w]] = v
+
         # save and return the result
         with open(emb_pkl, "wb") as f:
-            _pickle.dump(embedding_np, f)
-        return embedding_np
+            _pickle.dump((embedding_tensor, vocab), f)
+        return embedding_tensor, vocab
diff --git a/fastNLP/models/base_model.py b/fastNLP/models/base_model.py
index 0fcc14e1..c73bdfd9 100644
--- a/fastNLP/models/base_model.py
+++ b/fastNLP/models/base_model.py
@@ -1,5 +1,7 @@
 import torch
 
+from fastNLP.core.trainer import Trainer
+
 
 class BaseModel(torch.nn.Module):
     """Base PyTorch model for all models.
@@ -8,68 +10,6 @@ class BaseModel(torch.nn.Module):
     def __init__(self):
         super(BaseModel, self).__init__()
 
-
-class Vocabulary(object):
-    """A look-up table that allows you to access `Lexeme` objects. The `Vocab`
-    instance also provides access to the `StringStore`, and owns underlying
-    data that is shared between `Doc` objects.
-    """
-
-    def __init__(self):
-        """Create the vocabulary.
-        RETURNS (Vocab): The newly constructed object.
-        """
-        self.data_frame = None
-
-
-class Document(object):
-    """A sequence of Token objects. Access sentences and named entities, export
-    annotations to numpy arrays, losslessly serialize to compressed binary
-    strings. The `Doc` object holds an array of `Token` objects. The
-    Python-level `Token` and `Span` objects are views of this array, i.e.
-    they don't own the data themselves. -- spacy
-    """
-
-    def __init__(self, vocab, words=None, spaces=None):
-        """Create a Doc object.
-        vocab (Vocab): A vocabulary object, which must match any models you
-            want to use (e.g. tokenizer, parser, entity recognizer).
-        words (list or None): A list of unicode strings, to add to the document
-            as words. If `None`, defaults to empty list.
-        spaces (list or None): A list of boolean values, of the same length as
-            words. True means that the word is followed by a space, False means
-            it is not. If `None`, defaults to `[True]*len(words)`
-        user_data (dict or None): Optional extra data to attach to the Doc.
-        RETURNS (Doc): The newly constructed object.
-        """
-        self.vocab = vocab
-        self.spaces = spaces
-        self.words = words
-        if spaces is None:
-            self.spaces = [True] * len(self.words)
-        elif len(spaces) != len(self.words):
-            raise ValueError("dismatch spaces and words")
-
-    def get_chunker(self, vocab):
-        return None
-
-    def push_back(self, vocab):
-        pass
-
-
-class Token(object):
-    """An individual token – i.e. a word, punctuation symbol, whitespace,
-    etc.
-    """
-
-    def __init__(self, vocab, doc, offset):
-        """Construct a `Token` object.
-            vocab (Vocabulary): A storage container for lexical types.
-            doc (Document): The parent document.
-            offset (int): The index of the token within the document.
-        """
-        self.vocab = vocab
-        self.doc = doc
-        self.token = doc[offset]
-        self.i = offset
-
+    def fit(self, train_data, dev_data=None, **train_args):
+        trainer = Trainer(**train_args)
+        trainer.train(self, train_data, dev_data)
diff --git a/fastNLP/models/biaffine_parser.py b/fastNLP/models/biaffine_parser.py
new file mode 100644
index 00000000..a2a00a29
--- /dev/null
+++ b/fastNLP/models/biaffine_parser.py
@@ -0,0 +1,364 @@
+import sys, os
+sys.path.append(os.path.join(os.path.dirname(__file__), '../..'))
+import copy
+import numpy as np
+import torch
+from collections import defaultdict
+from torch import nn
+from torch.nn import functional as F
+from fastNLP.modules.utils import initial_parameter
+from fastNLP.modules.encoder.variational_rnn import VarLSTM
+from fastNLP.modules.dropout import TimestepDropout
+
+def mst(scores):
+    """
+    with some modification to support parser output for MST decoding
+    https://github.com/tdozat/Parser/blob/0739216129cd39d69997d28cbc4133b360ea3934/lib/models/nn.py#L692
+    """
+    length = scores.shape[0]
+    min_score = -np.inf
+    mask = np.zeros((length, length))
+    np.fill_diagonal(mask, -np.inf)
+    scores = scores + mask
+    heads = np.argmax(scores, axis=1)
+    heads[0] = 0
+    tokens = np.arange(1, length)
+    roots = np.where(heads[tokens] == 0)[0] + 1
+    if len(roots) < 1:
+        root_scores = scores[tokens, 0]
+        head_scores = scores[tokens, heads[tokens]]
+        new_root = tokens[np.argmax(root_scores / head_scores)]
+        heads[new_root] = 0
+    elif len(roots) > 1:
+        root_scores = scores[roots, 0]
+        scores[roots, 0] = 0
+        new_heads = np.argmax(scores[roots][:, tokens], axis=1) + 1
+        new_root = roots[np.argmin(
+            scores[roots, new_heads] / root_scores)]
+        heads[roots] = new_heads
+        heads[new_root] = 0
+
+    edges = defaultdict(set)
+    vertices = set((0,))
+    for dep, head in enumerate(heads[tokens]):
+        vertices.add(dep + 1)
+        edges[head].add(dep + 1)
+    for cycle in _find_cycle(vertices, edges):
+        dependents = set()
+        to_visit = set(cycle)
+        while len(to_visit) > 0:
+            node = to_visit.pop()
+            if node not in dependents:
+                dependents.add(node)
+                to_visit.update(edges[node])
+        cycle = np.array(list(cycle))
+        old_heads = heads[cycle]
+        old_scores = scores[cycle, old_heads]
+        non_heads = np.array(list(dependents))
+        scores[np.repeat(cycle, len(non_heads)),
+               np.repeat([non_heads], len(cycle), axis=0).flatten()] = min_score
+        new_heads = np.argmax(scores[cycle][:, tokens], axis=1) + 1
+        new_scores = scores[cycle, new_heads] / old_scores
+        change = np.argmax(new_scores)
+        changed_cycle = cycle[change]
+        old_head = old_heads[change]
+        new_head = new_heads[change]
+        heads[changed_cycle] = new_head
+        edges[new_head].add(changed_cycle)
+        edges[old_head].remove(changed_cycle)
+
+    return heads
+
+
+def _find_cycle(vertices, edges):
+    """
+    https://en.wikipedia.org/wiki/Tarjan%27s_strongly_connected_components_algorithm
+    https://github.com/tdozat/Parser/blob/0739216129cd39d69997d28cbc4133b360ea3934/lib/etc/tarjan.py
+    """
+    _index = 0
+    _stack = []
+    _indices = {}
+    _lowlinks = {}
+    _onstack = defaultdict(lambda: False)
+    _SCCs = []
+
+    def _strongconnect(v):
+        nonlocal _index
+        _indices[v] = _index
+        _lowlinks[v] = _index
+        _index += 1
+        _stack.append(v)
+        _onstack[v] = True
+
+        for w in edges[v]:
+            if w not in _indices:
+                _strongconnect(w)
+                _lowlinks[v] = min(_lowlinks[v], _lowlinks[w])
+            elif _onstack[w]:
+                _lowlinks[v] = min(_lowlinks[v], _indices[w])
+
+        if _lowlinks[v] == _indices[v]:
+            SCC = set()
+            while True:
+                w = _stack.pop()
+                _onstack[w] = False
+                SCC.add(w)
+                if not(w != v):
+                    break
+            _SCCs.append(SCC)
+
+    for v in vertices:
+        if v not in _indices:
+            _strongconnect(v)
+
+    return [SCC for SCC in _SCCs if len(SCC) > 1]
+
+
+class GraphParser(nn.Module):
+    """Graph based Parser helper class, support greedy decoding and MST(Maximum Spanning Tree) decoding
+    """
+    def __init__(self):
+        super(GraphParser, self).__init__()
+
+    def forward(self, x):
+        raise NotImplementedError
+
+    def _greedy_decoder(self, arc_matrix, seq_mask=None):
+        _, seq_len, _ = arc_matrix.shape
+        matrix = arc_matrix + torch.diag(arc_matrix.new(seq_len).fill_(-np.inf))
+        _, heads = torch.max(matrix, dim=2)
+        if seq_mask is not None:
+            heads *= seq_mask.long()
+        return heads
+
+    def _mst_decoder(self, arc_matrix, seq_mask=None):
+        batch_size, seq_len, _ = arc_matrix.shape
+        matrix = torch.zeros_like(arc_matrix).copy_(arc_matrix)
+        ans = matrix.new_zeros(batch_size, seq_len).long()
+        for i, graph in enumerate(matrix):
+            ans[i] = torch.as_tensor(mst(graph.cpu().numpy()), device=ans.device)
+        if seq_mask is not None:
+            ans *= seq_mask.long()
+        return ans
+
+
+class ArcBiaffine(nn.Module):
+    """helper module for Biaffine Dependency Parser predicting arc
+    """
+    def __init__(self, hidden_size, bias=True):
+        super(ArcBiaffine, self).__init__()
+        self.U = nn.Parameter(torch.Tensor(hidden_size, hidden_size), requires_grad=True)
+        self.has_bias = bias
+        if self.has_bias:
+            self.bias = nn.Parameter(torch.Tensor(hidden_size), requires_grad=True)
+        else:
+            self.register_parameter("bias", None)
+        initial_parameter(self)
+
+    def forward(self, head, dep):
+        """
+        :param head arc-head tensor = [batch, length, emb_dim]
+        :param dep arc-dependent tensor = [batch, length, emb_dim]
+
+        :return output tensor = [bacth, length, length]
+        """
+        output = dep.matmul(self.U)
+        output = output.bmm(head.transpose(-1, -2))
+        if self.has_bias:
+            output += head.matmul(self.bias).unsqueeze(1)
+        return output
+
+
+class LabelBilinear(nn.Module):
+    """helper module for Biaffine Dependency Parser predicting label
+    """
+    def __init__(self, in1_features, in2_features, num_label, bias=True):
+        super(LabelBilinear, self).__init__()
+        self.bilinear = nn.Bilinear(in1_features, in2_features, num_label, bias=bias)
+        self.lin1 = nn.Linear(in1_features, num_label, bias=False)
+        self.lin2 = nn.Linear(in2_features, num_label, bias=False)
+
+    def forward(self, x1, x2):
+        output = self.bilinear(x1, x2)
+        output += self.lin1(x1) + self.lin2(x2)
+        return output
+
+
+class BiaffineParser(GraphParser):
+    """Biaffine Dependency Parser implemantation.
+    refer to ` Deep Biaffine Attention for Neural Dependency Parsing (Dozat and Manning, 2016)
+    <https://arxiv.org/abs/1611.01734>`_ .
+    """
+    def __init__(self,
+                word_vocab_size,
+                word_emb_dim,
+                pos_vocab_size,
+                pos_emb_dim,
+                rnn_layers,
+                rnn_hidden_size,
+                arc_mlp_size,
+                label_mlp_size,
+                num_label,
+                dropout,
+                use_var_lstm=False,
+                use_greedy_infer=False):
+
+        super(BiaffineParser, self).__init__()
+        self.word_embedding = nn.Embedding(num_embeddings=word_vocab_size, embedding_dim=word_emb_dim)
+        self.pos_embedding = nn.Embedding(num_embeddings=pos_vocab_size, embedding_dim=pos_emb_dim)
+        if use_var_lstm:
+            self.lstm = VarLSTM(input_size=word_emb_dim + pos_emb_dim,
+                                hidden_size=rnn_hidden_size,
+                                num_layers=rnn_layers,
+                                bias=True,
+                                batch_first=True,
+                                input_dropout=dropout,
+                                hidden_dropout=dropout,
+                                bidirectional=True)
+        else:
+            self.lstm = nn.LSTM(input_size=word_emb_dim + pos_emb_dim,
+                                hidden_size=rnn_hidden_size,
+                                num_layers=rnn_layers,
+                                bias=True,
+                                batch_first=True,
+                                dropout=dropout,
+                                bidirectional=True)
+
+        rnn_out_size = 2 * rnn_hidden_size
+        self.arc_head_mlp = nn.Sequential(nn.Linear(rnn_out_size, arc_mlp_size),
+                                          nn.ELU())
+        self.arc_dep_mlp = copy.deepcopy(self.arc_head_mlp)
+        self.label_head_mlp = nn.Sequential(nn.Linear(rnn_out_size, label_mlp_size),
+                                            nn.ELU())
+        self.label_dep_mlp = copy.deepcopy(self.label_head_mlp)
+        self.arc_predictor = ArcBiaffine(arc_mlp_size, bias=True)
+        self.label_predictor = LabelBilinear(label_mlp_size, label_mlp_size, num_label, bias=True)
+        self.normal_dropout = nn.Dropout(p=dropout)
+        self.timestep_dropout = TimestepDropout(p=dropout)
+        self.use_greedy_infer = use_greedy_infer
+        initial_parameter(self)
+
+    def forward(self, word_seq, pos_seq, seq_mask, gold_heads=None, **_):
+        """
+        :param word_seq: [batch_size, seq_len] sequence of word's indices
+        :param pos_seq: [batch_size, seq_len] sequence of word's indices
+        :param seq_mask: [batch_size, seq_len] sequence of length masks
+        :param gold_heads: [batch_size, seq_len] sequence of golden heads
+        :return dict: parsing results
+            arc_pred: [batch_size, seq_len, seq_len]
+            label_pred: [batch_size, seq_len, seq_len]
+            seq_mask: [batch_size, seq_len]
+            head_pred: [batch_size, seq_len] if gold_heads is not provided, predicting the heads
+        """
+        # prepare embeddings
+        batch_size, seq_len = word_seq.shape
+        # print('forward {} {}'.format(batch_size, seq_len))
+        batch_range = torch.arange(start=0, end=batch_size, dtype=torch.long, device=word_seq.device).unsqueeze(1)
+
+        # get sequence mask
+        seq_mask = seq_mask.long()
+
+        word = self.normal_dropout(self.word_embedding(word_seq)) # [N,L] -> [N,L,C_0]
+        pos = self.normal_dropout(self.pos_embedding(pos_seq)) # [N,L] -> [N,L,C_1]
+        x = torch.cat([word, pos], dim=2) # -> [N,L,C]
+
+        # lstm, extract features
+        feat, _ = self.lstm(x) # -> [N,L,C]
+
+        # for arc biaffine
+        # mlp, reduce dim
+        arc_dep = self.timestep_dropout(self.arc_dep_mlp(feat))
+        arc_head = self.timestep_dropout(self.arc_head_mlp(feat))
+        label_dep = self.timestep_dropout(self.label_dep_mlp(feat))
+        label_head = self.timestep_dropout(self.label_head_mlp(feat))
+
+        # biaffine arc classifier
+        arc_pred = self.arc_predictor(arc_head, arc_dep) # [N, L, L]
+        flip_mask = (seq_mask == 0)
+        arc_pred.masked_fill_(flip_mask.unsqueeze(1), -np.inf)
+
+        # use gold or predicted arc to predict label
+        if gold_heads is None:
+            # use greedy decoding in training
+            if self.training or self.use_greedy_infer:
+                heads = self._greedy_decoder(arc_pred, seq_mask)
+            else:
+                heads = self._mst_decoder(arc_pred, seq_mask)
+            head_pred = heads
+        else:
+            head_pred = None
+            heads = gold_heads
+
+        label_head = label_head[batch_range, heads].contiguous()
+        label_pred = self.label_predictor(label_head, label_dep) # [N, L, num_label]
+        res_dict = {'arc_pred': arc_pred, 'label_pred': label_pred, 'seq_mask': seq_mask}
+        if head_pred is not None:
+            res_dict['head_pred'] = head_pred
+        return res_dict
+
+    def loss(self, arc_pred, label_pred, head_indices, head_labels, seq_mask, **_):
+        """
+        Compute loss.
+
+        :param arc_pred: [batch_size, seq_len, seq_len]
+        :param label_pred: [batch_size, seq_len, seq_len]
+        :param head_indices: [batch_size, seq_len]
+        :param head_labels: [batch_size, seq_len]
+        :param seq_mask: [batch_size, seq_len]
+        :return: loss value
+        """
+
+        batch_size, seq_len, _ = arc_pred.shape
+        arc_logits = F.log_softmax(arc_pred, dim=2)
+        label_logits = F.log_softmax(label_pred, dim=2)
+        batch_index = torch.arange(start=0, end=batch_size, device=arc_logits.device).long().unsqueeze(1)
+        child_index = torch.arange(start=0, end=seq_len, device=arc_logits.device).long().unsqueeze(0)
+        arc_loss = arc_logits[batch_index, child_index, head_indices]
+        label_loss = label_logits[batch_index, child_index, head_labels]
+
+        arc_loss = arc_loss[:, 1:]
+        label_loss = label_loss[:, 1:]
+
+        float_mask = seq_mask[:, 1:].float()
+        length = (seq_mask.sum() - batch_size).float()
+        arc_nll = -(arc_loss*float_mask).sum() / length
+        label_nll = -(label_loss*float_mask).sum() / length
+        return arc_nll + label_nll
+
+    def evaluate(self, arc_pred, label_pred, head_indices, head_labels, seq_mask, **kwargs):
+        """
+        Evaluate the performance of prediction.
+
+        :return dict: performance results.
+            head_pred_corrct: number of correct predicted heads.
+            label_pred_correct: number of correct predicted labels.
+            total_tokens: number of predicted tokens
+        """
+        if 'head_pred' in kwargs:
+            head_pred = kwargs['head_pred']
+        elif self.use_greedy_infer:
+            head_pred = self._greedy_decoder(arc_pred, seq_mask)
+        else:
+            head_pred = self._mst_decoder(arc_pred, seq_mask)
+
+        head_pred_correct = (head_pred == head_indices).long() * seq_mask
+        _, label_preds = torch.max(label_pred, dim=2)
+        label_pred_correct = (label_preds == head_labels).long() * head_pred_correct
+        return {"head_pred_correct": head_pred_correct.sum(dim=1),
+                "label_pred_correct": label_pred_correct.sum(dim=1),
+                "total_tokens": seq_mask.sum(dim=1)}
+
+    def metrics(self, head_pred_correct, label_pred_correct, total_tokens, **_):
+        """
+        Compute the metrics of model
+
+        :param head_pred_corrct: number of correct predicted heads.
+        :param label_pred_correct: number of correct predicted labels.
+        :param total_tokens: number of predicted tokens
+        :return dict: the metrics results
+            UAS: the head predicted accuracy
+            LAS: the label predicted accuracy
+        """
+        return {"UAS": head_pred_correct.sum().float() / total_tokens.sum().float() * 100,
+                "LAS": label_pred_correct.sum().float() / total_tokens.sum().float() * 100}
+
diff --git a/fastNLP/models/char_language_model.py b/fastNLP/models/char_language_model.py
index 2ad49abe..5fbde3cc 100644
--- a/fastNLP/models/char_language_model.py
+++ b/fastNLP/models/char_language_model.py
@@ -103,7 +103,7 @@ class CharLM(nn.Module):
         x = x.contiguous().view(lstm_batch_size, lstm_seq_len, -1)
         # [num_seq, seq_len, total_num_filters]
 
-        x, hidden = self.lstm(x)
+        x = self.lstm(x)
         # [seq_len, num_seq, hidden_size]
 
         x = self.dropout(x)
diff --git a/fastNLP/models/snli.py b/fastNLP/models/snli.py
new file mode 100644
index 00000000..9be7f43d
--- /dev/null
+++ b/fastNLP/models/snli.py
@@ -0,0 +1,161 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from fastNLP.models.base_model import BaseModel
+from fastNLP.modules import decoder as Decoder, encoder as Encoder
+
+
+my_inf = 10e12
+
+
+class SNLI(BaseModel):
+    """
+    PyTorch Network for SNLI.
+    """
+
+    def __init__(self, args, init_embedding=None):
+        super(SNLI, self).__init__()
+        self.vocab_size = args["vocab_size"]
+        self.embed_dim = args["embed_dim"]
+        self.hidden_size = args["hidden_size"]
+        self.batch_first = args["batch_first"]
+        self.dropout = args["dropout"]
+        self.n_labels = args["num_classes"]
+        self.gpu = args["gpu"] and torch.cuda.is_available()
+
+        self.embedding = Encoder.embedding.Embedding(self.vocab_size, self.embed_dim, init_emb=init_embedding,
+                                                     dropout=self.dropout)
+
+        self.embedding_layer = Encoder.Linear(self.embed_dim, self.hidden_size)
+
+        self.encoder = Encoder.LSTM(
+            input_size=self.embed_dim, hidden_size=self.hidden_size, num_layers=1, bias=True,
+            batch_first=self.batch_first, bidirectional=True
+        )
+
+        self.inference_layer = Encoder.Linear(self.hidden_size * 4, self.hidden_size)
+
+        self.decoder = Encoder.LSTM(
+            input_size=self.hidden_size, hidden_size=self.hidden_size, num_layers=1, bias=True,
+            batch_first=self.batch_first, bidirectional=True
+        )
+
+        self.output = Decoder.MLP([4 * self.hidden_size, self.hidden_size, self.n_labels], 'tanh')
+
+    def forward(self, premise, hypothesis, premise_len, hypothesis_len):
+        """ Forward function
+
+        :param premise: A Tensor represents premise: [batch size(B), premise seq len(PL), hidden size(H)].
+        :param hypothesis: A Tensor represents hypothesis: [B, hypothesis seq len(HL), H].
+        :param premise_len: A Tensor record which is a real word and which is a padding word in premise: [B, PL].
+        :param hypothesis_len: A Tensor record which is a real word and which is a padding word in hypothesis: [B, HL].
+        :return: prediction: A Tensor of classification result: [B, n_labels(N)].
+        """
+
+        premise0 = self.embedding_layer(self.embedding(premise))
+        hypothesis0 = self.embedding_layer(self.embedding(hypothesis))
+
+        _BP, _PSL, _HP = premise0.size()
+        _BH, _HSL, _HH = hypothesis0.size()
+        _BPL, _PLL = premise_len.size()
+        _HPL, _HLL = hypothesis_len.size()
+
+        assert _BP == _BH and _BPL == _HPL and _BP == _BPL
+        assert _HP == _HH
+        assert _PSL == _PLL and _HSL == _HLL
+
+        B, PL, H = premise0.size()
+        B, HL, H = hypothesis0.size()
+
+        # a0, (ah0, ac0) = self.encoder(premise)  # a0: [B, PL, H * 2], ah0: [2, B, H]
+        # b0, (bh0, bc0) = self.encoder(hypothesis)  # b0: [B, HL, H * 2]
+
+        a0 = self.encoder(premise0)  # a0: [B, PL, H * 2]
+        b0 = self.encoder(hypothesis0)  # b0: [B, HL, H * 2]
+
+        a = torch.mean(a0.view(B, PL, -1, H), dim=2)  # a: [B, PL, H]
+        b = torch.mean(b0.view(B, HL, -1, H), dim=2)  # b: [B, HL, H]
+
+        ai, bi = self.calc_bi_attention(a, b, premise_len, hypothesis_len)
+
+        ma = torch.cat((a, ai, a - ai, a * ai), dim=2)  # ma: [B, PL, 4 * H]
+        mb = torch.cat((b, bi, b - bi, b * bi), dim=2)  # mb: [B, HL, 4 * H]
+
+        f_ma = self.inference_layer(ma)
+        f_mb = self.inference_layer(mb)
+
+        vat = self.decoder(f_ma)
+        vbt = self.decoder(f_mb)
+
+        va = torch.mean(vat.view(B, PL, -1, H), dim=2)  # va: [B, PL, H]
+        vb = torch.mean(vbt.view(B, HL, -1, H), dim=2)  # vb: [B, HL, H]
+
+        # va_ave = torch.mean(va, dim=1)  # va_ave: [B, H]
+        # va_max, va_arg_max = torch.max(va, dim=1)  # va_max: [B, H]
+        # vb_ave = torch.mean(vb, dim=1)  # vb_ave: [B, H]
+        # vb_max, vb_arg_max = torch.max(vb, dim=1)  # vb_max: [B, H]
+
+        va_ave = self.mean_pooling(va, premise_len, dim=1)  # va_ave: [B, H]
+        va_max, va_arg_max = self.max_pooling(va, premise_len, dim=1)  # va_max: [B, H]
+        vb_ave = self.mean_pooling(vb, hypothesis_len, dim=1)  # vb_ave: [B, H]
+        vb_max, vb_arg_max = self.max_pooling(vb, hypothesis_len, dim=1)  # vb_max: [B, H]
+
+        v = torch.cat((va_ave, va_max, vb_ave, vb_max), dim=1)  # v: [B, 4 * H]
+
+        # v_mlp = F.tanh(self.mlp_layer1(v))  # v_mlp: [B, H]
+        # prediction = self.mlp_layer2(v_mlp)  # prediction: [B, N]
+
+        prediction = F.tanh(self.output(v))  # prediction: [B, N]
+
+        return prediction
+
+    @staticmethod
+    def calc_bi_attention(in_x1, in_x2, x1_len, x2_len):
+
+        # in_x1: [batch_size, x1_seq_len, hidden_size]
+        # in_x2: [batch_size, x2_seq_len, hidden_size]
+        # x1_len: [batch_size, x1_seq_len]
+        # x2_len: [batch_size, x2_seq_len]
+
+        assert in_x1.size()[0] == in_x2.size()[0]
+        assert in_x1.size()[2] == in_x2.size()[2]
+        # The batch size and hidden size must be equal.
+        assert in_x1.size()[1] == x1_len.size()[1] and in_x2.size()[1] == x2_len.size()[1]
+        # The seq len in in_x and x_len must be equal.
+        assert in_x1.size()[0] == x1_len.size()[0] and x1_len.size()[0] == x2_len.size()[0]
+
+        batch_size = in_x1.size()[0]
+        x1_max_len = in_x1.size()[1]
+        x2_max_len = in_x2.size()[1]
+
+        in_x2_t = torch.transpose(in_x2, 1, 2)  # [batch_size, hidden_size, x2_seq_len]
+
+        attention_matrix = torch.bmm(in_x1, in_x2_t)  # [batch_size, x1_seq_len, x2_seq_len]
+
+        a_mask = x1_len.le(0.5).float() * -my_inf  # [batch_size, x1_seq_len]
+        a_mask = a_mask.view(batch_size, x1_max_len, -1)
+        a_mask = a_mask.expand(-1, -1, x2_max_len)  # [batch_size, x1_seq_len, x2_seq_len]
+        b_mask = x2_len.le(0.5).float() * -my_inf
+        b_mask = b_mask.view(batch_size, -1, x2_max_len)
+        b_mask = b_mask.expand(-1, x1_max_len, -1)  # [batch_size, x1_seq_len, x2_seq_len]
+
+        attention_a = F.softmax(attention_matrix + a_mask, dim=2)  # [batch_size, x1_seq_len, x2_seq_len]
+        attention_b = F.softmax(attention_matrix + b_mask, dim=1)  # [batch_size, x1_seq_len, x2_seq_len]
+
+        out_x1 = torch.bmm(attention_a, in_x2)  # [batch_size, x1_seq_len, hidden_size]
+        attention_b_t = torch.transpose(attention_b, 1, 2)
+        out_x2 = torch.bmm(attention_b_t, in_x1)  # [batch_size, x2_seq_len, hidden_size]
+
+        return out_x1, out_x2
+
+    @staticmethod
+    def mean_pooling(tensor, mask, dim=0):
+        masks = mask.view(mask.size(0), mask.size(1), -1).float()
+        return torch.sum(tensor * masks, dim=dim) / torch.sum(masks, dim=1)
+
+    @staticmethod
+    def max_pooling(tensor, mask, dim=0):
+        masks = mask.view(mask.size(0), mask.size(1), -1)
+        masks = masks.expand(-1, -1, tensor.size(2)).float()
+        return torch.max(tensor + masks.le(0.5).float() * -my_inf, dim=dim)
diff --git a/fastNLP/modules/decoder/MLP.py b/fastNLP/modules/decoder/MLP.py
index 2a4193b1..766dc225 100644
--- a/fastNLP/modules/decoder/MLP.py
+++ b/fastNLP/modules/decoder/MLP.py
@@ -1,12 +1,15 @@
 import torch
 import torch.nn as nn
 from fastNLP.modules.utils import initial_parameter
+
+
 class MLP(nn.Module):
-    def __init__(self, size_layer, activation='relu' , initial_method = None):
+    def __init__(self, size_layer, activation='relu', initial_method=None):
         """Multilayer Perceptrons as a decoder
 
-        :param size_layer: list of int, define the size of MLP layers
-        :param activation: str or function, the activation function for hidden layers
+        :param size_layer: list of int, define the size of MLP layers.
+        :param activation: str or function, the activation function for hidden layers.
+        :param initial_method: str, the name of init method.
 
         .. note::
             There is no activation function applying on output layer.
@@ -23,7 +26,7 @@ class MLP(nn.Module):
 
         actives = {
             'relu': nn.ReLU(),
-            'tanh': nn.Tanh()
+            'tanh': nn.Tanh(),
         }
         if activation in actives:
             self.hidden_active = actives[activation]
@@ -31,7 +34,7 @@ class MLP(nn.Module):
             self.hidden_active = activation
         else:
             raise ValueError("should set activation correctly: {}".format(activation))
-        initial_parameter(self, initial_method  )
+        initial_parameter(self, initial_method)
 
     def forward(self, x):
         for layer in self.hiddens:
@@ -40,13 +43,11 @@ class MLP(nn.Module):
         return x
 
 
-
 if __name__ == '__main__':
-    net1 = MLP([5,10,5])
-    net2 = MLP([5,10,5], 'tanh')
+    net1 = MLP([5, 10, 5])
+    net2 = MLP([5, 10, 5], 'tanh')
     for net in [net1, net2]:
         x = torch.randn(5, 5)
         y = net(x)
         print(x)
         print(y)
-    
\ No newline at end of file
diff --git a/fastNLP/modules/dropout.py b/fastNLP/modules/dropout.py
new file mode 100644
index 00000000..9113a7e4
--- /dev/null
+++ b/fastNLP/modules/dropout.py
@@ -0,0 +1,15 @@
+import torch
+
+class TimestepDropout(torch.nn.Dropout):
+    """This module accepts a `[batch_size, num_timesteps, embedding_dim)]` and use a single
+    dropout mask of shape `(batch_size, embedding_dim)` to apply on every time step.
+    """
+    def forward(self, x):
+        dropout_mask = x.new_ones(x.shape[0], x.shape[-1])
+        torch.nn.functional.dropout(dropout_mask, self.p, self.training, inplace=True)
+        dropout_mask = dropout_mask.unsqueeze(1) # [batch_size, 1, embedding_dim]
+        if self.inplace:
+            x *= dropout_mask
+            return
+        else:
+            return x * dropout_mask
diff --git a/fastNLP/modules/encoder/char_embedding.py b/fastNLP/modules/encoder/char_embedding.py
index 1da63947..1ca3b5ba 100644
--- a/fastNLP/modules/encoder/char_embedding.py
+++ b/fastNLP/modules/encoder/char_embedding.py
@@ -1,12 +1,14 @@
 import torch
 import torch.nn.functional as F
 from torch import nn
-# from torch.nn.init import xavier_uniform
 
 from fastNLP.modules.utils import initial_parameter
+
+
+# from torch.nn.init import xavier_uniform
 class ConvCharEmbedding(nn.Module):
 
-    def __init__(self, char_emb_size=50, feature_maps=(40, 30, 30), kernels=(3, 4, 5),initial_method = None):
+    def __init__(self, char_emb_size=50, feature_maps=(40, 30, 30), kernels=(3, 4, 5), initial_method=None):
         """
         Character Level Word Embedding
         :param char_emb_size: the size of character level embedding. Default: 50
@@ -21,7 +23,7 @@ class ConvCharEmbedding(nn.Module):
             nn.Conv2d(1, feature_maps[i], kernel_size=(char_emb_size, kernels[i]), bias=True, padding=(0, 4))
             for i in range(len(kernels))])
 
-        initial_parameter(self,initial_method)
+        initial_parameter(self, initial_method)
 
     def forward(self, x):
         """
@@ -56,7 +58,7 @@ class LSTMCharEmbedding(nn.Module):
     :param hidden_size: int, the number of hidden units. Default:  equal to char_emb_size.
     """
 
-    def __init__(self, char_emb_size=50, hidden_size=None , initial_method= None):
+    def __init__(self, char_emb_size=50, hidden_size=None, initial_method=None):
         super(LSTMCharEmbedding, self).__init__()
         self.hidden_size = char_emb_size if hidden_size is None else hidden_size
 
@@ -66,6 +68,7 @@ class LSTMCharEmbedding(nn.Module):
                             bias=True,
                             batch_first=True)
         initial_parameter(self, initial_method)
+
     def forward(self, x):
         """
         :param x:[ n_batch*n_word, word_length, char_emb_size]
@@ -79,20 +82,3 @@ class LSTMCharEmbedding(nn.Module):
 
         _, hidden = self.lstm(x, (h0, c0))
         return hidden[0].squeeze().unsqueeze(2)
-
-
-if __name__ == "__main__":
-    batch_size = 128
-    char_emb = 100
-    word_length = 1
-    x = torch.Tensor(batch_size, char_emb, word_length)
-    x = x.transpose(1, 2)
-    cce = ConvCharEmbedding(char_emb)
-    y = cce(x)
-    print("CNN Char Emb input: ", x.shape)
-    print("CNN Char Emb output: ", y.shape)  # [128, 100]
-
-    lce = LSTMCharEmbedding(char_emb)
-    o = lce(x)
-    print("LSTM Char Emb input: ", x.shape)
-    print("LSTM Char Emb size: ", o.shape)
diff --git a/fastNLP/modules/encoder/linear.py b/fastNLP/modules/encoder/linear.py
index a7c5f6c3..399e15d3 100644
--- a/fastNLP/modules/encoder/linear.py
+++ b/fastNLP/modules/encoder/linear.py
@@ -1,6 +1,8 @@
 import torch.nn as nn
 
 from fastNLP.modules.utils import initial_parameter
+
+
 class Linear(nn.Module):
     """
     Linear module
@@ -12,10 +14,11 @@ class Linear(nn.Module):
     bidirectional : If True, becomes a bidirectional RNN
     """
 
-    def __init__(self, input_size, output_size, bias=True,initial_method = None        ):
+    def __init__(self, input_size, output_size, bias=True, initial_method=None):
         super(Linear, self).__init__()
         self.linear = nn.Linear(input_size, output_size, bias)
         initial_parameter(self, initial_method)
+
     def forward(self, x):
         x = self.linear(x)
         return x
diff --git a/fastNLP/modules/encoder/lstm.py b/fastNLP/modules/encoder/lstm.py
index e48960a8..a0b42442 100644
--- a/fastNLP/modules/encoder/lstm.py
+++ b/fastNLP/modules/encoder/lstm.py
@@ -14,16 +14,23 @@ class LSTM(nn.Module):
     bidirectional : If True, becomes a bidirectional RNN. Default: False.
     """
 
-    def __init__(self, input_size, hidden_size=100, num_layers=1, dropout=0.0, bidirectional=False,
-                 initial_method=None):
+    def __init__(self, input_size, hidden_size=100, num_layers=1, dropout=0.0, batch_first=True,
+                 bidirectional=False, bias=True, initial_method=None, get_hidden=False):
         super(LSTM, self).__init__()
-        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, bias=True, batch_first=True,
+        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, bias=bias, batch_first=batch_first,
                             dropout=dropout, bidirectional=bidirectional)
+        self.get_hidden = get_hidden
         initial_parameter(self, initial_method)
 
-    def forward(self, x):
-        x, _ = self.lstm(x)
-        return x
+    def forward(self, x, h0=None, c0=None):
+        if h0 is not None and c0 is not None:
+            x, (ht, ct) = self.lstm(x, (h0, c0))
+        else:
+            x, (ht, ct) = self.lstm(x)
+        if self.get_hidden:
+            return x, (ht, ct)
+        else:
+            return x
 
 
 if __name__ == "__main__":
diff --git a/fastNLP/modules/encoder/variational_rnn.py b/fastNLP/modules/encoder/variational_rnn.py
index fb75fabb..16bd4172 100644
--- a/fastNLP/modules/encoder/variational_rnn.py
+++ b/fastNLP/modules/encoder/variational_rnn.py
@@ -2,384 +2,153 @@ import math
 
 import torch
 import torch.nn as nn
-import torch.nn.functional as F
-from torch.nn._functions.thnn import rnnFusedPointwise as fusedBackend
-from torch.nn.parameter import Parameter
+from torch.nn.utils.rnn import PackedSequence
 
 from fastNLP.modules.utils import initial_parameter
 
-def default_initializer(hidden_size):
-    stdv = 1.0 / math.sqrt(hidden_size)
-
-    def forward(tensor):
-        nn.init.uniform_(tensor, -stdv, stdv)
-
-    return forward
-
-
-def VarMaskedRecurrent(reverse=False):
-    def forward(input, hidden, cell, mask):
-        output = []
-        steps = range(input.size(0) - 1, -1, -1) if reverse else range(input.size(0))
-        for i in steps:
-            if mask is None or mask[i].data.min() > 0.5:
-                hidden = cell(input[i], hidden)
-            elif mask[i].data.max() > 0.5:
-                hidden_next = cell(input[i], hidden)
-                # hack to handle LSTM
-                if isinstance(hidden, tuple):
-                    hx, cx = hidden
-                    hp1, cp1 = hidden_next
-                    hidden = (hx + (hp1 - hx) * mask[i], cx + (cp1 - cx) * mask[i])
-                else:
-                    hidden = hidden + (hidden_next - hidden) * mask[i]
-            # hack to handle LSTM
-            output.append(hidden[0] if isinstance(hidden, tuple) else hidden)
-
-        if reverse:
-            output.reverse()
-        output = torch.cat(output, 0).view(input.size(0), *output[0].size())
-
-        return hidden, output
-
-    return forward
-
-
-def StackedRNN(inners, num_layers, lstm=False):
-    num_directions = len(inners)
-    total_layers = num_layers * num_directions
-
-    def forward(input, hidden, cells, mask):
-        assert (len(cells) == total_layers)
-        next_hidden = []
-
-        if lstm:
-            hidden = list(zip(*hidden))
-
-        for i in range(num_layers):
-            all_output = []
-            for j, inner in enumerate(inners):
-                l = i * num_directions + j
-                hy, output = inner(input, hidden[l], cells[l], mask)
-                next_hidden.append(hy)
-                all_output.append(output)
-
-            input = torch.cat(all_output, input.dim() - 1)
-
-        if lstm:
-            next_h, next_c = zip(*next_hidden)
-            next_hidden = (
-                torch.cat(next_h, 0).view(total_layers, *next_h[0].size()),
-                torch.cat(next_c, 0).view(total_layers, *next_c[0].size())
-            )
-        else:
-            next_hidden = torch.cat(next_hidden, 0).view(total_layers, *next_hidden[0].size())
-
-        return next_hidden, input
-
-    return forward
-
-
-def AutogradVarMaskedRNN(num_layers=1, batch_first=False, bidirectional=False, lstm=False):
-    rec_factory = VarMaskedRecurrent
-
-    if bidirectional:
-        layer = (rec_factory(), rec_factory(reverse=True))
-    else:
-        layer = (rec_factory(),)
-
-    func = StackedRNN(layer,
-                      num_layers,
-                      lstm=lstm)
-
-    def forward(input, cells, hidden, mask):
-        if batch_first:
-            input = input.transpose(0, 1)
-            if mask is not None:
-                mask = mask.transpose(0, 1)
-
-        nexth, output = func(input, hidden, cells, mask)
-
-        if batch_first:
-            output = output.transpose(0, 1)
-
-        return output, nexth
-
-    return forward
-
+try:
+    from torch import flip
+except ImportError:
+   def flip(x, dims):
+        indices = [slice(None)] * x.dim()
+        for dim in dims:
+            indices[dim] = torch.arange(x.size(dim) - 1, -1, -1, dtype=torch.long, device=x.device)
+        return x[tuple(indices)]
+
+class VarRnnCellWrapper(nn.Module):
+    """Wrapper for normal RNN Cells, make it support variational dropout
+    """
+    def __init__(self, cell, hidden_size, input_p, hidden_p):
+        super(VarRnnCellWrapper, self).__init__()
+        self.cell = cell
+        self.hidden_size = hidden_size
+        self.input_p = input_p
+        self.hidden_p = hidden_p
 
-def VarMaskedStep():
-    def forward(input, hidden, cell, mask):
-        if mask is None or mask.data.min() > 0.5:
-            hidden = cell(input, hidden)
-        elif mask.data.max() > 0.5:
-            hidden_next = cell(input, hidden)
-            # hack to handle LSTM
-            if isinstance(hidden, tuple):
+    def forward(self, input, hidden, mask_x=None, mask_h=None):
+        """
+        :param input: [seq_len, batch_size, input_size]
+        :param hidden: for LSTM, tuple of (h_0, c_0), [batch_size, hidden_size]
+                       for other RNN, h_0, [batch_size, hidden_size]
+        :param mask_x: [batch_size, input_size] dropout mask for input
+        :param mask_h: [batch_size, hidden_size] dropout mask for hidden
+        :return output: [seq_len, bacth_size, hidden_size]
+                hidden: for LSTM, tuple of (h_n, c_n), [batch_size, hidden_size]
+                        for other RNN, h_n, [batch_size, hidden_size]
+        """
+        is_lstm = isinstance(hidden, tuple)
+        input = input * mask_x.unsqueeze(0) if mask_x is not None else input
+        output_list = []
+        for x in input:
+            if is_lstm:
                 hx, cx = hidden
-                hp1, cp1 = hidden_next
-                hidden = (hx + (hp1 - hx) * mask, cx + (cp1 - cx) * mask)
+                hidden = (hx * mask_h, cx) if mask_h is not None else (hx, cx)
             else:
-                hidden = hidden + (hidden_next - hidden) * mask
-        # hack to handle LSTM
-        output = hidden[0] if isinstance(hidden, tuple) else hidden
-
-        return hidden, output
-
-    return forward
-
-
-def StackedStep(layer, num_layers, lstm=False):
-    def forward(input, hidden, cells, mask):
-        assert (len(cells) == num_layers)
-        next_hidden = []
-
-        if lstm:
-            hidden = list(zip(*hidden))
-
-        for l in range(num_layers):
-            hy, output = layer(input, hidden[l], cells[l], mask)
-            next_hidden.append(hy)
-            input = output
-
-        if lstm:
-            next_h, next_c = zip(*next_hidden)
-            next_hidden = (
-                torch.cat(next_h, 0).view(num_layers, *next_h[0].size()),
-                torch.cat(next_c, 0).view(num_layers, *next_c[0].size())
-            )
-        else:
-            next_hidden = torch.cat(next_hidden, 0).view(num_layers, *next_hidden[0].size())
-
-        return next_hidden, input
-
-    return forward
-
-
-def AutogradVarMaskedStep(num_layers=1, lstm=False):
-    layer = VarMaskedStep()
-
-    func = StackedStep(layer,
-                       num_layers,
-                       lstm=lstm)
-
-    def forward(input, cells, hidden, mask):
-        nexth, output = func(input, hidden, cells, mask)
-        return output, nexth
-
-    return forward
-
+                hidden *= mask_h if mask_h is not None else hidden
+            hidden = self.cell(x, hidden)
+            output_list.append(hidden[0] if is_lstm else hidden)
+        output = torch.stack(output_list, dim=0)
+        return output, hidden
 
-class VarMaskedRNNBase(nn.Module):
-    def __init__(self, Cell, input_size, hidden_size,
-                 num_layers=1, bias=True, batch_first=False,
-                 dropout=(0, 0), bidirectional=False, initializer=None,initial_method = None, **kwargs):
 
-        super(VarMaskedRNNBase, self).__init__()
-        self.Cell = Cell
+class VarRNNBase(nn.Module):
+    """Implementation of Variational Dropout RNN network.
+    refer to `A Theoretically Grounded Application of Dropout in Recurrent Neural Networks (Yarin Gal and Zoubin Ghahramani, 2016)
+    https://arxiv.org/abs/1512.05287`.
+    """
+    def __init__(self, mode, Cell, input_size, hidden_size, num_layers=1,
+                 bias=True, batch_first=False,
+                 input_dropout=0, hidden_dropout=0, bidirectional=False):
+        super(VarRNNBase, self).__init__()
+        self.mode = mode
         self.input_size = input_size
         self.hidden_size = hidden_size
         self.num_layers = num_layers
         self.bias = bias
         self.batch_first = batch_first
+        self.input_dropout = input_dropout
+        self.hidden_dropout = hidden_dropout
         self.bidirectional = bidirectional
-        self.lstm = False
-        num_directions = 2 if bidirectional else 1
-
-        self.all_cells = []
-        for layer in range(num_layers):
-            for direction in range(num_directions):
-                layer_input_size = input_size if layer == 0 else hidden_size * num_directions
-
-                cell = self.Cell(layer_input_size, hidden_size, self.bias, p=dropout, initializer=initializer, **kwargs)
-                self.all_cells.append(cell)
-                self.add_module('cell%d' % (layer * num_directions + direction), cell)
-        initial_parameter(self, initial_method)
-    def reset_parameters(self):
-        for cell in self.all_cells:
-            cell.reset_parameters()
-
-    def reset_noise(self, batch_size):
-        for cell in self.all_cells:
-            cell.reset_noise(batch_size)
+        self.num_directions = 2 if bidirectional else 1
+        self._all_cells = nn.ModuleList()
+        for layer in range(self.num_layers):
+            for direction in range(self.num_directions):
+                input_size = self.input_size if layer == 0 else self.hidden_size * self.num_directions
+                cell = Cell(input_size, self.hidden_size, bias)
+                self._all_cells.append(VarRnnCellWrapper(cell, self.hidden_size, input_dropout, hidden_dropout))
+        initial_parameter(self)
+
+    def forward(self, input, hx=None):
+        is_packed = isinstance(input, PackedSequence)
+        is_lstm = (self.mode == "LSTM")
+        if is_packed:
+            input, batch_sizes = input
+            max_batch_size = int(batch_sizes[0])
+        else:
+            batch_sizes = None
+            max_batch_size = input.size(0) if self.batch_first else input.size(1)
 
-    def forward(self, input, mask=None, hx=None):
-        batch_size = input.size(0) if self.batch_first else input.size(1)
         if hx is None:
-            num_directions = 2 if self.bidirectional else 1
-            hx = torch.tensor(input.data.new(self.num_layers * num_directions, batch_size, self.hidden_size).zero_(),
-                              requires_grad=True)
-            if self.lstm:
+            hx = input.new_zeros(self.num_layers * self.num_directions,
+                                 max_batch_size, self.hidden_size,
+                                 requires_grad=False)
+            if is_lstm:
                 hx = (hx, hx)
 
-        func = AutogradVarMaskedRNN(num_layers=self.num_layers,
-                                    batch_first=self.batch_first,
-                                    bidirectional=self.bidirectional,
-                                    lstm=self.lstm)
-
-        self.reset_noise(batch_size)
-
-        output, hidden = func(input, self.all_cells, hx, None if mask is None else mask.view(mask.size() + (1,)))
-        return output, hidden
-
-    def step(self, input, hx=None, mask=None):
-        '''
-        execute one step forward (only for one-directional RNN).
-        Args:
-            input (batch, input_size): input tensor of this step.
-            hx (num_layers, batch, hidden_size): the hidden state of last step.
-            mask (batch): the mask tensor of this step.
-        Returns:
-            output (batch, hidden_size): tensor containing the output of this step from the last layer of RNN.
-            hn (num_layers, batch, hidden_size): tensor containing the hidden state of this step
-        '''
-        assert not self.bidirectional, "step only cannot be applied to bidirectional RNN."
-        batch_size = input.size(0)
-        if hx is None:
-            hx = torch.tensor(input.data.new(self.num_layers, batch_size, self.hidden_size).zero_(), requires_grad=True)
-            if self.lstm:
-                hx = (hx, hx)
+        if self.batch_first:
+            input = input.transpose(0, 1)
+            batch_size = input.shape[1]
+
+        mask_x = input.new_ones((batch_size, self.input_size))
+        mask_out = input.new_ones((batch_size, self.hidden_size * self.num_directions))
+        mask_h = input.new_ones((batch_size, self.hidden_size))
+        nn.functional.dropout(mask_x, p=self.input_dropout, training=self.training, inplace=True)
+        nn.functional.dropout(mask_out, p=self.hidden_dropout, training=self.training, inplace=True)
+        nn.functional.dropout(mask_h, p=self.hidden_dropout, training=self.training, inplace=True)
+
+        hidden_list = []
+        for layer in range(self.num_layers):
+            output_list = []
+            for direction in range(self.num_directions):
+                input_x = input if direction == 0 else flip(input, [0])
+                idx = self.num_directions * layer + direction
+                cell = self._all_cells[idx]
+                hi = (hx[0][idx], hx[1][idx]) if is_lstm else hx[idx]
+                mask_xi = mask_x if layer == 0 else mask_out
+                output_x, hidden_x = cell(input_x, hi, mask_xi, mask_h)
+                output_list.append(output_x if direction == 0 else flip(output_x, [0]))
+                hidden_list.append(hidden_x)
+            input = torch.cat(output_list, dim=-1)
+
+        output = input.transpose(0, 1) if self.batch_first else input
+        if is_lstm:
+            h_list, c_list = zip(*hidden_list)
+            hn = torch.stack(h_list, dim=0)
+            cn = torch.stack(c_list, dim=0)
+            hidden = (hn, cn)
+        else:
+            hidden = torch.stack(hidden_list, dim=0)
 
-        func = AutogradVarMaskedStep(num_layers=self.num_layers, lstm=self.lstm)
+        if is_packed:
+            output = PackedSequence(output, batch_sizes)
 
-        output, hidden = func(input, self.all_cells, hx, mask)
         return output, hidden
 
 
-class VarMaskedFastLSTM(VarMaskedRNNBase):
+class VarLSTM(VarRNNBase):
+    """Variational Dropout LSTM.
+    """
     def __init__(self, *args, **kwargs):
-        super(VarMaskedFastLSTM, self).__init__(VarFastLSTMCell, *args, **kwargs)
-        self.lstm = True
-
-
-class VarRNNCellBase(nn.Module):
-    def __repr__(self):
-        s = '{name}({input_size}, {hidden_size}'
-        if 'bias' in self.__dict__ and self.bias is not True:
-            s += ', bias={bias}'
-        if 'nonlinearity' in self.__dict__ and self.nonlinearity != "tanh":
-            s += ', nonlinearity={nonlinearity}'
-        s += ')'
-        return s.format(name=self.__class__.__name__, **self.__dict__)
+        super(VarLSTM, self).__init__(mode="LSTM", Cell=nn.LSTMCell, *args, **kwargs)
 
-    def reset_noise(self, batch_size):
-        """
-        Should be overriden by all subclasses.
-        Args:
-            batch_size: (int) batch size of input.
-        """
-        raise NotImplementedError
-
-
-class VarFastLSTMCell(VarRNNCellBase):
-    """
-    A long short-term memory (LSTM) cell with variational dropout.
-    .. math::
-        \begin{array}{ll}
-        i = \mathrm{sigmoid}(W_{ii} x + b_{ii} + W_{hi} h + b_{hi}) \\
-        f = \mathrm{sigmoid}(W_{if} x + b_{if} + W_{hf} h + b_{hf}) \\
-        g = \tanh(W_{ig} x + b_{ig} + W_{hc} h + b_{hg}) \\
-        o = \mathrm{sigmoid}(W_{io} x + b_{io} + W_{ho} h + b_{ho}) \\
-        c' = f * c + i * g \\
-        h' = o * \tanh(c') \\
-        \end{array}
+class VarRNN(VarRNNBase):
+    """Variational Dropout RNN.
     """
+    def __init__(self, *args, **kwargs):
+        super(VarRNN, self).__init__(mode="RNN", Cell=nn.RNNCell, *args, **kwargs)
 
-    def __init__(self, input_size, hidden_size, bias=True, p=(0.5, 0.5), initializer=None,initial_method =None):
-        super(VarFastLSTMCell, self).__init__()
-        self.input_size = input_size
-        self.hidden_size = hidden_size
-        self.bias = bias
-        self.weight_ih = Parameter(torch.Tensor(4 * hidden_size, input_size))
-        self.weight_hh = Parameter(torch.Tensor(4 * hidden_size, hidden_size))
-        if bias:
-            self.bias_ih = Parameter(torch.Tensor(4 * hidden_size))
-            self.bias_hh = Parameter(torch.Tensor(4 * hidden_size))
-        else:
-            self.register_parameter('bias_ih', None)
-            self.register_parameter('bias_hh', None)
-
-        self.initializer = default_initializer(self.hidden_size) if initializer is None else initializer
-        self.reset_parameters()
-        p_in, p_hidden = p
-        if p_in < 0 or p_in > 1:
-            raise ValueError("input dropout probability has to be between 0 and 1, "
-                             "but got {}".format(p_in))
-        if p_hidden < 0 or p_hidden > 1:
-            raise ValueError("hidden state dropout probability has to be between 0 and 1, "
-                             "but got {}".format(p_hidden))
-        self.p_in = p_in
-        self.p_hidden = p_hidden
-        self.noise_in = None
-        self.noise_hidden = None
-        initial_parameter(self, initial_method)
-    def reset_parameters(self):
-        for weight in self.parameters():
-            if weight.dim() == 1:
-                weight.data.zero_()
-            else:
-                self.initializer(weight.data)
-
-    def reset_noise(self, batch_size):
-        if self.training:
-            if self.p_in:
-                noise = self.weight_ih.data.new(batch_size, self.input_size)
-                self.noise_in = torch.tensor(noise.bernoulli_(1.0 - self.p_in) / (1.0 - self.p_in))
-            else:
-                self.noise_in = None
-
-            if self.p_hidden:
-                noise = self.weight_hh.data.new(batch_size, self.hidden_size)
-                self.noise_hidden = torch.tensor(noise.bernoulli_(1.0 - self.p_hidden) / (1.0 - self.p_hidden))
-            else:
-                self.noise_hidden = None
-        else:
-            self.noise_in = None
-            self.noise_hidden = None
-
-    def forward(self, input, hx):
-        return self.__forward(
-            input, hx,
-            self.weight_ih, self.weight_hh,
-            self.bias_ih, self.bias_hh,
-            self.noise_in, self.noise_hidden,
-        )
-
-    @staticmethod
-    def __forward(input, hidden, w_ih, w_hh, b_ih=None, b_hh=None, noise_in=None, noise_hidden=None):
-        if noise_in is not None:
-            if input.is_cuda:
-                input = input * noise_in.cuda(input.get_device())
-            else:
-                input = input * noise_in
-
-        if input.is_cuda:
-            w_ih = w_ih.cuda(input.get_device())
-            w_hh = w_hh.cuda(input.get_device())
-            hidden = [h.cuda(input.get_device()) for h in hidden]
-            b_ih = b_ih.cuda(input.get_device())
-            b_hh = b_hh.cuda(input.get_device())
-            igates = F.linear(input, w_ih.cuda(input.get_device()))
-            hgates = F.linear(hidden[0], w_hh) if noise_hidden is None \
-                else F.linear(hidden[0] * noise_hidden.cuda(input.get_device()), w_hh)
-            state = fusedBackend.LSTMFused.apply
-            # print("use backend")
-            # use some magic function
-            return state(igates, hgates, hidden[1]) if b_ih is None else state(igates, hgates, hidden[1], b_ih, b_hh)
-
-        hx, cx = hidden
-        if noise_hidden is not None:
-            hx = hx * noise_hidden
-        gates = F.linear(input, w_ih, b_ih) + F.linear(hx, w_hh, b_hh)
-
-        ingate, forgetgate, cellgate, outgate = gates.chunk(4, 1)
-
-        ingate = F.sigmoid(ingate)
-        forgetgate = F.sigmoid(forgetgate)
-        cellgate = F.tanh(cellgate)
-        outgate = F.sigmoid(outgate)
-
-        cy = (forgetgate * cx) + (ingate * cellgate)
-        hy = outgate * F.tanh(cy)
-
-        return hy, cy
+class VarGRU(VarRNNBase):
+    """Variational Dropout GRU.
+    """
+    def __init__(self, *args, **kwargs):
+        super(VarGRU, self).__init__(mode="GRU", Cell=nn.GRUCell, *args, **kwargs)
diff --git a/reproduction/Biaffine_parser/cfg.cfg b/reproduction/Biaffine_parser/cfg.cfg
new file mode 100644
index 00000000..946e4c51
--- /dev/null
+++ b/reproduction/Biaffine_parser/cfg.cfg
@@ -0,0 +1,37 @@
+[train]
+epochs = 50
+batch_size = 16
+pickle_path = "./save/"
+validate = true
+save_best_dev = false
+use_cuda = true
+model_saved_path = "./save/"
+task = "parse"
+
+
+[test]
+save_output = true
+validate_in_training = true
+save_dev_input = false
+save_loss = true
+batch_size = 16
+pickle_path = "./save/"
+use_cuda = true
+task = "parse"
+
+[model]
+word_vocab_size = -1
+word_emb_dim = 100
+pos_vocab_size = -1
+pos_emb_dim = 100
+rnn_layers = 3
+rnn_hidden_size = 400
+arc_mlp_size = 500
+label_mlp_size = 100
+num_label = -1
+dropout = 0.33
+use_var_lstm=true
+use_greedy_infer=false
+
+[optim]
+lr = 2e-3
diff --git a/reproduction/Biaffine_parser/run.py b/reproduction/Biaffine_parser/run.py
new file mode 100644
index 00000000..cc8e54ad
--- /dev/null
+++ b/reproduction/Biaffine_parser/run.py
@@ -0,0 +1,260 @@
+import os
+import sys
+
+sys.path.append(os.path.join(os.path.dirname(__file__), '../..'))
+
+from collections import defaultdict
+import math
+import torch
+
+from fastNLP.core.trainer import Trainer
+from fastNLP.core.instance import Instance
+from fastNLP.core.vocabulary import Vocabulary
+from fastNLP.core.dataset import DataSet
+from fastNLP.core.batch import Batch
+from fastNLP.core.sampler import SequentialSampler
+from fastNLP.core.field import TextField, SeqLabelField
+from fastNLP.core.preprocess import SeqLabelPreprocess, load_pickle
+from fastNLP.core.tester import Tester
+from fastNLP.loader.config_loader import ConfigLoader, ConfigSection
+from fastNLP.loader.model_loader import ModelLoader
+from fastNLP.loader.embed_loader import EmbedLoader
+from fastNLP.models.biaffine_parser import BiaffineParser
+from fastNLP.saver.model_saver import ModelSaver
+
+# not in the file's dir
+if len(os.path.dirname(__file__)) != 0:
+    os.chdir(os.path.dirname(__file__))
+
+class MyDataLoader(object):
+    def __init__(self, pickle_path):
+        self.pickle_path = pickle_path
+
+    def load(self, path, word_v=None, pos_v=None, headtag_v=None):
+        datalist = []
+        with open(path, 'r', encoding='utf-8') as f:
+            sample = []
+            for line in f:
+                if line.startswith('\n'):
+                    datalist.append(sample)
+                    sample = []
+                elif line.startswith('#'):
+                    continue
+                else:
+                    sample.append(line.split('\t'))
+            if len(sample) > 0:
+                datalist.append(sample)
+
+        ds = DataSet(name='conll')
+        for sample in datalist:
+            # print(sample)
+            res = self.get_one(sample)
+            if word_v is not None:
+                word_v.update(res[0])
+                pos_v.update(res[1])
+                headtag_v.update(res[3])
+            ds.append(Instance(word_seq=TextField(res[0], is_target=False),
+                               pos_seq=TextField(res[1], is_target=False),
+                               head_indices=SeqLabelField(res[2], is_target=True),
+                               head_labels=TextField(res[3], is_target=True),
+                               seq_mask=SeqLabelField([1 for _ in range(len(res[0]))], is_target=False)))
+
+        return ds
+
+    def get_one(self, sample):
+        text = ['<root>']
+        pos_tags = ['<root>']
+        heads = [0]
+        head_tags = ['root']
+        for w in sample:
+            t1, t2, t3, t4 = w[1], w[3], w[6], w[7]
+            if t3 == '_':
+                continue
+            text.append(t1)
+            pos_tags.append(t2)
+            heads.append(int(t3))
+            head_tags.append(t4)
+        return (text, pos_tags, heads, head_tags)
+
+    def index_data(self, dataset, word_v, pos_v, tag_v):
+        dataset.index_field('word_seq', word_v)
+        dataset.index_field('pos_seq', pos_v)
+        dataset.index_field('head_labels', tag_v)
+
+# datadir = "/mnt/c/Me/Dev/release-2.2-st-train-dev-data/ud-treebanks-v2.2/UD_English-EWT"
+datadir = "/home/yfshao/UD_English-EWT"
+cfgfile = './cfg.cfg'
+train_data_name = "en_ewt-ud-train.conllu"
+dev_data_name = "en_ewt-ud-dev.conllu"
+emb_file_name = '/home/yfshao/glove.6B.100d.txt'
+processed_datadir = './save'
+
+# Config Loader
+train_args = ConfigSection()
+test_args = ConfigSection()
+model_args = ConfigSection()
+optim_args = ConfigSection()
+ConfigLoader.load_config(cfgfile, {"train": train_args, "test": test_args, "model": model_args, "optim": optim_args})
+
+# Data Loader
+def save_data(dirpath, **kwargs):
+    import _pickle
+    if not os.path.exists(dirpath):
+        os.mkdir(dirpath)
+    for name, data in kwargs.items():
+        with open(os.path.join(dirpath, name+'.pkl'), 'wb') as f:
+            _pickle.dump(data, f)
+
+
+def load_data(dirpath):
+    import _pickle
+    datas = {}
+    for f_name in os.listdir(dirpath):
+        if not f_name.endswith('.pkl'):
+            continue
+        name = f_name[:-4]
+        with open(os.path.join(dirpath, f_name), 'rb') as f:
+            datas[name] = _pickle.load(f)
+    return datas
+
+class MyTester(object):
+    def __init__(self, batch_size, use_cuda=False, **kwagrs):
+        self.batch_size = batch_size
+        self.use_cuda = use_cuda
+
+    def test(self, model, dataset):
+        self.model = model.cuda() if self.use_cuda else model
+        self.model.eval()
+        batchiter = Batch(dataset, self.batch_size, SequentialSampler(), self.use_cuda)
+        eval_res = defaultdict(list)
+        i = 0
+        for batch_x, batch_y in batchiter:
+            with torch.no_grad():
+                pred_y = self.model(**batch_x)
+                eval_one = self.model.evaluate(**pred_y, **batch_y)
+            i += self.batch_size
+            for eval_name, tensor in eval_one.items():
+                eval_res[eval_name].append(tensor)
+        tmp = {}
+        for eval_name, tensorlist in eval_res.items():
+            tmp[eval_name] = torch.cat(tensorlist, dim=0)
+
+        self.res = self.model.metrics(**tmp)
+
+    def show_metrics(self):
+        s = ""
+        for name, val in self.res.items():
+            s += '{}: {:.2f}\t'.format(name, val)
+        return s
+
+
+loader = MyDataLoader('')
+try:
+    data_dict = load_data(processed_datadir)
+    word_v = data_dict['word_v']
+    pos_v = data_dict['pos_v']
+    tag_v = data_dict['tag_v']
+    train_data = data_dict['train_data']
+    dev_data = data_dict['dev_data']
+    print('use saved pickles')
+
+except Exception as _:
+    print('load raw data and preprocess')
+    word_v = Vocabulary(need_default=True, min_freq=2)
+    pos_v = Vocabulary(need_default=True)
+    tag_v = Vocabulary(need_default=False)
+    train_data = loader.load(os.path.join(datadir, train_data_name), word_v, pos_v, tag_v)
+    dev_data = loader.load(os.path.join(datadir, dev_data_name))
+    save_data(processed_datadir, word_v=word_v, pos_v=pos_v, tag_v=tag_v, train_data=train_data, dev_data=dev_data)
+
+loader.index_data(train_data, word_v, pos_v, tag_v)
+loader.index_data(dev_data, word_v, pos_v, tag_v)
+print(len(train_data))
+print(len(dev_data))
+ep = train_args['epochs']
+train_args['epochs'] =  math.ceil(50000.0 / len(train_data) * train_args['batch_size']) if ep <= 0 else ep
+model_args['word_vocab_size'] = len(word_v)
+model_args['pos_vocab_size'] = len(pos_v)
+model_args['num_label'] = len(tag_v)
+
+
+def train():
+    # Trainer
+    trainer = Trainer(**train_args.data)
+
+    def _define_optim(obj):
+        obj._optimizer = torch.optim.Adam(obj._model.parameters(), **optim_args.data)
+        obj._scheduler = torch.optim.lr_scheduler.LambdaLR(obj._optimizer, lambda ep: .75 ** (ep / 5e4))
+
+    def _update(obj):
+        obj._scheduler.step()
+        obj._optimizer.step()
+
+    trainer.define_optimizer = lambda: _define_optim(trainer)
+    trainer.update = lambda: _update(trainer)
+    trainer.get_loss = lambda predict, truth: trainer._loss_func(**predict, **truth)
+    trainer._create_validator = lambda x: MyTester(**test_args.data)
+
+    # Model
+    model = BiaffineParser(**model_args.data)
+
+    # use pretrain embedding
+    embed, _ = EmbedLoader.load_embedding(model_args['word_emb_dim'], emb_file_name, 'glove', word_v, os.path.join(processed_datadir, 'word_emb.pkl'))
+    model.word_embedding = torch.nn.Embedding.from_pretrained(embed, freeze=False)
+    model.word_embedding.padding_idx = word_v.padding_idx
+    model.word_embedding.weight.data[word_v.padding_idx].fill_(0)
+    model.pos_embedding.padding_idx = pos_v.padding_idx
+    model.pos_embedding.weight.data[pos_v.padding_idx].fill_(0)
+
+    try:
+        ModelLoader.load_pytorch(model, "./save/saved_model.pkl")
+        print('model parameter loaded!')
+    except Exception as _:
+        print("No saved model. Continue.")
+        pass
+
+    # Start training
+    trainer.train(model, train_data, dev_data)
+    print("Training finished!")
+
+    # Saver
+    saver = ModelSaver("./save/saved_model.pkl")
+    saver.save_pytorch(model)
+    print("Model saved!")
+
+
+def test():
+    # Tester
+    tester = MyTester(**test_args.data)
+
+    # Model
+    model = BiaffineParser(**model_args.data)
+
+    try:
+        ModelLoader.load_pytorch(model, "./save/saved_model.pkl")
+        print('model parameter loaded!')
+    except Exception as _:
+        print("No saved model. Abort test.")
+        raise
+
+    # Start training
+    tester.test(model, dev_data)
+    print(tester.show_metrics())
+    print("Testing finished!")
+
+
+
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser(description='Run a chinese word segmentation model')
+    parser.add_argument('--mode', help='set the model\'s model', choices=['train', 'test', 'infer'])
+    args = parser.parse_args()
+    if args.mode == 'train':
+        train()
+    elif args.mode == 'test':
+        test()
+    elif args.mode == 'infer':
+        infer()
+    else:
+        print('no mode specified for model!')
+        parser.print_help()
diff --git a/reproduction/Char-aware_NLM/main.py b/reproduction/Char-aware_NLM/main.py
index 03810650..6467d98d 100644
--- a/reproduction/Char-aware_NLM/main.py
+++ b/reproduction/Char-aware_NLM/main.py
@@ -1,24 +1,8 @@
-from fastNLP.core.loss import Loss
-from fastNLP.core.preprocess import Preprocessor
-from fastNLP.core.trainer import Trainer
-from fastNLP.loader.dataset_loader import LMDataSetLoader
-from fastNLP.models.char_language_model import CharLM
-
 PICKLE = "./save/"
 
 
 def train():
-    loader = LMDataSetLoader()
-    train_data = loader.load()
-
-    pre = Preprocessor(label_is_seq=True, share_vocab=True)
-    train_set = pre.run(train_data, pickle_path=PICKLE)
-
-    model = CharLM(50, 50, pre.vocab_size, pre.char_vocab_size)
-
-    trainer = Trainer(task="language_model", loss=Loss("cross_entropy"))
-
-    trainer.train(model, train_set)
+    pass
 
 
 if __name__ == "__main__":
diff --git a/reproduction/chinese_word_segment/run.py b/reproduction/chinese_word_segment/run.py
index f940c5b8..df597942 100644
--- a/reproduction/chinese_word_segment/run.py
+++ b/reproduction/chinese_word_segment/run.py
@@ -12,7 +12,7 @@ from fastNLP.loader.model_loader import ModelLoader
 from fastNLP.core.tester import SeqLabelTester
 from fastNLP.models.sequence_modeling import AdvSeqLabel
 from fastNLP.core.predictor import SeqLabelInfer
-from fastNLP.core.dataset import SeqLabelDataSet, change_field_is_target
+from fastNLP.core.dataset import DataSet
 from fastNLP.core.preprocess import save_pickle
 from fastNLP.core.metrics import SeqLabelEvaluator
 
diff --git a/test/core/test_batch.py b/test/core/test_batch.py
index 5de91da8..826167ac 100644
--- a/test/core/test_batch.py
+++ b/test/core/test_batch.py
@@ -3,7 +3,7 @@ import unittest
 import torch
 
 from fastNLP.core.batch import Batch
-from fastNLP.core.dataset import DataSet, create_dataset_from_lists
+from fastNLP.core.dataset import DataSet
 from fastNLP.core.field import TextField, LabelField
 from fastNLP.core.instance import Instance
 
@@ -51,14 +51,3 @@ class TestCase1(unittest.TestCase):
             self.assertTrue(isinstance(batch_x["text"], torch.LongTensor))
             self.assertTrue(isinstance(batch_y, dict))
             self.assertTrue(isinstance(batch_y["label"], torch.LongTensor))
-
-
-class TestCase2(unittest.TestCase):
-    def test(self):
-        data = DataSet()
-        for text in texts:
-            x = TextField(text, is_target=False)
-            ins = Instance(text=x)
-            data.append(ins)
-        data_set = create_dataset_from_lists(texts, vocab, has_target=False)
-        self.assertTrue(type(data) == type(data_set))
diff --git a/test/core/test_dataset.py b/test/core/test_dataset.py
index 9b79c840..c30cd37f 100644
--- a/test/core/test_dataset.py
+++ b/test/core/test_dataset.py
@@ -1,7 +1,6 @@
 import unittest
 
-from fastNLP.core.dataset import SeqLabelDataSet, TextClassifyDataSet
-from fastNLP.core.dataset import create_dataset_from_lists
+from fastNLP.loader.dataset_loader import convert_seq2seq_dataset, convert_seq_dataset
 
 
 class TestDataSet(unittest.TestCase):
@@ -19,8 +18,9 @@ class TestDataSet(unittest.TestCase):
     label_vocab = {"1": 1, "2": 2, "3": 3, "4": 4}
 
     def test_case_1(self):
-        data_set = create_dataset_from_lists(self.labeled_data_list, self.word_vocab, has_target=True,
-                                             label_vocab=self.label_vocab)
+        data_set = convert_seq2seq_dataset(self.labeled_data_list)
+        data_set.index_field("word_seq", self.word_vocab)
+        data_set.index_field("label_seq", self.label_vocab)
         self.assertEqual(len(data_set), len(self.labeled_data_list))
         self.assertTrue(len(data_set) > 0)
         self.assertTrue(hasattr(data_set[0], "fields"))
@@ -39,7 +39,8 @@ class TestDataSet(unittest.TestCase):
                          [self.label_vocab[c] for c in self.labeled_data_list[0][1]])
 
     def test_case_2(self):
-        data_set = create_dataset_from_lists(self.unlabeled_data_list, self.word_vocab, has_target=False)
+        data_set = convert_seq_dataset(self.unlabeled_data_list)
+        data_set.index_field("word_seq", self.word_vocab)
 
         self.assertEqual(len(data_set), len(self.unlabeled_data_list))
         self.assertTrue(len(data_set) > 0)
@@ -51,193 +52,3 @@ class TestDataSet(unittest.TestCase):
         self.assertEqual(data_set[0].fields["word_seq"]._index,
                          [self.word_vocab[c] for c in self.unlabeled_data_list[0]])
 
-
-class TestDataSetConvertion(unittest.TestCase):
-    labeled_data_list = [
-        [["a", "b", "e", "d"], ["1", "2", "3", "4"]],
-        [["a", "b", "e", "d"], ["1", "2", "3", "4"]],
-        [["a", "b", "e", "d"], ["1", "2", "3", "4"]],
-    ]
-    unlabeled_data_list = [
-        ["a", "b", "e", "d"],
-        ["a", "b", "e", "d"],
-        ["a", "b", "e", "d"]
-    ]
-    word_vocab = {"a": 0, "b": 1, "e": 2, "d": 3}
-    label_vocab = {"1": 1, "2": 2, "3": 3, "4": 4}
-
-    def test_case_1(self):
-        def loader(path):
-            labeled_data_list = [
-                [["a", "b", "e", "d"], ["1", "2", "3", "4"]],
-                [["a", "b", "e", "d"], ["1", "2", "3", "4"]],
-                [["a", "b", "e", "d"], ["1", "2", "3", "4"]],
-            ]
-            return labeled_data_list
-
-        data_set = SeqLabelDataSet(load_func=loader)
-        data_set.load("any_path")
-
-        self.assertEqual(len(data_set), len(self.labeled_data_list))
-        self.assertTrue(len(data_set) > 0)
-        self.assertTrue(hasattr(data_set[0], "fields"))
-        self.assertTrue("word_seq" in data_set[0].fields)
-
-        self.assertTrue(hasattr(data_set[0].fields["word_seq"], "text"))
-        self.assertTrue(hasattr(data_set[0].fields["word_seq"], "_index"))
-        self.assertEqual(data_set[0].fields["word_seq"].text, self.labeled_data_list[0][0])
-
-        self.assertTrue("truth" in data_set[0].fields)
-        self.assertTrue(hasattr(data_set[0].fields["truth"], "text"))
-        self.assertTrue(hasattr(data_set[0].fields["truth"], "_index"))
-        self.assertEqual(data_set[0].fields["truth"].text, self.labeled_data_list[0][1])
-
-        self.assertTrue("word_seq_origin_len" in data_set[0].fields)
-
-    def test_case_2(self):
-        def loader(path):
-            unlabeled_data_list = [
-                ["a", "b", "e", "d"],
-                ["a", "b", "e", "d"],
-                ["a", "b", "e", "d"]
-            ]
-            return unlabeled_data_list
-
-        data_set = SeqLabelDataSet(load_func=loader)
-        data_set.load("any_path", vocabs={"word_vocab": self.word_vocab}, infer=True)
-
-        self.assertEqual(len(data_set), len(self.labeled_data_list))
-        self.assertTrue(len(data_set) > 0)
-        self.assertTrue(hasattr(data_set[0], "fields"))
-        self.assertTrue("word_seq" in data_set[0].fields)
-        self.assertTrue(hasattr(data_set[0].fields["word_seq"], "text"))
-        self.assertTrue(hasattr(data_set[0].fields["word_seq"], "_index"))
-        self.assertEqual(data_set[0].fields["word_seq"].text, self.labeled_data_list[0][0])
-        self.assertEqual(data_set[0].fields["word_seq"]._index,
-                         [self.word_vocab[c] for c in self.labeled_data_list[0][0]])
-
-        self.assertTrue("word_seq_origin_len" in data_set[0].fields)
-
-    def test_case_3(self):
-        def loader(path):
-            labeled_data_list = [
-                [["a", "b", "e", "d"], ["1", "2", "3", "4"]],
-                [["a", "b", "e", "d"], ["1", "2", "3", "4"]],
-                [["a", "b", "e", "d"], ["1", "2", "3", "4"]],
-            ]
-            return labeled_data_list
-
-        data_set = SeqLabelDataSet(load_func=loader)
-        data_set.load("any_path", vocabs={"word_vocab": self.word_vocab, "label_vocab": self.label_vocab})
-
-        self.assertEqual(len(data_set), len(self.labeled_data_list))
-        self.assertTrue(len(data_set) > 0)
-        self.assertTrue(hasattr(data_set[0], "fields"))
-        self.assertTrue("word_seq" in data_set[0].fields)
-        self.assertTrue(hasattr(data_set[0].fields["word_seq"], "text"))
-        self.assertTrue(hasattr(data_set[0].fields["word_seq"], "_index"))
-        self.assertEqual(data_set[0].fields["word_seq"].text, self.labeled_data_list[0][0])
-        self.assertEqual(data_set[0].fields["word_seq"]._index,
-                         [self.word_vocab[c] for c in self.labeled_data_list[0][0]])
-
-        self.assertTrue("truth" in data_set[0].fields)
-        self.assertTrue(hasattr(data_set[0].fields["truth"], "text"))
-        self.assertTrue(hasattr(data_set[0].fields["truth"], "_index"))
-        self.assertEqual(data_set[0].fields["truth"].text, self.labeled_data_list[0][1])
-        self.assertEqual(data_set[0].fields["truth"]._index,
-                         [self.label_vocab[c] for c in self.labeled_data_list[0][1]])
-
-        self.assertTrue("word_seq_origin_len" in data_set[0].fields)
-
-
-class TestDataSetConvertionHHH(unittest.TestCase):
-    labeled_data_list = [
-        [["a", "b", "e", "d"], "A"],
-        [["a", "b", "e", "d"], "C"],
-        [["a", "b", "e", "d"], "B"],
-    ]
-    unlabeled_data_list = [
-        ["a", "b", "e", "d"],
-        ["a", "b", "e", "d"],
-        ["a", "b", "e", "d"]
-    ]
-    word_vocab = {"a": 0, "b": 1, "e": 2, "d": 3}
-    label_vocab = {"A": 1, "B": 2, "C": 3}
-
-    def test_case_1(self):
-        def loader(path):
-            labeled_data_list = [
-                [["a", "b", "e", "d"], "A"],
-                [["a", "b", "e", "d"], "C"],
-                [["a", "b", "e", "d"], "B"],
-            ]
-            return labeled_data_list
-
-        data_set = TextClassifyDataSet(load_func=loader)
-        data_set.load("xxx")
-
-        self.assertEqual(len(data_set), len(self.labeled_data_list))
-        self.assertTrue(len(data_set) > 0)
-        self.assertTrue(hasattr(data_set[0], "fields"))
-        self.assertTrue("word_seq" in data_set[0].fields)
-
-        self.assertTrue(hasattr(data_set[0].fields["word_seq"], "text"))
-        self.assertTrue(hasattr(data_set[0].fields["word_seq"], "_index"))
-        self.assertEqual(data_set[0].fields["word_seq"].text, self.labeled_data_list[0][0])
-
-        self.assertTrue("label" in data_set[0].fields)
-        self.assertTrue(hasattr(data_set[0].fields["label"], "label"))
-        self.assertTrue(hasattr(data_set[0].fields["label"], "_index"))
-        self.assertEqual(data_set[0].fields["label"].label, self.labeled_data_list[0][1])
-
-    def test_case_2(self):
-        def loader(path):
-            labeled_data_list = [
-                [["a", "b", "e", "d"], "A"],
-                [["a", "b", "e", "d"], "C"],
-                [["a", "b", "e", "d"], "B"],
-            ]
-            return labeled_data_list
-
-        data_set = TextClassifyDataSet(load_func=loader)
-        data_set.load("xxx", vocabs={"word_vocab": self.word_vocab, "label_vocab": self.label_vocab})
-
-        self.assertEqual(len(data_set), len(self.labeled_data_list))
-        self.assertTrue(len(data_set) > 0)
-        self.assertTrue(hasattr(data_set[0], "fields"))
-        self.assertTrue("word_seq" in data_set[0].fields)
-
-        self.assertTrue(hasattr(data_set[0].fields["word_seq"], "text"))
-        self.assertTrue(hasattr(data_set[0].fields["word_seq"], "_index"))
-        self.assertEqual(data_set[0].fields["word_seq"].text, self.labeled_data_list[0][0])
-        self.assertEqual(data_set[0].fields["word_seq"]._index,
-                         [self.word_vocab[c] for c in self.labeled_data_list[0][0]])
-
-        self.assertTrue("label" in data_set[0].fields)
-        self.assertTrue(hasattr(data_set[0].fields["label"], "label"))
-        self.assertTrue(hasattr(data_set[0].fields["label"], "_index"))
-        self.assertEqual(data_set[0].fields["label"].label, self.labeled_data_list[0][1])
-        self.assertEqual(data_set[0].fields["label"]._index, self.label_vocab[self.labeled_data_list[0][1]])
-
-    def test_case_3(self):
-        def loader(path):
-            unlabeled_data_list = [
-                ["a", "b", "e", "d"],
-                ["a", "b", "e", "d"],
-                ["a", "b", "e", "d"]
-            ]
-            return unlabeled_data_list
-
-        data_set = TextClassifyDataSet(load_func=loader)
-        data_set.load("xxx", vocabs={"word_vocab": self.word_vocab}, infer=True)
-
-        self.assertEqual(len(data_set), len(self.labeled_data_list))
-        self.assertTrue(len(data_set) > 0)
-        self.assertTrue(hasattr(data_set[0], "fields"))
-        self.assertTrue("word_seq" in data_set[0].fields)
-
-        self.assertTrue(hasattr(data_set[0].fields["word_seq"], "text"))
-        self.assertTrue(hasattr(data_set[0].fields["word_seq"], "_index"))
-        self.assertEqual(data_set[0].fields["word_seq"].text, self.labeled_data_list[0][0])
-        self.assertEqual(data_set[0].fields["word_seq"]._index,
-                         [self.word_vocab[c] for c in self.labeled_data_list[0][0]])
diff --git a/test/core/test_predictor.py b/test/core/test_predictor.py
index 8bd5a7ab..84275478 100644
--- a/test/core/test_predictor.py
+++ b/test/core/test_predictor.py
@@ -1,11 +1,12 @@
 import os
 import unittest
 
-from fastNLP.core.dataset import TextClassifyDataSet, SeqLabelDataSet
+from fastNLP.core.dataset import DataSet
 from fastNLP.core.predictor import Predictor
 from fastNLP.core.preprocess import save_pickle
 from fastNLP.core.vocabulary import Vocabulary
 from fastNLP.loader.base_loader import BaseLoader
+from fastNLP.loader.dataset_loader import convert_seq_dataset
 from fastNLP.models.cnn_text_classification import CNNText
 from fastNLP.models.sequence_modeling import SeqLabeling
 
@@ -42,8 +43,8 @@ class TestPredictor(unittest.TestCase):
         predictor = Predictor("./save/", pre.text_classify_post_processor)
 
         # Load infer data
-        infer_data_set = TextClassifyDataSet(load_func=BaseLoader.load)
-        infer_data_set.convert_for_infer(infer_data, vocabs={"word_vocab": vocab.word2idx})
+        infer_data_set = convert_seq_dataset(infer_data)
+        infer_data_set.index_field("word_seq", vocab)
 
         results = predictor.predict(network=model, data=infer_data_set)
 
@@ -54,14 +55,12 @@ class TestPredictor(unittest.TestCase):
             self.assertTrue(isinstance(res, str))
             self.assertTrue(res in class_vocab.word2idx)
 
-        del model, predictor, infer_data_set
+        del model, predictor
+        infer_data_set.set_origin_len("word_seq")
 
         model = SeqLabeling(model_args)
         predictor = Predictor("./save/", pre.seq_label_post_processor)
 
-        infer_data_set = SeqLabelDataSet(load_func=BaseLoader.load)
-        infer_data_set.convert_for_infer(infer_data, vocabs={"word_vocab": vocab.word2idx})
-
         results = predictor.predict(network=model, data=infer_data_set)
         self.assertTrue(isinstance(results, list))
         self.assertEqual(len(results), len(infer_data))
diff --git a/test/core/test_preprocess.py b/test/core/test_preprocess.py
deleted file mode 100644
index 05c04ce9..00000000
--- a/test/core/test_preprocess.py
+++ /dev/null
@@ -1,72 +0,0 @@
-import os
-import unittest
-
-from fastNLP.core.dataset import DataSet
-from fastNLP.core.preprocess import SeqLabelPreprocess
-
-data = [
-    [['Tom', 'and', 'Jerry', '.'], ['n', '&', 'n', '.']],
-    [['Hello', 'world', '!'], ['a', 'n', '.']],
-    [['Tom', 'and', 'Jerry', '.'], ['n', '&', 'n', '.']],
-    [['Hello', 'world', '!'], ['a', 'n', '.']],
-    [['Tom', 'and', 'Jerry', '.'], ['n', '&', 'n', '.']],
-    [['Hello', 'world', '!'], ['a', 'n', '.']],
-    [['Tom', 'and', 'Jerry', '.'], ['n', '&', 'n', '.']],
-    [['Hello', 'world', '!'], ['a', 'n', '.']],
-    [['Tom', 'and', 'Jerry', '.'], ['n', '&', 'n', '.']],
-    [['Hello', 'world', '!'], ['a', 'n', '.']],
-]
-
-
-class TestCase1(unittest.TestCase):
-    def test(self):
-        if os.path.exists("./save"):
-            for root, dirs, files in os.walk("./save", topdown=False):
-                for name in files:
-                    os.remove(os.path.join(root, name))
-                for name in dirs:
-                    os.rmdir(os.path.join(root, name))
-        result = SeqLabelPreprocess().run(train_dev_data=data, train_dev_split=0.4,
-                                          pickle_path="./save")
-        self.assertEqual(len(result), 2)
-        self.assertEqual(type(result[0]), DataSet)
-        self.assertEqual(type(result[1]), DataSet)
-
-        os.system("rm -rf save")
-        print("pickle path deleted")
-
-
-class TestCase2(unittest.TestCase):
-    def test(self):
-        if os.path.exists("./save"):
-            for root, dirs, files in os.walk("./save", topdown=False):
-                for name in files:
-                    os.remove(os.path.join(root, name))
-                for name in dirs:
-                    os.rmdir(os.path.join(root, name))
-        result = SeqLabelPreprocess().run(test_data=data, train_dev_data=data,
-                                          pickle_path="./save", train_dev_split=0.4,
-                                          cross_val=False)
-        self.assertEqual(len(result), 3)
-        self.assertEqual(type(result[0]), DataSet)
-        self.assertEqual(type(result[1]), DataSet)
-        self.assertEqual(type(result[2]), DataSet)
-
-        os.system("rm -rf save")
-        print("pickle path deleted")
-
-
-class TestCase3(unittest.TestCase):
-    def test(self):
-        num_folds = 2
-        result = SeqLabelPreprocess().run(test_data=None, train_dev_data=data,
-                                          pickle_path="./save", train_dev_split=0.4,
-                                          cross_val=True, n_fold=num_folds)
-        self.assertEqual(len(result), 2)
-        self.assertEqual(len(result[0]), num_folds)
-        self.assertEqual(len(result[1]), num_folds)
-        for data_set in result[0] + result[1]:
-            self.assertEqual(type(data_set), DataSet)
-
-        os.system("rm -rf save")
-        print("pickle path deleted")
diff --git a/test/core/test_tester.py b/test/core/test_tester.py
index 1118f284..5ae67e3f 100644
--- a/test/core/test_tester.py
+++ b/test/core/test_tester.py
@@ -1,7 +1,7 @@
 import os
 import unittest
 
-from fastNLP.core.dataset import SeqLabelDataSet
+from fastNLP.core.dataset import DataSet
 from fastNLP.core.metrics import SeqLabelEvaluator
 from fastNLP.core.field import TextField, LabelField
 from fastNLP.core.instance import Instance
@@ -35,7 +35,7 @@ class TestTester(unittest.TestCase):
         vocab = {'a': 0, 'b': 1, 'c': 2, 'd': 3, 'e': 4, '!': 5, '@': 6, '#': 7, '$': 8, '?': 9}
         label_vocab = {'a': 0, '@': 1, 'c': 2, 'd': 3, 'e': 4}
 
-        data_set = SeqLabelDataSet()
+        data_set = DataSet()
         for example in train_data:
             text, label = example[0], example[1]
             x = TextField(text, False)
diff --git a/test/core/test_trainer.py b/test/core/test_trainer.py
index b4a9178f..98ef879f 100644
--- a/test/core/test_trainer.py
+++ b/test/core/test_trainer.py
@@ -1,7 +1,7 @@
 import os
 import unittest
 
-from fastNLP.core.dataset import SeqLabelDataSet
+from fastNLP.core.dataset import DataSet
 from fastNLP.core.metrics import SeqLabelEvaluator
 from fastNLP.core.field import TextField, LabelField
 from fastNLP.core.instance import Instance
@@ -36,7 +36,7 @@ class TestTrainer(unittest.TestCase):
         vocab = {'a': 0, 'b': 1, 'c': 2, 'd': 3, 'e': 4, '!': 5, '@': 6, '#': 7, '$': 8, '?': 9}
         label_vocab = {'a': 0, '@': 1, 'c': 2, 'd': 3, 'e': 4}
 
-        data_set = SeqLabelDataSet()
+        data_set = DataSet()
         for example in train_data:
             text, label = example[0], example[1]
             x = TextField(text, False)
diff --git a/test/data_for_tests/config b/test/data_for_tests/config
index 3f4ff7af..1180c97a 100644
--- a/test/data_for_tests/config
+++ b/test/data_for_tests/config
@@ -45,3 +45,28 @@ use_cuda = true
 learn_rate = 1e-3
 momentum = 0.9
 model_name = "class_model.pkl"
+
+[snli_trainer]
+epochs = 5
+batch_size = 32
+validate = true
+save_best_dev = true
+use_cuda = true
+learn_rate = 1e-4
+loss = "cross_entropy"
+print_every_step = 1000
+
+[snli_tester]
+batch_size = 512
+use_cuda = true
+
+[snli_model]
+model_name = "snli_model.pkl"
+embed_dim = 300
+hidden_size = 300
+batch_first = true
+dropout = 0.5
+gpu = true
+embed_file = "./../data_for_tests/glove.840B.300d.txt"
+embed_pkl = "./snli/embed.pkl"
+examples = 0
diff --git a/test/data_for_tests/glove.6B.50d_test.txt b/test/data_for_tests/glove.6B.50d_test.txt
new file mode 100644
index 00000000..cd71b26e
--- /dev/null
+++ b/test/data_for_tests/glove.6B.50d_test.txt
@@ -0,0 +1,12 @@
+the 0.418 0.24968 -0.41242 0.1217 0.34527 -0.044457 -0.49688 -0.17862 -0.00066023 -0.6566 0.27843 -0.14767 -0.55677 0.14658 -0.0095095 0.011658 0.10204 -0.12792 -0.8443 -0.12181 -0.016801 -0.33279 -0.1552 -0.23131 -0.19181 -1.8823 -0.76746 0.099051 -0.42125 -0.19526 4.0071 -0.18594 -0.52287 -0.31681 0.00059213 0.0074449 0.17778 -0.15897 0.012041 -0.054223 -0.29871 -0.15749 -0.34758 -0.045637 -0.44251 0.18785 0.0027849 -0.18411 -0.11514 -0.78581
+, 0.013441 0.23682 -0.16899 0.40951 0.63812 0.47709 -0.42852 -0.55641 -0.364 -0.23938 0.13001 -0.063734 -0.39575 -0.48162 0.23291 0.090201 -0.13324 0.078639 -0.41634 -0.15428 0.10068 0.48891 0.31226 -0.1252 -0.037512 -1.5179 0.12612 -0.02442 -0.042961 -0.28351 3.5416 -0.11956 -0.014533 -0.1499 0.21864 -0.33412 -0.13872 0.31806 0.70358 0.44858 -0.080262 0.63003 0.32111 -0.46765 0.22786 0.36034 -0.37818 -0.56657 0.044691 0.30392
+. 0.15164 0.30177 -0.16763 0.17684 0.31719 0.33973 -0.43478 -0.31086 -0.44999 -0.29486 0.16608 0.11963 -0.41328 -0.42353 0.59868 0.28825 -0.11547 -0.041848 -0.67989 -0.25063 0.18472 0.086876 0.46582 0.015035 0.043474 -1.4671 -0.30384 -0.023441 0.30589 -0.21785 3.746 0.0042284 -0.18436 -0.46209 0.098329 -0.11907 0.23919 0.1161 0.41705 0.056763 -6.3681e-05 0.068987 0.087939 -0.10285 -0.13931 0.22314 -0.080803 -0.35652 0.016413 0.10216
+of 0.70853 0.57088 -0.4716 0.18048 0.54449 0.72603 0.18157 -0.52393 0.10381 -0.17566 0.078852 -0.36216 -0.11829 -0.83336 0.11917 -0.16605 0.061555 -0.012719 -0.56623 0.013616 0.22851 -0.14396 -0.067549 -0.38157 -0.23698 -1.7037 -0.86692 -0.26704 -0.2589 0.1767 3.8676 -0.1613 -0.13273 -0.68881 0.18444 0.0052464 -0.33874 -0.078956 0.24185 0.36576 -0.34727 0.28483 0.075693 -0.062178 -0.38988 0.22902 -0.21617 -0.22562 -0.093918 -0.80375
+to 0.68047 -0.039263 0.30186 -0.17792 0.42962 0.032246 -0.41376 0.13228 -0.29847 -0.085253 0.17118 0.22419 -0.10046 -0.43653 0.33418 0.67846 0.057204 -0.34448 -0.42785 -0.43275 0.55963 0.10032 0.18677 -0.26854 0.037334 -2.0932 0.22171 -0.39868 0.20912 -0.55725 3.8826 0.47466 -0.95658 -0.37788 0.20869 -0.32752 0.12751 0.088359 0.16351 -0.21634 -0.094375 0.018324 0.21048 -0.03088 -0.19722 0.082279 -0.09434 -0.073297 -0.064699 -0.26044
+and 0.26818 0.14346 -0.27877 0.016257 0.11384 0.69923 -0.51332 -0.47368 -0.33075 -0.13834 0.2702 0.30938 -0.45012 -0.4127 -0.09932 0.038085 0.029749 0.10076 -0.25058 -0.51818 0.34558 0.44922 0.48791 -0.080866 -0.10121 -1.3777 -0.10866 -0.23201 0.012839 -0.46508 3.8463 0.31362 0.13643 -0.52244 0.3302 0.33707 -0.35601 0.32431 0.12041 0.3512 -0.069043 0.36885 0.25168 -0.24517 0.25381 0.1367 -0.31178 -0.6321 -0.25028 -0.38097
+in 0.33042 0.24995 -0.60874 0.10923 0.036372 0.151 -0.55083 -0.074239 -0.092307 -0.32821 0.09598 -0.82269 -0.36717 -0.67009 0.42909 0.016496 -0.23573 0.12864 -1.0953 0.43334 0.57067 -0.1036 0.20422 0.078308 -0.42795 -1.7984 -0.27865 0.11954 -0.12689 0.031744 3.8631 -0.17786 -0.082434 -0.62698 0.26497 -0.057185 -0.073521 0.46103 0.30862 0.12498 -0.48609 -0.0080272 0.031184 -0.36576 -0.42699 0.42164 -0.11666 -0.50703 -0.027273 -0.53285
+a 0.21705 0.46515 -0.46757 0.10082 1.0135 0.74845 -0.53104 -0.26256 0.16812 0.13182 -0.24909 -0.44185 -0.21739 0.51004 0.13448 -0.43141 -0.03123 0.20674 -0.78138 -0.20148 -0.097401 0.16088 -0.61836 -0.18504 -0.12461 -2.2526 -0.22321 0.5043 0.32257 0.15313 3.9636 -0.71365 -0.67012 0.28388 0.21738 0.14433 0.25926 0.23434 0.4274 -0.44451 0.13813 0.36973 -0.64289 0.024142 -0.039315 -0.26037 0.12017 -0.043782 0.41013 0.1796
+" 0.25769 0.45629 -0.76974 -0.37679 0.59272 -0.063527 0.20545 -0.57385 -0.29009 -0.13662 0.32728 1.4719 -0.73681 -0.12036 0.71354 -0.46098 0.65248 0.48887 -0.51558 0.039951 -0.34307 -0.014087 0.86488 0.3546 0.7999 -1.4995 -1.8153 0.41128 0.23921 -0.43139 3.6623 -0.79834 -0.54538 0.16943 -0.82017 -0.3461 0.69495 -1.2256 -0.17992 -0.057474 0.030498 -0.39543 -0.38515 -1.0002 0.087599 -0.31009 -0.34677 -0.31438 0.75004 0.97065
+'s 0.23727 0.40478 -0.20547 0.58805 0.65533 0.32867 -0.81964 -0.23236 0.27428 0.24265 0.054992 0.16296 -1.2555 -0.086437 0.44536 0.096561 -0.16519 0.058378 -0.38598 0.086977 0.0033869 0.55095 -0.77697 -0.62096 0.092948 -2.5685 -0.67739 0.10151 -0.48643 -0.057805 3.1859 -0.017554 -0.16138 0.055486 -0.25885 -0.33938 -0.19928 0.26049 0.10478 -0.55934 -0.12342 0.65961 -0.51802 -0.82995 -0.082739 0.28155 -0.423 -0.27378 -0.007901 -0.030231
+
+
diff --git a/test/loader/test_dataset_loader.py b/test/loader/test_dataset_loader.py
index 94a7fa71..1914bce9 100644
--- a/test/loader/test_dataset_loader.py
+++ b/test/loader/test_dataset_loader.py
@@ -3,7 +3,7 @@ import unittest
 
 from fastNLP.loader.dataset_loader import POSDataSetLoader, LMDataSetLoader, TokenizeDataSetLoader, \
     PeopleDailyCorpusLoader, ConllLoader
-
+from fastNLP.core.dataset import DataSet
 
 class TestDatasetLoader(unittest.TestCase):
     def test_case_1(self):
@@ -15,13 +15,23 @@ class TestDatasetLoader(unittest.TestCase):
 
     def test_case_TokenizeDatasetLoader(self):
         loader = TokenizeDataSetLoader()
-        data = loader.load("./test/data_for_tests/cws_pku_utf_8", max_seq_len=32)
+        filepath = "./test/data_for_tests/cws_pku_utf_8"
+        data = loader.load(filepath, max_seq_len=32)
+        assert len(data) > 0
+
+        data1 = DataSet()
+        data1.read_tokenize(filepath, max_seq_len=32)
+        assert len(data1) > 0
         print("pass TokenizeDataSetLoader test!")
 
     def test_case_POSDatasetLoader(self):
         loader = POSDataSetLoader()
+        filepath = "./test/data_for_tests/people.txt"
         data = loader.load("./test/data_for_tests/people.txt")
         datas = loader.load_lines("./test/data_for_tests/people.txt")
+
+        data1 = DataSet().read_pos(filepath)
+        assert len(data1) > 0
         print("pass POSDataSetLoader test!")
 
     def test_case_LMDatasetLoader(self):
diff --git a/test/loader/test_embed_loader.py b/test/loader/test_embed_loader.py
new file mode 100644
index 00000000..560dd29e
--- /dev/null
+++ b/test/loader/test_embed_loader.py
@@ -0,0 +1,33 @@
+import unittest
+import os
+
+import torch
+
+from fastNLP.loader.embed_loader import EmbedLoader
+from fastNLP.core.vocabulary import Vocabulary
+
+
+class TestEmbedLoader(unittest.TestCase):
+    glove_path = './test/data_for_tests/glove.6B.50d_test.txt'
+    pkl_path = './save'
+    raw_texts = ["i am a cat",
+                "this is a test of new batch",
+                "ha ha",
+                "I am a good boy .",
+                "This is the most beautiful girl ."
+                ]
+    texts = [text.strip().split() for text in raw_texts]
+    vocab = Vocabulary()
+    vocab.update(texts)
+    def test1(self):
+        emb, _ = EmbedLoader.load_embedding(50, self.glove_path, 'glove', self.vocab, self.pkl_path)
+        self.assertTrue(emb.shape[0] == (len(self.vocab)))
+        self.assertTrue(emb.shape[1] == 50)
+        os.remove(self.pkl_path)
+    
+    def test2(self):
+        try:
+            _ = EmbedLoader.load_embedding(100, self.glove_path, 'glove', self.vocab, self.pkl_path)
+            self.fail(msg="load dismatch embedding")
+        except ValueError:
+            pass
diff --git a/test/model/seq_labeling.py b/test/model/seq_labeling.py
index 06c67fa7..64561a4b 100644
--- a/test/model/seq_labeling.py
+++ b/test/model/seq_labeling.py
@@ -1,9 +1,9 @@
 import os
 import sys
+
 sys.path.append("..")
 import argparse
 from fastNLP.loader.config_loader import ConfigLoader, ConfigSection
-from fastNLP.core.trainer import SeqLabelTrainer
 from fastNLP.loader.dataset_loader import BaseLoader
 from fastNLP.saver.model_saver import ModelSaver
 from fastNLP.loader.model_loader import ModelLoader
@@ -82,6 +82,7 @@ def train_and_test():
     save_pickle(data_set.word_vocab, pickle_path, "word2id.pkl")
     save_pickle(data_set.label_vocab, pickle_path, "label2id.pkl")
 
+    """
     trainer = SeqLabelTrainer(
         epochs=trainer_args["epochs"],
         batch_size=trainer_args["batch_size"],
@@ -92,12 +93,23 @@ def train_and_test():
         model_name=model_name,
         optimizer=Optimizer("SGD", lr=0.01, momentum=0.9),
     )
+    """
 
     # Model
     model = SeqLabeling(model_args)
 
+    model.fit(train_set, dev_set,
+              epochs=trainer_args["epochs"],
+              batch_size=trainer_args["batch_size"],
+              validate=False,
+              use_cuda=trainer_args["use_cuda"],
+              pickle_path=pickle_path,
+              save_best_dev=trainer_args["save_best_dev"],
+              model_name=model_name,
+              optimizer=Optimizer("SGD", lr=0.01, momentum=0.9))
+
     # Start training
-    trainer.train(model, train_set, dev_set)
+    # trainer.train(model, train_set, dev_set)
     print("Training finished!")
 
     # Saver
@@ -105,7 +117,7 @@ def train_and_test():
     saver.save_pytorch(model)
     print("Model saved!")
 
-    del model, trainer
+    del model
 
     change_field_is_target(dev_set, "truth", True)
 
diff --git a/test/model/test_char_language_model.py b/test/model/test_char_language_model.py
new file mode 100644
index 00000000..5a7bc835
--- /dev/null
+++ b/test/model/test_char_language_model.py
@@ -0,0 +1,25 @@
+import unittest
+
+import numpy as np
+import torch
+
+from fastNLP.models.char_language_model import CharLM
+
+
+class TestCharLM(unittest.TestCase):
+    def test_case_1(self):
+        char_emb_dim = 50
+        word_emb_dim = 50
+        vocab_size = 1000
+        num_char = 24
+        max_word_len = 21
+        num_seq = 64
+        seq_len = 32
+
+        model = CharLM(char_emb_dim, word_emb_dim, vocab_size, num_char)
+
+        x = torch.from_numpy(np.random.randint(0, num_char, size=(num_seq, seq_len, max_word_len + 2)))
+
+        self.assertEqual(tuple(x.shape), (num_seq, seq_len, max_word_len + 2))
+        y = model(x)
+        self.assertEqual(tuple(y.shape), (num_seq * seq_len, vocab_size))
diff --git a/test/model/test_cws.py b/test/model/test_cws.py
index 0c43bbff..7f248dce 100644
--- a/test/model/test_cws.py
+++ b/test/model/test_cws.py
@@ -1,13 +1,14 @@
 import os
 
-from fastNLP.core.dataset import SeqLabelDataSet, change_field_is_target
+from fastNLP.core.dataset import DataSet
+from fastNLP.core.vocabulary import Vocabulary
 from fastNLP.core.metrics import SeqLabelEvaluator
 from fastNLP.core.predictor import SeqLabelInfer
 from fastNLP.core.preprocess import save_pickle, load_pickle
 from fastNLP.core.tester import SeqLabelTester
 from fastNLP.core.trainer import SeqLabelTrainer
 from fastNLP.loader.config_loader import ConfigLoader, ConfigSection
-from fastNLP.loader.dataset_loader import TokenizeDataSetLoader, BaseLoader
+from fastNLP.loader.dataset_loader import TokenizeDataSetLoader, BaseLoader, RawDataSetLoader
 from fastNLP.loader.model_loader import ModelLoader
 from fastNLP.models.sequence_modeling import SeqLabeling
 from fastNLP.saver.model_saver import ModelSaver
@@ -37,9 +38,9 @@ def infer():
     print("model loaded!")
 
     # Load infer data
-    infer_data = SeqLabelDataSet(load_func=BaseLoader.load)
-    infer_data.load(data_infer_path, vocabs={"word_vocab": word2index}, infer=True)
-
+    infer_data = RawDataSetLoader().load(data_infer_path)
+    infer_data.index_field("word_seq", word2index)
+    infer_data.set_origin_len("word_seq")
     # inference
     infer = SeqLabelInfer(pickle_path)
     results = infer.predict(model, infer_data)
@@ -52,13 +53,18 @@ def train_test():
     ConfigLoader().load_config(config_path, {"POS_infer": train_args})
 
     # define dataset
-    data_train = SeqLabelDataSet(load_func=TokenizeDataSetLoader.load)
-    data_train.load(cws_data_path)
-    train_args["vocab_size"] = len(data_train.word_vocab)
-    train_args["num_classes"] = len(data_train.label_vocab)
-
-    save_pickle(data_train.word_vocab, pickle_path, "word2id.pkl")
-    save_pickle(data_train.label_vocab, pickle_path, "label2id.pkl")
+    data_train = TokenizeDataSetLoader().load(cws_data_path)
+    word_vocab = Vocabulary()
+    label_vocab = Vocabulary()
+    data_train.update_vocab(word_seq=word_vocab, label_seq=label_vocab)
+    data_train.index_field("word_seq", word_vocab).index_field("label_seq", label_vocab)
+    data_train.set_origin_len("word_seq")
+    data_train.rename_field("label_seq", "truth").set_target(truth=False)
+    train_args["vocab_size"] = len(word_vocab)
+    train_args["num_classes"] = len(label_vocab)
+
+    save_pickle(word_vocab, pickle_path, "word2id.pkl")
+    save_pickle(label_vocab, pickle_path, "label2id.pkl")
 
     # Trainer
     trainer = SeqLabelTrainer(**train_args.data)
@@ -90,7 +96,7 @@ def train_test():
     tester = SeqLabelTester(**test_args.data)
 
     # Start testing
-    change_field_is_target(data_train, "truth", True)
+    data_train.set_target(truth=True)
     tester.test(model, data_train)
 
 
diff --git a/test/model/test_seq_label.py b/test/model/test_seq_label.py
index ebb62f99..09d43008 100644
--- a/test/model/test_seq_label.py
+++ b/test/model/test_seq_label.py
@@ -1,6 +1,7 @@
 import os
 
-from fastNLP.core.dataset import SeqLabelDataSet, change_field_is_target
+from fastNLP.core.vocabulary import Vocabulary
+from fastNLP.loader.dataset_loader import TokenizeDataSetLoader
 from fastNLP.core.metrics import SeqLabelEvaluator
 from fastNLP.core.optimizer import Optimizer
 from fastNLP.core.preprocess import save_pickle
@@ -25,14 +26,19 @@ def test_training():
     ConfigLoader().load_config(config_dir, {
         "test_seq_label_trainer": trainer_args, "test_seq_label_model": model_args})
 
-    data_set = SeqLabelDataSet()
-    data_set.load(data_path)
+    data_set = TokenizeDataSetLoader().load(data_path)
+    word_vocab = Vocabulary()
+    label_vocab = Vocabulary()
+    data_set.update_vocab(word_seq=word_vocab, label_seq=label_vocab)
+    data_set.index_field("word_seq", word_vocab).index_field("label_seq", label_vocab)
+    data_set.set_origin_len("word_seq")
+    data_set.rename_field("label_seq", "truth").set_target(truth=False)
     data_train, data_dev = data_set.split(0.3, shuffle=True)
-    model_args["vocab_size"] = len(data_set.word_vocab)
-    model_args["num_classes"] = len(data_set.label_vocab)
+    model_args["vocab_size"] = len(word_vocab)
+    model_args["num_classes"] = len(label_vocab)
 
-    save_pickle(data_set.word_vocab, pickle_path, "word2id.pkl")
-    save_pickle(data_set.label_vocab, pickle_path, "label2id.pkl")
+    save_pickle(word_vocab, pickle_path, "word2id.pkl")
+    save_pickle(label_vocab, pickle_path, "label2id.pkl")
 
     trainer = SeqLabelTrainer(
         epochs=trainer_args["epochs"],
@@ -76,5 +82,5 @@ def test_training():
                             )
 
     # Start testing with validation data
-    change_field_is_target(data_dev, "truth", True)
+    data_dev.set_target(truth=True)
     tester.test(model, data_dev)
diff --git a/test/modules/test_char_embedding.py b/test/modules/test_char_embedding.py
new file mode 100644
index 00000000..07def64a
--- /dev/null
+++ b/test/modules/test_char_embedding.py
@@ -0,0 +1,28 @@
+import unittest
+
+import torch
+
+from fastNLP.modules.encoder.char_embedding import ConvCharEmbedding, LSTMCharEmbedding
+
+
+class TestCharEmbed(unittest.TestCase):
+    def test_case_1(self):
+        batch_size = 128
+        char_emb = 100
+        word_length = 1
+        x = torch.Tensor(batch_size, char_emb, word_length)
+        x = x.transpose(1, 2)
+
+        cce = ConvCharEmbedding(char_emb)
+        y = cce(x)
+        self.assertEqual(tuple(x.shape), (batch_size, word_length, char_emb))
+        print("CNN Char Emb input: ", x.shape)
+        self.assertEqual(tuple(y.shape), (batch_size, char_emb, 1))
+        print("CNN Char Emb output: ", y.shape)  # [128, 100]
+
+        lce = LSTMCharEmbedding(char_emb)
+        o = lce(x)
+        self.assertEqual(tuple(x.shape), (batch_size, word_length, char_emb))
+        print("LSTM Char Emb input: ", x.shape)
+        self.assertEqual(tuple(o.shape), (batch_size, char_emb, 1))
+        print("LSTM Char Emb size: ", o.shape)
diff --git a/test/modules/test_variational_rnn.py b/test/modules/test_variational_rnn.py
index cd265109..c3806f60 100644
--- a/test/modules/test_variational_rnn.py
+++ b/test/modules/test_variational_rnn.py
@@ -1,28 +1,25 @@
+import unittest
 
+import numpy as np
 import torch
-import unittest
 
-from fastNLP.modules.encoder.variational_rnn import VarMaskedFastLSTM
+from fastNLP.modules.encoder.variational_rnn import VarLSTM
+
 
 class TestMaskedRnn(unittest.TestCase):
     def test_case_1(self):
-        masked_rnn = VarMaskedFastLSTM(input_size=1, hidden_size=1, bidirectional=True, batch_first=True)
+        masked_rnn = VarLSTM(input_size=1, hidden_size=1, bidirectional=True, batch_first=True)
         x = torch.tensor([[[1.0], [2.0]]])
         print(x.size())
         y = masked_rnn(x)
-        mask = torch.tensor([[[1], [1]]])
-        y = masked_rnn(x, mask=mask)
-        mask = torch.tensor([[[1], [0]]])
-        y = masked_rnn(x, mask=mask)
+
 
     def test_case_2(self):
-        masked_rnn = VarMaskedFastLSTM(input_size=1, hidden_size=1, bidirectional=False, batch_first=True)
-        x = torch.tensor([[[1.0], [2.0]]])
-        print(x.size())
-        y = masked_rnn(x)
-        mask = torch.tensor([[[1], [1]]])
-        y = masked_rnn(x, mask=mask)
-        xx = torch.tensor([[[1.0]]])
-        #y, hidden = masked_rnn.step(xx)
-        #step() still has a bug
-        #y, hidden = masked_rnn.step(xx, mask=mask)
\ No newline at end of file
+        input_size = 12
+        batch = 16
+        hidden = 10
+        masked_rnn = VarLSTM(input_size=input_size, hidden_size=hidden, bidirectional=False, batch_first=True)
+
+        xx = torch.randn((batch, 32, input_size))
+        y, _ = masked_rnn(xx)
+        self.assertEqual(tuple(y.shape), (batch, 32, hidden))