- add document

6 years ago · 001586fa3e
--- a/fastNLP/api/api.py
+++ b/fastNLP/api/api.py
@@ -47,7 +47,7 @@ from fastNLP.core.dataset import DataSet

 from fastNLP.api.utils import load_url
 from fastNLP.api.processor import ModelProcessor
 from fastNLP.io.dataset_loader import cut_long_sentence, ConllLoader
 from fastNLP.io.dataset_loader import _cut_long_sentence, ConllLoader
 from fastNLP.core.instance import Instance
 from fastNLP.api.pipeline import Pipeline
 from fastNLP.core.metrics import SpanFPreRecMetric
@@ -107,7 +107,7 @@ class ConllCWSReader(object):
                continue
            line = ' '.join(res)
            if cut_long_sent:
                sents = cut_long_sentence(line)
                sents = _cut_long_sentence(line)
            else:
                sents = [line]
            for raw_sentence in sents:
--- a/fastNLP/core/init.py
+++ b/fastNLP/core/init.py
@@ -5,7 +5,7 @@ from .instance import Instance
 from .losses import LossFunc, CrossEntropyLoss, L1Loss, BCELoss, NLLLoss, LossInForward
 from .metrics import AccuracyMetric
 from .optimizer import Optimizer, SGD, Adam
 from .sampler import SequentialSampler, BucketSampler, RandomSampler, BaseSampler
 from .sampler import SequentialSampler, BucketSampler, RandomSampler, Sampler
 from .tester import Tester
 from .trainer import Trainer
 from .vocabulary import Vocabulary
--- a/fastNLP/core/batch.py
+++ b/fastNLP/core/batch.py
@@ -2,7 +2,7 @@ import numpy as np
 import torch
 import atexit

 from fastNLP.core.sampler import RandomSampler
 from fastNLP.core.sampler import RandomSampler, Sampler
 import torch.multiprocessing as mp

 _python_is_exit = False
@@ -12,19 +12,25 @@ def _set_python_is_exit():
 atexit.register(_set_python_is_exit)

 class Batch(object):
    """Batch is an iterable object which iterates over mini-batches.

        Example::

            for batch_x, batch_y in Batch(data_set, batch_size=16, sampler=SequentialSampler()):
                # ...

    :param DataSet dataset: a DataSet object
    :param int batch_size: the size of the batch
    :param Sampler sampler: a Sampler object. If None, use fastNLP.sampler.RandomSampler
    :param bool as_numpy: If True, return Numpy array. Otherwise, return torch tensors.
    :param bool prefetch: If True, use multiprocessing to fetch next batch when training.
    :param str or torch.device device: the batch's device, if as_numpy is True, device is ignored.
    """
    Batch 用于从 `DataSet` 中按一定的顺序, 依次按 ``batch_size`` 的大小将数据取出.
    组成 `x` 和 `y`

    Example::

        batch = Batch(data_set, batch_size=16, sampler=SequentialSampler())
        num_batch = len(batch)
        for batch_x, batch_y in batch:
            # do stuff ...

    :param DataSet dataset: `DataSet` 对象, 数据集
    :param int batch_size: 取出的batch大小
    :param Sampler sampler: 规定使用的 Sample 方式. 若为 ``None`` , 使用 RandomSampler.
        Default: ``None``
    :param bool as_numpy: 若为 ``True`` , 输出batch为 numpy.array. 否则为 torch.Tensor.
        Default: ``False``
    :param bool prefetch: 若为 ``True`` 使用多进程预先取出下一batch.
        Default: ``False``
    """

    def __init__(self, dataset, batch_size, sampler=None, as_numpy=False, prefetch=False):
@@ -41,7 +47,7 @@ class Batch(object):
        self.prefetch = prefetch
        self.lengths = 0

    def fetch_one(self):
    def _fetch_one(self):
        if self.curidx >= len(self.idx_list):
            return None
        else:
@@ -55,7 +61,7 @@ class Batch(object):
                if field.is_target or field.is_input:
                    batch = field.get(indices)
                    if not self.as_numpy and field.padder is not None:
                        batch = to_tensor(batch, field.dtype)
                        batch = _to_tensor(batch, field.dtype)
                    if field.is_target:
                        batch_y[field_name] = batch
                    if field.is_input:
@@ -70,17 +76,17 @@ class Batch(object):
        :return:
        """
        if self.prefetch:
            return run_batch_iter(self)
            return _run_batch_iter(self)
        def batch_iter():
            self.init_iter()
            self._init_iter()
            while 1:
                res = self.fetch_one()
                res = self._fetch_one()
                if res is None:
                    break
                yield res
        return batch_iter()

    def init_iter(self):
    def _init_iter(self):
        self.idx_list = self.sampler(self.dataset)
        self.curidx = 0
        self.lengths = self.dataset.get_length()
@@ -89,10 +95,14 @@ class Batch(object):
        return self.num_batches

    def get_batch_indices(self):
        """取得当前batch在DataSet中所在的index下标序列

        :return list(int) indexes: 下标序列
        """
        return self.cur_batch_indices


 def to_tensor(batch, dtype):
 def _to_tensor(batch, dtype):
    try:
        if dtype in (int, np.int8, np.int16, np.int32, np.int64):
            batch = torch.LongTensor(batch)
@@ -103,12 +113,12 @@ def to_tensor(batch, dtype):
    return batch


 def run_fetch(batch, q):
 def _run_fetch(batch, q):
    global _python_is_exit
    batch.init_iter()
    batch._init_iter()
    # print('start fetch')
    while 1:
        res = batch.fetch_one()
        res = batch._fetch_one()
        # print('fetch one')
        while 1:
            try:
@@ -124,9 +134,9 @@ def run_fetch(batch, q):
    # print('fetch exit')


 def run_batch_iter(batch):
 def _run_batch_iter(batch):
    q = mp.JoinableQueue(maxsize=10)
    fetch_p = mp.Process(target=run_fetch, args=(batch, q))
    fetch_p = mp.Process(target=_run_fetch, args=(batch, q))
    fetch_p.daemon = True
    fetch_p.start()
    # print('fork fetch process')
--- a/fastNLP/core/dataset.py
+++ b/fastNLP/core/dataset.py
@@ -482,7 +482,7 @@ class DataSet(object):

        """
        import warnings
        warnings.warn('read_csv is deprecated, use CSVLoader instead',
        warnings.warn('DataSet.read_csv is deprecated, use CSVLoader instead',
                      category=DeprecationWarning)
        with open(csv_path, "r", encoding='utf-8') as f:
            start_idx = 0
--- a/fastNLP/core/sampler.py
+++ b/fastNLP/core/sampler.py
@@ -3,72 +3,49 @@ from itertools import chain
 import numpy as np
 import torch

 class Sampler(object):
    """ `Sampler` 类的基类. 规定以何种顺序取出data中的元素

 def convert_to_torch_tensor(data_list, use_cuda):
    """Convert lists into (cuda) Tensors.

    :param data_list: 2-level lists
    :param use_cuda: bool, whether to use GPU or not
    :return data_list: PyTorch Tensor of shape [batch_size, max_seq_len]
    """
    data_list = torch.Tensor(data_list).long()
    if torch.cuda.is_available() and use_cuda:
        data_list = data_list.cuda()
    return data_list


 class BaseSampler(object):
    """The base class of all samplers.

        Sub-classes must implement the ``__call__`` method.
        ``__call__`` takes a DataSet object and returns a list of int - the sampling indices.
    子类必须实现 ``__call__`` 方法. 输入 `DataSet` 对象, 返回其中元素的下标序列
    """

    def __call__(self, *args, **kwargs):
    def __call__(self, data_set):
        """
       :param DataSet data_set: `DataSet` 对象, 需要Sample的数据
       :return result: list(int) 其中元素的下标序列, ``data_set`` 中元素会按 ``result`` 中顺序取出
       """
        raise NotImplementedError


 class SequentialSampler(BaseSampler):
    """Sample data in the original order.
 class SequentialSampler(Sampler):
    """顺序取出元素的 `Sampler`

    """
    def __call__(self, data_set):
        """

        :param DataSet data_set:
        :return result: a list of integers.
        """
        return list(range(len(data_set)))


 class RandomSampler(BaseSampler):
    """Sample data in random permutation order.
 class RandomSampler(Sampler):
    """随机化取元素的 `Sampler`

    """
    def __call__(self, data_set):
        """

            :param DataSet data_set:
            :return result: a list of integers.
        """
        return list(np.random.permutation(len(data_set)))


 class BucketSampler(BaseSampler):
    """

        :param int num_buckets: the number of buckets to use.
        :param int batch_size: batch size per epoch.
        :param str seq_lens_field_name: the field name indicating the field about sequence length.
 class BucketSampler(Sampler):
    """带Bucket的 `Random Sampler`. 可以随机地取出长度相似的元素

    :param int num_buckets: bucket的数量
    :param int batch_size: batch的大小
    :param str seq_lens_field_name: 对应序列长度的 `field` 的名字
    """
    def __init__(self, num_buckets=10, batch_size=32, seq_lens_field_name='seq_lens'):
    def __init__(self, num_buckets=10, batch_size=32, seq_lens_field_name='seq_len'):
        self.num_buckets = num_buckets
        self.batch_size = batch_size
        self.seq_lens_field_name = seq_lens_field_name

    def __call__(self, data_set):

        seq_lens = data_set.get_all_fields()[self.seq_lens_field_name].content
        total_sample_num = len(seq_lens)

--- a/fastNLP/core/trainer.py
+++ b/fastNLP/core/trainer.py
@@ -18,7 +18,7 @@ from fastNLP.core.dataset import DataSet
 from fastNLP.core.losses import _prepare_losser
 from fastNLP.core.metrics import _prepare_metrics
 from fastNLP.core.optimizer import Adam
 from fastNLP.core.sampler import BaseSampler
 from fastNLP.core.sampler import Sampler
 from fastNLP.core.sampler import RandomSampler
 from fastNLP.core.sampler import SequentialSampler
 from fastNLP.core.tester import Tester
@@ -57,7 +57,7 @@ class Trainer(object):
            smaller, add "-" in front of the string. For example::

                    metric_key="-PPL"   # language model gets better as perplexity gets smaller
        :param BaseSampler sampler: method used to generate batch data.
        :param Sampler sampler: method used to generate batch data.
        :param prefetch: bool, 是否使用额外的进程对产生batch数据。
        :param bool use_tqdm: whether to use tqdm to show train progress.
        :param callbacks: List[Callback]. 用于在train过程中起调节作用的回调函数。比如early stop，negative sampling等可以
@@ -102,7 +102,7 @@ class Trainer(object):
        losser = _prepare_losser(loss)

        # sampler check
        if sampler is not None and not isinstance(sampler, BaseSampler):
        if sampler is not None and not isinstance(sampler, Sampler):
            raise ValueError("The type of sampler should be fastNLP.BaseSampler, got {}.".format(type(sampler)))

        if check_code_level > -1:
--- a/fastNLP/core/vocabulary.py
+++ b/fastNLP/core/vocabulary.py
@@ -1,3 +1,4 @@
 from functools import wraps
 from collections import Counter
 from fastNLP.core.dataset import DataSet

@@ -5,7 +6,7 @@ def check_build_vocab(func):
    """A decorator to make sure the indexing is built before used.

    """

    @wraps(func) # to solve missing docstring
    def _wrapper(self, *args, **kwargs):
        if self.word2idx is None or self.rebuild is True:
            self.build_vocab()
@@ -18,7 +19,7 @@ def check_build_status(func):
    """A decorator to check whether the vocabulary updates after the last build.

    """

    @wraps(func) # to solve missing docstring
    def _wrapper(self, *args, **kwargs):
        if self.rebuild is False:
            self.rebuild = True
@@ -32,23 +33,28 @@ def check_build_status(func):


 class Vocabulary(object):
    """Use for word and index one to one mapping
    """
    用于构建, 存储和使用 `str` 到 `int` 的一一映射

    Example::

        vocab = Vocabulary()
        word_list = "this is a word list".split()
        vocab.update(word_list)
        vocab["word"]
        vocab.to_word(5)

    :param int max_size: set the max number of words in Vocabulary. Default: None
    :param int min_freq: set the min occur frequency of words in Vocabulary. Default: None
    :param padding: str, padding的字符，默认为<pad>。如果设置为None，则vocabulary中不考虑padding，为None的情况多在为label建立
        Vocabulary的情况。
    :param unknown: str, unknown的字符，默认为<unk>。如果设置为None，则vocabulary中不考虑unknown，为None的情况多在为label建立
        Vocabulary的情况。

        vocab["word"] # str to int
        vocab.to_word(5) # int to str

    :param int max_size: `Vocabulary` 的最大大小, 即能存储词的最大数量
        若为 ``None`` , 则不限制大小. Default: ``None``
    :param int min_freq: 能被记录下的词在文本中的最小出现频率, 应大于或等于 1.
        若小于该频率, 词语将被视为 `unknown`. 若为 ``None`` , 所有文本中的词都被记录. Default: ``None``
    :param str padding: padding的字符. 如果设置为 ``None`` ,
        则vocabulary中不考虑padding, 也不计入词表大小，为 ``None`` 的情况多在为label建立Vocabulary的情况.
        Default: '<pad>'
    :param str unknow: unknow的字符，所有未被记录的词在转为 `int` 时将被视为unknown.
        如果设置为 ``None`` ,则vocabulary中不考虑unknow, 也不计入词表大小.
        为 ``None`` 的情况多在为label建立Vocabulary的情况.
        Default: '<unk>'
    """

    def __init__(self, max_size=None, min_freq=None, padding='<pad>', unknown='<unk>'):
@@ -63,7 +69,7 @@ class Vocabulary(object):

    @check_build_status
    def update(self, word_lst):
        """Add a list of words into the vocabulary.
        """依次增加序列中词在词典中的出现频率

        :param list word_lst: a list of strings
        """
@@ -71,32 +77,35 @@ class Vocabulary(object):

    @check_build_status
    def add(self, word):
        """Add a single word into the vocabulary.
        """
        增加一个新词在词典中的出现频率

        :param str word: a word or token.
        :param str word: 新词
        """
        self.word_count[word] += 1

    @check_build_status
    def add_word(self, word):
        """Add a single word into the vocabulary.

        :param str word: a word or token.
        """
        增加一个新词在词典中的出现频率

        :param str word: 新词
        """
        self.add(word)

    @check_build_status
    def add_word_lst(self, word_lst):
        """Add a list of words into the vocabulary.

        :param list word_lst: a list of strings
        """
        依次增加序列中词在词典中的出现频率

        :param list(str) word_lst: 词的序列
        """
        self.update(word_lst)

    def build_vocab(self):
        """Build a mapping from word to index, and filter the word using ``max_size`` and ``min_freq``.
        """
        根据已经出现的词和出现频率构建词典. 注意: 重复构建可能会改变词典的大小,
        但已经记录在词典中的词, 不会改变对应的 `int`

        """
        self.word2idx = {}
@@ -117,7 +126,8 @@ class Vocabulary(object):
        self.rebuild = False

    def build_reverse_vocab(self):
        """Build "index to word" dict based on "word to index" dict.
        """
        基于 "word to index" dict, 构建 "index to word" dict.

        """
        self.idx2word = {i: w for w, i in self.word2idx.items()}
@@ -128,7 +138,8 @@ class Vocabulary(object):

    @check_build_vocab
    def __contains__(self, item):
        """Check if a word in vocabulary.
        """
        检查词是否被记录

        :param item: the word
        :return: True or False
@@ -136,11 +147,24 @@ class Vocabulary(object):
        return item in self.word2idx

    def has_word(self, w):
        """
        检查词是否被记录

        Example::

            has_abc = vocab.has_word('abc')
            # equals to
            has_abc = 'abc' in vocab

        :param item: the word
        :return: ``True`` or ``False``
        """
        return self.__contains__(w)

    @check_build_vocab
    def __getitem__(self, w):
        """To support usage like::
        """
        To support usage like::

            vocab[w]
        """
@@ -154,14 +178,19 @@ class Vocabulary(object):
    @check_build_vocab
    def index_dataset(self, *datasets, field_name, new_field_name=None):
        """
        example:
            # remember to use `field_name`
            vocab.index_dataset(tr_data, dev_data, te_data, field_name='words')
        将DataSet中对应field的词转为数字.

        Example::

        :param datasets: fastNLP Dataset type. you can pass multiple datasets
        :param field_name: str, what field to index. Only support 0,1,2 dimension.
        :param new_field_name: str. What the indexed field should be named, default is to overwrite field_name
        :return:
            # remember to use `field_name`
            vocab.index_dataset(train_data, dev_data, test_data, field_name='words')

        :param DataSet datasets: 需要转index的 DataSet, 支持一个或多个
        :param str field_name: 需要转index的field, 若有多个 DataSet, 每个DataSet都必须有此 field.
            目前仅支持 ``str`` , ``list(str)`` , ``list(list(str))``
        :param str new_field_name: 保存结果的field_name. 若为 ``None`` , 将覆盖原field.
            Default: ``None``
        :return self:
        """
        def index_instance(ins):
            """
@@ -194,11 +223,18 @@ class Vocabulary(object):

    def from_dataset(self, *datasets, field_name):
        """
        Construct vocab from dataset.
        使用dataset的对应field中词构建词典

         Example::

            # remember to use `field_name`
            vocab.from_dataset(train_data1, train_data2, field_name='words')

        :param datasets: DataSet.
        :param field_name: str, what field is used to construct dataset.
        :return:
        :param DataSet datasets: 需要转index的 DataSet, 支持一个或多个.
        :param str field_name: 构建词典所使用的 field.
            若有多个 DataSet, 每个DataSet都必须有此 field.
            目前仅支持 ``str`` , ``list(str)`` , ``list(list(str))``
        :return self:
        """
        def construct_vocab(ins):
            field = ins[field_name]
@@ -223,15 +259,27 @@ class Vocabulary(object):
        return self

    def to_index(self, w):
        """ Turn a word to an index. If w is not in Vocabulary, return the unknown label.
        """
        将词转为数字. 若词不再词典中被记录, 将视为 unknown, 若 ``unknown=None`` , 将抛出
        ``ValueError``

        Example::

            index = vocab.to_index('abc')
            # equals to
            index = vocab['abc']

        :param str w: a word
        :return int index: the number
        """
        return self.__getitem__(w)

    @property
    @check_build_vocab
    def unknown_idx(self):
        """
        unknown 对应的数字.
        """
        if self.unknown is None:
            return None
        return self.word2idx[self.unknown]
@@ -239,16 +287,20 @@ class Vocabulary(object):
    @property
    @check_build_vocab
    def padding_idx(self):
        """
        padding 对应的数字
        """
        if self.padding is None:
            return None
        return self.word2idx[self.padding]

    @check_build_vocab
    def to_word(self, idx):
        """given a word's index, return the word itself
        """
        给定一个数字, 将其转为对应的词.

        :param int idx: the index
        :return str word: the indexed word
        :return str word: the word
        """
        return self.idx2word[idx]

--- a/fastNLP/io/dataset_loader.py
+++ b/fastNLP/io/dataset_loader.py
@@ -4,7 +4,7 @@ from nltk.tree import Tree

 from fastNLP.core.dataset import DataSet
 from fastNLP.core.instance import Instance
 from fastNLP.io.file_reader import read_csv, read_json, read_conll
 from fastNLP.io.file_reader import _read_csv, _read_json, _read_conll


 def _download_from_url(url, path):
@@ -55,12 +55,12 @@ def _uncompress(src, dst):


 class DataSetLoader:
    """Interface for all DataSetLoaders.
    """所有`DataSetLoader`的接口

    """

    def load(self, path):
        """Load data from a given file.
        """从指定 ``path`` 的文件中读取数据,返回DataSet

        :param str path: file path
        :return: a DataSet object
@@ -68,7 +68,7 @@ class DataSetLoader:
        raise NotImplementedError

    def convert(self, data):
        """Optional operation to build a DataSet.
        """用Python数据对象创建DataSet

        :param data: inner data structure (user-defined) to represent the data.
        :return: a DataSet object
@@ -77,7 +77,7 @@ class DataSetLoader:


 class PeopleDailyCorpusLoader(DataSetLoader):
    """人民日报数据集
    """读取人民日报数据集
    """
    def __init__(self):
        super(PeopleDailyCorpusLoader, self).__init__()
@@ -154,8 +154,35 @@ class PeopleDailyCorpusLoader(DataSetLoader):
        return data_set


 class ConllLoader:
 class ConllLoader(DataSetLoader):
    """
    读取Conll格式的数据. 数据格式详见 http://conll.cemantix.org/2012/data.html

    列号从0开始, 每列对应内容为::

        Column  Type
        0       Document ID
        1       Part number
        2       Word number
        3       Word itself
        4       Part-of-Speech
        5       Parse bit
        6       Predicate lemma
        7       Predicate Frameset ID
        8       Word sense
        9       Speaker/Author
        10      Named Entities
        11:N    Predicate Arguments
        N       Coreference

    :param headers: 每一列数据的名称，需为List or Tuple  of str。``header`` 与 ``indexs`` 一一对应
    :param indexs: 需要保留的数据列下标，从0开始。若为 ``None`` ，则所有列都保留。Default: ``None``
    :param dropna: 是否忽略非法数据，若 ``False`` ，遇到非法数据时抛出 ``ValueError`` 。Default: ``True``
    """
    def __init__(self, headers, indexs=None, dropna=True):
        super(ConllLoader, self).__init__()
        if not isinstance(headers, (list, tuple)):
            raise TypeError('invalid headers: {}, should be list of strings'.format(headers))
        self.headers = headers
        self.dropna = dropna
        if indexs is None:
@@ -167,24 +194,17 @@ class ConllLoader:

    def load(self, path):
        ds = DataSet()
        for idx, data in read_conll(path, indexes=self.indexs, dropna=self.dropna):
            ins = {h:data[idx] for h, idx in zip(self.headers, self.indexs)}
        for idx, data in _read_conll(path, indexes=self.indexs, dropna=self.dropna):
            ins = {h:data[i] for i, h in enumerate(self.headers)}
            ds.append(Instance(**ins))
        return ds

    def get_one(self, sample):
        sample = list(map(list, zip(*sample)))
        for field in sample:
            if len(field) <= 0:
                return None
        return sample


 class Conll2003Loader(ConllLoader):
    """Loader for conll2003 dataset
    """读取Conll2003数据
    
        More information about the given dataset cound be found on 
        https://sites.google.com/site/ermasoftware/getting-started/ne-tagging-conll2003-data
    关于数据集的更多信息,参考:
    https://sites.google.com/site/ermasoftware/getting-started/ne-tagging-conll2003-data
    """
    def __init__(self):
        headers = [
@@ -193,9 +213,10 @@ class Conll2003Loader(ConllLoader):
        super(Conll2003Loader, self).__init__(headers=headers)


 def cut_long_sentence(sent, max_sample_length=200):
 def _cut_long_sentence(sent, max_sample_length=200):
    """
    将长于max_sample_length的sentence截成多段，只会在有空格的地方发生截断。所以截取的句子可能长于或者短于max_sample_length
    将长于max_sample_length的sentence截成多段，只会在有空格的地方发生截断。
    所以截取的句子可能长于或者短于max_sample_length

    :param sent: str.
    :param max_sample_length: int.
@@ -223,8 +244,15 @@ def cut_long_sentence(sent, max_sample_length=200):


 class SSTLoader(DataSetLoader):
    """load SST data in PTB tree format
        data source: https://nlp.stanford.edu/sentiment/trainDevTestTrees_PTB.zip
    """读取SST数据集, DataSet包含fields::

        words: list(str) 需要分类的文本
        target: str 文本的标签

    数据来源: https://nlp.stanford.edu/sentiment/trainDevTestTrees_PTB.zip

    :param subtree: 是否将数据展开为子树，扩充数据量. Default: ``False``
    :param fine_grained: 是否使用SST-5标准，若 ``False`` , 使用SST-2。Default: ``False``
    """
    def __init__(self, subtree=False, fine_grained=False):
        self.subtree = subtree
@@ -247,14 +275,14 @@ class SSTLoader(DataSetLoader):
            datas = []
            for l in f:
                datas.extend([(s, self.tag_v[t])
                              for s, t in self.get_one(l, self.subtree)])
                              for s, t in self._get_one(l, self.subtree)])
        ds = DataSet()
        for words, tag in datas:
            ds.append(Instance(words=words, raw_tag=tag))
            ds.append(Instance(words=words, target=tag))
        return ds

    @staticmethod
    def get_one(data, subtree):
    def _get_one(data, subtree):
        tree = Tree.fromstring(data)
        if subtree:
            return [(t.leaves(), t.label()) for t in tree.subtrees()]
@@ -262,11 +290,17 @@ class SSTLoader(DataSetLoader):


 class JsonLoader(DataSetLoader):
    """Load json-format data,
        every line contains a json obj, like a dict
        fields is the dict key that need to be load
    """
    def __init__(self, dropna=False, fields=None):
    读取json格式数据.数据必须按行存储,每行是一个包含各类属性的json对象

    :param dict fields: 需要读入的json属性名称, 和读入后在DataSet中存储的field_name
        ``fields`` 的`key`必须是json对象的属性名. ``fields`` 的`value`为读入后在DataSet存储的`field_name`,
        `value`也可为 ``None`` , 这时读入后的`field_name`与json对象对应属性同名
        ``fields`` 可为 ``None`` , 这时,json对象所有属性都保存在DataSet中. Default: ``None``
    :param bool dropna: 是否忽略非法数据,若 ``True`` 则忽略,若 ``False`` ,在遇到非法数据时,抛出 ``ValueError`` .
        Default: ``True``
    """
    def __init__(self, fields=None, dropna=False):
        super(JsonLoader, self).__init__()
        self.dropna = dropna
        self.fields = None
@@ -279,7 +313,7 @@ class JsonLoader(DataSetLoader):

    def load(self, path):
        ds = DataSet()
        for idx, d in read_json(path, fields=self.fields_list, dropna=self.dropna):
        for idx, d in _read_json(path, fields=self.fields_list, dropna=self.dropna):
            ins = {self.fields[k]:v for k,v in d.items()}
            ds.append(Instance(**ins))
        return ds
@@ -287,7 +321,13 @@ class JsonLoader(DataSetLoader):

 class SNLILoader(JsonLoader):
    """
    data source: https://nlp.stanford.edu/projects/snli/snli_1.0.zip
    读取SNLI数据集，读取的DataSet包含fields::

        words1: list(str)，第一句文本, premise
        words2: list(str), 第二句文本, hypothesis
        target: str, 真实标签

    数据来源: https://nlp.stanford.edu/projects/snli/snli_1.0.zip
    """
    def __init__(self):
        fields = {
@@ -309,14 +349,14 @@ class SNLILoader(JsonLoader):


 class CSVLoader(DataSetLoader):
    """Load data from a CSV file and return a DataSet object.

            :param str csv_path: path to the CSV file
            :param List[str] or Tuple[str] headers: headers of the CSV file
            :param str sep: delimiter in CSV file. Default: ","
            :param bool dropna: If True, drop rows that have less entries than headers.
            :return dataset: the read data set
    """
    读取CSV格式的数据集。返回 ``DataSet``

    :param List[str] headers: CSV文件的文件头.定义每一列的属性名称,即返回的DataSet中`field`的名称
        若为 ``None`` ,则将读入文件的第一行视作 ``headers`` . Default: ``None``
    :param str sep: CSV文件中列与列之间的分隔符. Default: ","
    :param bool dropna: 是否忽略非法数据,若 ``True`` 则忽略,若 ``False`` ,在遇到非法数据时,抛出 ``ValueError`` .
        Default: ``True``
    """
    def __init__(self, headers=None, sep=",", dropna=True):
        self.headers = headers
@@ -325,8 +365,8 @@ class CSVLoader(DataSetLoader):

    def load(self, path):
        ds = DataSet()
        for idx, data in read_csv(path, headers=self.headers,
                                  sep=self.sep, dropna=self.dropna):
        for idx, data in _read_csv(path, headers=self.headers,
                                   sep=self.sep, dropna=self.dropna):
            ds.append(Instance(**data))
        return ds

--- a/fastNLP/io/file_reader.py
+++ b/fastNLP/io/file_reader.py
@@ -1,15 +1,16 @@
 import json


 def read_csv(path, encoding='utf-8', headers=None, sep=',', dropna=True):
 def _read_csv(path, encoding='utf-8', headers=None, sep=',', dropna=True):
    """
    Construct a generator to read csv items
    Construct a generator to read csv items.

    :param path: file path
    :param encoding: file's encoding, default: utf-8
    :param headers: file's headers, if None, make file's first line as headers. default: None
    :param sep: separator for each column. default: ','
    :param dropna: weather to ignore and drop invalid data,
            if False, raise ValueError when reading invalid data. default: True
            :if False, raise ValueError when reading invalid data. default: True
    :return: generator, every time yield (line number, csv item)
    """
    with open(path, 'r', encoding=encoding) as f:
@@ -35,14 +36,15 @@ def read_csv(path, encoding='utf-8', headers=None, sep=',', dropna=True):
            yield line_idx, _dict


 def read_json(path, encoding='utf-8', fields=None, dropna=True):
 def _read_json(path, encoding='utf-8', fields=None, dropna=True):
    """
    Construct a generator to read json items
    Construct a generator to read json items.

    :param path: file path
    :param encoding: file's encoding, default: utf-8
    :param fields: json object's fields that needed, if None, all fields are needed. default: None
    :param dropna: weather to ignore and drop invalid data,
            if False, raise ValueError when reading invalid data. default: True
            :if False, raise ValueError when reading invalid data. default: True
    :return: generator, every time yield (line number, json item)
    """
    if fields:
@@ -65,14 +67,15 @@ def read_json(path, encoding='utf-8', fields=None, dropna=True):
            yield line_idx, _res


 def read_conll(path, encoding='utf-8', indexes=None, dropna=True):
 def _read_conll(path, encoding='utf-8', indexes=None, dropna=True):
    """
    Construct a generator to read conll items
    Construct a generator to read conll items.

    :param path: file path
    :param encoding: file's encoding, default: utf-8
    :param indexes: conll object's column indexes that needed, if None, all columns are needed. default: None
    :param dropna: weather to ignore and drop invalid data,
            if False, raise ValueError when reading invalid data. default: True
            :if False, raise ValueError when reading invalid data. default: True
    :return: generator, every time yield (line number, conll item)
    """
    def parse_conll(sample):
--- a/fastNLP/models/biaffine_parser.py
+++ b/fastNLP/models/biaffine_parser.py
@@ -16,7 +16,7 @@ from fastNLP.modules.utils import initial_parameter
 from fastNLP.modules.utils import seq_mask


 def mst(scores):
 def _mst(scores):
    """
    with some modification to support parser output for MST decoding
    https://github.com/tdozat/Parser/blob/0739216129cd39d69997d28cbc4133b360ea3934/lib/models/nn.py#L692
@@ -120,12 +120,22 @@ def _find_cycle(vertices, edges):


 class GraphParser(BaseModel):
    """Graph based Parser helper class, support greedy decoding and MST(Maximum Spanning Tree) decoding
    """
    基于图的parser base class, 支持贪婪解码和最大生成树解码
    """
    def __init__(self):
        super(GraphParser, self).__init__()

    def _greedy_decoder(self, arc_matrix, mask=None):
    @staticmethod
    def greedy_decoder(arc_matrix, mask=None):
        """
        贪心解码方式, 输入图, 输出贪心解码的parsing结果, 不保证合法的构成树

        :param arc_matrix: [batch, seq_len, seq_len] 输入图矩阵
        :param mask: [batch, seq_len] 输入图的padding mask, 有内容的部分为 1, 否则为 0.
            若为 ``None`` 时, 默认为全1向量. Default: ``None``
        :return heads: [batch, seq_len] 每个元素在树中对应的head(parent)预测结果
        """
        _, seq_len, _ = arc_matrix.shape
        matrix = arc_matrix + torch.diag(arc_matrix.new(seq_len).fill_(-np.inf))
        flip_mask = (mask == 0).byte()
@@ -135,22 +145,34 @@ class GraphParser(BaseModel):
            heads *= mask.long()
        return heads

    def _mst_decoder(self, arc_matrix, mask=None):
    @staticmethod
    def mst_decoder(arc_matrix, mask=None):
        """
        用最大生成树算法, 计算parsing结果, 保证输出合法的树结构

        :param arc_matrix: [batch, seq_len, seq_len] 输入图矩阵
        :param mask: [batch, seq_len] 输入图的padding mask, 有内容的部分为 1, 否则为 0.
            若为 ``None`` 时, 默认为全1向量. Default: ``None``
        :return heads: [batch, seq_len] 每个元素在树中对应的head(parent)预测结果
        """
        batch_size, seq_len, _ = arc_matrix.shape
        matrix = arc_matrix.clone()
        ans = matrix.new_zeros(batch_size, seq_len).long()
        lens = (mask.long()).sum(1) if mask is not None else torch.zeros(batch_size) + seq_len
        batch_idx = torch.arange(batch_size, dtype=torch.long, device=lens.device)
        for i, graph in enumerate(matrix):
            len_i = lens[i]
            ans[i, :len_i] = torch.as_tensor(mst(graph.detach()[:len_i, :len_i].cpu().numpy()), device=ans.device)
            ans[i, :len_i] = torch.as_tensor(_mst(graph.detach()[:len_i, :len_i].cpu().numpy()), device=ans.device)
        if mask is not None:
            ans *= mask.long()
        return ans


 class ArcBiaffine(nn.Module):
    """helper module for Biaffine Dependency Parser predicting arc
    """
    Biaffine Dependency Parser 的子模块, 用于构建预测边的图

    :param hidden_size: 输入的特征维度
    :param bias: 是否使用bias. Default: ``True``
    """
    def __init__(self, hidden_size, bias=True):
        super(ArcBiaffine, self).__init__()
@@ -164,10 +186,10 @@ class ArcBiaffine(nn.Module):

    def forward(self, head, dep):
        """
        :param head arc-head tensor = [batch, length, emb_dim]
        :param dep arc-dependent tensor = [batch, length, emb_dim]

        :return output tensor = [bacth, length, length]
        :param head: arc-head tensor [batch, length, hidden]
        :param dep: arc-dependent tensor [batch, length, hidden]
        :return output: tensor [bacth, length, length]
        """
        output = dep.matmul(self.U)
        output = output.bmm(head.transpose(-1, -2))
@@ -177,7 +199,13 @@ class ArcBiaffine(nn.Module):


 class LabelBilinear(nn.Module):
    """helper module for Biaffine Dependency Parser predicting label
    """
    Biaffine Dependency Parser 的子模块, 用于构建预测边类别的图

    :param in1_features: 输入的特征1维度
    :param in2_features: 输入的特征2维度
    :param num_label: 边类别的个数
    :param bias: 是否使用bias. Default: ``True``
    """
    def __init__(self, in1_features, in2_features, num_label, bias=True):
        super(LabelBilinear, self).__init__()
@@ -185,14 +213,34 @@ class LabelBilinear(nn.Module):
        self.lin = nn.Linear(in1_features + in2_features, num_label, bias=False)

    def forward(self, x1, x2):
        """

        :param x1: [batch, seq_len, hidden] 输入特征1, 即label-head
        :param x2: [batch, seq_len, hidden] 输入特征2, 即label-dep
        :return output: [batch, seq_len, num_cls] 每个元素对应类别的概率图
        """
        output = self.bilinear(x1, x2)
        output += self.lin(torch.cat([x1, x2], dim=2))
        return output

 class BiaffineParser(GraphParser):
    """Biaffine Dependency Parser implemantation.
    refer to ` Deep Biaffine Attention for Neural Dependency Parsing (Dozat and Manning, 2016)
    """Biaffine Dependency Parser 实现.
    论文参考 ` Deep Biaffine Attention for Neural Dependency Parsing (Dozat and Manning, 2016)
    <https://arxiv.org/abs/1611.01734>`_ .

    :param word_vocab_size: 单词词典大小
    :param word_emb_dim: 单词词嵌入向量的维度
    :param pos_vocab_size: part-of-speech 词典大小
    :param pos_emb_dim: part-of-speech 向量维度
    :param num_label: 边的类别个数
    :param rnn_layers: rnn encoder的层数
    :param rnn_hidden_size: rnn encoder 的隐状态维度
    :param arc_mlp_size: 边预测的MLP维度
    :param label_mlp_size: 类别预测的MLP维度
    :param dropout: dropout概率.
    :param encoder: encoder类别, 可选 ('lstm', 'var-lstm', 'transformer'). Default: lstm
    :param use_greedy_infer: 是否在inference时使用贪心算法.
        若 ``False`` , 使用更加精确但相对缓慢的MST算法. Default: ``False``
    """
    def __init__(self,
                word_vocab_size,
@@ -207,7 +255,6 @@ class BiaffineParser(GraphParser):
                dropout=0.3,
                encoder='lstm',
                use_greedy_infer=False):

        super(BiaffineParser, self).__init__()
        rnn_out_size = 2 * rnn_hidden_size
        word_hid_dim = pos_hid_dim = rnn_hidden_size
@@ -275,27 +322,31 @@ class BiaffineParser(GraphParser):
                for p in m.parameters():
                    nn.init.normal_(p, 0, 0.1)

    def forward(self, word_seq, pos_seq, seq_lens, gold_heads=None):
        """
        :param word_seq: [batch_size, seq_len] sequence of word's indices
        :param pos_seq: [batch_size, seq_len] sequence of word's indices
        :param seq_lens: [batch_size, seq_len] sequence of length masks
        :param gold_heads: [batch_size, seq_len] sequence of golden heads
        :return dict: parsing results
            arc_pred: [batch_size, seq_len, seq_len]
            label_pred: [batch_size, seq_len, seq_len]
            mask: [batch_size, seq_len]
            head_pred: [batch_size, seq_len] if gold_heads is not provided, predicting the heads
    def forward(self, words1, words2, seq_len, gold_heads=None):
        """模型forward阶段

        :param words1: [batch_size, seq_len] 输入word序列
        :param words2: [batch_size, seq_len] 输入pos序列
        :param seq_len: [batch_size, seq_len] 输入序列长度
        :param gold_heads: [batch_size, seq_len] 输入真实标注的heads, 仅在训练阶段有效,
            用于训练label分类器. 若为 ``None`` , 使用预测的heads输入到label分类器
            Default: ``None``
        :return dict: parsing结果::

            arc_pred: [batch_size, seq_len, seq_len] 边预测logits
            label_pred: [batch_size, seq_len, num_label] label预测logits
            mask: [batch_size, seq_len] 预测结果的mask
            head_pred: [batch_size, seq_len] heads的预测结果, 在 ``gold_heads=None`` 时预测
        """
        # prepare embeddings
        batch_size, seq_len = word_seq.shape
        batch_size, length = words1.shape
        # print('forward {} {}'.format(batch_size, seq_len))

        # get sequence mask
        mask = seq_mask(seq_lens, seq_len).long()
        mask = seq_mask(seq_len, length).long()

        word = self.word_embedding(word_seq) # [N,L] -> [N,L,C_0]
        pos = self.pos_embedding(pos_seq) # [N,L] -> [N,L,C_1]
        word = self.word_embedding(words1) # [N,L] -> [N,L,C_0]
        pos = self.pos_embedding(words2) # [N,L] -> [N,L,C_1]

        word, pos = self.word_fc(word), self.pos_fc(pos)
        word, pos = self.word_norm(word), self.pos_norm(pos)
@@ -303,7 +354,7 @@ class BiaffineParser(GraphParser):

        # encoder, extract features
        if self.encoder_name.endswith('lstm'):
            sort_lens, sort_idx = torch.sort(seq_lens, dim=0, descending=True)
            sort_lens, sort_idx = torch.sort(seq_len, dim=0, descending=True)
            x = x[sort_idx]
            x = nn.utils.rnn.pack_padded_sequence(x, sort_lens, batch_first=True)
            feat, _ = self.encoder(x) # -> [N,L,C]
@@ -329,20 +380,20 @@ class BiaffineParser(GraphParser):
        if gold_heads is None or not self.training:
            # use greedy decoding in training
            if self.training or self.use_greedy_infer:
                heads = self._greedy_decoder(arc_pred, mask)
                heads = self.greedy_decoder(arc_pred, mask)
            else:
                heads = self._mst_decoder(arc_pred, mask)
                heads = self.mst_decoder(arc_pred, mask)
            head_pred = heads
        else:
            assert self.training # must be training mode
            if gold_heads is None:
                heads = self._greedy_decoder(arc_pred, mask)
                heads = self.greedy_decoder(arc_pred, mask)
                head_pred = heads
            else:
                head_pred = None
                heads = gold_heads

        batch_range = torch.arange(start=0, end=batch_size, dtype=torch.long, device=word_seq.device).unsqueeze(1)
        batch_range = torch.arange(start=0, end=batch_size, dtype=torch.long, device=words1.device).unsqueeze(1)
        label_head = label_head[batch_range, heads].contiguous()
        label_pred = self.label_predictor(label_head, label_dep) # [N, L, num_label]
        res_dict = {'arc_pred': arc_pred, 'label_pred': label_pred, 'mask': mask}
@@ -355,11 +406,11 @@ class BiaffineParser(GraphParser):
        """
        Compute loss.

        :param arc_pred: [batch_size, seq_len, seq_len]
        :param label_pred: [batch_size, seq_len, n_tags]
        :param arc_true: [batch_size, seq_len]
        :param label_true: [batch_size, seq_len]
        :param mask: [batch_size, seq_len]
        :param arc_pred: [batch_size, seq_len, seq_len] 边预测logits
        :param label_pred: [batch_size, seq_len, num_label] label预测logits
        :param arc_true: [batch_size, seq_len] 真实边的标注
        :param label_true: [batch_size, seq_len] 真实类别的标注
        :param mask: [batch_size, seq_len] 预测结果的mask
        :return: loss value
        """

@@ -381,16 +432,23 @@ class BiaffineParser(GraphParser):
        label_nll = -label_loss.mean()
        return arc_nll + label_nll

    def predict(self, word_seq, pos_seq, seq_lens):
        """

        :param word_seq:
        :param pos_seq:
        :param seq_lens:
        :return: arc_pred: [B, L]
                 label_pred: [B, L]
    def predict(self, words1, words2, seq_len):
        """模型预测API

        :param words1: [batch_size, seq_len] 输入word序列
        :param words2: [batch_size, seq_len] 输入pos序列
        :param seq_len: [batch_size, seq_len] 输入序列长度
        :param gold_heads: [batch_size, seq_len] 输入真实标注的heads, 仅在训练阶段有效,
            用于训练label分类器. 若为 ``None`` , 使用预测的heads输入到label分类器
            Default: ``None``
        :return dict: parsing结果::

            arc_pred: [batch_size, seq_len, seq_len] 边预测logits
            label_pred: [batch_size, seq_len, num_label] label预测logits
            mask: [batch_size, seq_len] 预测结果的mask
            head_pred: [batch_size, seq_len] heads的预测结果, 在 ``gold_heads=None`` 时预测
        """
        res = self(word_seq, pos_seq, seq_lens)
        res = self(words1, words2, seq_len)
        output = {}
        output['arc_pred'] = res.pop('head_pred')
        _, label_pred = res.pop('label_pred').max(2)
@@ -399,6 +457,16 @@ class BiaffineParser(GraphParser):


 class ParserLoss(LossFunc):
    """
    计算parser的loss

    :param arc_pred: [batch_size, seq_len, seq_len] 边预测logits
    :param label_pred: [batch_size, seq_len, num_label] label预测logits
    :param arc_true: [batch_size, seq_len] 真实边的标注
    :param label_true: [batch_size, seq_len] 真实类别的标注
    :param mask: [batch_size, seq_len] 预测结果的mask
    :return loss: scalar
    """
    def __init__(self, arc_pred=None, label_pred=None, arc_true=None, label_true=None):
        super(ParserLoss, self).__init__(BiaffineParser.loss,
                                                 arc_pred=arc_pred,
@@ -408,12 +476,26 @@ class ParserLoss(LossFunc):


 class ParserMetric(MetricBase):
    """
    评估parser的性能

    :param arc_pred: 边预测logits
    :param label_pred: label预测logits
    :param arc_true: 真实边的标注
    :param label_true: 真实类别的标注
    :param seq_len: 序列长度
    :return dict: 评估结果::

        UAS: 不带label时, 边预测的准确率
        LAS: 同时预测边和label的准确率
    """
    def __init__(self, arc_pred=None, label_pred=None,
                       arc_true=None, label_true=None, seq_lens=None):
                 arc_true=None, label_true=None, seq_len=None):

        super().__init__()
        self._init_param_map(arc_pred=arc_pred, label_pred=label_pred,
                             arc_true=arc_true, label_true=label_true,
                             seq_lens=seq_lens)
                             seq_len=seq_len)
        self.num_arc = 0
        self.num_label = 0
        self.num_sample = 0
@@ -424,13 +506,13 @@ class ParserMetric(MetricBase):
            self.num_sample = self.num_label = self.num_arc = 0
        return res

    def evaluate(self, arc_pred, label_pred, arc_true, label_true, seq_lens=None):
    def evaluate(self, arc_pred, label_pred, arc_true, label_true, seq_len=None):
        """Evaluate the performance of prediction.
        """
        if seq_lens is None:
        if seq_len is None:
            seq_mask = arc_pred.new_ones(arc_pred.size(), dtype=torch.long)
        else:
            seq_mask = seq_lens_to_masks(seq_lens.long(), float=False).long()
            seq_mask = seq_lens_to_masks(seq_len.long(), float=False).long()
        # mask out <root> tag
        seq_mask[:,0] = 0
        head_pred_correct = (arc_pred == arc_true).long() * seq_mask
--- a/fastNLP/models/star_transformer.py
+++ b/fastNLP/models/star_transformer.py
@@ -7,6 +7,21 @@ import torch.nn.functional as F


 class StarTransEnc(nn.Module):
    """
    带word embedding的Star-Transformer Encoder

    :param vocab_size: 词嵌入的词典大小
    :param emb_dim: 每个词嵌入的特征维度
    :param num_cls: 输出类别个数
    :param hidden_size: 模型中特征维度.
    :param num_layers: 模型层数.
    :param num_head: 模型中multi-head的head个数.
    :param head_dim: 模型中multi-head中每个head特征维度.
    :param max_len: 模型能接受的最大输入长度.
    :param cls_hidden_size: 分类器隐层维度.
    :param emb_dropout: 词嵌入的dropout概率.
    :param dropout: 模型除词嵌入外的dropout概率.
    """
    def __init__(self, vocab_size, emb_dim,
                 hidden_size,
                 num_layers,
@@ -27,15 +42,23 @@ class StarTransEnc(nn.Module):
                                       max_len=max_len)

    def forward(self, x, mask):
        """
        :param FloatTensor data: [batch, length, hidden] 输入的序列
        :param ByteTensor mask: [batch, length] 输入序列的padding mask, 在没有内容(padding 部分) 为 0,
            否则为 1
        :return: [batch, length, hidden] 编码后的输出序列

                [batch, hidden] 全局 relay 节点, 详见论文
        """
        x = self.embedding(x)
        x = self.emb_fc(self.emb_drop(x))
        nodes, relay = self.encoder(x, mask)
        return nodes, relay


 class Cls(nn.Module):
 class _Cls(nn.Module):
    def __init__(self, in_dim, num_cls, hid_dim, dropout=0.1):
        super(Cls, self).__init__()
        super(_Cls, self).__init__()
        self.fc = nn.Sequential(
            nn.Linear(in_dim, hid_dim),
            nn.LeakyReLU(),
@@ -48,9 +71,9 @@ class Cls(nn.Module):
        return h


 class NLICls(nn.Module):
 class _NLICls(nn.Module):
    def __init__(self, in_dim, num_cls, hid_dim, dropout=0.1):
        super(NLICls, self).__init__()
        super(_NLICls, self).__init__()
        self.fc = nn.Sequential(
            nn.Dropout(dropout),
            nn.Linear(in_dim*4, hid_dim),  #4
@@ -65,7 +88,19 @@ class NLICls(nn.Module):
        return h

 class STSeqLabel(nn.Module):
    """star-transformer model for sequence labeling
    """用于序列标注的Star-Transformer模型

    :param vocab_size: 词嵌入的词典大小
    :param emb_dim: 每个词嵌入的特征维度
    :param num_cls: 输出类别个数
    :param hidden_size: 模型中特征维度. Default: 300
    :param num_layers: 模型层数. Default: 4
    :param num_head: 模型中multi-head的head个数. Default: 8
    :param head_dim: 模型中multi-head中每个head特征维度. Default: 32
    :param max_len: 模型能接受的最大输入长度. Default: 512
    :param cls_hidden_size: 分类器隐层维度. Default: 600
    :param emb_dropout: 词嵌入的dropout概率. Default: 0.1
    :param dropout: 模型除词嵌入外的dropout概率. Default: 0.1
    """
    def __init__(self, vocab_size, emb_dim, num_cls,
                 hidden_size=300,
@@ -86,23 +121,47 @@ class STSeqLabel(nn.Module):
                                max_len=max_len,
                                emb_dropout=emb_dropout,
                                dropout=dropout)
        self.cls = Cls(hidden_size, num_cls, cls_hidden_size)
        self.cls = _Cls(hidden_size, num_cls, cls_hidden_size)

    def forward(self, words, seq_len):
        """

    def forward(self, word_seq, seq_lens):
        mask = seq_lens_to_masks(seq_lens)
        nodes, _ = self.enc(word_seq, mask)
        :param words: [batch, seq_len] 输入序列
        :param seq_len: [batch,] 输入序列的长度
        :return output: [batch, num_cls, seq_len] 输出序列中每个元素的分类的概率
        """
        mask = seq_lens_to_masks(seq_len)
        nodes, _ = self.enc(words, mask)
        output = self.cls(nodes)
        output = output.transpose(1,2) # make hidden to be dim 1
        return {'output': output} # [bsz, n_cls, seq_len]

    def predict(self, word_seq, seq_lens):
        y = self.forward(word_seq, seq_lens)
    def predict(self, words, seq_len):
        """

        :param words: [batch, seq_len] 输入序列
        :param seq_len: [batch,] 输入序列的长度
        :return output: [batch, seq_len] 输出序列中每个元素的分类
        """
        y = self.forward(words, seq_len)
        _, pred = y['output'].max(1)
        return {'output': pred, 'seq_lens': seq_lens}
        return {'output': pred}


 class STSeqCls(nn.Module):
    """star-transformer model for sequence classification
    """用于分类任务的Star-Transformer

    :param vocab_size: 词嵌入的词典大小
    :param emb_dim: 每个词嵌入的特征维度
    :param num_cls: 输出类别个数
    :param hidden_size: 模型中特征维度. Default: 300
    :param num_layers: 模型层数. Default: 4
    :param num_head: 模型中multi-head的head个数. Default: 8
    :param head_dim: 模型中multi-head中每个head特征维度. Default: 32
    :param max_len: 模型能接受的最大输入长度. Default: 512
    :param cls_hidden_size: 分类器隐层维度. Default: 600
    :param emb_dropout: 词嵌入的dropout概率. Default: 0.1
    :param dropout: 模型除词嵌入外的dropout概率. Default: 0.1
    """

    def __init__(self, vocab_size, emb_dim, num_cls,
@@ -124,23 +183,47 @@ class STSeqCls(nn.Module):
                                max_len=max_len,
                                emb_dropout=emb_dropout,
                                dropout=dropout)
        self.cls = Cls(hidden_size, num_cls, cls_hidden_size)
        self.cls = _Cls(hidden_size, num_cls, cls_hidden_size)

    def forward(self, word_seq, seq_lens):
        mask = seq_lens_to_masks(seq_lens)
        nodes, relay = self.enc(word_seq, mask)
    def forward(self, words, seq_len):
        """

        :param words: [batch, seq_len] 输入序列
        :param seq_len: [batch,] 输入序列的长度
        :return output: [batch, num_cls] 输出序列的分类的概率
        """
        mask = seq_lens_to_masks(seq_len)
        nodes, relay = self.enc(words, mask)
        y = 0.5 * (relay + nodes.max(1)[0])
        output = self.cls(y) # [bsz, n_cls]
        return {'output': output}

    def predict(self, word_seq, seq_lens):
        y = self.forward(word_seq, seq_lens)
    def predict(self, words, seq_len):
        """

        :param words: [batch, seq_len] 输入序列
        :param seq_len: [batch,] 输入序列的长度
        :return output: [batch, num_cls] 输出序列的分类
        """
        y = self.forward(words, seq_len)
        _, pred = y['output'].max(1)
        return {'output': pred}


 class STNLICls(nn.Module):
    """star-transformer model for NLI
    """用于自然语言推断(NLI)的Star-Transformer

    :param vocab_size: 词嵌入的词典大小
    :param emb_dim: 每个词嵌入的特征维度
    :param num_cls: 输出类别个数
    :param hidden_size: 模型中特征维度. Default: 300
    :param num_layers: 模型层数. Default: 4
    :param num_head: 模型中multi-head的head个数. Default: 8
    :param head_dim: 模型中multi-head中每个head特征维度. Default: 32
    :param max_len: 模型能接受的最大输入长度. Default: 512
    :param cls_hidden_size: 分类器隐层维度. Default: 600
    :param emb_dropout: 词嵌入的dropout概率. Default: 0.1
    :param dropout: 模型除词嵌入外的dropout概率. Default: 0.1
    """

    def __init__(self, vocab_size, emb_dim, num_cls,
@@ -162,20 +245,36 @@ class STNLICls(nn.Module):
                                max_len=max_len,
                                emb_dropout=emb_dropout,
                                dropout=dropout)
        self.cls = NLICls(hidden_size, num_cls, cls_hidden_size)
        self.cls = _NLICls(hidden_size, num_cls, cls_hidden_size)

    def forward(self, words1, words2, seq_len1, seq_len2):
        """

    def forward(self, word_seq1, word_seq2, seq_lens1, seq_lens2):
        mask1 = seq_lens_to_masks(seq_lens1)
        mask2 = seq_lens_to_masks(seq_lens2)
        :param words1: [batch, seq_len] 输入序列1
        :param words2: [batch, seq_len] 输入序列2
        :param seq_len1: [batch,] 输入序列1的长度
        :param seq_len2: [batch,] 输入序列2的长度
        :return output: [batch, num_cls] 输出分类的概率
        """
        mask1 = seq_lens_to_masks(seq_len1)
        mask2 = seq_lens_to_masks(seq_len2)
        def enc(seq, mask):
            nodes, relay = self.enc(seq, mask)
            return 0.5 * (relay + nodes.max(1)[0])
        y1 = enc(word_seq1, mask1)
        y2 = enc(word_seq2, mask2)
        y1 = enc(words1, mask1)
        y2 = enc(words2, mask2)
        output = self.cls(y1, y2) # [bsz, n_cls]
        return {'output': output}

    def predict(self, word_seq1, word_seq2, seq_lens1, seq_lens2):
        y = self.forward(word_seq1, word_seq2, seq_lens1, seq_lens2)
    def predict(self, words1, words2, seq_len1, seq_len2):
        """

        :param words1: [batch, seq_len] 输入序列1
        :param words2: [batch, seq_len] 输入序列2
        :param seq_len1: [batch,] 输入序列1的长度
        :param seq_len2: [batch,] 输入序列2的长度
        :return output: [batch, num_cls] 输出分类的概率
        """
        y = self.forward(words1, words2, seq_len1, seq_len2)
        _, pred = y['output'].max(1)
        return {'output': pred}
--- a/fastNLP/modules/encoder/lstm.py
+++ b/fastNLP/modules/encoder/lstm.py
@@ -6,17 +6,17 @@ from fastNLP.modules.utils import initial_parameter


 class LSTM(nn.Module):
    """Long Short Term Memory
    """LSTM 模块, 轻量封装的Pytorch LSTM

    :param int input_size:
    :param int hidden_size:
    :param int num_layers:
    :param float dropout:
    :param bool batch_first:
    :param bool bidirectional:
    :param bool bias:
    :param str initial_method:
    :param bool get_hidden:
    :param input_size:  输入 `x` 的特征维度
    :param hidden_size: 隐状态 `h` 的特征维度
    :param num_layers: rnn的层数. Default: 1
    :param dropout: 层间dropout概率. Default: 0
    :param bidirectional: 若为 ``True``, 使用双向的RNN. Default: ``False``
    :param batch_first: 若为 ``True``, 输入和输出 ``Tensor`` 形状为
        :(batch, seq, feature). Default: ``False``
    :param bias: 如果为 ``False``, 模型将不会使用bias. Default: ``True``
    :param get_hidden: 是否返回隐状态 `h` . Default: ``False``
    """
    def __init__(self, input_size, hidden_size=100, num_layers=1, dropout=0.0, batch_first=True,
                 bidirectional=False, bias=True, initial_method=None, get_hidden=False):
@@ -27,14 +27,24 @@ class LSTM(nn.Module):
        self.get_hidden = get_hidden
        initial_parameter(self, initial_method)

    def forward(self, x, seq_lens=None, h0=None, c0=None):
    def forward(self, x, seq_len=None, h0=None, c0=None):
        """

        :param x: [batch, seq_len, input_size] 输入序列
        :param seq_len: [batch, ] 序列长度, 若为 ``None``, 所有输入看做一样长. Default: ``None``
        :param h0: [batch, hidden_size] 初始隐状态, 若为 ``None`` , 设为全1向量. Default: ``None``
        :param c0: [batch, hidden_size] 初始Cell状态, 若为 ``None`` , 设为全1向量. Default: ``None``
        :return (output, ht) 或 output: 若 ``get_hidden=True`` [batch, seq_len, hidden_size*num_direction] 输出序列
            :和 [batch, hidden_size*num_direction] 最后时刻隐状态.
            :若 ``get_hidden=False`` 仅返回输出序列.
        """
        if h0 is not None and c0 is not None:
            hx = (h0, c0)
        else:
            hx = None
        if seq_lens is not None and not isinstance(x, rnn.PackedSequence):
        if seq_len is not None and not isinstance(x, rnn.PackedSequence):
            print('padding')
            sort_lens, sort_idx = torch.sort(seq_lens, dim=0, descending=True)
            sort_lens, sort_idx = torch.sort(seq_len, dim=0, descending=True)
            if self.batch_first:
                x = x[sort_idx]
            else:
--- a/fastNLP/modules/encoder/star_transformer.py
+++ b/fastNLP/modules/encoder/star_transformer.py
@@ -5,16 +5,19 @@ import numpy as NP


 class StarTransformer(nn.Module):
    """Star-Transformer Encoder part。
    """
    Star-Transformer 的encoder部分。 输入3d的文本输入, 返回相同长度的文本编码

    paper: https://arxiv.org/abs/1902.09113
    :param hidden_size: int, 输入维度的大小。同时也是输出维度的大小。
    :param num_layers: int, star-transformer的层数
    :param num_head: int，head的数量。
    :param head_dim: int, 每个head的维度大小。
    :param dropout: float dropout 概率
    :param max_len: int or None, 如果为int，输入序列的最大长度，
                    模型会为属于序列加上position embedding。
                    若为None，忽略加上position embedding的步骤

    :param int hidden_size: 输入维度的大小。同时也是输出维度的大小。
    :param int num_layers: star-transformer的层数
    :param int num_head: head的数量。
    :param int head_dim: 每个head的维度大小。
    :param float dropout: dropout 概率. Default: 0.1
    :param int max_len: int or None, 如果为int，输入序列的最大长度，
        模型会为输入序列加上position embedding。
        若为`None`，忽略加上position embedding的步骤. Default: `None`
    """
    def __init__(self, hidden_size, num_layers, num_head, head_dim, dropout=0.1, max_len=None):
        super(StarTransformer, self).__init__()
@@ -22,11 +25,11 @@ class StarTransformer(nn.Module):

        self.norm = nn.ModuleList([nn.LayerNorm(hidden_size) for _ in range(self.iters)])
        self.ring_att = nn.ModuleList(
            [MSA1(hidden_size, nhead=num_head, head_dim=head_dim, dropout=dropout)
                for _ in range(self.iters)])
            [_MSA1(hidden_size, nhead=num_head, head_dim=head_dim, dropout=dropout)
             for _ in range(self.iters)])
        self.star_att = nn.ModuleList(
            [MSA2(hidden_size, nhead=num_head, head_dim=head_dim, dropout=dropout)
                for _ in range(self.iters)])
            [_MSA2(hidden_size, nhead=num_head, head_dim=head_dim, dropout=dropout)
             for _ in range(self.iters)])

        if max_len is not None:
            self.pos_emb = self.pos_emb = nn.Embedding(max_len, hidden_size)
@@ -35,10 +38,12 @@ class StarTransformer(nn.Module):

    def forward(self, data, mask):
        """
        :param FloatTensor data: [batch, length, hidden] the input sequence
        :param ByteTensor mask: [batch, length] the padding mask for input, in which padding pos is 0
        :return: [batch, length, hidden] the output sequence
                 [batch, hidden] the global relay node
        :param FloatTensor data: [batch, length, hidden] 输入的序列
        :param ByteTensor mask: [batch, length] 输入序列的padding mask, 在没有内容(padding 部分) 为 0,
            否则为 1
        :return: [batch, length, hidden] 编码后的输出序列

                [batch, hidden] 全局 relay 节点, 详见论文
        """
        def norm_func(f, x):
            # B, H, L, 1
@@ -70,9 +75,9 @@ class StarTransformer(nn.Module):
        return nodes, relay.view(B, H)


 class MSA1(nn.Module):
 class _MSA1(nn.Module):
    def __init__(self, nhid, nhead=10, head_dim=10, dropout=0.1):
        super(MSA1, self).__init__()
        super(_MSA1, self).__init__()
        # Multi-head Self Attention Case 1, doing self-attention for small regions
        # Due to the architecture of GPU, using hadamard production and summation are faster than dot production when unfold_size is very small
        self.WQ = nn.Conv2d(nhid, nhead * head_dim, 1)
@@ -113,10 +118,10 @@ class MSA1(nn.Module):
        return ret


 class MSA2(nn.Module):
 class _MSA2(nn.Module):
    def __init__(self, nhid, nhead=10, head_dim=10, dropout=0.1):
        # Multi-head Self Attention Case 2, a broadcastable query for a sequence key and value
        super(MSA2, self).__init__()
        super(_MSA2, self).__init__()
        self.WQ = nn.Conv2d(nhid, nhead * head_dim, 1)
        self.WK = nn.Conv2d(nhid, nhead * head_dim, 1)
        self.WV = nn.Conv2d(nhid, nhead * head_dim, 1)
--- a/fastNLP/modules/encoder/transformer.py
+++ b/fastNLP/modules/encoder/transformer.py
@@ -7,13 +7,13 @@ from ..dropout import TimestepDropout
 class TransformerEncoder(nn.Module):
    """transformer的encoder模块，不包含embedding层

    :param num_layers: int, transformer的层数
    :param model_size: int, 输入维度的大小。同时也是输出维度的大小。
    :param inner_size: int, FFN层的hidden大小
    :param key_size: int, 每个head的维度大小。
    :param value_size: int，每个head中value的维度。
    :param num_head: int，head的数量。
    :param dropout: float。
    :param int num_layers: transformer的层数
    :param int model_size: 输入维度的大小。同时也是输出维度的大小。
    :param int inner_size: FFN层的hidden大小
    :param int key_size: 每个head的维度大小。
    :param int value_size: 每个head中value的维度。
    :param int num_head: head的数量。
    :param float dropout: dropout概率. Default: 0.1
    """
    class SubLayer(nn.Module):
        def __init__(self, model_size, inner_size, key_size, value_size, num_head, dropout=0.1):
@@ -48,7 +48,8 @@ class TransformerEncoder(nn.Module):
    def forward(self, x, seq_mask=None):
        """
        :param x: [batch, seq_len, model_size] 输入序列
        :param seq_mask: [batch, seq_len] 输入序列的padding mask
        :param seq_mask: [batch, seq_len] 输入序列的padding mask, 若为 ``None`` , 生成全1向量.
            Default: ``None``
        :return: [batch, seq_len, model_size] 输出序列
        """
        output = x
--- a/fastNLP/modules/encoder/variational_rnn.py
+++ b/fastNLP/modules/encoder/variational_rnn.py
@@ -28,11 +28,11 @@ class VarRnnCellWrapper(nn.Module):
        """
        :param PackedSequence input_x: [seq_len, batch_size, input_size]
        :param hidden: for LSTM, tuple of (h_0, c_0), [batch_size, hidden_size]
                       for other RNN, h_0, [batch_size, hidden_size]
            :for other RNN, h_0, [batch_size, hidden_size]
        :param mask_x: [batch_size, input_size] dropout mask for input
        :param mask_h: [batch_size, hidden_size] dropout mask for hidden
        :return PackedSequence output: [seq_len, bacth_size, hidden_size]
                hidden: for LSTM, tuple of (h_n, c_n), [batch_size, hidden_size]
                :hidden: for LSTM, tuple of (h_n, c_n), [batch_size, hidden_size]
                        for other RNN, h_n, [batch_size, hidden_size]
        """
        def get_hi(hi, h0, size):
@@ -84,9 +84,21 @@ class VarRnnCellWrapper(nn.Module):


 class VarRNNBase(nn.Module):
    """Implementation of Variational Dropout RNN network.
    refer to `A Theoretically Grounded Application of Dropout in Recurrent Neural Networks (Yarin Gal and Zoubin Ghahramani, 2016)
    """Variational Dropout RNN 实现.
    论文参考: `A Theoretically Grounded Application of Dropout in Recurrent Neural Networks (Yarin Gal and Zoubin Ghahramani, 2016)
    https://arxiv.org/abs/1512.05287`.

    :param mode: rnn 模式, (lstm or not)
    :param Cell: rnn cell 类型, (lstm, gru, etc)
    :param input_size:  输入 `x` 的特征维度
    :param hidden_size: 隐状态 `h` 的特征维度
    :param num_layers: rnn的层数. Default: 1
    :param bias: 如果为 ``False``, 模型将不会使用bias. Default: ``True``
    :param batch_first: 若为 ``True``, 输入和输出 ``Tensor`` 形状为
        :(batch, seq, feature). Default: ``False``
    :param input_dropout: 对输入的dropout概率. Default: 0
    :param hidden_dropout: 对每个隐状态的dropout概率. Default: 0
    :param bidirectional: 若为 ``True``, 使用双向的RNN. Default: ``False``
    """

    def __init__(self, mode, Cell, input_size, hidden_size, num_layers=1,
@@ -120,36 +132,43 @@ class VarRNNBase(nn.Module):
        output_x, hidden_x = cell(input, hi, mask_x, mask_h, is_reversed=(n_direction == 1))
        return output_x, hidden_x

    def forward(self, input, hx=None):
    def forward(self, x, hx=None):
        """

        :param x: [batch, seq_len, input_size] 输入序列
        :param hx: [batch, hidden_size] 初始隐状态, 若为 ``None`` , 设为全1向量. Default: ``None``
        :return (output, ht): [batch, seq_len, hidden_size*num_direction] 输出序列
            :和 [batch, hidden_size*num_direction] 最后时刻隐状态
        """
        is_lstm = self.is_lstm
        is_packed = isinstance(input, PackedSequence)
        is_packed = isinstance(x, PackedSequence)
        if not is_packed:
            seq_len = input.size(1) if self.batch_first else input.size(0)
            max_batch_size = input.size(0) if self.batch_first else input.size(1)
            seq_len = x.size(1) if self.batch_first else x.size(0)
            max_batch_size = x.size(0) if self.batch_first else x.size(1)
            seq_lens = torch.LongTensor([seq_len for _ in range(max_batch_size)])
            input, batch_sizes = pack_padded_sequence(input, seq_lens, batch_first=self.batch_first)
            x, batch_sizes = pack_padded_sequence(x, seq_lens, batch_first=self.batch_first)
        else:
            max_batch_size = int(input.batch_sizes[0])
            input, batch_sizes = input
            max_batch_size = int(x.batch_sizes[0])
            x, batch_sizes = x

        if hx is None:
            hx = input.new_zeros(self.num_layers * self.num_directions,
                                 max_batch_size, self.hidden_size, requires_grad=True)
            hx = x.new_zeros(self.num_layers * self.num_directions,
                             max_batch_size, self.hidden_size, requires_grad=True)
            if is_lstm:
                hx = (hx, hx.new_zeros(hx.size(), requires_grad=True))

        mask_x = input.new_ones((max_batch_size, self.input_size))
        mask_out = input.new_ones((max_batch_size, self.hidden_size * self.num_directions))
        mask_h_ones = input.new_ones((max_batch_size, self.hidden_size))
        mask_x = x.new_ones((max_batch_size, self.input_size))
        mask_out = x.new_ones((max_batch_size, self.hidden_size * self.num_directions))
        mask_h_ones = x.new_ones((max_batch_size, self.hidden_size))
        nn.functional.dropout(mask_x, p=self.input_dropout, training=self.training, inplace=True)
        nn.functional.dropout(mask_out, p=self.hidden_dropout, training=self.training, inplace=True)

        hidden = input.new_zeros((self.num_layers*self.num_directions, max_batch_size, self.hidden_size))
        hidden = x.new_zeros((self.num_layers * self.num_directions, max_batch_size, self.hidden_size))
        if is_lstm:
            cellstate = input.new_zeros((self.num_layers*self.num_directions, max_batch_size, self.hidden_size))
            cellstate = x.new_zeros((self.num_layers * self.num_directions, max_batch_size, self.hidden_size))
        for layer in range(self.num_layers):
            output_list = []
            input_seq = PackedSequence(input, batch_sizes)
            input_seq = PackedSequence(x, batch_sizes)
            mask_h = nn.functional.dropout(mask_h_ones, p=self.hidden_dropout, training=self.training, inplace=False)
            for direction in range(self.num_directions):
                output_x, hidden_x = self._forward_one(layer, direction, input_seq, hx,
@@ -161,22 +180,32 @@ class VarRNNBase(nn.Module):
                    cellstate[idx] = hidden_x[1]
                else:
                    hidden[idx] = hidden_x
            input = torch.cat(output_list, dim=-1)
            x = torch.cat(output_list, dim=-1)

        if is_lstm:
            hidden = (hidden, cellstate)

        if is_packed:
            output = PackedSequence(input, batch_sizes)
            output = PackedSequence(x, batch_sizes)
        else:
            input = PackedSequence(input, batch_sizes)
            output, _ = pad_packed_sequence(input, batch_first=self.batch_first)
            x = PackedSequence(x, batch_sizes)
            output, _ = pad_packed_sequence(x, batch_first=self.batch_first)

        return output, hidden


 class VarLSTM(VarRNNBase):
    """Variational Dropout LSTM.

    :param input_size:  输入 `x` 的特征维度
    :param hidden_size: 隐状态  `h`  的特征维度
    :param num_layers: rnn的层数. Default: 1
    :param bias: 如果为 ``False``, 模型将不会使用bias. Default: ``True``
    :param batch_first: 若为 ``True``, 输入和输出 ``Tensor`` 形状为
        :(batch, seq, feature). Default: ``False``
    :param input_dropout: 对输入的dropout概率. Default: 0
    :param hidden_dropout: 对每个隐状态的dropout概率. Default: 0
    :param bidirectional: 若为 ``True``, 使用双向的LSTM. Default: ``False``
    """

    def __init__(self, *args, **kwargs):
@@ -185,6 +214,16 @@ class VarLSTM(VarRNNBase):

 class VarRNN(VarRNNBase):
    """Variational Dropout RNN.

    :param input_size:  输入 `x` 的特征维度
    :param hidden_size: 隐状态 `h` 的特征维度
    :param num_layers: rnn的层数. Default: 1
    :param bias: 如果为 ``False``, 模型将不会使用bias. Default: ``True``
    :param batch_first: 若为 ``True``, 输入和输出 ``Tensor`` 形状为
        :(batch, seq, feature). Default: ``False``
    :param input_dropout: 对输入的dropout概率. Default: 0
    :param hidden_dropout: 对每个隐状态的dropout概率. Default: 0
    :param bidirectional: 若为 ``True``, 使用双向的RNN. Default: ``False``
    """

    def __init__(self, *args, **kwargs):
@@ -193,6 +232,16 @@ class VarRNN(VarRNNBase):

 class VarGRU(VarRNNBase):
    """Variational Dropout GRU.

    :param input_size:  输入 `x` 的特征维度
    :param hidden_size: 隐状态 `h` 的特征维度
    :param num_layers: rnn的层数. Default: 1
    :param bias: 如果为 ``False``, 模型将不会使用bias. Default: ``True``
    :param batch_first: 若为 ``True``, 输入和输出 ``Tensor`` 形状为
        :(batch, seq, feature). Default: ``False``
    :param input_dropout: 对输入的dropout概率. Default: 0
    :param hidden_dropout: 对每个隐状态的dropout概率. Default: 0
    :param bidirectional: 若为 ``True``, 使用双向的GRU. Default: ``False``
    """

    def __init__(self, *args, **kwargs):
--- a/test/core/test_sampler.py
+++ b/test/core/test_sampler.py
@@ -4,17 +4,11 @@ import unittest
 import torch

 from fastNLP.core.dataset import DataSet
 from fastNLP.core.sampler import convert_to_torch_tensor, SequentialSampler, RandomSampler, \
 from fastNLP.core.sampler import SequentialSampler, RandomSampler, \
    k_means_1d, k_means_bucketing, simple_sort_bucketing, BucketSampler


 class TestSampler(unittest.TestCase):
    def test_convert_to_torch_tensor(self):
        data = [[1, 2, 3, 4, 5], [5, 4, 3, 2, 1], [1, 3, 4, 5, 2]]
        ans = convert_to_torch_tensor(data, False)
        assert isinstance(ans, torch.Tensor)
        assert tuple(ans.shape) == (3, 5)

    def test_sequential_sampler(self):
        sampler = SequentialSampler()
        data = [1, 3, 5, 7, 9, 2, 4, 6, 8, 10]
--- a/test/models/test_biaffine_parser.py
+++ b/test/models/test_biaffine_parser.py
@@ -44,34 +44,34 @@ data_file = """

 def init_data():
    ds = fastNLP.DataSet()
    v = {'word_seq': fastNLP.Vocabulary(),
         'pos_seq': fastNLP.Vocabulary(),
    v = {'words1': fastNLP.Vocabulary(),
         'words2': fastNLP.Vocabulary(),
         'label_true': fastNLP.Vocabulary()}
    data = []
    for line in data_file.split('\n'):
        line = line.split()
        if len(line) == 0 and len(data) > 0:
            data = list(zip(*data))
            ds.append(fastNLP.Instance(word_seq=data[1],
                                       pos_seq=data[4],
            ds.append(fastNLP.Instance(words1=data[1],
                                       words2=data[4],
                                       arc_true=data[6],
                                       label_true=data[7]))
            data = []
        elif len(line) > 0:
            data.append(line)

    for name in ['word_seq', 'pos_seq', 'label_true']:
    for name in ['words1', 'words2', 'label_true']:
        ds.apply(lambda x: ['<st>'] + list(x[name]), new_field_name=name)
        ds.apply(lambda x: v[name].add_word_lst(x[name]))

    for name in ['word_seq', 'pos_seq', 'label_true']:
    for name in ['words1', 'words2', 'label_true']:
        ds.apply(lambda x: [v[name].to_index(w) for w in x[name]], new_field_name=name)

    ds.apply(lambda x: [0] + list(map(int, x['arc_true'])), new_field_name='arc_true')
    ds.apply(lambda x: len(x['word_seq']), new_field_name='seq_lens')
    ds.set_input('word_seq', 'pos_seq', 'seq_lens', flag=True)
    ds.set_target('arc_true', 'label_true', 'seq_lens', flag=True)
    return ds, v['word_seq'], v['pos_seq'], v['label_true']
    ds.apply(lambda x: len(x['words1']), new_field_name='seq_len')
    ds.set_input('words1', 'words2', 'seq_len', flag=True)
    ds.set_target('arc_true', 'label_true', 'seq_len', flag=True)
    return ds, v['words1'], v['words2'], v['label_true']


 class TestBiaffineParser(unittest.TestCase):
--- a/test/test_tutorials.py
+++ b/test/test_tutorials.py
@@ -437,4 +437,10 @@ class TestTutorial(unittest.TestCase):
        )
        tester.test()

        os.chdir("../..")
    def setUp(self):
        import os
        self._init_wd = os.path.abspath(os.curdir)

    def tearDown(self):
        import os
        os.chdir(self._init_wd)