Merge remote-tracking branch 'private/dev' into dev

# Conflicts: # fastNLP/api/api.py # fastNLP/modules/encoder/variational_rnn.py
5 years ago · 62a7556a04
--- a/.travis.yml
+++ b/.travis.yml
@@ -4,7 +4,8 @@ python:
 # command to install dependencies
 install:
  - pip install --quiet -r requirements.txt
  - pip install pytest pytest-cov
  - pip install pytest>=3.6
  - pip install pytest-cov
 # command to run tests
 script:
  - pytest --cov=./
--- a/README.md
+++ b/README.md
@@ -48,8 +48,10 @@ For example:
 ## Resources

 - [Documentation](https://fastnlp.readthedocs.io/en/latest/)
 - [Tutorials](https://github.com/fastnlp/fastNLP/tutorials)
 - [Source Code](https://github.com/fastnlp/fastNLP)


 ## Installation
 Run the following commands to install fastNLP package.
 ```shell
@@ -70,7 +72,7 @@ pip install fastNLP
 </tr>
 <tr>
    <td><b> fastNLP.core </b></td>
    <td> data representation & train/test presedure </td>
    <td> data representation & train/test procedure </td>
 </tr>
 <tr>
    <td><b> fastNLP.models </b></td>
--- a/fastNLP/api/api.py
+++ b/fastNLP/api/api.py
@@ -13,9 +13,6 @@ from reproduction.chinese_word_segment.cws_io.cws_reader import ConllCWSReader
 from reproduction.pos_tag_model.pos_reader import ZhConllPOSReader
 from reproduction.Biaffine_parser.util import ConllxDataLoader, add_seg_tag
 from fastNLP.core.instance import Instance
 from fastNLP.core.sampler import SequentialSampler
 from fastNLP.core.batch import Batch
 from reproduction.chinese_word_segment.utils import calculate_pre_rec_f1
 from fastNLP.api.pipeline import Pipeline
 from fastNLP.core.metrics import SpanFPreRecMetric
 from fastNLP.api.processor import IndexerProcessor
@@ -23,10 +20,9 @@ from fastNLP.api.processor import IndexerProcessor

 # TODO add pretrain urls
 model_urls = {

    'cws': "http://123.206.98.91:8888/download/cws_crf_1_11-457fc899.pkl"
 }


 class API:
    def __init__(self):
        self.pipeline = None
@@ -139,6 +135,12 @@ class POS(API):

 class CWS(API):
    def __init__(self, model_path=None, device='cpu'):
        """
        中文分词高级接口。

        :param model_path: 当model_path为None，使用默认位置的model。如果默认位置不存在，则自动下载模型
        :param device: str，可以为'cpu', 'cuda'或'cuda:0'等。会将模型load到相应device进行推断。
        """
        super(CWS, self).__init__()
        if model_path is None:
            model_path = model_urls['cws']
@@ -146,7 +148,13 @@ class CWS(API):
        self.load(model_path, device)

    def predict(self, content):
        """
        分词接口。

        :param content: str或List[str], 例如: "中文分词很重要！"， 返回的结果是"中文 分词 很 重要 !"。 如果传入的为List[str]，比如
            [ "中文分词很重要！", ...], 返回的结果["中文 分词 很 重要 !", ...]。
        :return: str或List[str], 根据输入的的类型决定。
        """
        if not hasattr(self, 'pipeline'):
            raise ValueError("You have to load model first.")

@@ -164,17 +172,35 @@ class CWS(API):
        # 3. 使用pipeline
        self.pipeline(dataset)

        output = dataset['output'].content
        output = dataset.get_field('output').content
        if isinstance(content, str):
            return output[0]
        elif isinstance(content, list):
            return output

    def test(self, filepath):

        tag_proc = self._dict['tag_indexer']
        """
        传入一个分词文件路径，返回该数据集上分词f1, precision, recall。
        分词文件应该为:
            1	编者按	编者按	NN	O	11	nmod:topic
            2	：	：	PU	O	11	punct
            3	7月	7月	NT	DATE	4	compound:nn
            4	12日	12日	NT	DATE	11	nmod:tmod
            5	，	，	PU	O	11	punct

            1	这	这	DT	O	3	det
            2	款	款	M	O	1	mark:clf
            3	飞行	飞行	NN	O	8	nsubj
            4	从	从	P	O	5	case
            5	外型	外型	NN	O	8	nmod:prep
        以空行分割两个句子，有内容的每行有7列。

        :param filepath: str, 文件路径路径。
        :return: float, float, float. 分别f1, precision, recall.
        """
        tag_proc = self._dict['tag_proc']
        cws_model = self.pipeline.pipeline[-2].model
        pipeline = self.pipeline.pipeline[:5]
        pipeline = self.pipeline.pipeline[:-2]

        pipeline.insert(1, tag_proc)
        pp = Pipeline(pipeline)
@@ -185,12 +211,16 @@ class CWS(API):
        te_dataset = reader.load(filepath)
        pp(te_dataset)

        batch_size = 64
        te_batcher = Batch(te_dataset, batch_size, SequentialSampler(), use_cuda=False)
        pre, rec, f1 = calculate_pre_rec_f1(cws_model, te_batcher, type='bmes')
        f1 = round(f1 * 100, 2)
        pre = round(pre * 100, 2)
        rec = round(rec * 100, 2)
        from fastNLP.core.tester import Tester
        from fastNLP.core.metrics import BMESF1PreRecMetric

        tester = Tester(data=te_dataset, model=cws_model, metrics=BMESF1PreRecMetric(target='target'), batch_size=64,
                        verbose=0)
        eval_res = tester.test()

        f1 = eval_res['BMESF1PreRecMetric']['f']
        pre = eval_res['BMESF1PreRecMetric']['pre']
        rec = eval_res['BMESF1PreRecMetric']['rec']
        # print("f1:{:.2f}, pre:{:.2f}, rec:{:.2f}".format(f1, pre, rec))

        return f1, pre, rec
@@ -287,7 +317,7 @@ class Analyzer:

    def test(self, filepath):
        output_dict = {}
        if self.seg:
        if self.cws:
            seg_output = self.cws.test(filepath)
            output_dict['seg'] = seg_output
        if self.pos:
@@ -309,18 +339,24 @@ if __name__ == "__main__":
    # print(pos.test("/home/zyfeng/data/sample.conllx"))
    # print(pos.predict(s))

    # cws_model_path = '../../reproduction/chinese_word_segment/models/cws_crf.pkl'
    # cws = CWS(device='cpu')
    # s = ['本品是一个抗酸抗胆汁的胃黏膜保护剂' ,
    #     '这款飞行从外型上来看酷似电影中的太空飞行器，据英国方面介绍，可以实现洲际远程打击。',
    #      '那么这款无人机到底有多厉害？']
    # print(cws.test('/Users/yh/Desktop/test_data/cws_test.conll'))
    # print(cws.predict(s))

    # cws_model_path = '../../reproduction/chinese_word_segment/models/cws_crf_1_11.pkl'
    cws = CWS(device='cpu')
    s = ['本品是一个抗酸抗胆汁的胃黏膜保护剂' ,
        '这款飞行从外型上来看酷似电影中的太空飞行器，据英国方面介绍，可以实现洲际远程打击。',
    parser_path = '/home/yfshao/workdir/fastnlp/reproduction/Biaffine_parser/pipe.pkl'
    parser = Parser(parser_path, device='cpu')
    # print(parser.test('/Users/yh/Desktop/test_data/parser_test2.conll'))
    s = ['编者按：7月12日，英国航空航天系统公司公布了该公司研制的第一款高科技隐形无人机雷电之神。',
         '这款飞行从外型上来看酷似电影中的太空飞行器，据英国方面介绍，可以实现洲际远程打击。',
         '那么这款无人机到底有多厉害？']
    print(cws.test('/home/hyan/ctb3/test.conllx'))
    print(cws.predict(s))
    print(cws.predict('本品是一个抗酸抗胆汁的胃黏膜保护剂'))

    # parser = Parser(device='cpu')
    # print(parser.test('/Users/yh/Desktop/test_data/parser_test2.conll'))
    # s = ['编者按：7月12日，英国航空航天系统公司公布了该公司研制的第一款高科技隐形无人机雷电之神。',
    #      '这款飞行从外型上来看酷似电影中的太空飞行器，据英国方面介绍，可以实现洲际远程打击。',
    #      '那么这款无人机到底有多厉害？']
    # print(parser.predict(s))
    print(parser.predict(s))
--- a/fastNLP/api/processor.py
+++ b/fastNLP/api/processor.py
@@ -270,8 +270,8 @@ class ModelProcessor(Processor):
                        for idx, seq_len in enumerate(seq_lens):
                            tmp_batch.append(value[idx, :seq_len])
                        batch_output[key].extend(tmp_batch)

                batch_output[self.seq_len_field_name].extend(seq_lens)
                if not self.seq_len_field_name in prediction:
                    batch_output[self.seq_len_field_name].extend(seq_lens)

        # TODO 当前的实现会导致之后的processor需要知道model输出的output的key是什么
        for field_name, fields in batch_output.items():
--- a/fastNLP/core/batch.py
+++ b/fastNLP/core/batch.py
@@ -26,7 +26,8 @@ class Batch(object):
        self.as_numpy = as_numpy
        self.idx_list = None
        self.curidx = 0
        self.num_batches = len(dataset)//batch_size + int(len(dataset)%batch_size!=0)
        self.num_batches = len(dataset) // batch_size + int(len(dataset) % batch_size != 0)
        self.cur_batch_indices = None

    def __iter__(self):
        self.idx_list = self.sampler(self.dataset)
@@ -42,6 +43,7 @@ class Batch(object):
            batch_x, batch_y = {}, {}

            indices = self.idx_list[self.curidx:endidx]
            self.cur_batch_indices = indices

            for field_name, field in self.dataset.get_all_fields().items():
                if field.is_target or field.is_input:
@@ -60,6 +62,9 @@ class Batch(object):
    def __len__(self):
        return self.num_batches

    def get_batch_indices(self):
        return self.cur_batch_indices


 def to_tensor(batch, dtype):
    if dtype in (int, np.int8, np.int16, np.int32, np.int64):
--- a/fastNLP/core/callback.py
+++ b/fastNLP/core/callback.py
@@ -12,34 +12,72 @@ class Callback(object):
        # before the main training loop
        pass

    def before_epoch(self):
    def before_epoch(self, cur_epoch, total_epoch):
        # at the beginning of each epoch
        pass

    def before_batch(self):
    def before_batch(self, batch_x, batch_y, indices):
        # at the beginning of each step/mini-batch
        pass

    def before_loss(self):
    def before_loss(self, batch_y, predict_y):
        # after data_forward, and before loss computation
        pass

    def before_backward(self):
    def before_backward(self, loss, model):
        # after loss computation, and before gradient backward
        pass

    def after_batch(self):
    def after_backward(self, model):
        pass

    def after_step(self, optimizer):
        pass

    def after_batch(self, *args):
        # at the end of each step/mini-batch
        pass

    def after_epoch(self):
        # at the end of each epoch
    def after_valid(self, eval_result, metric_key, optimizer):
        """
        每次执行验证机的evaluation后会调用。传入eval_result

        :param eval_result: Dict[str: Dict[str: float]], evaluation的结果
        :param metric_key: str
        :param optimizer:
        :return:
        """
        pass

    def after_train(self):
        # after training loop
    def after_epoch(self, cur_epoch, n_epoch, optimizer):
        """
        每个epoch结束将会调用该方法

        :param cur_epoch: int, 当前的batch。从1开始。
        :param n_epoch: int, 总的batch数
        :param optimizer: 传入Trainer的optimizer。
        :return:
        """
        pass

    def after_train(self, model):
        """
        训练结束，调用该方法

        :param model: nn.Module, 传入Trainer的模型
        :return:
        """
        pass

    def on_exception(self, exception, model, indices):
        """
        当训练过程出现异常，会触发该方法
        :param exception: 某种类型的Exception，比如KeyboardInterrupt等
        :param model: 传入Trainer的模型
        :param indices: 当前batch的index
        :return:
        """
        pass

 def transfer(func):
    """装饰器，将对CallbackManager的调用转发到各个Callback子类.
@@ -48,12 +86,12 @@ def transfer(func):
    :return:
    """

    def wrapper(manager):
    def wrapper(manager, *arg):
        returns = []
        for callback in manager.callbacks:
            for env_name, env_value in manager.env.items():
                setattr(callback, env_name, env_value)
            returns.append(getattr(callback, func.__name__)())
            returns.append(getattr(callback, func.__name__)(*arg))
        return returns

    return wrapper
@@ -91,19 +129,27 @@ class CallbackManager(Callback):
        pass

    @transfer
    def before_epoch(self):
    def before_epoch(self, cur_epoch, total_epoch):
        pass

    @transfer
    def before_batch(self):
    def before_batch(self, batch_x, batch_y, indices):
        pass

    @transfer
    def before_loss(self):
    def before_loss(self, batch_y, predict_y):
        pass

    @transfer
    def before_backward(self):
    def before_backward(self, loss, model):
        pass

    @transfer
    def after_backward(self, model):
        pass

    @transfer
    def after_step(self, optimizer):
        pass

    @transfer
@@ -111,51 +157,86 @@ class CallbackManager(Callback):
        pass

    @transfer
    def after_epoch(self):
    def after_valid(self, eval_result, metric_key, optimizer):
        pass

    @transfer
    def after_train(self):
    def after_epoch(self, cur_epoch, n_epoch, optimizer):
        pass

    @transfer
    def after_train(self, model):
        pass

    @transfer
    def on_exception(self, exception, model, indices):
        pass


 class DummyCallback(Callback):
    def before_train(self):
        print("before train!!!")
        print(self.n_epoch)
    def before_train(self, *arg):
        print(arg)

    def after_epoch(self):
        print("after epoch!!!")
        return 12
    def after_epoch(self, cur_epoch, n_epoch, optimizer):
        print(cur_epoch, n_epoch, optimizer)


 class EchoCallback(Callback):
    def before_train(self):
        print("before_train")

    def before_epoch(self):
    def before_epoch(self, cur_epoch, total_epoch):
        print("before_epoch")

    def before_batch(self):
    def before_batch(self, batch_x, batch_y, indices):
        print("before_batch")

    def before_loss(self):
    def before_loss(self, batch_y, predict_y):
        print("before_loss")

    def before_backward(self):
    def before_backward(self, loss, model):
        print("before_backward")

    def after_batch(self):
        print("after_batch")

    def after_epoch(self):
    def after_epoch(self, cur_epoch, n_epoch, optimizer):
        print("after_epoch")

    def after_train(self):
    def after_train(self, model):
        print("after_train")

 class GradientClipCallback(Callback):
    def __init__(self, parameters=None, clip_value=1, clip_type='norm'):
        """
        每次backward前，将parameter的gradient clip到某个范围。

        :param parameters: None, torch.Tensor或List[torch.Tensor], 一般通过model.parameters()获得。如果为None则默认对Trainer
            的model中所有参数进行clip
        :param clip_value: float, 将gradient 限制到[-clip_value, clip_value]。clip_value应该为正数
        :param clip_type: str, 支持'norm', 'value'两种。
            (1) 'norm', 将gradient的norm rescale到[-clip_value, clip_value]
            (2) 'value', 将gradient限制在[-clip_value, clip_value], 小于-clip_value的gradient被赋值为-clip_value; 大于
                clip_value的gradient被赋值为clip_value.
        """
        super().__init__()

        from torch import nn
        if clip_type == 'norm':
            self.clip_fun = nn.utils.clip_grad_norm_
        elif clip_type == 'value':
            self.clip_fun = nn.utils.clip_grad_value_
        else:
            raise ValueError("Only supports `norm` or `value` right now.")
        self.parameters = parameters
        self.clip_value = clip_value

    def after_backward(self, model):
        self.clip_fun(model.parameters(), self.clip_value)



 if __name__ == "__main__":
    manager = CallbackManager(env={"n_epoch": 3}, callbacks=[DummyCallback(), DummyCallback()])
    manager.before_train()
    print(manager.after_epoch())
    manager.before_train(10, 11, 12)
    # print(manager.after_epoch())
--- a/fastNLP/core/trainer.py
+++ b/fastNLP/core/trainer.py
@@ -7,7 +7,11 @@ import numpy as np
 import torch
 from tensorboardX import SummaryWriter
 from torch import nn
 from tqdm.autonotebook import tqdm

 try:
    from tqdm.autonotebook import tqdm
 except:
    from fastNLP.core.utils import pseudo_tqdm as tqdm

 from fastNLP.core.batch import Batch
 from fastNLP.core.callback import CallbackManager
@@ -108,7 +112,7 @@ class Trainer(object):
        self.use_cuda = bool(use_cuda)
        self.save_path = save_path
        self.print_every = int(print_every)
        self.validate_every = int(validate_every)
        self.validate_every = int(validate_every) if validate_every!=0 else -1
        self.best_metric_indicator = None
        self.sampler = sampler
        self.callback_manager = CallbackManager(env={"trainer": self}, callbacks=callbacks)
@@ -119,11 +123,7 @@ class Trainer(object):
            self.optimizer = optimizer.construct_from_pytorch(self.model.parameters())

        self.use_tqdm = use_tqdm
        if self.use_tqdm:
            tester_verbose = 0
            self.print_every = abs(self.print_every)
        else:
            tester_verbose = 1
        self.print_every = abs(self.print_every)

        if self.dev_data is not None:
            self.tester = Tester(model=self.model,
@@ -131,7 +131,7 @@ class Trainer(object):
                                 metrics=self.metrics,
                                 batch_size=self.batch_size,
                                 use_cuda=self.use_cuda,
                                 verbose=tester_verbose)
                                 verbose=0)

        self.step = 0
        self.start_time = None  # start timestamp
@@ -199,11 +199,8 @@ class Trainer(object):
                self._summary_writer = SummaryWriter(path)

            self.callback_manager.before_train()
            if self.use_tqdm:
                self._tqdm_train()
            else:
                self._print_train()
            self.callback_manager.after_train()
            self._train()
            self.callback_manager.after_train(self.model)

            if self.dev_data is not None:
                print("\nIn Epoch:{}/Step:{}, got best dev performance:".format(self.best_dev_epoch, self.best_dev_step) +
@@ -225,28 +222,43 @@ class Trainer(object):

        return results

    def _tqdm_train(self):
    def _train(self):
        if not self.use_tqdm:
            from fastNLP.core.utils import pseudo_tqdm as inner_tqdm
        else:
            inner_tqdm = tqdm
        self.step = 0
        data_iterator = Batch(self.train_data, batch_size=self.batch_size, sampler=self.sampler,
                                as_numpy=False)
        total_steps = data_iterator.num_batches*self.n_epochs
        with tqdm(total=total_steps, postfix='loss:{0:<6.5f}', leave=False, dynamic_ncols=True) as pbar:
        start = time.time()
        data_iterator = Batch(self.train_data, batch_size=self.batch_size, sampler=self.sampler, as_numpy=False)
        total_steps = data_iterator.num_batches * self.n_epochs
        with inner_tqdm(total=total_steps, postfix='loss:{0:<6.5f}', leave=False, dynamic_ncols=True) as pbar:
            avg_loss = 0
            for epoch in range(1, self.n_epochs+1):
                pbar.set_description_str(desc="Epoch {}/{}".format(epoch, self.n_epochs))
                self.callback_manager.before_epoch()
                # early stopping
                self.callback_manager.before_epoch(epoch, self.n_epochs)
                for batch_x, batch_y in data_iterator:
                    self.callback_manager.before_batch()
                    indices = data_iterator.get_batch_indices()
                    # negative sampling; replace unknown; re-weight batch_y
                    self.callback_manager.before_batch(batch_x, batch_y, indices)
                    _move_dict_value_to_device(batch_x, batch_y, device=self._model_device)
                    prediction = self._data_forward(self.model, batch_x)

                    self.callback_manager.before_loss()
                    # edit prediction
                    self.callback_manager.before_loss(batch_y, prediction)
                    loss = self._compute_loss(prediction, batch_y)
                    avg_loss += loss.item()

                    self.callback_manager.before_backward()
                    # Is loss NaN or inf? requires_grad = False
                    self.callback_manager.before_backward(loss, self.model)
                    self._grad_backward(loss)
                    # gradient clipping
                    self.callback_manager.after_backward(self.model)

                    self._update()
                    # lr scheduler; lr_finder; one_cycle
                    self.callback_manager.after_step(self.optimizer)

                    self._summary_writer.add_scalar("loss", loss.item(), global_step=self.step)
                    for name, param in self.model.named_parameters():
                        if param.requires_grad:
@@ -254,77 +266,41 @@ class Trainer(object):
                            # self._summary_writer.add_scalar(name + "_std", param.std(), global_step=self.step)
                            # self._summary_writer.add_scalar(name + "_grad_sum", param.sum(), global_step=self.step)
                    if (self.step+1) % self.print_every == 0:
                        pbar.set_postfix_str("loss:{0:<6.5f}".format(avg_loss / self.print_every))
                        if self.use_tqdm:
                            print_output = "loss:{0:<6.5f}".format(avg_loss / self.print_every)
                            pbar.update(self.print_every)
                        else:
                            end = time.time()
                            diff = timedelta(seconds=round(end - start))
                            print_output = "[epoch: {:>3} step: {:>4}] train loss: {:>4.6} time: {}".format(
                                epoch, self.step, avg_loss, diff)
                        pbar.set_postfix_str(print_output)
                        avg_loss = 0
                        pbar.update(self.print_every)
                    self.step += 1
                    # do nothing
                    self.callback_manager.after_batch()

                    if self.validate_every > 0 and self.step % self.validate_every == 0 \
                    if ((self.validate_every > 0 and self.step % self.validate_every == 0) or
                        (self.validate_every < 0 and self.step % len(data_iterator)) == 0) \
                            and self.dev_data is not None:
                        eval_res = self._do_validation(epoch=epoch, step=self.step)
                        eval_str = "Epoch {}/{}. Step:{}/{}. ".format(epoch, self.n_epochs, self.step, total_steps) + \
                        eval_str = "Evaluation at Epoch {}/{}. Step:{}/{}. ".format(epoch, self.n_epochs, self.step,
                                                                                    total_steps) + \
                                   self.tester._format_eval_results(eval_res)
                        pbar.write(eval_str)
                if self.validate_every < 0 and self.dev_data:
                    eval_res = self._do_validation(epoch=epoch, step=self.step)
                    eval_str = "Epoch {}/{}. Step:{}/{}. ".format(epoch, self.n_epochs, self.step, total_steps) + \
                               self.tester._format_eval_results(eval_res)
                    pbar.write(eval_str)
                if epoch!=self.n_epochs:

                # if self.validate_every < 0 and self.dev_data:
                #     eval_res = self._do_validation(epoch=epoch, step=self.step)
                #     eval_str = "Epoch {}/{}. Step:{}/{}. ".format(epoch, self.n_epochs, self.step, total_steps) + \
                #                self.tester._format_eval_results(eval_res)
                #     pbar.write(eval_str)
                if epoch != self.n_epochs:
                    data_iterator = Batch(self.train_data, batch_size=self.batch_size, sampler=self.sampler,
                                          as_numpy=False)
                self.callback_manager.after_epoch()
                # lr decay; early stopping
                self.callback_manager.after_epoch(epoch, self.n_epochs, self.optimizer)
            pbar.close()

    def _print_train(self):
        epoch = 1
        start = time.time()
        while epoch <= self.n_epochs:
            self.callback_manager.before_epoch()

            data_iterator = Batch(self.train_data, batch_size=self.batch_size, sampler=self.sampler,
                                  as_numpy=False)

            for batch_x, batch_y in data_iterator:
                self.callback_manager.before_batch()
                # TODO 这里可能会遇到问题，万一用户在model内部修改了prediction的device就会有问题
                _move_dict_value_to_device(batch_x, batch_y, device=self._model_device)
                prediction = self._data_forward(self.model, batch_x)

                self.callback_manager.before_loss()
                loss = self._compute_loss(prediction, batch_y)

                self.callback_manager.before_backward()
                self._grad_backward(loss)
                self._update()

                self._summary_writer.add_scalar("loss", loss.item(), global_step=self.step)
                for name, param in self.model.named_parameters():
                    if param.requires_grad:
                        self._summary_writer.add_scalar(name + "_mean", param.mean(), global_step=self.step)
                        # self._summary_writer.add_scalar(name + "_std", param.std(), global_step=self.step)
                        # self._summary_writer.add_scalar(name + "_grad_sum", param.sum(), global_step=self.step)
                if self.print_every > 0 and self.step % self.print_every == 0:
                    end = time.time()
                    diff = timedelta(seconds=round(end - start))
                    print_output = "[epoch: {:>3} step: {:>4}] train loss: {:>4.6} time:  {}".format(
                        epoch, self.step, loss.data, diff)
                    print(print_output)

                if (self.validate_every > 0 and self.step % self.validate_every == 0 and
                        self.dev_data is not None):
                    self._do_validation(epoch=epoch, step=self.step)

                self.step += 1
                self.callback_manager.after_batch()

            # validate_every override validation at end of epochs
            if self.dev_data and self.validate_every <= 0:
                self._do_validation(epoch=epoch, step=self.step)
            epoch += 1
            self.callback_manager.after_epoch()

    def _do_validation(self, epoch, step):
        res = self.tester.test()
        for name, metric in res.items():
@@ -340,6 +316,8 @@ class Trainer(object):
            self.best_dev_perf = res
            self.best_dev_epoch = epoch
            self.best_dev_step = step
        # get validation results; adjust optimizer
        self.callback_manager.after_valid(res, self.metric_key, self.optimizer)
        return res

    def _mode(self, model, is_test=False):
--- a/fastNLP/core/utils.py
+++ b/fastNLP/core/utils.py
@@ -430,3 +430,30 @@ def seq_mask(seq_len, max_len):
    seq_len = seq_len.view(-1, 1).long()   # [batch_size, 1]
    seq_range = torch.arange(start=0, end=max_len, dtype=torch.long, device=seq_len.device).view(1, -1) # [1, max_len]
    return torch.gt(seq_len, seq_range) # [batch_size, max_len]


 class pseudo_tqdm:
    """
    当无法引入tqdm，或者Trainer中设置use_tqdm为false的时候，用该方法打印数据
    """

    def __init__(self, **kwargs):
        pass

    def write(self, info):
        print(info)

    def set_postfix_str(self, info):
        print(info)

    def __getattr__(self, item):
        def pass_func(*args, **kwargs):
            pass

        return pass_func

    def __enter__(self):
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        del self
--- a/fastNLP/io/dataset_loader.py
+++ b/fastNLP/io/dataset_loader.py
@@ -254,7 +254,7 @@ class TokenizeDataSetLoader(DataSetLoader):


 class ClassDataSetLoader(DataSetLoader):
    """Loader for classification data sets"""
    """Loader for a dummy classification data set"""

    def __init__(self):
        super(ClassDataSetLoader, self).__init__()
@@ -304,7 +304,7 @@ class ConllLoader(DataSetLoader):
    @staticmethod
    def parse(lines):
        """
        :param list lines:a list containing all lines in a conll file.
        :param list lines: a list containing all lines in a conll file.
        :return: a 3D list
        """
        sentences = list()
--- a/fastNLP/modules/aggregator/attention.py
+++ b/fastNLP/modules/aggregator/attention.py
@@ -1,12 +1,13 @@
 import math

 import torch
 from torch import nn
 import torch.nn.functional as F
 import math
 from torch import nn

 from fastNLP.modules.utils import mask_softmax


 class Attention(torch.nn.Module):

    def __init__(self, normalize=False):
        super(Attention, self).__init__()
        self.normalize = normalize
@@ -20,9 +21,9 @@ class Attention(torch.nn.Module):
    def _atten_forward(self, query, memory):
        raise NotImplementedError


 class DotAtte(nn.Module):
    def __init__(self, key_size, value_size):
        # TODO never test
        super(DotAtte, self).__init__()
        self.key_size = key_size
        self.value_size = value_size
@@ -42,10 +43,9 @@ class DotAtte(nn.Module):
        output = nn.functional.softmax(output, dim=2)
        return torch.matmul(output, V)


 class MultiHeadAtte(nn.Module):
    def __init__(self, input_size, output_size, key_size, value_size, num_atte):
        raise NotImplementedError
        # TODO never test
        super(MultiHeadAtte, self).__init__()
        self.in_linear = nn.ModuleList()
        for i in range(num_atte * 3):
--- a/fastNLP/modules/aggregator/self_attention.py
+++ b/fastNLP/modules/aggregator/self_attention.py
@@ -7,13 +7,14 @@ from fastNLP.modules.utils import initial_parameter


 class SelfAttention(nn.Module):
    """
    Self Attention Module.
    """Self Attention Module.

    Args:
    input_size: int, the size for the input vector
    dim: int, the width of weight matrix.
    num_vec: int, the number of encoded vectors
    :param int input_size:
    :param int attention_unit:
    :param int attention_hops:
    :param float drop:
    :param str initial_method:
    :param bool use_cuda:
    """

    def __init__(self, input_size, attention_unit=350, attention_hops=10, drop=0.5, initial_method=None,
@@ -48,7 +49,7 @@ class SelfAttention(nn.Module):
    def forward(self, input, input_origin):
        """
        :param input:  the matrix to do attention.              [baz, senLen, h_dim]
        :param inp:  then token index include pad token( 0 )   [baz , senLen]
        :param inp: then token index include pad token( 0 )   [baz , senLen]
        :return output1: the input matrix after attention operation   [baz, multi-head , h_dim]
        :return output2: the attention penalty term, a scalar  [1]
        """
@@ -59,8 +60,8 @@ class SelfAttention(nn.Module):
        input_origin = input_origin.transpose(0, 1).contiguous()  # [baz, hops,len]

        y1 = self.tanh(self.ws1(self.drop(input)))  # [baz,len,dim] -->[bsz,len, attention-unit]
        attention = self.ws2(y1).transpose(1,
                                           2).contiguous()  # [bsz,len, attention-unit]--> [bsz, len, hop]--> [baz,hop,len]
        attention = self.ws2(y1).transpose(1, 2).contiguous()
        # [bsz,len, attention-unit]--> [bsz, len, hop]--> [baz,hop,len]

        attention = attention + (-999999 * (input_origin == 0).float())  # remove the weight on padding token.
        attention = F.softmax(attention, 2)  # [baz ,hop, len]
--- a/fastNLP/modules/decoder/CRF.py
+++ b/fastNLP/modules/decoder/CRF.py
@@ -19,13 +19,14 @@ def seq_len_to_byte_mask(seq_lens):
    mask = broadcast_arange.float().lt(seq_lens.float().view(-1, 1))
    return mask


 def allowed_transitions(id2label, encoding_type='bio'):
    """

    :param id2label: dict, key是label的indices，value是str类型的tag或tag-label。value可以是只有tag的, 比如"B", "M"; 也可以是
    :param dict id2label: key是label的indices，value是str类型的tag或tag-label。value可以是只有tag的, 比如"B", "M"; 也可以是
        "B-NN", "M-NN", tag和label之间一定要用"-"隔开。一般可以通过Vocabulary.get_id2word()id2label。
    :param encoding_type: str, 支持"bio", "bmes"。
    :return:List[Tuple(int, int)]], 内部的Tuple是(from_tag_id, to_tag_id)。 返回的结果考虑了start和end，比如"BIO"中，B、O可以
    :return: List[Tuple(int, int)]], 内部的Tuple是(from_tag_id, to_tag_id)。 返回的结果考虑了start和end，比如"BIO"中，B、O可以
        位于序列的开端，而I不行。所以返回的结果中会包含(start_idx, B_idx), (start_idx, O_idx), 但是不包含(start_idx, I_idx).
        start_idx=len(id2label), end_idx=len(id2label)+1。
    """
@@ -57,6 +58,7 @@ def allowed_transitions(id2label, encoding_type='bio'):
                allowed_trans.append((from_id, to_id))
    return allowed_trans


 def is_transition_allowed(encoding_type, from_tag, from_label, to_tag, to_label):
    """

@@ -130,16 +132,16 @@ def is_transition_allowed(encoding_type, from_tag, from_label, to_tag, to_label)


 class ConditionalRandomField(nn.Module):
    def __init__(self, num_tags, include_start_end_trans=False, allowed_transitions=None, initial_method=None):
        """
    """

        :param num_tags: int, 标签的数量。
        :param include_start_end_trans: bool, 是否包含起始tag
        :param allowed_transitions: List[Tuple[from_tag_id(int), to_tag_id(int)]]. 允许的跃迁，可以通过allowed_transitions()得到。
            如果为None，则所有跃迁均为合法
        :param initial_method:
        """
    :param int num_tags: 标签的数量。
    :param bool include_start_end_trans: 是否包含起始tag
    :param list allowed_transitions: ``List[Tuple[from_tag_id(int), to_tag_id(int)]]``. 允许的跃迁，可以通过allowed_transitions()得到。
        如果为None，则所有跃迁均为合法
    :param str initial_method:
    """

    def __init__(self, num_tags, include_start_end_trans=False, allowed_transitions=None, initial_method=None):
        super(ConditionalRandomField, self).__init__()

        self.include_start_end_trans = include_start_end_trans
@@ -161,7 +163,6 @@ class ConditionalRandomField(nn.Module):

        # self.reset_parameter()
        initial_parameter(self, initial_method)

    def reset_parameter(self):
        nn.init.xavier_normal_(self.trans_m)
        if self.include_start_end_trans:
@@ -169,9 +170,9 @@ class ConditionalRandomField(nn.Module):
            nn.init.normal_(self.end_scores)

    def _normalizer_likelihood(self, logits, mask):
        """
        Computes the (batch_size,) denominator term for the log-likelihood, which is the
        """Computes the (batch_size,) denominator term for the log-likelihood, which is the
        sum of the likelihoods across all possible state sequences.

        :param logits:FloatTensor, max_len x batch_size x num_tags
        :param mask:ByteTensor, max_len x batch_size
        :return:FloatTensor, batch_size
@@ -236,8 +237,8 @@ class ConditionalRandomField(nn.Module):
        return all_path_score - gold_path_score

    def viterbi_decode(self, data, mask, get_score=False, unpad=False):
        """
        Given a feats matrix, return best decode path and best score.
        """Given a feats matrix, return best decode path and best score.

        :param data:FloatTensor, batch_size x max_len x num_tags
        :param mask:ByteTensor batch_size x max_len
        :param get_score: bool, whether to output the decode score.
--- a/fastNLP/modules/decoder/MLP.py
+++ b/fastNLP/modules/decoder/MLP.py
@@ -1,21 +1,23 @@
 import torch
 import torch.nn as nn

 from fastNLP.modules.utils import initial_parameter


 class MLP(nn.Module):
    def __init__(self, size_layer, activation='relu', initial_method=None, dropout=0.0):
        """Multilayer Perceptrons as a decoder
    """Multilayer Perceptrons as a decoder

        :param size_layer: list of int, define the size of MLP layers.
        :param activation: str or function, the activation function for hidden layers.
        :param initial_method: str, the name of init method.
        :param dropout: float, the probability of dropout.
    :param list size_layer: list of int, define the size of MLP layers.
    :param str activation: str or function, the activation function for hidden layers.
    :param str initial_method: the name of initialization method.
    :param float dropout: the probability of dropout.

        .. note::
            There is no activation function applying on output layer.
    .. note::
        There is no activation function applying on output layer.

        """
    """

    def __init__(self, size_layer, activation='relu', initial_method=None, dropout=0.0):
        super(MLP, self).__init__()
        self.hiddens = nn.ModuleList()
        self.output = None
--- a/fastNLP/modules/dropout.py
+++ b/fastNLP/modules/dropout.py
@@ -2,8 +2,8 @@ import torch


 class TimestepDropout(torch.nn.Dropout):
    """This module accepts a `[batch_size, num_timesteps, embedding_dim)]` and use a single
    dropout mask of shape `(batch_size, embedding_dim)` to apply on every time step.
    """This module accepts a ``[batch_size, num_timesteps, embedding_dim)]`` and use a single
    dropout mask of shape ``(batch_size, embedding_dim)`` to apply on every time step.
    """

    def forward(self, x):
--- a/fastNLP/modules/encoder/char_embedding.py
+++ b/fastNLP/modules/encoder/char_embedding.py
@@ -1,5 +1,4 @@
 import torch
 import torch.nn.functional as F
 from torch import nn

 from fastNLP.modules.utils import initial_parameter
@@ -7,17 +6,17 @@ from fastNLP.modules.utils import initial_parameter

 # from torch.nn.init import xavier_uniform
 class ConvCharEmbedding(nn.Module):
    """Character-level Embedding with CNN.

    :param int char_emb_size: the size of character level embedding. Default: 50
        say 26 characters, each embedded to 50 dim vector, then the input_size is 50.
    :param tuple feature_maps: tuple of int. The length of the tuple is the number of convolution operations
        over characters. The i-th integer is the number of filters (dim of out channels) for the i-th
        convolution.
    :param tuple kernels: tuple of int. The width of each kernel.
    """

    def __init__(self, char_emb_size=50, feature_maps=(40, 30, 30), kernels=(3, 4, 5), initial_method=None):
        """
        Character Level Word Embedding
        :param char_emb_size: the size of character level embedding. Default: 50
            say 26 characters, each embedded to 50 dim vector, then the input_size is 50.
        :param feature_maps: tuple of int. The length of the tuple is the number of convolution operations
            over characters. The i-th integer is the number of filters (dim of out channels) for the i-th
            convolution.
        :param kernels: tuple of int. The width of each kernel.
        """
        super(ConvCharEmbedding, self).__init__()
        self.convs = nn.ModuleList([
            nn.Conv2d(1, feature_maps[i], kernel_size=(char_emb_size, kernels[i]), bias=True, padding=(0, 4))
@@ -27,8 +26,8 @@ class ConvCharEmbedding(nn.Module):

    def forward(self, x):
        """
        :param x: [batch_size * sent_length, word_length, char_emb_size]
        :return: [batch_size * sent_length, sum(feature_maps), 1]
        :param x: ``[batch_size * sent_length, word_length, char_emb_size]``
        :return: feature map of shape [batch_size * sent_length, sum(feature_maps), 1]
        """
        x = x.contiguous().view(x.size(0), 1, x.size(1), x.size(2))
        # [batch_size*sent_length, channel, width, height]
@@ -51,13 +50,12 @@ class ConvCharEmbedding(nn.Module):


 class LSTMCharEmbedding(nn.Module):
    """
    Character Level Word Embedding with LSTM with a single layer.
    :param char_emb_size: int, the size of character level embedding. Default: 50
    """Character-level Embedding with LSTM.

    :param int char_emb_size: the size of character level embedding. Default: 50
        say 26 characters, each embedded to 50 dim vector, then the input_size is 50.
    :param hidden_size: int, the number of hidden units. Default:  equal to char_emb_size.
    :param int hidden_size: the number of hidden units. Default:  equal to char_emb_size.
    """

    def __init__(self, char_emb_size=50, hidden_size=None, initial_method=None):
        super(LSTMCharEmbedding, self).__init__()
        self.hidden_size = char_emb_size if hidden_size is None else hidden_size
@@ -71,7 +69,7 @@ class LSTMCharEmbedding(nn.Module):

    def forward(self, x):
        """
        :param x:[ n_batch*n_word, word_length, char_emb_size]
        :param x: ``[ n_batch*n_word, word_length, char_emb_size]``
        :return: [ n_batch*n_word, char_emb_size]
        """
        batch_size = x.shape[0]
--- a/fastNLP/modules/encoder/conv.py
+++ b/fastNLP/modules/encoder/conv.py
@@ -3,20 +3,30 @@

 import torch
 import torch.nn as nn
 from torch.nn.init import xavier_uniform_
 # import torch.nn.functional as F

 from fastNLP.modules.utils import initial_parameter


 # import torch.nn.functional as F


 class Conv(nn.Module):
    """
    Basic 1-d convolution module.
    initialize with xavier_uniform
    """
    """Basic 1-d convolution module, initialized with xavier_uniform.

    :param int in_channels:
    :param int out_channels:
    :param tuple kernel_size:
    :param int stride:
    :param int padding:
    :param int dilation:
    :param int groups:
    :param bool bias:
    :param str activation:
    :param str initial_method:
    """
    def __init__(self, in_channels, out_channels, kernel_size,
                 stride=1, padding=0, dilation=1,
                 groups=1, bias=True, activation='relu',initial_method = None ):
                 groups=1, bias=True, activation='relu', initial_method=None):
        super(Conv, self).__init__()
        self.conv = nn.Conv1d(
            in_channels=in_channels,
--- a/fastNLP/modules/encoder/conv_maxpool.py
+++ b/fastNLP/modules/encoder/conv_maxpool.py
@@ -4,17 +4,27 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from torch.nn.init import xavier_uniform_

 from fastNLP.modules.utils import initial_parameter


 class ConvMaxpool(nn.Module):
    """
    Convolution and max-pooling module with multiple kernel sizes.
    """
    """Convolution and max-pooling module with multiple kernel sizes.

    :param int in_channels:
    :param int out_channels:
    :param tuple kernel_sizes:
    :param int stride:
    :param int padding:
    :param int dilation:
    :param int groups:
    :param bool bias:
    :param str activation:
    :param str initial_method:
    """
    def __init__(self, in_channels, out_channels, kernel_sizes,
                 stride=1, padding=0, dilation=1,
                 groups=1, bias=True, activation='relu',initial_method = None ):
                 groups=1, bias=True, activation="relu", initial_method=None):
        super(ConvMaxpool, self).__init__()

        # convolution
--- a/fastNLP/modules/encoder/embedding.py
+++ b/fastNLP/modules/encoder/embedding.py
@@ -2,16 +2,13 @@ import torch.nn as nn


 class Embedding(nn.Module):
    """
    A simple lookup table
    Args:
    nums : the size of the lookup table
    dims : the size of each vector
    padding_idx : pads the tensor with zeros whenever it encounters this index
    sparse : If True, gradient matrix will be a sparse tensor. In this case,
    only optim.SGD(cuda and cpu) and optim.Adagrad(cpu) can be used
    """
    """A simple lookup table.

    :param int nums: the size of the lookup table
    :param int dims: the size of each vector
    :param int padding_idx: pads the tensor with zeros whenever it encounters this index
    :param bool sparse: If True, gradient matrix will be a sparse tensor. In this case, only optim.SGD(cuda and cpu) and optim.Adagrad(cpu) can be used
    """
    def __init__(self, nums, dims, padding_idx=0, sparse=False, init_emb=None, dropout=0.0):
        super(Embedding, self).__init__()
        self.embed = nn.Embedding(nums, dims, padding_idx, sparse=sparse)
--- a/fastNLP/modules/encoder/linear.py
+++ b/fastNLP/modules/encoder/linear.py
@@ -5,15 +5,12 @@ from fastNLP.modules.utils import initial_parameter

 class Linear(nn.Module):
    """
    Linear module
    Args:
    input_size : input size
    hidden_size : hidden size
    num_layers : number of hidden layers
    dropout : dropout rate
    bidirectional : If True, becomes a bidirectional RNN
    """

    :param int input_size: input size
    :param int output_size: output size
    :param bool bias:
    :param str initial_method:
    """
    def __init__(self, input_size, output_size, bias=True, initial_method=None):
        super(Linear, self).__init__()
        self.linear = nn.Linear(input_size, output_size, bias)
--- a/fastNLP/modules/encoder/lstm.py
+++ b/fastNLP/modules/encoder/lstm.py
@@ -6,14 +6,16 @@ from fastNLP.modules.utils import initial_parameter
 class LSTM(nn.Module):
    """Long Short Term Memory

    Args:
    input_size : input size
    hidden_size : hidden size
    num_layers : number of hidden layers. Default: 1
    dropout : dropout rate. Default: 0.5
    bidirectional : If True, becomes a bidirectional RNN. Default: False.
    :param int input_size:
    :param int hidden_size:
    :param int num_layers:
    :param float dropout:
    :param bool batch_first:
    :param bool bidirectional:
    :param bool bias:
    :param str initial_method:
    :param bool get_hidden:
    """

    def __init__(self, input_size, hidden_size=100, num_layers=1, dropout=0.0, batch_first=True,
                 bidirectional=False, bias=True, initial_method=None, get_hidden=False):
        super(LSTM, self).__init__()
--- a/fastNLP/modules/encoder/masked_rnn.py
+++ b/fastNLP/modules/encoder/masked_rnn.py
@@ -5,6 +5,8 @@ import torch.nn as nn
 import torch.nn.functional as F

 from fastNLP.modules.utils import initial_parameter


 def MaskedRecurrent(reverse=False):
    def forward(input, hidden, cell, mask, train=True, dropout=0):
        """
@@ -254,16 +256,16 @@ class MaskedRNNBase(nn.Module):
        return output, hidden

    def step(self, input, hx=None, mask=None):
        '''
        execute one step forward (only for one-directional RNN).
        Args:
            input (batch, input_size): input tensor of this step.
            hx (num_layers, batch, hidden_size): the hidden state of last step.
            mask (batch): the mask tensor of this step.
        Returns:
            output (batch, hidden_size): tensor containing the output of this step from the last layer of RNN.
            hn (num_layers, batch, hidden_size): tensor containing the hidden state of this step
        '''
        """Execute one step forward (only for one-directional RNN).

        :param Tensor input: input tensor of this step. (batch, input_size)
        :param Tensor hx: the hidden state of last step. (num_layers, batch, hidden_size)
        :param Tensor mask: the mask tensor of this step. (batch, )
        :returns:
            **output** (batch, hidden_size), tensor containing the output of this step from the last layer of RNN.
            **hn** (num_layers, batch, hidden_size), tensor containing the hidden state of this step

        """
        assert not self.bidirectional, "step only cannot be applied to bidirectional RNN."  # aha, typo!
        batch_size = input.size(0)
        lstm = self.Cell is nn.LSTMCell
@@ -285,25 +287,23 @@ class MaskedRNN(MaskedRNNBase):
    r"""Applies a multi-layer Elman RNN with costomized non-linearity to an
    input sequence.
    For each element in the input sequence, each layer computes the following
    function:
    .. math::
        h_t = \tanh(w_{ih} * x_t + b_{ih}  +  w_{hh} * h_{(t-1)} + b_{hh})
    function. :math:`h_t = \tanh(w_{ih} * x_t + b_{ih}  +  w_{hh} * h_{(t-1)} + b_{hh})`

    where :math:`h_t` is the hidden state at time `t`, and :math:`x_t` is
    the hidden state of the previous layer at time `t` or :math:`input_t`
    for the first layer. If nonlinearity='relu', then `ReLU` is used instead
    of `tanh`.
    Args:
        input_size: The number of expected features in the input x
        hidden_size: The number of features in the hidden state h
        num_layers: Number of recurrent layers.
        nonlinearity: The non-linearity to use ['tanh'|'relu']. Default: 'tanh'
        bias: If False, then the layer does not use bias weights b_ih and b_hh.
            Default: True
        batch_first: If True, then the input and output tensors are provided
            as (batch, seq, feature)
        dropout: If non-zero, introduces a dropout layer on the outputs of each
            RNN layer except the last layer
        bidirectional: If True, becomes a bidirectional RNN. Default: False


    :param int input_size: The number of expected features in the input x
    :param int hidden_size: The number of features in the hidden state h
    :param int num_layers: Number of recurrent layers.
    :param str nonlinearity: The non-linearity to use ['tanh'|'relu']. Default: 'tanh'
    :param bool bias: If False, then the layer does not use bias weights b_ih and b_hh. Default: True
    :param bool batch_first: If True, then the input and output tensors are provided as (batch, seq, feature)
    :param float dropout: If non-zero, introduces a dropout layer on the outputs of each RNN layer except the last layer
    :param bool bidirectional: If True, becomes a bidirectional RNN. Default: False

    Inputs: input, mask, h_0
        - **input** (seq_len, batch, input_size): tensor containing the features
          of the input sequence.
@@ -327,32 +327,33 @@ class MaskedLSTM(MaskedRNNBase):
    r"""Applies a multi-layer long short-term memory (LSTM) RNN to an input
    sequence.
    For each element in the input sequence, each layer computes the following
    function:
    function.

    .. math::
            \begin{array}{ll}
            i_t = \mathrm{sigmoid}(W_{ii} x_t + b_{ii} + W_{hi} h_{(t-1)} + b_{hi}) \\
            f_t = \mathrm{sigmoid}(W_{if} x_t + b_{if} + W_{hf} h_{(t-1)} + b_{hf}) \\
            g_t = \tanh(W_{ig} x_t + b_{ig} + W_{hc} h_{(t-1)} + b_{hg}) \\
            o_t = \mathrm{sigmoid}(W_{io} x_t + b_{io} + W_{ho} h_{(t-1)} + b_{ho}) \\
            c_t = f_t * c_{(t-1)} + i_t * g_t \\
            h_t = o_t * \tanh(c_t)
            \end{array}

        \begin{array}{ll}
        i_t = \mathrm{sigmoid}(W_{ii} x_t + b_{ii} + W_{hi} h_{(t-1)} + b_{hi}) \\
        f_t = \mathrm{sigmoid}(W_{if} x_t + b_{if} + W_{hf} h_{(t-1)} + b_{hf}) \\
        g_t = \tanh(W_{ig} x_t + b_{ig} + W_{hc} h_{(t-1)} + b_{hg}) \\
        o_t = \mathrm{sigmoid}(W_{io} x_t + b_{io} + W_{ho} h_{(t-1)} + b_{ho}) \\
        c_t = f_t * c_{(t-1)} + i_t * g_t \\
        h_t = o_t * \tanh(c_t)
        \end{array}

    where :math:`h_t` is the hidden state at time `t`, :math:`c_t` is the cell
    state at time `t`, :math:`x_t` is the hidden state of the previous layer at
    time `t` or :math:`input_t` for the first layer, and :math:`i_t`,
    :math:`f_t`, :math:`g_t`, :math:`o_t` are the input, forget, cell,
    and out gates, respectively.
    Args:
        input_size: The number of expected features in the input x
        hidden_size: The number of features in the hidden state h
        num_layers: Number of recurrent layers.
        bias: If False, then the layer does not use bias weights b_ih and b_hh.
            Default: True
        batch_first: If True, then the input and output tensors are provided
            as (batch, seq, feature)
        dropout: If non-zero, introduces a dropout layer on the outputs of each
            RNN layer except the last layer
        bidirectional: If True, becomes a bidirectional RNN. Default: False

    :param int input_size: The number of expected features in the input x
    :param int hidden_size: The number of features in the hidden state h
    :param int num_layers: Number of recurrent layers.
    :param bool bias: If False, then the layer does not use bias weights b_ih and b_hh. Default: True
    :param bool batch_first: If True, then the input and output tensors are provided as (batch, seq, feature)
    :param bool dropout: If non-zero, introduces a dropout layer on the outputs of each RNN layer except the last layer
    :param bool bidirectional: If True, becomes a bidirectional RNN. Default: False

    Inputs: input, mask, (h_0, c_0)
        - **input** (seq_len, batch, input_size): tensor containing the features
          of the input sequence.
@@ -380,29 +381,30 @@ class MaskedGRU(MaskedRNNBase):
    r"""Applies a multi-layer gated recurrent unit (GRU) RNN to an input sequence.
    For each element in the input sequence, each layer computes the following
    function:

    .. math::

            \begin{array}{ll}
            r_t = \mathrm{sigmoid}(W_{ir} x_t + b_{ir} + W_{hr} h_{(t-1)} + b_{hr}) \\
            z_t = \mathrm{sigmoid}(W_{iz} x_t + b_{iz} + W_{hz} h_{(t-1)} + b_{hz}) \\
            n_t = \tanh(W_{in} x_t + b_{in} + r_t * (W_{hn} h_{(t-1)}+ b_{hn})) \\
            h_t = (1 - z_t) * n_t + z_t * h_{(t-1)} \\
            \end{array}

    where :math:`h_t` is the hidden state at time `t`, :math:`x_t` is the hidden
    state of the previous layer at time `t` or :math:`input_t` for the first
    layer, and :math:`r_t`, :math:`z_t`, :math:`n_t` are the reset, input,
    and new gates, respectively.
    Args:
        input_size: The number of expected features in the input x
        hidden_size: The number of features in the hidden state h
        num_layers: Number of recurrent layers.
        nonlinearity: The non-linearity to use ['tanh'|'relu']. Default: 'tanh'
        bias: If False, then the layer does not use bias weights b_ih and b_hh.
            Default: True
        batch_first: If True, then the input and output tensors are provided
            as (batch, seq, feature)
        dropout: If non-zero, introduces a dropout layer on the outputs of each
            RNN layer except the last layer
        bidirectional: If True, becomes a bidirectional RNN. Default: False

    :param int input_size: The number of expected features in the input x
    :param int hidden_size: The number of features in the hidden state h
    :param int num_layers: Number of recurrent layers.
    :param str nonlinearity: The non-linearity to use ['tanh'|'relu']. Default: 'tanh'
    :param bool bias: If False, then the layer does not use bias weights b_ih and b_hh. Default: True
    :param bool batch_first: If True, then the input and output tensors are provided as (batch, seq, feature)
    :param bool dropout: If non-zero, introduces a dropout layer on the outputs of each RNN layer except the last layer
    :param bool bidirectional: If True, becomes a bidirectional RNN. Default: False

    Inputs: input, mask, h_0
        - **input** (seq_len, batch, input_size): tensor containing the features
          of the input sequence.
--- a/fastNLP/modules/encoder/transformer.py
+++ b/fastNLP/modules/encoder/transformer.py
@@ -1,10 +1,9 @@
 import torch
 from torch import nn
 import torch.nn.functional as F

 from ..aggregator.attention import MultiHeadAtte
 from ..other_modules import LayerNormalization


 class TransformerEncoder(nn.Module):
    class SubLayer(nn.Module):
        def __init__(self, input_size, output_size, key_size, value_size, num_atte):
@@ -12,8 +11,8 @@ class TransformerEncoder(nn.Module):
            self.atte = MultiHeadAtte(input_size, output_size, key_size, value_size, num_atte)
            self.norm1 = LayerNormalization(output_size)
            self.ffn = nn.Sequential(nn.Linear(output_size, output_size),
                                    nn.ReLU(),
                                    nn.Linear(output_size, output_size))
                                     nn.ReLU(),
                                     nn.Linear(output_size, output_size))
            self.norm2 = LayerNormalization(output_size)

        def forward(self, input, seq_mask):
@@ -28,5 +27,3 @@ class TransformerEncoder(nn.Module):

    def forward(self, x, seq_mask=None):
        return self.layers(x, seq_mask)


--- a/fastNLP/modules/encoder/variational_rnn.py
+++ b/fastNLP/modules/encoder/variational_rnn.py
@@ -1,5 +1,3 @@
 import math

 import torch
 import torch.nn as nn
 from torch.nn.utils.rnn import PackedSequence, pack_padded_sequence, pad_packed_sequence
@@ -8,15 +6,17 @@ from fastNLP.modules.utils import initial_parameter
 try:
    from torch import flip
 except ImportError:
   def flip(x, dims):
    def flip(x, dims):
        indices = [slice(None)] * x.dim()
        for dim in dims:
            indices[dim] = torch.arange(x.size(dim) - 1, -1, -1, dtype=torch.long, device=x.device)
        return x[tuple(indices)]


 class VarRnnCellWrapper(nn.Module):
    """Wrapper for normal RNN Cells, make it support variational dropout
    """

    def __init__(self, cell, hidden_size, input_p, hidden_p):
        super(VarRnnCellWrapper, self).__init__()
        self.cell = cell
@@ -88,6 +88,7 @@ class VarRNNBase(nn.Module):
    refer to `A Theoretically Grounded Application of Dropout in Recurrent Neural Networks (Yarin Gal and Zoubin Ghahramani, 2016)
    https://arxiv.org/abs/1512.05287`.
    """

    def __init__(self, mode, Cell, input_size, hidden_size, num_layers=1,
                 bias=True, batch_first=False,
                 input_dropout=0, hidden_dropout=0, bidirectional=False):
@@ -177,18 +178,23 @@ class VarRNNBase(nn.Module):
 class VarLSTM(VarRNNBase):
    """Variational Dropout LSTM.
    """

    def __init__(self, *args, **kwargs):
        super(VarLSTM, self).__init__(mode="LSTM", Cell=nn.LSTMCell, *args, **kwargs)


 class VarRNN(VarRNNBase):
    """Variational Dropout RNN.
    """

    def __init__(self, *args, **kwargs):
        super(VarRNN, self).__init__(mode="RNN", Cell=nn.RNNCell, *args, **kwargs)


 class VarGRU(VarRNNBase):
    """Variational Dropout GRU.
    """

    def __init__(self, *args, **kwargs):
        super(VarGRU, self).__init__(mode="GRU", Cell=nn.GRUCell, *args, **kwargs)

--- a/fastNLP/modules/other_modules.py
+++ b/fastNLP/modules/other_modules.py
@@ -29,8 +29,11 @@ class GroupNorm(nn.Module):


 class LayerNormalization(nn.Module):
    """ Layer normalization module """
    """

    :param int layer_size:
    :param float eps: default=1e-3
    """
    def __init__(self, layer_size, eps=1e-3):
        super(LayerNormalization, self).__init__()

@@ -52,12 +55,11 @@ class LayerNormalization(nn.Module):
 class BiLinear(nn.Module):
    def __init__(self, n_left, n_right, n_out, bias=True):
        """
        Args:
            n_left: size of left input
            n_right: size of right input
            n_out: size of output
            bias: If set to False, the layer will not learn an additive bias.
                Default: True

        :param int n_left: size of left input
        :param int n_right: size of right input
        :param int n_out: size of output
        :param bool bias: If set to False, the layer will not learn an additive bias. Default: True
        """
        super(BiLinear, self).__init__()
        self.n_left = n_left
@@ -83,12 +85,9 @@ class BiLinear(nn.Module):

    def forward(self, input_left, input_right):
        """
        Args:
            input_left: Tensor
                the left input tensor with shape = [batch1, batch2, ..., left_features]
            input_right: Tensor
                the right input tensor with shape = [batch1, batch2, ..., right_features]
        Returns:
        :param Tensor input_left: the left input tensor with shape = [batch1, batch2, ..., left_features]
        :param Tensor input_right: the right input tensor with shape = [batch1, batch2, ..., right_features]

        """
        left_size = input_left.size()
        right_size = input_right.size()
@@ -118,16 +117,11 @@ class BiLinear(nn.Module):
 class BiAffine(nn.Module):
    def __init__(self, n_enc, n_dec, n_labels, biaffine=True, **kwargs):
        """
        Args:
            n_enc: int
                the dimension of the encoder input.
            n_dec: int
                the dimension of the decoder input.
            n_labels: int
                the number of labels of the crf layer
            biaffine: bool
                if apply bi-affine parameter.
            **kwargs:

        :param int n_enc: the dimension of the encoder input.
        :param int n_dec: the dimension of the decoder input.
        :param int n_labels: the number of labels of the crf layer
        :param bool biaffine: if apply bi-affine parameter.
        """
        super(BiAffine, self).__init__()
        self.n_enc = n_enc
@@ -154,17 +148,12 @@ class BiAffine(nn.Module):

    def forward(self, input_d, input_e, mask_d=None, mask_e=None):
        """
        Args:
            input_d: Tensor
                the decoder input tensor with shape = [batch, length_decoder, input_size]
            input_e: Tensor
                the child input tensor with shape = [batch, length_encoder, input_size]
            mask_d: Tensor or None
                the mask tensor for decoder with shape = [batch, length_decoder]
            mask_e: Tensor or None
                the mask tensor for encoder with shape = [batch, length_encoder]
        Returns: Tensor
            the energy tensor with shape = [batch, num_label, length, length]

        :param Tensor input_d: the decoder input tensor with shape = [batch, length_decoder, input_size]
        :param Tensor input_e: the child input tensor with shape = [batch, length_encoder, input_size]
        :param mask_d: Tensor or None, the mask tensor for decoder with shape = [batch, length_decoder]
        :param mask_e: Tensor or None, the mask tensor for encoder with shape = [batch, length_encoder]
        :returns: Tensor, the energy tensor with shape = [batch, num_label, length, length]
        """
        assert input_d.size(0) == input_e.size(0), 'batch sizes of encoder and decoder are requires to be equal.'
        batch, length_decoder, _ = input_d.size()
--- a/fastNLP/modules/utils.py
+++ b/fastNLP/modules/utils.py
@@ -15,7 +15,7 @@ def initial_parameter(net, initial_method=None):
    """A method used to initialize the weights of PyTorch models.

    :param net: a PyTorch model
    :param initial_method: str, one of the following initializations
    :param str initial_method: one of the following initializations.

            - xavier_uniform
            - xavier_normal (default)
@@ -79,7 +79,7 @@ def seq_mask(seq_len, max_len):

    :param seq_len: list or torch.Tensor, the lengths of sequences in a batch.
    :param max_len: int, the maximum sequence length in a batch.
    :return mask: torch.LongTensor, [batch_size, max_len]
    :return: mask, torch.LongTensor, [batch_size, max_len]

    """
    if not isinstance(seq_len, torch.Tensor):
--- a/reproduction/chinese_word_segment/models/cws_model.py
+++ b/reproduction/chinese_word_segment/models/cws_model.py
@@ -65,7 +65,7 @@ class CWSBiLSTMEncoder(BaseModel):

        x_tensor = self.char_embedding(chars)

        if not bigrams is None:
        if hasattr(self, 'bigram_embedding'):
            bigram_tensor = self.bigram_embedding(bigrams).view(batch_size, max_len, -1)
            x_tensor = torch.cat([x_tensor, bigram_tensor], dim=2)
        x_tensor = self.embedding_drop(x_tensor)
@@ -185,5 +185,5 @@ class CWSBiLSTMCRF(BaseModel):
        feats = self.decoder_model(feats)
        probs = self.crf.viterbi_decode(feats, masks, get_score=False)

        return {'pred': probs}
        return {'pred': probs, 'seq_lens':seq_lens}

--- a/reproduction/chinese_word_segment/process/cws_processor.py
+++ b/reproduction/chinese_word_segment/process/cws_processor.py
@@ -238,7 +238,7 @@ class VocabIndexerProcessor(Processor):

    """
    def __init__(self, field_name, new_added_filed_name=None, min_freq=1, max_size=None,
                 verbose=1, is_input=True):
                 verbose=0, is_input=True):
        """

        :param field_name: 从哪个field_name创建词表，以及对哪个field_name进行index操作
@@ -320,6 +320,15 @@ class VocabIndexerProcessor(Processor):
    def get_vocab_size(self):
        return len(self.vocab)

    def set_verbose(self, verbose):
        """
        设置processor verbose状态。

        :param verbose: int, 0，不输出任何信息；1，输出vocab 信息。
        :return:
        """
        self.verbose = verbose

 class VocabProcessor(Processor):
    def __init__(self, field_name, min_freq=1, max_size=None):

@@ -378,7 +387,7 @@ class BMES2OutputProcessor(Processor):
        prediction为BSEMS，会被认为是SSSSS.

    """
    def __init__(self, chars_field_name='chars_list', tag_field_name='pred_tags', new_added_field_name='output',
    def __init__(self, chars_field_name='chars_list', tag_field_name='pred', new_added_field_name='output',
            b_idx = 0, m_idx = 1, e_idx = 2, s_idx = 3):
        """

--- a/reproduction/chinese_word_segment/train_context.py
+++ b/reproduction/chinese_word_segment/train_context.py
@@ -11,7 +11,6 @@ from reproduction.chinese_word_segment.process.cws_processor import InputTargetP
 from reproduction.chinese_word_segment.cws_io.cws_reader import ConllCWSReader
 from reproduction.chinese_word_segment.models.cws_model import CWSBiLSTMCRF

 from reproduction.chinese_word_segment.utils import calculate_pre_rec_f1

 ds_name = 'msr'

@@ -39,8 +38,6 @@ bigram_vocab_proc = VocabIndexerProcessor('bigrams_lst', new_added_filed_name='b

 seq_len_proc = SeqLenProcessor('chars')

 input_target_proc = InputTargetProcessor(input_fields=['chars', 'bigrams', 'seq_lens', "target"],
                                         target_fields=['target', 'seq_lens'])
 # 2. 使用processor
 fs2hs_proc(tr_dataset)

@@ -63,15 +60,15 @@ char_vocab_proc(dev_dataset)
 bigram_vocab_proc(dev_dataset)
 seq_len_proc(dev_dataset)

 input_target_proc(tr_dataset)
 input_target_proc(dev_dataset)
 dev_dataset.set_input('target')
 tr_dataset.set_input('target')


 print("Finish preparing data.")

 # 3. 得到数据集可以用于训练了
 # TODO pretrain的embedding是怎么解决的？

 import torch
 from torch import optim


@@ -79,8 +76,8 @@ tag_size = tag_proc.tag_size

 cws_model = CWSBiLSTMCRF(char_vocab_proc.get_vocab_size(), embed_dim=100,
                            bigram_vocab_num=bigram_vocab_proc.get_vocab_size(),
                            bigram_embed_dim=100, num_bigram_per_char=8,
                            hidden_size=200, bidirectional=True, embed_drop_p=0.2,
                            bigram_embed_dim=30, num_bigram_per_char=8,
                            hidden_size=200, bidirectional=True, embed_drop_p=0.3,
                            num_layers=1, tag_size=tag_size)
 cws_model.cuda()

@@ -108,7 +105,7 @@ pp.add_processor(bigram_proc)
 pp.add_processor(char_vocab_proc)
 pp.add_processor(bigram_vocab_proc)
 pp.add_processor(seq_len_proc)
 pp.add_processor(input_target_proc)
 # pp.add_processor(input_target_proc)

 # te_filename = '/hdd/fudanNLP/CWS/CWS_semiCRF/all_data/{}/middle_files/{}_test.txt'.format(ds_name, ds_name)
 te_filename = '/home/hyan/ctb3/test.conllx'
@@ -142,14 +139,16 @@ from fastNLP.api.processor import ModelProcessor
 from reproduction.chinese_word_segment.process.cws_processor import BMES2OutputProcessor

 model_proc = ModelProcessor(cws_model)
 output_proc = BMES2OutputProcessor()
 output_proc = BMES2OutputProcessor(chars_field_name='chars_lst', tag_field_name='pred')

 pp = Pipeline()
 pp.add_processor(fs2hs_proc)
 # pp.add_processor(sp_proc)
 pp.add_processor(char_proc)
 pp.add_processor(bigram_proc)
 char_vocab_proc.set_verbose(0)
 pp.add_processor(char_vocab_proc)
 bigram_vocab_proc.set_verbose(0)
 pp.add_processor(bigram_vocab_proc)
 pp.add_processor(seq_len_proc)

@@ -158,9 +157,11 @@ pp.add_processor(output_proc)


 # TODO 这里貌似需要区分test pipeline与infer pipeline

 infer_context_dict = {'pipeline': pp}
 # torch.save(infer_context_dict, 'models/cws_crf.pkl')
 import torch
 import datetime
 now = datetime.datetime.now()
 infer_context_dict = {'pipeline': pp, 'tag_proc': tag_proc}
 torch.save(infer_context_dict, 'models/cws_crf_{}_{}.pkl'.format(now.month, now.day))


 # TODO 还需要考虑如何替换回原文的问题？
--- a/test/core/test_dataset.py
+++ b/test/core/test_dataset.py
@@ -197,4 +197,4 @@ class TestDataSetIter(unittest.TestCase):
    def test__repr__(self):
        ds = DataSet({"x": [[1, 2, 3, 4]] * 10, "y": [[5, 6]] * 10})
        for iter in ds:
            self.assertEqual(iter.__repr__(), "{'x': [1, 2, 3, 4],\n'y': [5, 6]}")
            self.assertEqual(iter.__repr__(), "{'x': [1, 2, 3, 4] type=list,\n'y': [5, 6] type=list}")
--- a/test/core/test_metrics.py
+++ b/test/core/test_metrics.py
@@ -360,7 +360,8 @@ class TestBMESF1PreRecMetric(unittest.TestCase):

        metric = BMESF1PreRecMetric()
        metric(pred_dict, target_dict)
        self.assertDictEqual(metric.get_metric(), {'f1': 0.999999, 'precision': 1.0, 'recall': 1.0})
        self.assertDictEqual(metric.get_metric(), {'f': 1.0, 'pre': 1.0, 'rec': 1.0})


 class TestUsefulFunctions(unittest.TestCase):
    # 测试metrics.py中一些看上去挺有用的函数
--- a/tutorials/fastnlp_advanced_tutorial/advance_tutorial.ipynb
+++ b/tutorials/fastnlp_advanced_tutorial/advance_tutorial.ipynb
--- a/tutorials/fastnlp_advanced_tutorial/data/config
+++ b/tutorials/fastnlp_advanced_tutorial/data/config
@@ -0,0 +1,8 @@
 [esim_model]
 embed_dim = 300
 hidden_size = 300
 batch_first = true
 dropout = 0.3
 num_classes = 3
 gpu = true
 batch_size = 32
--- a/tutorials/fastnlp_advanced_tutorial/hypothesis
+++ b/tutorials/fastnlp_advanced_tutorial/hypothesis
@@ -0,0 +1,100 @@
 A person is training his horse for a competition .
 A person is at a diner , ordering an omelette .
 A person is outdoors , on a horse .
 They are smiling at their parents
 There are children present
 The kids are frowning
 The boy skates down the sidewalk .
 The boy does a skateboarding trick .
 The boy is wearing safety equipment .
 An older man drinks his juice as he waits for his daughter to get off work .
 A boy flips a burger .
 An elderly man sits in a small shop .
 Some women are hugging on vacation .
 The women are sleeping .
 There are women showing affection .
 The people are eating omelettes .
 The people are sitting at desks in school .
 The diners are at a restaurant .
 A man is drinking juice .
 Two women are at a restaurant drinking wine .
 A man in a restaurant is waiting for his meal to arrive .
 A blond man getting a drink of water from a fountain in the park .
 A blond man wearing a brown shirt is reading a book on a bench in the park
 A blond man drinking water from a fountain .
 The friends scowl at each other over a full dinner table .
 There are two woman in this picture .
 The friends have just met for the first time in 20 years , and have had a great time catching up .
 The two sisters saw each other across the crowded diner and shared a hug , both clutching their doggie bags .
 Two groups of rival gang members flipped each other off .
 Two women hug each other .
 A team is trying to score the games winning out .
 A team is trying to tag a runner out .
 A team is playing baseball on Saturn .
 A school hosts a basketball game .
 A high school is hosting an event .
 A school is hosting an event .
 The women do not care what clothes they wear .
 Women are waiting by a tram .
 The women enjoy having a good fashion sense .
 A child with mom and dad , on summer vacation at the beach .
 A family of three is at the beach .
 A family of three is at the mall shopping .
 The people waiting on the train are sitting .
 There are people just getting on a train
 There are people waiting on a train .
 A couple are playing with a young child outside .
 A couple are playing frisbee with a young child at the beach .
 A couple watch a little girl play by herself on the beach .
 The family is sitting down for dinner .
 The family is outside .
 The family is on vacation .
 The people are standing still on the curb .
 Near a couple of restaurants , two people walk across the street .
 The couple are walking across the street together .
 The woman is nake .
 The woman is cold .
 The woman is wearing green .
 The man with the sign is caucasian .
 They are protesting outside the capital .
 A woman in white .
 A man is advertising for a restaurant .
 The woman is wearing black .
 A man and a woman walk down a crowded city street .
 The woman is wearing white .
 They are working for John 's Pizza .
 Olympic swimming .
 A man and a soman are eating together at John 's Pizza and Gyro .
 They are walking with a sign .
 The woman is waiting for a friend .
 The man is sitting down while he has a sign for John 's Pizza and Gyro in his arms .
 The woman and man are outdoors .
 A woman ordering pizza .
 The people are related .
 Two adults run across the street to get away from a red shirted person chasing them .
 The adults are both male and female .
 Two people walk home after a tasty steak dinner .
 Two adults swimming in water
 Two adults walk across a street .
 Two people ride bicycles into a tunnel .
 Two people walk away from a restaurant across a street .
 Two adults walking across a road near the convicted prisoner dressed in red
 Two friends cross a street .
 Some people board a train .
 Two adults walk across the street .
 Two adults walking across a road
 There are no women in the picture .
 Two adults walk across the street to get away from a red shirted person who is chasing them .
 A married couple is sleeping .
 A female is next to a man .
 A married couple is walking next to each other .
 Nobody has food .
 A woman eats a banana and walks across a street , and there is a man trailing behind her .
 The woman and man are playing baseball together .
 two coworkers cross pathes on a street
 A woman eats ice cream walking down the sidewalk , and there is another woman in front of her with a purse .
 The mans briefcase is for work .
 A person eating .
 A person that is hungry .
 An actress and her favorite assistant talk a walk in the city .
 a woman eating a banana crosses a street
--- a/tutorials/fastnlp_advanced_tutorial/label
+++ b/tutorials/fastnlp_advanced_tutorial/label
@@ -0,0 +1,100 @@
 1
 2
 0
 1
 0
 2
 2
 0
 1
 1
 2
 1
 1
 2
 0
 1
 2
 0
 0
 2
 1
 1
 2
 0
 2
 0
 1
 1
 2
 0
 1
 0
 2
 2
 1
 0
 2
 0
 1
 1
 0
 2
 1
 0
 0
 0
 1
 2
 2
 0
 1
 2
 0
 1
 2
 1
 0
 1
 2
 0
 0
 2
 1
 0
 1
 2
 2
 0
 1
 2
 0
 1
 1
 2
 0
 1
 2
 0
 2
 0
 1
 1
 2
 0
 0
 2
 1
 2
 0
 1
 2
 0
 2
 1
 2
 1
 0
 1
 1
 0
--- a/tutorials/fastnlp_advanced_tutorial/premise
+++ b/tutorials/fastnlp_advanced_tutorial/premise
@@ -0,0 +1,100 @@
 A person on a horse jumps over a broken down airplane .
 A person on a horse jumps over a broken down airplane .
 A person on a horse jumps over a broken down airplane .
 Children smiling and waving at camera
 Children smiling and waving at camera
 Children smiling and waving at camera
 A boy is jumping on skateboard in the middle of a red bridge .
 A boy is jumping on skateboard in the middle of a red bridge .
 A boy is jumping on skateboard in the middle of a red bridge .
 An older man sits with his orange juice at a small table in a coffee shop while employees in bright colored shirts smile in the background .
 An older man sits with his orange juice at a small table in a coffee shop while employees in bright colored shirts smile in the background .
 An older man sits with his orange juice at a small table in a coffee shop while employees in bright colored shirts smile in the background .
 Two blond women are hugging one another .
 Two blond women are hugging one another .
 Two blond women are hugging one another .
 A few people in a restaurant setting , one of them is drinking orange juice .
 A few people in a restaurant setting , one of them is drinking orange juice .
 A few people in a restaurant setting , one of them is drinking orange juice .
 An older man is drinking orange juice at a restaurant .
 An older man is drinking orange juice at a restaurant .
 An older man is drinking orange juice at a restaurant .
 A man with blond-hair , and a brown shirt drinking out of a public water fountain .
 A man with blond-hair , and a brown shirt drinking out of a public water fountain .
 A man with blond-hair , and a brown shirt drinking out of a public water fountain .
 Two women who just had lunch hugging and saying goodbye .
 Two women who just had lunch hugging and saying goodbye .
 Two women who just had lunch hugging and saying goodbye .
 Two women , holding food carryout containers , hug .
 Two women , holding food carryout containers , hug .
 Two women , holding food carryout containers , hug .
 A Little League team tries to catch a runner sliding into a base in an afternoon game .
 A Little League team tries to catch a runner sliding into a base in an afternoon game .
 A Little League team tries to catch a runner sliding into a base in an afternoon game .
 The school is having a special event in order to show the american culture on how other cultures are dealt with in parties .
 The school is having a special event in order to show the american culture on how other cultures are dealt with in parties .
 The school is having a special event in order to show the american culture on how other cultures are dealt with in parties .
 High fashion ladies wait outside a tram beside a crowd of people in the city .
 High fashion ladies wait outside a tram beside a crowd of people in the city .
 High fashion ladies wait outside a tram beside a crowd of people in the city .
 A man , woman , and child enjoying themselves on a beach .
 A man , woman , and child enjoying themselves on a beach .
 A man , woman , and child enjoying themselves on a beach .
 People waiting to get on a train or just getting off .
 People waiting to get on a train or just getting off .
 People waiting to get on a train or just getting off .
 A couple playing with a little boy on the beach .
 A couple playing with a little boy on the beach .
 A couple playing with a little boy on the beach .
 A couple play in the tide with their young son .
 A couple play in the tide with their young son .
 A couple play in the tide with their young son .
 A man and a woman cross the street in front of a pizza and gyro restaurant .
 A man and a woman cross the street in front of a pizza and gyro restaurant .
 A man and a woman cross the street in front of a pizza and gyro restaurant .
 A woman in a green jacket and hood over her head looking towards a valley .
 A woman in a green jacket and hood over her head looking towards a valley .
 A woman in a green jacket and hood over her head looking towards a valley .
 Woman in white in foreground and a man slightly behind walking with a sign for John 's Pizza and Gyro in the background .
 Woman in white in foreground and a man slightly behind walking with a sign for John 's Pizza and Gyro in the background .
 Woman in white in foreground and a man slightly behind walking with a sign for John 's Pizza and Gyro in the background .
 Woman in white in foreground and a man slightly behind walking with a sign for John 's Pizza and Gyro in the background .
 Woman in white in foreground and a man slightly behind walking with a sign for John 's Pizza and Gyro in the background .
 Woman in white in foreground and a man slightly behind walking with a sign for John 's Pizza and Gyro in the background .
 Woman in white in foreground and a man slightly behind walking with a sign for John 's Pizza and Gyro in the background .
 Woman in white in foreground and a man slightly behind walking with a sign for John 's Pizza and Gyro in the background .
 Woman in white in foreground and a man slightly behind walking with a sign for John 's Pizza and Gyro in the background .
 Woman in white in foreground and a man slightly behind walking with a sign for John 's Pizza and Gyro in the background .
 Woman in white in foreground and a man slightly behind walking with a sign for John 's Pizza and Gyro in the background .
 Woman in white in foreground and a man slightly behind walking with a sign for John 's Pizza and Gyro in the background .
 Woman in white in foreground and a man slightly behind walking with a sign for John 's Pizza and Gyro in the background .
 Woman in white in foreground and a man slightly behind walking with a sign for John 's Pizza and Gyro in the background .
 Woman in white in foreground and a man slightly behind walking with a sign for John 's Pizza and Gyro in the background .
 Two adults , one female in white , with shades and one male , gray clothes , walking across a street , away from a eatery with a blurred image of a dark colored red shirted person in the foreground .
 Two adults , one female in white , with shades and one male , gray clothes , walking across a street , away from a eatery with a blurred image of a dark colored red shirted person in the foreground .
 Two adults , one female in white , with shades and one male , gray clothes , walking across a street , away from a eatery with a blurred image of a dark colored red shirted person in the foreground .
 Two adults , one female in white , with shades and one male , gray clothes , walking across a street , away from a eatery with a blurred image of a dark colored red shirted person in the foreground .
 Two adults , one female in white , with shades and one male , gray clothes , walking across a street , away from a eatery with a blurred image of a dark colored red shirted person in the foreground .
 Two adults , one female in white , with shades and one male , gray clothes , walking across a street , away from a eatery with a blurred image of a dark colored red shirted person in the foreground .
 Two adults , one female in white , with shades and one male , gray clothes , walking across a street , away from a eatery with a blurred image of a dark colored red shirted person in the foreground .
 Two adults , one female in white , with shades and one male , gray clothes , walking across a street , away from a eatery with a blurred image of a dark colored red shirted person in the foreground .
 Two adults , one female in white , with shades and one male , gray clothes , walking across a street , away from a eatery with a blurred image of a dark colored red shirted person in the foreground .
 Two adults , one female in white , with shades and one male , gray clothes , walking across a street , away from a eatery with a blurred image of a dark colored red shirted person in the foreground .
 Two adults , one female in white , with shades and one male , gray clothes , walking across a street , away from a eatery with a blurred image of a dark colored red shirted person in the foreground .
 Two adults , one female in white , with shades and one male , gray clothes , walking across a street , away from a eatery with a blurred image of a dark colored red shirted person in the foreground .
 Two adults , one female in white , with shades and one male , gray clothes , walking across a street , away from a eatery with a blurred image of a dark colored red shirted person in the foreground .
 Two adults , one female in white , with shades and one male , gray clothes , walking across a street , away from a eatery with a blurred image of a dark colored red shirted person in the foreground .
 Two adults , one female in white , with shades and one male , gray clothes , walking across a street , away from a eatery with a blurred image of a dark colored red shirted person in the foreground .
 A woman wearing all white and eating , walks next to a man holding a briefcase .
 A woman wearing all white and eating , walks next to a man holding a briefcase .
 A woman wearing all white and eating , walks next to a man holding a briefcase .
 A woman is walking across the street eating a banana , while a man is following with his briefcase .
 A woman is walking across the street eating a banana , while a man is following with his briefcase .
 A woman is walking across the street eating a banana , while a man is following with his briefcase .
 A woman is walking across the street eating a banana , while a man is following with his briefcase .
 A woman is walking across the street eating a banana , while a man is following with his briefcase .
 A woman is walking across the street eating a banana , while a man is following with his briefcase .
 A woman is walking across the street eating a banana , while a man is following with his briefcase .
 A woman is walking across the street eating a banana , while a man is following with his briefcase .
 A woman is walking across the street eating a banana , while a man is following with his briefcase .
 A woman is walking across the street eating a banana , while a man is following with his briefcase .
--- a/tutorials/fastnlp_advanced_tutorial/tutorial_sample_dataset.csv
+++ b/tutorials/fastnlp_advanced_tutorial/tutorial_sample_dataset.csv
@@ -0,0 +1,77 @@
 A series of escapades demonstrating the adage that what is good for the goose is also good for the gander , some of which occasionally amuses but none of which amounts to much of a story .	1
 This quiet , introspective and entertaining independent is worth seeking .	4
 Even fans of Ismail Merchant 's work , I suspect , would have a hard time sitting through this one .	1
 A positively thrilling combination of ethnography and all the intrigue , betrayal , deceit and murder of a Shakespearean tragedy or a juicy soap opera .	3
 Aggressive self-glorification and a manipulative whitewash .	1
 A comedy-drama of nearly epic proportions rooted in a sincere performance by the title character undergoing midlife crisis .	4
 Narratively , Trouble Every Day is a plodding mess .	1
 The Importance of Being Earnest , so thick with wit it plays like a reading from Bartlett 's Familiar Quotations	3
 But it does n't leave you with much .	1
 You could hate it for the same reason .	1
 There 's little to recommend Snow Dogs , unless one considers cliched dialogue and perverse escapism a source of high hilarity .	1
 Kung Pow is Oedekerk 's realization of his childhood dream to be in a martial-arts flick , and proves that sometimes the dreams of youth should remain just that .	1
 The performances are an absolute joy .	4
 Fresnadillo has something serious to say about the ways in which extravagant chance can distort our perspective and throw us off the path of good sense .	3
 I still like Moonlight Mile , better judgment be damned .	3
 A welcome relief from baseball movies that try too hard to be mythic , this one is a sweet and modest and ultimately winning story .	3
 a bilingual charmer , just like the woman who inspired it	3
 Like a less dizzily gorgeous companion to Mr. Wong 's In the Mood for Love -- very much a Hong Kong movie despite its mainland setting .	2
 As inept as big-screen remakes of The Avengers and The Wild Wild West .	1
 It 's everything you 'd expect -- but nothing more .	2
 Best indie of the year , so far .	4
 Hatfield and Hicks make the oddest of couples , and in this sense the movie becomes a study of the gambles of the publishing world , offering a case study that exists apart from all the movie 's political ramifications .	3
 It 's like going to a house party and watching the host defend himself against a frothing ex-girlfriend .	1
 That the Chuck Norris `` grenade gag '' occurs about 7 times during Windtalkers is a good indication of how serious-minded the film is .	2
 The plot is romantic comedy boilerplate from start to finish .	2
 It arrives with an impeccable pedigree , mongrel pep , and almost indecipherable plot complications .	2
 A film that clearly means to preach exclusively to the converted .	2
 While The Importance of Being Earnest offers opportunities for occasional smiles and chuckles , it does n't give us a reason to be in the theater beyond Wilde 's wit and the actors ' performances .	1
 The latest vapid actor 's exercise to appropriate the structure of Arthur Schnitzler 's Reigen .	1
 More vaudeville show than well-constructed narrative , but on those terms it 's inoffensive and actually rather sweet .	2
 Nothing more than a run-of-the-mill action flick .	2
 Hampered -- no , paralyzed -- by a self-indulgent script ... that aims for poetry and ends up sounding like satire .	0
 Ice Age is the first computer-generated feature cartoon to feel like other movies , and that makes for some glacial pacing early on .	2
 There 's very little sense to what 's going on here , but the makers serve up the cliches with considerable dash .	2
 Cattaneo should have followed the runaway success of his first film , The Full Monty , with something different .	2
 They 're the unnamed , easily substitutable forces that serve as whatever terror the heroes of horror movies try to avoid .	1
 It almost feels as if the movie is more interested in entertaining itself than in amusing us .	1
 The movie 's progression into rambling incoherence gives new meaning to the phrase ` fatal script error . '	0
 I still like Moonlight Mile , better judgment be damned .	3
 A welcome relief from baseball movies that try too hard to be mythic , this one is a sweet and modest and ultimately winning story .	3
 a bilingual charmer , just like the woman who inspired it	3
 Like a less dizzily gorgeous companion to Mr. Wong 's In the Mood for Love -- very much a Hong Kong movie despite its mainland setting .	2
 As inept as big-screen remakes of The Avengers and The Wild Wild West .	1
 It 's everything you 'd expect -- but nothing more .	2
 Best indie of the year , so far .	4
 Hatfield and Hicks make the oddest of couples , and in this sense the movie becomes a study of the gambles of the publishing world , offering a case study that exists apart from all the movie 's political ramifications .	3
 It 's like going to a house party and watching the host defend himself against a frothing ex-girlfriend .	1
 That the Chuck Norris `` grenade gag '' occurs about 7 times during Windtalkers is a good indication of how serious-minded the film is .	2
 The plot is romantic comedy boilerplate from start to finish .	2
 It arrives with an impeccable pedigree , mongrel pep , and almost indecipherable plot complications .	2
 A film that clearly means to preach exclusively to the converted .	2
 I still like Moonlight Mile , better judgment be damned .	3
 A welcome relief from baseball movies that try too hard to be mythic , this one is a sweet and modest and ultimately winning story .	3
 a bilingual charmer , just like the woman who inspired it	3
 Like a less dizzily gorgeous companion to Mr. Wong 's In the Mood for Love -- very much a Hong Kong movie despite its mainland setting .	2
 As inept as big-screen remakes of The Avengers and The Wild Wild West .	1
 It 's everything you 'd expect -- but nothing more .	2
 Best indie of the year , so far .	4
 Hatfield and Hicks make the oddest of couples , and in this sense the movie becomes a study of the gambles of the publishing world , offering a case study that exists apart from all the movie 's political ramifications .	3
 It 's like going to a house party and watching the host defend himself against a frothing ex-girlfriend .	1
 That the Chuck Norris `` grenade gag '' occurs about 7 times during Windtalkers is a good indication of how serious-minded the film is .	2
 The plot is romantic comedy boilerplate from start to finish .	2
 It arrives with an impeccable pedigree , mongrel pep , and almost indecipherable plot complications .	2
 A film that clearly means to preach exclusively to the converted .	2
 I still like Moonlight Mile , better judgment be damned .	3
 A welcome relief from baseball movies that try too hard to be mythic , this one is a sweet and modest and ultimately winning story .	3
 a bilingual charmer , just like the woman who inspired it	3
 Like a less dizzily gorgeous companion to Mr. Wong 's In the Mood for Love -- very much a Hong Kong movie despite its mainland setting .	2
 As inept as big-screen remakes of The Avengers and The Wild Wild West .	1
 It 's everything you 'd expect -- but nothing more .	2
 Best indie of the year , so far .	4
 Hatfield and Hicks make the oddest of couples , and in this sense the movie becomes a study of the gambles of the publishing world , offering a case study that exists apart from all the movie 's political ramifications .	3
 It 's like going to a house party and watching the host defend himself against a frothing ex-girlfriend .	1
 That the Chuck Norris `` grenade gag '' occurs about 7 times during Windtalkers is a good indication of how serious-minded the film is .	2
 The plot is romantic comedy boilerplate from start to finish .	2
 It arrives with an impeccable pedigree , mongrel pep , and almost indecipherable plot complications .	2
 A film that clearly means to preach exclusively to the converted .	2
--- a/tutorials/fastnlp_advanced_tutorial/vocab.txt
+++ b/tutorials/fastnlp_advanced_tutorial/vocab.txt