Merge pull request #5 from fastnlp/dev0.5.0

Dev0.5.0
6 years ago · 2e4d84d950
--- a/fastNLP/init.py
+++ b/fastNLP/init.py
@@ -12,7 +12,11 @@ fastNLP 中最常用的组件可以直接从 fastNLP 包中 import ，他们的
 __all__ = [
    "Instance",
    "FieldArray",
    "Batch",

    "DataSetIter",
    "BatchIter",
    "TorchLoaderIter",

    "Vocabulary",
    "DataSet",
    "Const",
--- a/fastNLP/core/init.py
+++ b/fastNLP/core/init.py
@@ -14,7 +14,7 @@ core 模块里实现了 fastNLP 的核心框架，常用的功能都可以从 fa
    介绍core 的子模块的分工，好像必要性不大
    
 """
 from .batch import Batch
 from .batch import DataSetIter, BatchIter, TorchLoaderIter
 from .callback import Callback, GradientClipCallback, EarlyStopCallback, TensorboardCallback, LRScheduler, ControlC
 from .const import Const
 from .dataset import DataSet
--- a/fastNLP/core/batch.py
+++ b/fastNLP/core/batch.py
@@ -3,7 +3,9 @@ batch 模块实现了 fastNLP 所需的 Batch 类。

 """
 __all__ = [
    "Batch"
    "BatchIter",
    "DataSetIter",
    "TorchLoaderIter",
 ]

 import atexit
@@ -12,9 +14,11 @@ from queue import Empty, Full
 import numpy as np
 import torch
 import torch.multiprocessing as mp
 import torch.utils.data
 from numbers import Number

 from .sampler import RandomSampler
 from .sampler import SequentialSampler
 from .dataset import DataSet

 _python_is_exit = False

@@ -27,162 +31,163 @@ def _set_python_is_exit():
 atexit.register(_set_python_is_exit)


 class Batch(object):
    """
    别名：:class:`fastNLP.Batch` :class:`fastNLP.core.batch.Batch`

    Batch 用于从 `DataSet` 中按一定的顺序, 依次按 ``batch_size`` 的大小将数据取出，
    组成 `x` 和 `y`::

        batch = Batch(data_set, batch_size=16, sampler=SequentialSampler())
        num_batch = len(batch)
        for batch_x, batch_y in batch:
            # do stuff ...

    :param dataset: :class:`~fastNLP.DataSet` 对象, 数据集
    :param int batch_size: 取出的batch大小
    :param sampler: 规定使用的 :class:`~fastNLP.Sampler` 方式. 若为 ``None`` , 使用 :class:`~fastNLP.RandomSampler`.
    
        Default: ``None``
    :param bool as_numpy: 若为 ``True`` , 输出batch为 numpy.array. 否则为 :class:`torch.Tensor`.
    
        Default: ``False``
    :param bool prefetch: 若为 ``True`` 使用多进程预先取出下一batch.
    
        Default: ``False``
    """
    
    def __init__(self, dataset, batch_size, sampler=None, as_numpy=False, prefetch=False):
 class DataSetGetter:
    def __init__(self, dataset: DataSet, as_numpy=False):
        self.dataset = dataset
        self.batch_size = batch_size
        if sampler is None:
            sampler = RandomSampler()
        self.sampler = sampler
        self.inputs = {n: f for n, f in dataset.get_all_fields().items() if f.is_input}
        self.targets = {n: f for n, f in dataset.get_all_fields().items() if f.is_target}
        self.as_numpy = as_numpy
        self.idx_list = None
        self.curidx = 0
        self.num_batches = len(dataset) // batch_size + int(len(dataset) % batch_size != 0)
        self.cur_batch_indices = None
        self.prefetch = prefetch
        self.lengths = 0
    
    def fetch_one(self):
        if self.curidx >= len(self.idx_list):
            return None
        self.idx_list = list(range(len(dataset)))

    def __getitem__(self, idx: int):
        # mapping idx to sampled idx
        idx = self.idx_list[idx]
        inputs = {n:f.get(idx) for n, f in self.inputs.items()}
        targets = {n:f.get(idx) for n, f in self.targets.items()}
        return idx, inputs, targets

    def __len__(self):
        return len(self.dataset)

    def collate_fn(self, batch: list):
        batch_x = {n:[] for n in self.inputs.keys()}
        batch_y = {n:[] for n in self.targets.keys()}
        indices = []
        for idx, x, y in batch:
            indices.append(idx)
            for n, v in x.items():
                batch_x[n].append(v)
            for n, v in y.items():
                batch_y[n].append(v)

        def pad_batch(batch_dict, field_array):
            for n, vlist in batch_dict.items():
                f = field_array[n]
                if f.padder is None:
                    batch_dict[n] = np.array(vlist)
                else:
                    data = f.pad(vlist)
                    if not self.as_numpy:
                        try:
                            data, flag = _to_tensor(data, f.dtype)
                        except TypeError as e:
                            print(f"Field {n} cannot be converted to torch.tensor.")
                            raise e
                    batch_dict[n] = data
            return batch_dict

        return (indices,
                pad_batch(batch_x, self.inputs),
                pad_batch(batch_y, self.targets))

    def set_idx_list(self, idx_list):
        if len(idx_list) != len(self.idx_list):
            raise ValueError
        self.idx_list = idx_list

    def __getattr__(self, item):
        if hasattr(self.dataset, item):
            return getattr(self.dataset, item)
        else:
            endidx = min(self.curidx + self.batch_size, len(self.idx_list))
            batch_x, batch_y = {}, {}
            
            indices = self.idx_list[self.curidx:endidx]
            self.cur_batch_indices = indices
            
            for field_name, field in self.dataset.get_all_fields().items():
                if field.is_target or field.is_input:
                    batch = field.get(indices)
                    if not self.as_numpy and \
                            field.dtype is not None and \
                                issubclass(field.dtype, Number) and not isinstance(batch, torch.Tensor):
                        batch = _to_tensor(batch)
                    if field.is_target:
                        batch_y[field_name] = batch
                    if field.is_input:
                        batch_x[field_name] = batch
            
            self.curidx = endidx
            return batch_x, batch_y
    
            raise AttributeError("'DataSetGetter' object has no attribute '{}'".format(item))


 class SamplerAdapter(torch.utils.data.Sampler):
    def __init__(self, sampler, dataset):
        self.sampler = sampler
        self.dataset = dataset

    def __iter__(self):
        """
        Iterate on dataset, fetch batch data. Fetch process don't block the iterate process
        :return:
        """
        if self.prefetch:
            return self._run_batch_iter(self)
        
        def batch_iter():
            self.init_iter()
            while 1:
                res = self.fetch_one()
                if res is None:
                    break
                yield res
        
        return batch_iter()
    
        return iter(self.sampler(self.dataset))


 class BatchIter:
    def __init__(self):
        self.dataiter = None
        self.num_batches = None
        self.cur_batch_indices = None
        self.batch_size = None

    def init_iter(self):
        self.idx_list = self.sampler(self.dataset)
        self.curidx = 0
        self.lengths = self.dataset.get_length()
    
        pass

    @staticmethod
    def get_num_batches(num_samples, batch_size, drop_last):
        num_batches = num_samples // batch_size
        if not drop_last and (num_samples % batch_size > 0):
            num_batches += 1
        return num_batches

    def __iter__(self):
        self.init_iter()
        for indices, batch_x, batch_y in self.dataiter:
            self.cur_batch_indices = indices
            yield batch_x, batch_y

    def get_batch_indices(self):
        return self.cur_batch_indices

    def __len__(self):
        return self.num_batches
    
    def get_batch_indices(self):
        """
        取得当前batch在DataSet中所在的index下标序列

        :return list(int) indexes: 下标序列
        """
        return self.cur_batch_indices
    
    @staticmethod
    def _run_fetch(batch, q):
        try:
            global _python_is_exit
            batch.init_iter()
            # print('start fetch')
            while 1:
                res = batch.fetch_one()
                # print('fetch one')
                while 1:
                    try:
                        q.put(res, timeout=3)
                        break
                    except Full:
                        if _python_is_exit:
                            return
                if res is None:
                    # print('fetch done, waiting processing')
                    break
            # print('fetch exit')
        except Exception as e:
            q.put(e)
        finally:
            q.join()
    
    @staticmethod
    def _run_batch_iter(batch):
        q = mp.JoinableQueue(maxsize=10)
        fetch_p = mp.Process(target=Batch._run_fetch, args=(batch, q))
        fetch_p.daemon = True
        fetch_p.start()
        # print('fork fetch process')
        while 1:
            try:
                res = q.get(timeout=1)
                q.task_done()
                # print('get fetched')
                if res is None:
                    break
                elif isinstance(res, Exception):
                    raise res
                yield res
            except Empty as e:
                if fetch_p.is_alive():
                    continue
                else:
                    break
        fetch_p.terminate()
        fetch_p.join()
        # print('iter done')
    @property
    def dataset(self):
        return self.dataiter.dataset


 class DataSetIter(BatchIter):
    def __init__(self, dataset, batch_size=1, sampler=None, as_numpy=False,
                 num_workers=0, pin_memory=False, drop_last=False,
                 timeout=0, worker_init_fn=None):
        super().__init__()
        assert isinstance(dataset, DataSet)
        sampler = SamplerAdapter(sampler=sampler or SequentialSampler(), dataset=dataset)
        dataset = DataSetGetter(dataset, as_numpy)
        collate_fn = dataset.collate_fn if hasattr(dataset, 'collate_fn') else None
        self.dataiter = torch.utils.data.DataLoader(
            dataset=dataset, batch_size=batch_size, sampler=sampler,
            collate_fn=collate_fn, num_workers=num_workers,
            pin_memory=pin_memory, drop_last=drop_last,
            timeout=timeout, worker_init_fn=worker_init_fn)
        self.num_batches = self.get_num_batches(len(dataset), batch_size, drop_last)
        self.batch_size = batch_size


 class TorchLoaderIter(BatchIter):
    def __init__(self, dataset):
        super().__init__()
        assert isinstance(dataset, torch.utils.data.DataLoader)
        self.dataiter = dataset
        self.num_batches = self.get_num_batches(len(dataset), dataset.batch_size, dataset.drop_last)
        self.batch_size = dataset.batch_size


 def _to_tensor(batch):
 class OnlineDataGettter:
    # TODO
    pass


 class OnlineDataIter(BatchIter):
    # TODO
    def __init__(self, dataset, batch_size=1, buffer_size=10000, sampler=None, as_numpy=False,
                 num_workers=0, pin_memory=False, drop_last=False,
                 timeout=0, worker_init_fn=None, **kwargs):
        super().__init__()


 def _to_tensor(batch, field_dtype):
    try:
        if issubclass(batch.dtype.type, np.floating):
            batch = torch.as_tensor(batch).float()  # 默认使用float32
        if field_dtype is not None and isinstance(field_dtype, type)\
                and issubclass(field_dtype, Number) \
                and not isinstance(batch, torch.Tensor):
            if issubclass(batch.dtype.type, np.floating):
                new_batch = torch.as_tensor(batch).float()  # 默认使用float32
            elif issubclass(batch.dtype.type, np.integer):
                new_batch = torch.as_tensor(batch).long()  # 复用内存地址，避免复制
            else:
                new_batch = torch.as_tensor(batch)
            return new_batch, True
        else:
            batch = torch.as_tensor(batch)  # 复用内存地址，避免复制
    except:
        pass
    return batch
            return batch, False
    except Exception as e:
        raise e
--- a/fastNLP/core/callback.py
+++ b/fastNLP/core/callback.py
@@ -548,7 +548,7 @@ class LRScheduler(Callback):
        else:
            raise ValueError(f"Expect torch.optim.lr_scheduler for LRScheduler. Got {type(lr_scheduler)}.")
    
    def on_epoch_begin(self):
    def on_epoch_end(self):
        self.scheduler.step(self.epoch)


--- a/fastNLP/core/dataset.py
+++ b/fastNLP/core/dataset.py
@@ -801,17 +801,19 @@ class DataSet(object):
            else:
                return DataSet()
    
    def split(self, ratio):
    def split(self, ratio, shuffle=True):
        """
        将DataSet按照ratio的比例拆分，返回两个DataSet

        :param float ratio: 0<ratio<1, 返回的第一个DataSet拥有 `ratio` 这么多数据，第二个DataSet拥有 `(1-ratio)` 这么多数据
        :param float ratio: 0<ratio<1, 返回的第一个DataSet拥有 `(1-ratio)` 这么多数据，第二个DataSet拥有`ratio`这么多数据
        :param bool shuffle: 在split前是否shuffle一下
        :return: [DataSet, DataSet]
        """
        assert isinstance(ratio, float)
        assert 0 < ratio < 1
        all_indices = [_ for _ in range(len(self))]
        np.random.shuffle(all_indices)
        if shuffle:
            np.random.shuffle(all_indices)
        split = int(ratio * len(self))
        dev_indices = all_indices[:split]
        train_indices = all_indices[split:]
--- a/fastNLP/core/field.py
+++ b/fastNLP/core/field.py
@@ -176,7 +176,10 @@ class FieldArray:
        if self.padder is None or pad is False:
            return np.array(contents)
        else:
            return self.padder(contents, field_name=self.name, field_ele_dtype=self.dtype, dim=self._cell_ndim)
            return self.pad(contents)

    def pad(self, contents):
        return self.padder(contents, field_name=self.name, field_ele_dtype=self.dtype, dim=self._cell_ndim)

    def set_padder(self, padder):
        """
@@ -239,7 +242,7 @@ class FieldArray:
                new_contents.append(cell.split(sep))
            except Exception as e:
                print(f"Exception happens when process value in index {index}.")
                print(e)
                raise e
        return self._after_process(new_contents, inplace=inplace)

    def int(self, inplace:bool=True):
@@ -279,7 +282,7 @@ class FieldArray:
                    new_contents.append(float(cell))
            except Exception as e:
                print(f"Exception happens when process value in index {index}.")
                print(e)
                raise e
        return self._after_process(new_contents, inplace=inplace)

    def bool(self, inplace=True):
@@ -299,7 +302,7 @@ class FieldArray:
                    new_contents.append(bool(cell))
            except Exception as e:
                print(f"Exception happens when process value in index {index}.")
                print(e)
                raise e

        return self._after_process(new_contents, inplace=inplace)

@@ -320,7 +323,7 @@ class FieldArray:
                    new_contents.append(cell.lower())
            except Exception as e:
                print(f"Exception happens when process value in index {index}.")
                print(e)
                raise e
        return self._after_process(new_contents, inplace=inplace)

    def upper(self, inplace=True):
@@ -340,7 +343,7 @@ class FieldArray:
                    new_contents.append(cell.upper())
            except Exception as e:
                print(f"Exception happens when process value in index {index}.")
                print(e)
                raise e
        return self._after_process(new_contents, inplace=inplace)

    def value_count(self):
@@ -350,8 +353,15 @@ class FieldArray:
        :return: Counter, key是label，value是出现次数
        """
        count = Counter()

        def cum(cell):
            if _is_iterable(cell) and not isinstance(cell, str):
                for cell_ in cell:
                    cum(cell_)
            else:
                count[cell] += 1
        for cell in self.content:
            count[cell] += 1
            cum(cell)
        return count

    def _after_process(self, new_contents, inplace):
@@ -385,6 +395,8 @@ def _get_ele_type_and_dim(cell:Any, dim=0):
    :return:
    """
    if isinstance(cell, (str, Number, np.bool_)):
        if hasattr(cell, 'dtype'):
            return cell.dtype.type, dim
        return type(cell), dim
    elif isinstance(cell, list):
        dim += 1
@@ -402,7 +414,7 @@ def _get_ele_type_and_dim(cell:Any, dim=0):
        return cell.dtype, cell.dim() + dim  # 如果是torch.mean的结果是0
    elif isinstance(cell, np.ndarray):
        if cell.dtype != np.dtype('O'):  # 如果不是object的话说明是well-formatted的了
            return cell.dtype.type, cell.ndim + dim
            return cell.dtype.type, cell.ndim + dim  # dtype.type返回的会是np.int32, np.float等
        # 否则需要继续往下iterate
        dim += 1
        res = [_get_ele_type_and_dim(cell_i, dim) for cell_i in cell]
@@ -527,31 +539,30 @@ class AutoPadder(Padder):
        if field_ele_dtype:
            if dim>3:
                return np.array(contents)
            if isinstance(field_ele_dtype, np.dtype) or field_ele_dtype in (float, int, bool, str):
                if isinstance(field_ele_dtype, np.number) or field_ele_dtype in (float, int, bool):
                    if dim==0:
            if isinstance(field_ele_dtype, type) and \
                    (issubclass(field_ele_dtype, np.number) or issubclass(field_ele_dtype, Number)):
                if dim==0:
                    array = np.array(contents, dtype=field_ele_dtype)
                elif dim==1:
                    max_len = max(map(len, contents))
                    array = np.full((len(contents), max_len), self.pad_val, dtype=field_ele_dtype)
                    for i, content_i in enumerate(contents):
                        array[i, :len(content_i)] = content_i
                elif dim==2:
                    max_len = max(map(len, contents))
                    max_word_len = max([max([len(content_ii) for content_ii in content_i]) for
                                        content_i in contents])
                    array = np.full((len(contents), max_len, max_word_len), self.pad_val, dtype=field_ele_dtype)
                    for i, content_i in enumerate(contents):
                        for j, content_ii in enumerate(content_i):
                            array[i, j, :len(content_ii)] = content_ii
                else:
                    shape = np.shape(contents)
                    if len(shape)==4: # 说明各dimension是相同的大小
                        array = np.array(contents, dtype=field_ele_dtype)
                    elif dim==1:
                        max_len = max(map(len, contents))
                        array = np.full((len(contents), max_len), self.pad_val, dtype=field_ele_dtype)
                        for i, content_i in enumerate(contents):
                            array[i, :len(content_i)] = content_i
                    elif dim==2:
                        max_len = max(map(len, contents))
                        max_word_len = max([max([len(content_ii) for content_ii in content_i]) for
                                            content_i in contents])
                        array = np.full((len(contents), max_len, max_word_len), self.pad_val, dtype=field_ele_dtype)
                        for i, content_i in enumerate(contents):
                            for j, content_ii in enumerate(content_i):
                                array[i, j, :len(content_ii)] = content_ii
                    else:
                        shape = np.shape(contents)
                        if len(shape)==4: # 说明各dimension是相同的大小
                            array = np.array(contents, dtype=field_ele_dtype)
                        else:
                            raise RuntimeError(f"Field:{field_name} has 3 dimensions, every sample should have the same shape.")
                    return array
                return np.array(contents)
                        raise RuntimeError(f"Field:{field_name} has 3 dimensions, every sample should have the same shape.")
                return array
            elif str(field_ele_dtype).startswith('torch'):
                if dim==0:
                    tensor = torch.tensor(contents).to(field_ele_dtype)
--- a/fastNLP/core/losses.py
+++ b/fastNLP/core/losses.py
@@ -26,7 +26,7 @@ from .utils import _build_args
 from .utils import _check_arg_dict_list
 from .utils import _check_function_or_method
 from .utils import _get_func_signature

 from .utils import seq_len_to_mask

 class LossBase(object):
    """
@@ -34,14 +34,23 @@ class LossBase(object):
    """
    
    def __init__(self):
        self.param_map = {}
        self._param_map = {}  # key是fun的参数，value是以该值从传入的dict取出value
        self._checked = False
    

    @property
    def param_map(self):
        if len(self._param_map) == 0:  # 如果为空说明还没有初始化
            func_spect = inspect.getfullargspec(self.get_loss)
            func_args = [arg for arg in func_spect.args if arg != 'self']
            for arg in func_args:
                self._param_map[arg] = arg
        return self._param_map

    def get_loss(self, *args, **kwargs):
        raise NotImplementedError
    
    def _init_param_map(self, key_map=None, **kwargs):
        """检查key_map和其他参数map，并将这些映射关系添加到self.param_map
        """检查key_map和其他参数map，并将这些映射关系添加到self._param_map

        :param dict key_map: 表示key的映射关系
        :param kwargs: key word args里面的每一个的键-值对都会被构造成映射关系
@@ -53,30 +62,30 @@ class LossBase(object):
                raise TypeError("key_map must be `dict`, got {}.".format(type(key_map)))
            for key, value in key_map.items():
                if value is None:
                    self.param_map[key] = key
                    self._param_map[key] = key
                    continue
                if not isinstance(key, str):
                    raise TypeError(f"key in key_map must be `str`, not `{type(key)}`.")
                if not isinstance(value, str):
                    raise TypeError(f"value in key_map must be `str`, not `{type(value)}`.")
                self.param_map[key] = value
                self._param_map[key] = value
                value_counter[value].add(key)
        for key, value in kwargs.items():
            if value is None:
                self.param_map[key] = key
                self._param_map[key] = key
                continue
            if not isinstance(value, str):
                raise TypeError(f"in {key}={value}, value must be `str`, not `{type(value)}`.")
            self.param_map[key] = value
            self._param_map[key] = value
            value_counter[value].add(key)
        for value, key_set in value_counter.items():
            if len(key_set) > 1:
                raise ValueError(f"Several parameters:{key_set} are provided with one output {value}.")
        
        # check consistence between signature and param_map
        # check consistence between signature and _param_map
        func_spect = inspect.getfullargspec(self.get_loss)
        func_args = [arg for arg in func_spect.args if arg != 'self']
        for func_param, input_param in self.param_map.items():
        for func_param, input_param in self._param_map.items():
            if func_param not in func_args:
                raise NameError(
                    f"Parameter `{func_param}` is not in {_get_func_signature(self.get_loss)}. Please check the "
@@ -96,7 +105,7 @@ class LossBase(object):
        :return: dict, if dict is not {}, pass it to self.evaluate. Otherwise do mapping.
        """
        fast_param = {}
        if len(self.param_map) == 2 and len(pred_dict) == 1 and len(target_dict) == 1:
        if len(self._param_map) == 2 and len(pred_dict) == 1 and len(target_dict) == 1:
            fast_param['pred'] = list(pred_dict.values())[0]
            fast_param['target'] = list(target_dict.values())[0]
            return fast_param
@@ -115,49 +124,41 @@ class LossBase(object):
            return loss
        
        if not self._checked:
            # 1. check consistence between signature and param_map
            # 1. check consistence between signature and _param_map
            func_spect = inspect.getfullargspec(self.get_loss)
            func_args = set([arg for arg in func_spect.args if arg != 'self'])
            for func_arg, input_arg in self.param_map.items():
            for func_arg, input_arg in self._param_map.items():
                if func_arg not in func_args:
                    raise NameError(f"`{func_arg}` not in {_get_func_signature(self.get_loss)}.")
            
            # 2. only part of the param_map are passed, left are not
            # 2. only part of the _param_map are passed, left are not
            for arg in func_args:
                if arg not in self.param_map:
                    self.param_map[arg] = arg  # This param does not need mapping.
                if arg not in self._param_map:
                    self._param_map[arg] = arg  # This param does not need mapping.
            self._evaluate_args = func_args
            self._reverse_param_map = {input_arg: func_arg for func_arg, input_arg in self.param_map.items()}
        
        # need to wrap inputs in dict.
            self._reverse_param_map = {input_arg: func_arg for func_arg, input_arg in self._param_map.items()}

        mapped_pred_dict = {}
        mapped_target_dict = {}
        duplicated = []
        for input_arg in set(list(pred_dict.keys()) + list(target_dict.keys())):
            not_duplicate_flag = 0
            if input_arg in self._reverse_param_map:
                mapped_arg = self._reverse_param_map[input_arg]
                not_duplicate_flag += 1
            else:
                mapped_arg = input_arg
        for input_arg, mapped_arg in self._reverse_param_map.items():
            if input_arg in pred_dict:
                mapped_pred_dict[mapped_arg] = pred_dict[input_arg]
                not_duplicate_flag += 1
            if input_arg in target_dict:
                mapped_target_dict[mapped_arg] = target_dict[input_arg]
                not_duplicate_flag += 1
            if not_duplicate_flag == 3:
                duplicated.append(input_arg)
        
        # missing
        if not self._checked:
            duplicated = []
            for input_arg, mapped_arg in self._reverse_param_map.items():
                if input_arg in pred_dict and input_arg in target_dict:
                    duplicated.append(input_arg)
            check_res = _check_arg_dict_list(self.get_loss, [mapped_pred_dict, mapped_target_dict])
            # replace missing.
            missing = check_res.missing
            replaced_missing = list(missing)
            for idx, func_arg in enumerate(missing):
                # Don't delete `` in this information, nor add ``
                replaced_missing[idx] = f"{self.param_map[func_arg]}" + f"(assign to `{func_arg}` " \
                replaced_missing[idx] = f"{self._param_map[func_arg]}" + f"(assign to `{func_arg}` " \
                    f"in `{self.__class__.__name__}`)"
            
            check_res = _CheckRes(missing=replaced_missing,
@@ -170,6 +171,8 @@ class LossBase(object):
            if check_res.missing or check_res.duplicated:
                raise _CheckError(check_res=check_res,
                                  func_signature=_get_func_signature(self.get_loss))
            self._checked = True

        refined_args = _build_args(self.get_loss, **mapped_pred_dict, **mapped_target_dict)
        
        loss = self.get_loss(**refined_args)
@@ -204,15 +207,12 @@ class LossFunc(LossBase):
        
        super(LossFunc, self).__init__()
        _check_function_or_method(func)
        self.get_loss = func
        if key_map is not None:
            if not isinstance(key_map, dict):
                raise RuntimeError(f"Loss error: key_map except a {type({})} but got a {type(key_map)}")
            self.param_map = key_map
        if len(kwargs) > 0:
            for key, val in kwargs.items():
                self.param_map.update({key: val})
        self._init_param_map(key_map, **kwargs)
        
        self.get_loss = func


 class CrossEntropyLoss(LossBase):
@@ -223,7 +223,9 @@ class CrossEntropyLoss(LossBase):
    
    :param pred: 参数映射表中 `pred` 的映射关系，None表示映射关系为 `pred` -> `pred`
    :param target: 参数映射表中 `target` 的映射关系，None表示映射关系为 `target` -> `target`
    :param padding_idx: padding的index，在计算loss时将忽略target中标号为padding_idx的内容
    :param seq_len: 句子的长度, 长度之外的token不会计算loss。。
    :param padding_idx: padding的index，在计算loss时将忽略target中标号为padding_idx的内容, 可以通过该值代替
        传入seq_len.

    Example::

@@ -231,13 +233,19 @@ class CrossEntropyLoss(LossBase):
        
    """
    
    def __init__(self, pred=None, target=None, padding_idx=-100):
        # TODO 需要做一些检查，F.cross_entropy在计算时，如果pred是(16, 10 ,4), target的形状按道理应该是(16, 10), 但实际需要（16，4）
    def __init__(self, pred=None, target=None, seq_len=None, padding_idx=-100):
        super(CrossEntropyLoss, self).__init__()
        self._init_param_map(pred=pred, target=target)
        self._init_param_map(pred=pred, target=target, seq_len=seq_len)
        self.padding_idx = padding_idx
    
    def get_loss(self, pred, target):
    def get_loss(self, pred, target, seq_len=None):
        if pred.dim()>2:
            pred = pred.view(-1, pred.size(-1))
            target = target.view(-1)
        if seq_len is not None:
            mask = seq_len_to_mask(seq_len).view(-1).eq(0)
            target = target.masked_fill(mask, self.padding_idx)

        return F.cross_entropy(input=pred, target=target,
                               ignore_index=self.padding_idx)

--- a/fastNLP/core/metrics.py
+++ b/fastNLP/core/metrics.py
@@ -115,9 +115,18 @@ class MetricBase(object):
    """
    
    def __init__(self):
        self.param_map = {}  # key is param in function, value is input param.
        self._param_map = {}  # key is param in function, value is input param.
        self._checked = False

    @property
    def param_map(self):
        if len(self._param_map) == 0:  # 如果为空说明还没有初始化
            func_spect = inspect.getfullargspec(self.evaluate)
            func_args = [arg for arg in func_spect.args if arg != 'self']
            for arg in func_args:
                self._param_map[arg] = arg
        return self._param_map

    @abstractmethod
    def evaluate(self, *args, **kwargs):
        raise NotImplementedError
@@ -127,7 +136,7 @@ class MetricBase(object):
        raise NotImplemented
    
    def _init_param_map(self, key_map=None, **kwargs):
        """检查key_map和其他参数map，并将这些映射关系添加到self.param_map
        """检查key_map和其他参数map，并将这些映射关系添加到self._param_map

        :param dict key_map: 表示key的映射关系
        :param kwargs: key word args里面的每一个的键-值对都会被构造成映射关系
@@ -139,30 +148,30 @@ class MetricBase(object):
                raise TypeError("key_map must be `dict`, got {}.".format(type(key_map)))
            for key, value in key_map.items():
                if value is None:
                    self.param_map[key] = key
                    self._param_map[key] = key
                    continue
                if not isinstance(key, str):
                    raise TypeError(f"key in key_map must be `str`, not `{type(key)}`.")
                if not isinstance(value, str):
                    raise TypeError(f"value in key_map must be `str`, not `{type(value)}`.")
                self.param_map[key] = value
                self._param_map[key] = value
                value_counter[value].add(key)
        for key, value in kwargs.items():
            if value is None:
                self.param_map[key] = key
                self._param_map[key] = key
                continue
            if not isinstance(value, str):
                raise TypeError(f"in {key}={value}, value must be `str`, not `{type(value)}`.")
            self.param_map[key] = value
            self._param_map[key] = value
            value_counter[value].add(key)
        for value, key_set in value_counter.items():
            if len(key_set) > 1:
                raise ValueError(f"Several parameters:{key_set} are provided with one output {value}.")
        
        # check consistence between signature and param_map
        # check consistence between signature and _param_map
        func_spect = inspect.getfullargspec(self.evaluate)
        func_args = [arg for arg in func_spect.args if arg != 'self']
        for func_param, input_param in self.param_map.items():
        for func_param, input_param in self._param_map.items():
            if func_param not in func_args:
                raise NameError(
                    f"Parameter `{func_param}` is not in {_get_func_signature(self.evaluate)}. Please check the "
@@ -177,7 +186,7 @@ class MetricBase(object):
        :return: dict, if dict is not {}, pass it to self.evaluate. Otherwise do mapping.
        """
        fast_param = {}
        if len(self.param_map) == 2 and len(pred_dict) == 1 and len(target_dict) == 1:
        if len(self._param_map) == 2 and len(pred_dict) == 1 and len(target_dict) == 1:
            fast_param['pred'] = list(pred_dict.values())[0]
            fast_param['target'] = list(target_dict.values())[0]
            return fast_param
@@ -206,42 +215,35 @@ class MetricBase(object):
        if not self._checked:
            if not callable(self.evaluate):
                raise TypeError(f"{self.__class__.__name__}.evaluate has to be callable, not {type(self.evaluate)}.")
            # 1. check consistence between signature and param_map
            # 1. check consistence between signature and _param_map
            func_spect = inspect.getfullargspec(self.evaluate)
            func_args = set([arg for arg in func_spect.args if arg != 'self'])
            for func_arg, input_arg in self.param_map.items():
            for func_arg, input_arg in self._param_map.items():
                if func_arg not in func_args:
                    raise NameError(f"`{func_arg}` not in {_get_func_signature(self.evaluate)}.")
            
            # 2. only part of the param_map are passed, left are not
            # 2. only part of the _param_map are passed, left are not
            for arg in func_args:
                if arg not in self.param_map:
                    self.param_map[arg] = arg  # This param does not need mapping.
                if arg not in self._param_map:
                    self._param_map[arg] = arg  # This param does not need mapping.
            self._evaluate_args = func_args
            self._reverse_param_map = {input_arg: func_arg for func_arg, input_arg in self.param_map.items()}
            self._reverse_param_map = {input_arg: func_arg for func_arg, input_arg in self._param_map.items()}
        
        # need to wrap inputs in dict.
        mapped_pred_dict = {}
        mapped_target_dict = {}
        duplicated = []
        for input_arg in set(list(pred_dict.keys()) + list(target_dict.keys())):
            not_duplicate_flag = 0
            if input_arg in self._reverse_param_map:
                mapped_arg = self._reverse_param_map[input_arg]
                not_duplicate_flag += 1
            else:
                mapped_arg = input_arg
        for input_arg, mapped_arg in self._reverse_param_map.items():
            if input_arg in pred_dict:
                mapped_pred_dict[mapped_arg] = pred_dict[input_arg]
                not_duplicate_flag += 1
            if input_arg in target_dict:
                mapped_target_dict[mapped_arg] = target_dict[input_arg]
                not_duplicate_flag += 1
            if not_duplicate_flag == 3:
                duplicated.append(input_arg)
        
        # missing
        if not self._checked:
            duplicated = []
            for input_arg, mapped_arg in self._reverse_param_map.items():
                if input_arg in pred_dict and input_arg in target_dict:
                    duplicated.append(input_arg)
            check_res = _check_arg_dict_list(self.evaluate, [mapped_pred_dict, mapped_target_dict])
            # only check missing.
            # replace missing.
@@ -249,7 +251,7 @@ class MetricBase(object):
            replaced_missing = list(missing)
            for idx, func_arg in enumerate(missing):
                # Don't delete `` in this information, nor add ``
                replaced_missing[idx] = f"{self.param_map[func_arg]}" + f"(assign to `{func_arg}` " \
                replaced_missing[idx] = f"{self._param_map[func_arg]}" + f"(assign to `{func_arg}` " \
                    f"in `{self.__class__.__name__}`)"
            
            check_res = _CheckRes(missing=replaced_missing,
@@ -262,10 +264,10 @@ class MetricBase(object):
            if check_res.missing or check_res.duplicated:
                raise _CheckError(check_res=check_res,
                                  func_signature=_get_func_signature(self.evaluate))
            self._checked = True
        refined_args = _build_args(self.evaluate, **mapped_pred_dict, **mapped_target_dict)
        
        self.evaluate(**refined_args)
        self._checked = True
        
        return

@@ -411,6 +413,37 @@ def _bmeso_tag_to_spans(tags, ignore_labels=None):
            ]


 def _bioes_tag_to_spans(tags, ignore_labels=None):
    """
    给定一个tags的lis，比如['O', 'B-singer', 'I-singer', 'E-singer', 'O', 'O']。
    返回[('singer', (1, 4))] (左闭右开区间)

    :param tags: List[str],
    :param ignore_labels: List[str], 在该list中的label将被忽略
    :return: List[Tuple[str, List[int, int]]]. [(label，[start, end])]
    """
    ignore_labels = set(ignore_labels) if ignore_labels else set()

    spans = []
    prev_bioes_tag = None
    for idx, tag in enumerate(tags):
        tag = tag.lower()
        bioes_tag, label = tag[:1], tag[2:]
        if bioes_tag in ('b', 's'):
            spans.append((label, [idx, idx]))
        elif bioes_tag in ('i', 'e') and prev_bioes_tag in ('b', 'i') and label == spans[-1][0]:
            spans[-1][1][1] = idx
        elif bioes_tag == 'o':
            pass
        else:
            spans.append((label, [idx, idx]))
        prev_bioes_tag = bioes_tag
    return [(span[0], (span[1][0], span[1][1] + 1))
            for span in spans
            if span[0] not in ignore_labels
            ]


 def _bio_tag_to_spans(tags, ignore_labels=None):
    """
    给定一个tags的lis，比如['O', 'B-singer', 'I-singer', 'I-singer', 'O', 'O']。
@@ -471,7 +504,7 @@ class SpanFPreRecMetric(MetricBase):
    :param str pred: 用该key在evaluate()时从传入dict中取出prediction数据。 为None，则使用'pred'取数据
    :param str target: 用该key在evaluate()时从传入dict中取出target数据。 为None，则使用'target'取数据
    :param str seq_len: 用该key在evaluate()时从传入dict中取出sequence length数据。为None，则使用'seq_len'取数据。
    :param str encoding_type: 目前支持bio, bmes
    :param str encoding_type: 目前支持bio, bmes, bmeso, bioes
    :param list ignore_labels: str 组成的list. 这个list中的class不会被用于计算。例如在POS tagging时传入['NN']，则不会计算'NN'这
        个label
    :param bool only_gross: 是否只计算总的f1, precision, recall的值；如果为False，不仅返回总的f1, pre, rec, 还会返回每个
@@ -499,6 +532,8 @@ class SpanFPreRecMetric(MetricBase):
            self.tag_to_span_func = _bio_tag_to_spans
        elif self.encoding_type == 'bmeso':
            self.tag_to_span_func = _bmeso_tag_to_spans
        elif self.encoding_type == 'bioes':
            self.tag_to_span_func = _bioes_tag_to_spans
        else:
            raise ValueError("Only support 'bio', 'bmes', 'bmeso' type.")
        
--- a/fastNLP/core/optimizer.py
+++ b/fastNLP/core/optimizer.py
@@ -36,6 +36,23 @@ class Optimizer(object):
        """
        return [param for param in params if param.requires_grad]

 class NullOptimizer(Optimizer):
    """
    当不希望Trainer更新optimizer时，传入本optimizer，但请确保通过callback的方式对参数进行了更新。

    """
    def __init__(self):
        super().__init__(None)

    def construct_from_pytorch(self, model_params):
        pass

    def __getattr__(self, item):
        def pass_func(*args, **kwargs):
            pass

        return pass_func


 class SGD(Optimizer):
    """
--- a/fastNLP/core/predictor.py
+++ b/fastNLP/core/predictor.py
@@ -6,7 +6,7 @@ from collections import defaultdict

 import torch

 from . import Batch
 from . import DataSetIter
 from . import DataSet
 from . import SequentialSampler
 from .utils import _build_args
@@ -44,8 +44,7 @@ class Predictor(object):

        self.network.eval()
        batch_output = defaultdict(list)
        data_iterator = Batch(data, batch_size=self.batch_size, sampler=SequentialSampler(), as_numpy=False,
                              prefetch=False)
        data_iterator = DataSetIter(data, batch_size=self.batch_size, sampler=SequentialSampler(), as_numpy=False)

        if hasattr(self.network, "predict"):
            predict_func = self.network.predict
--- a/fastNLP/core/tester.py
+++ b/fastNLP/core/tester.py
@@ -37,7 +37,7 @@ import warnings
 import torch
 import torch.nn as nn

 from .batch import Batch
 from .batch import BatchIter, DataSetIter
 from .dataset import DataSet
 from .metrics import _prepare_metrics
 from .sampler import SequentialSampler
@@ -82,7 +82,7 @@ class Tester(object):
    :param int verbose: 如果为0不输出任何信息; 如果为1，打印出验证结果。
    """
    
    def __init__(self, data, model, metrics, batch_size=16, device=None, verbose=1):
    def __init__(self, data, model, metrics, batch_size=16, num_workers=0, device=None, verbose=1):
        super(Tester, self).__init__()
        
        if not isinstance(data, DataSet):
@@ -96,6 +96,14 @@ class Tester(object):
        self._model = _move_model_to_device(model, device=device)
        self.batch_size = batch_size
        self.verbose = verbose

        if isinstance(data, DataSet):
            self.data_iterator = DataSetIter(
                dataset=data, batch_size=batch_size, num_workers=num_workers, sampler=SequentialSampler())
        elif isinstance(data, BatchIter):
            self.data_iterator = data
        else:
            raise TypeError("data type {} not support".format(type(data)))
        
        #  如果是DataParallel将没有办法使用predict方法
        if isinstance(self._model, nn.DataParallel):
@@ -112,7 +120,10 @@ class Tester(object):
                raise TypeError(f"`{_model_name}.predict` must be callable to be used "
                                f"for evaluation, not `{type(self._predict_func)}`.")
        else:
            self._predict_func = self._model.forward
            if isinstance(model, nn.DataParallel):
                self._predict_func = self._model.module.forward
            else:
                self._predict_func = self._model.forward
    
    def test(self):
        """开始进行验证，并返回验证结果。
@@ -124,7 +135,7 @@ class Tester(object):
        self._model_device = _get_model_device(self._model)
        network = self._model
        self._mode(network, is_test=True)
        data_iterator = Batch(self.data, self.batch_size, sampler=SequentialSampler(), as_numpy=False)
        data_iterator = self.data_iterator
        eval_results = {}
        try:
            with torch.no_grad():
--- a/fastNLP/core/trainer.py
+++ b/fastNLP/core/trainer.py
@@ -311,8 +311,9 @@ try:
    from tqdm.auto import tqdm
 except:
    from .utils import _pseudo_tqdm as tqdm
 import warnings

 from .batch import Batch
 from .batch import DataSetIter, BatchIter
 from .callback import CallbackManager, CallbackException
 from .dataset import DataSet
 from .losses import _prepare_losser
@@ -320,7 +321,6 @@ from .metrics import _prepare_metrics
 from .optimizer import Optimizer
 from .sampler import Sampler
 from .sampler import RandomSampler
 from .sampler import SequentialSampler
 from .tester import Tester
 from .utils import _CheckError
 from .utils import _build_args
@@ -351,6 +351,8 @@ class Trainer(object):
    :param int batch_size: 训练和验证的时候的batch大小。
    :param loss: 使用的 :class:`~fastNLP.core.losses.LossBase` 对象。当为None时，默认使用 :class:`~fastNLP.LossInForward`
    :param sampler: Batch数据生成的顺序， :class:`~fastNLP.Sampler` 类型。如果为None，默认使用 :class:`~fastNLP.RandomSampler`
    :param drop_last: 如果最后一个batch没有正好为batch_size这么多数据，就扔掉最后一个batch
    :param num_workers: int, 有多少个线程来进行数据pad处理。
    :param update_every: int, 多少步更新一次梯度。用于希望累计梯度的场景，比如需要128的batch_size, 但是直接设为128
        会导致内存不足，通过设置batch_size=32, update_every=4达到目的。当optimizer为None时，该参数无效。
    :param int n_epochs: 需要优化迭代多少次。
@@ -367,7 +369,6 @@ class Trainer(object):
    :param int validate_every: 多少个step在验证集上验证一次; 如果为-1，则每个epoch结束验证一次。仅在传入dev_data时有效。
    :param str,None save_path: 将模型保存路径。如果为None，则不保存模型。如果dev_data为None，则保存最后一次迭代的模型。
        保存的时候不仅保存了参数，还保存了模型结构。即便使用DataParallel，这里也只保存模型。
    :param prefetch: bool, 是否使用额外的进程对产生batch数据。理论上会使得Batch迭代更快。
    :param bool use_tqdm: 是否使用tqdm来显示训练进度; 如果为False，则将loss打印在终端中。
    :param str,int,torch.device,list(int) device: 将模型load到哪个设备。默认为None，即Trainer不对模型
        的计算位置进行管理。支持以下的输入:
@@ -394,16 +395,17 @@ class Trainer(object):
    """
    
    def __init__(self, train_data, model, optimizer=None, loss=None,
                 batch_size=32, sampler=None, update_every=1,
                 n_epochs=10, print_every=5,
                 batch_size=32, sampler=None, drop_last=False, update_every=1,
                 num_workers=0, n_epochs=10, print_every=5,
                 dev_data=None, metrics=None, metric_key=None,
                 validate_every=-1, save_path=None,
                 prefetch=False, use_tqdm=True, device=None,
                 callbacks=None,
                 check_code_level=0):
                 validate_every=-1, save_path=None, use_tqdm=True, device=None, prefetch=False,
                 callbacks=None, check_code_level=0):
        if prefetch and num_workers==0:
            num_workers = 1
        if prefetch:
            warnings.warn("prefetch is deprecated, will be removed in version 0.5.0, please use num_workers instead.")

        super(Trainer, self).__init__()
        if not isinstance(train_data, DataSet):
            raise TypeError(f"The type of train_data must be fastNLP.DataSet, got {type(train_data)}.")
        if not isinstance(model, nn.Module):
            raise TypeError(f"The type of model must be torch.nn.Module, got {type(model)}.")
        
@@ -430,25 +432,35 @@ class Trainer(object):
        if metric_key is not None:
            self.increase_better = False if metric_key[0] == "-" else True
            self.metric_key = metric_key[1:] if metric_key[0] == "+" or metric_key[0] == "-" else metric_key
        elif len(metrics) > 0:
            self.metric_key = metrics[0].__class__.__name__.lower().strip('metric')
        
        else:
            self.metric_key = None
        # prepare loss
        losser = _prepare_losser(loss)
        
        # sampler check
        if sampler is not None and not isinstance(sampler, Sampler):
            raise ValueError("The type of sampler should be fastNLP.BaseSampler, got {}.".format(type(sampler)))
        
        if check_code_level > -1:

        if sampler is None:
            sampler = RandomSampler()

        if isinstance(train_data, DataSet):
            self.data_iterator = DataSetIter(
                dataset=train_data, batch_size=batch_size, num_workers=num_workers, sampler=sampler, drop_last=drop_last)
        elif isinstance(train_data, BatchIter):
            self.data_iterator = train_data
        else:
            raise TypeError("train_data type {} not support".format(type(train_data)))

        if check_code_level > -1 and isinstance(self.data_iterator, DataSetIter):
            _check_code(dataset=train_data, model=model, losser=losser, metrics=metrics, dev_data=dev_data,
                        metric_key=metric_key, check_level=check_code_level,
                        batch_size=min(batch_size, DEFAULT_CHECK_BATCH_SIZE))
            # _check_code 是 fastNLP 帮助你检查代码是否正确的方法 。如果你在错误栈中看到这行注释，请认真检查你的代码
        
        self.model = _move_model_to_device(model, device=device)

        self.train_data = train_data
        self.dev_data = dev_data  # If None, No validation.
        self.model = model
        self.losser = losser
        self.metrics = metrics
        self.n_epochs = int(n_epochs)
@@ -460,26 +472,22 @@ class Trainer(object):
        self.best_dev_epoch = None
        self.best_dev_step = None
        self.best_dev_perf = None
        self.sampler = sampler if sampler is not None else RandomSampler()
        self.prefetch = prefetch
        self.n_steps = (len(self.train_data) // self.batch_size + int(
            len(self.train_data) % self.batch_size != 0)) * self.n_epochs
        
        self.model = _move_model_to_device(self.model, device=device)
        

        if isinstance(optimizer, torch.optim.Optimizer):
            self.optimizer = optimizer
        elif isinstance(optimizer, Optimizer):
            self.optimizer = optimizer.construct_from_pytorch(model.parameters())
            self.optimizer = optimizer.construct_from_pytorch(self.model.parameters())
        elif optimizer is None:
            self.optimizer = torch.optim.Adam(model.parameters(), lr=4e-3)
            self.optimizer = torch.optim.Adam(self.model.parameters(), lr=4e-3)
        else:
            raise TypeError("optimizer can only be torch.optim.Optimizer type, not {}.".format(type(optimizer)))
        
        self.use_tqdm = use_tqdm
        self.pbar = None
        self.print_every = abs(self.print_every)
        

        if self.dev_data is not None:
            self.tester = Tester(model=self.model,
                                 data=self.dev_data,
@@ -493,7 +501,7 @@ class Trainer(object):
        
        self.callback_manager = CallbackManager(env={"trainer": self},
                                                callbacks=callbacks)
    

    def train(self, load_best_model=True, on_exception='auto'):
        """
        使用该函数使Trainer开始训练。
@@ -568,12 +576,14 @@ class Trainer(object):
        self.step = 0
        self.epoch = 0
        start = time.time()
        
        if isinstance(self.model, nn.DataParallel):
            self._forward_func = self.model.module.forward
        else:
            self._forward_func = self.model.forward
        with inner_tqdm(total=self.n_steps, postfix='loss:{0:<6.5f}', leave=False, dynamic_ncols=True) as pbar:
            self.pbar = pbar
            avg_loss = 0
            data_iterator = Batch(self.train_data, batch_size=self.batch_size, sampler=self.sampler, as_numpy=False,
                                  prefetch=self.prefetch)
            data_iterator = self.data_iterator
            self.batch_per_epoch = data_iterator.num_batches
            for epoch in range(1, self.n_epochs + 1):
                self.epoch = epoch
@@ -605,7 +615,7 @@ class Trainer(object):
                    if self.step % self.print_every == 0:
                        avg_loss = float(avg_loss) / self.print_every
                        if self.use_tqdm:
                            print_output = "loss:{0:<6.5f}".format(avg_loss)
                            print_output = "loss:{:<6.5f}".format(avg_loss)
                            pbar.update(self.print_every)
                        else:
                            end = time.time()
@@ -669,15 +679,15 @@ class Trainer(object):
        """Perform weight update on a model.

        """
        if self.optimizer is not None and (self.step + 1) % self.update_every == 0:
        if self.step % self.update_every == 0:
            self.optimizer.step()
    
    def _data_forward(self, network, x):
        x = _build_args(network.forward, **x)
        x = _build_args(self._forward_func, **x)
        y = network(**x)
        if not isinstance(y, dict):
            raise TypeError(
                f"The return value of {_get_func_signature(network.forward)} should be dict, got {type(y)}.")
                f"The return value of {_get_func_signature(self._forward_func)} should be dict, got {type(y)}.")
        return y
    
    def _grad_backward(self, loss):
@@ -687,7 +697,7 @@ class Trainer(object):

        For PyTorch, just do "loss.backward()"
        """
        if self.step % self.update_every == 0:
        if (self.step-1) % self.update_every == 0:
            self.model.zero_grad()
        loss.backward()
    
@@ -746,7 +756,9 @@ class Trainer(object):

        :return bool value: True means current results on dev set is the best.
        """
        indicator_val = _check_eval_results(metrics, self.metric_key, self.metrics)
        indicator, indicator_val = _check_eval_results(metrics, self.metric_key, self.metrics)
        if self.metric_key is None:
            self.metric_key = indicator
        is_better = True
        if self.best_metric_indicator is None:
            # first-time validation
@@ -785,15 +797,34 @@ def _get_value_info(_dict):
        strs.append(_str)
    return strs


 from numbers import Number
 from .batch import _to_tensor
 def _check_code(dataset, model, losser, metrics, batch_size=DEFAULT_CHECK_BATCH_SIZE,
                dev_data=None, metric_key=None,
                check_level=0):
    # check get_loss 方法
    model_devcie = model.parameters().__next__().device
    model_devcie = _get_model_device(model=model)
    
    batch = Batch(dataset=dataset, batch_size=batch_size, sampler=SequentialSampler())
    for batch_count, (batch_x, batch_y) in enumerate(batch):
    def _iter():
        start_idx = 0
        while start_idx<len(dataset):
            batch_x = {}
            batch_y = {}
            for field_name, field in dataset.get_all_fields().items():
                indices = list(range(start_idx, min(start_idx+batch_size, len(dataset))))
                if field.is_target or field.is_input:
                    batch = field.get(indices)
                    if field.dtype is not None and \
                            issubclass(field.dtype, Number) and not isinstance(batch, torch.Tensor):
                        batch, _ = _to_tensor(batch, field.dtype)
                    if field.is_target:
                        batch_y[field_name] = batch
                    if field.is_input:
                        batch_x[field_name] = batch
            yield (batch_x, batch_y)
            start_idx += batch_size

    for batch_count, (batch_x, batch_y) in enumerate(_iter()):
        _move_dict_value_to_device(batch_x, batch_y, device=model_devcie)
        # forward check
        if batch_count == 0:
@@ -815,8 +846,11 @@ def _check_code(dataset, model, losser, metrics, batch_size=DEFAULT_CHECK_BATCH_
            print(info_str)
            _check_forward_error(forward_func=model.forward, dataset=dataset,
                                 batch_x=batch_x, check_level=check_level)
        
        refined_batch_x = _build_args(model.forward, **batch_x)
        if isinstance(model, nn.DataParallel):
            forward_func = model.module.forward
        else:
            forward_func = model.forward
        refined_batch_x = _build_args(forward_func, **batch_x)
        pred_dict = model(**refined_batch_x)
        func_signature = _get_func_signature(model.forward)
        if not isinstance(pred_dict, dict):
@@ -861,26 +895,16 @@ def _check_eval_results(metrics, metric_key, metric_list):
        loss, metrics = metrics
    
    if isinstance(metrics, dict):
        if len(metrics) == 1:
            # only single metric, just use it
            metric_dict = list(metrics.values())[0]
            metrics_name = list(metrics.keys())[0]
        else:
            metrics_name = metric_list[0].__class__.__name__
            if metrics_name not in metrics:
                raise RuntimeError(f"{metrics_name} is chosen to do validation, but got {metrics}")
            metric_dict = metrics[metrics_name]
        metric_dict = list(metrics.values())[0]  # 取第一个metric
        
        if len(metric_dict) == 1:
        if metric_key is None:
            indicator_val, indicator = list(metric_dict.values())[0], list(metric_dict.keys())[0]
        elif len(metric_dict) > 1 and metric_key is None:
            raise RuntimeError(
                f"Got multiple metric keys: {metric_dict}, but metric_key is not set. Which one to use?")
        else:
            # metric_key is set
            if metric_key not in metric_dict:
                raise RuntimeError(f"metric key {metric_key} not found in {metric_dict}")
            indicator_val = metric_dict[metric_key]
            indicator = metric_key
    else:
        raise RuntimeError("Invalid metrics type. Expect {}, got {}".format((tuple, dict), type(metrics)))
    return indicator_val
    return indicator, indicator_val
--- a/fastNLP/core/utils.py
+++ b/fastNLP/core/utils.py
@@ -643,7 +643,7 @@ def _check_forward_error(forward_func, batch_x, dataset, check_level):
            warnings.warn(message=_unused_warn)


 def seq_len_to_mask(seq_len):
 def seq_len_to_mask(seq_len, max_len=None):
    """

    将一个表示sequence length的一维数组转换为二维的mask，不包含的位置为0。
@@ -659,20 +659,26 @@ def seq_len_to_mask(seq_len):
        >>> mask = seq_len_to_mask(seq_len)
        >>> print(mask.shape)
        (14, 15)
        >>> seq_len = torch.arange(2, 16)
        >>> mask = seq_len_to_mask(seq_len, max_len=100)
        >>>print(mask.size())
        torch.Size([14, 100])

    :param np.ndarray,torch.LongTensor seq_len: shape将是(B,)
    :param int max_len: 将长度pad到这个长度。默认(None)使用的是seq_len中最长的长度。但在nn.DataParallel的场景下可能不同卡的seq_len会有
        区别，所以需要传入一个max_len使得mask的长度是pad到该长度。
    :return: np.ndarray or torch.Tensor, shape将是(B, max_length)。 元素类似为bool或torch.uint8
    """
    if isinstance(seq_len, np.ndarray):
        assert len(np.shape(seq_len)) == 1, f"seq_len can only have one dimension, got {len(np.shape(seq_len))}."
        max_len = int(seq_len.max())
        max_len = int(max_len) if max_len else int(seq_len.max())
        broad_cast_seq_len = np.tile(np.arange(max_len), (len(seq_len), 1))
        mask = broad_cast_seq_len < seq_len.reshape(-1, 1)
    
    elif isinstance(seq_len, torch.Tensor):
        assert seq_len.dim() == 1, f"seq_len can only have one dimension, got {seq_len.dim() == 1}."
        batch_size = seq_len.size(0)
        max_len = seq_len.max().long()
        max_len = int(max_len) if max_len else seq_len.max().long()
        broad_cast_seq_len = torch.arange(max_len).expand(batch_size, -1).to(seq_len)
        mask = broad_cast_seq_len.lt(seq_len.unsqueeze(1))
    else:
--- a/fastNLP/core/vocabulary.py
+++ b/fastNLP/core/vocabulary.py
@@ -4,10 +4,11 @@ __all__ = [
 ]

 from functools import wraps
 from collections import Counter
 from collections import Counter, defaultdict
 from .dataset import DataSet
 from .utils import Option

 from functools import partial
 import numpy as np

 class VocabularyOption(Option):
    def __init__(self,
@@ -89,7 +90,9 @@ class Vocabulary(object):
        self.word2idx = None
        self.idx2word = None
        self.rebuild = True
    
        #  用于承载不需要单独创建entry的词语，具体见from_dataset()方法
        self._no_create_word = defaultdict(int)

    @_check_build_status
    def update(self, word_lst):
        """依次增加序列中词在词典中的出现频率
@@ -240,8 +243,12 @@ class Vocabulary(object):
                    raise e
            else:
                raise RuntimeError("Only DataSet type is allowed.")
    
    def from_dataset(self, *datasets, field_name):

    @property
    def _no_create_word_length(self):
        return len(self._no_create_word)

    def from_dataset(self, *datasets, field_name, no_create_entry_dataset=None):
        """
        使用dataset的对应field中词构建词典::

@@ -253,6 +260,13 @@ class Vocabulary(object):
            构建词典所使用的 field(s), 支持一个或多个field
            若有多个 DataSet, 每个DataSet都必须有这些field.
            目前仅支持的field结构: ``str`` , ``list(str)`` , ``list(list(str))``
        :param no_create_entry_dataset: 可以传入DataSet, List[DataSet]或者None(默认)，该选项用在接下来的模型会使用pretrain
            的embedding(包括glove, word2vec, elmo与bert)且会finetune的情况。如果仅使用来自于train的数据建立vocabulary，会导致test与dev
            中的数据无法充分利用到来自于预训练embedding的信息，所以在建立词表的时候将test与dev考虑进来会使得最终的结果更好。
            如果一个词出现在了train中，但是没在预训练模型中，embedding会为它用unk初始化，但它是单独的一个vector，如果
            finetune embedding的话，这个词在更新之后可能会有更好的表示; 而如果这个词仅出现在了dev或test中，那么就不能为它们单独建立vector，
            而应该让它指向unk这个vector的值。所以只位于no_create_entry_dataset中的token，将首先从预训练的词表中寻找它的表示，
            如果找到了，就使用该表示; 如果没有找到，则认为该词的表示应该为unk的表示。
        :return self:
        """
        if isinstance(field_name, str):
@@ -260,19 +274,28 @@ class Vocabulary(object):
        elif not isinstance(field_name, list):
            raise TypeError('invalid argument field_name: {}'.format(field_name))
        
        def construct_vocab(ins):
        def construct_vocab(ins, no_create_entry=False):
            for fn in field_name:
                field = ins[fn]
                if isinstance(field, str):
                    if no_create_entry and field not in self.word_count:
                        self._no_create_word[field] += 1
                    self.add_word(field)
                elif isinstance(field, list):
                    if not isinstance(field[0], list):
                        self.add_word_lst(field)
                elif isinstance(field, (list, np.ndarray)):
                    if not isinstance(field[0], (list, np.ndarray)):
                        for word in field:
                            if no_create_entry and word not in self.word_count:
                                self._no_create_word[word] += 1
                            self.add_word(word)
                    else:
                        if isinstance(field[0][0], list):
                        if isinstance(field[0][0], (list, np.ndarray)):
                            raise RuntimeError("Only support field with 2 dimensions.")
                        [self.add_word_lst(w) for w in field]
        
                        for words in field:
                            for word in words:
                                if no_create_entry and word not in self.word_count:
                                    self._no_create_word[word] += 1
                                self.add_word(word)

        for idx, dataset in enumerate(datasets):
            if isinstance(dataset, DataSet):
                try:
@@ -281,9 +304,27 @@ class Vocabulary(object):
                    print("When processing the `{}` dataset, the following error occurred.".format(idx))
                    raise e
            else:
                raise RuntimeError("Only DataSet type is allowed.")
                raise TypeError("Only DataSet type is allowed.")

        if no_create_entry_dataset is not None:
            partial_construct_vocab = partial(construct_vocab, no_create_entry=True)
            if isinstance(no_create_entry_dataset, DataSet):
                no_create_entry_dataset.apply(partial_construct_vocab)
            elif isinstance(no_create_entry_dataset, list):
                for dataset in no_create_entry_dataset:
                    if not isinstance(dataset, DataSet):
                        raise TypeError("Only DataSet type is allowed.")
                    dataset.apply(partial_construct_vocab)
        return self
    

    def _is_word_no_create_entry(self, word):
        """
        判断当前的word是否是不需要创建entry的，具体参见from_dataset的说明
        :param word: str
        :return: bool
        """
        return word in self._no_create_word

    def to_index(self, w):
        """
        将词转为数字. 若词不再词典中被记录, 将视为 unknown, 若 ``unknown=None`` , 将抛出
@@ -338,6 +379,7 @@ class Vocabulary(object):
        self.word2idx = None
        self.idx2word = None
        self.rebuild = True
        self._no_create_word.clear()
    
    def __getstate__(self):
        """Use to prepare data for pickle.
--- a/fastNLP/io/init.py
+++ b/fastNLP/io/init.py
@@ -26,6 +26,6 @@ __all__ = [
 ]

 from .embed_loader import EmbedLoader
 from .dataset_loader import DataSetLoader, CSVLoader, JsonLoader, ConllLoader, SNLILoader, SSTLoader, \
    PeopleDailyCorpusLoader, Conll2003Loader
 from .dataset_loader import DataSetLoader, CSVLoader, JsonLoader, ConllLoader, \
    SNLILoader, SSTLoader, PeopleDailyCorpusLoader, Conll2003Loader
 from .model_io import ModelLoader, ModelSaver
--- a/fastNLP/io/base_loader.py
+++ b/fastNLP/io/base_loader.py
@@ -124,6 +124,14 @@ class DataInfo:
        self.embeddings = embeddings or {}
        self.datasets = datasets or {}

    def __repr__(self):
        _str = 'In total {} datasets:\n'.format(len(self.datasets))
        for name, dataset in self.datasets.items():
            _str += '\t{} has {} instances.\n'.format(name, len(dataset))
        _str += 'In total {} vocabs:\n'.format(len(self.vocabs))
        for name, vocab in self.vocabs.items():
            _str += '\t{} has {} entries.\n'.format(name, len(vocab))
        return _str

 class DataSetLoader:
    """
--- a/fastNLP/io/dataset_loader.py
+++ b/fastNLP/io/dataset_loader.py
@@ -22,13 +22,17 @@ __all__ = [
    'Conll2003Loader',
 ]

 import os
 from nltk import Tree
 from typing import Union, Dict
 from ..core.vocabulary import Vocabulary
 from ..core.dataset import DataSet
 from ..core.instance import Instance
 from .file_reader import _read_csv, _read_json, _read_conll
 from .base_loader import DataSetLoader
 from .base_loader import DataSetLoader, DataInfo
 from .data_loader.sst import SSTLoader
 from ..core.const import Const
 from ..modules.encoder._bert import BertTokenizer


 class PeopleDailyCorpusLoader(DataSetLoader):
@@ -115,7 +119,8 @@ class ConllLoader(DataSetLoader):
    """
    别名：:class:`fastNLP.io.ConllLoader` :class:`fastNLP.io.dataset_loader.ConllLoader`

    读取Conll格式的数据. 数据格式详见 http://conll.cemantix.org/2012/data.html
    读取Conll格式的数据. 数据格式详见 http://conll.cemantix.org/2012/data.html. 数据中以"-DOCSTART-"开头的行将被忽略，因为
        该符号在conll 2003中被用为文档分割符。

    列号从0开始, 每列对应内容为::

--- a/fastNLP/io/embed_loader.py
+++ b/fastNLP/io/embed_loader.py
@@ -38,7 +38,8 @@ class EmbedLoader(BaseLoader):
        super(EmbedLoader, self).__init__()

    @staticmethod
    def load_with_vocab(embed_filepath, vocab, dtype=np.float32, padding='<pad>', unknown='<unk>', normalize=True, error='ignore'):
    def load_with_vocab(embed_filepath, vocab, dtype=np.float32, padding='<pad>', unknown='<unk>', normalize=True,
                        error='ignore', init_method=None):
        """
        从embed_filepath这个预训练的词向量中抽取出vocab这个词表的词的embedding。EmbedLoader将自动判断embed_filepath是
        word2vec(第一行只有两个元素)还是glove格式的数据。
@@ -52,6 +53,7 @@ class EmbedLoader(BaseLoader):
        :param bool normalize: 是否将每个vector归一化到norm为1
        :param str error: `ignore` , `strict` ; 如果 `ignore` ，错误将自动跳过; 如果 `strict` , 错误将抛出。
            这里主要可能出错的地方在于词表有空行或者词表出现了维度不一致。
        :param callable init_method: 传入numpy.ndarray, 返回numpy.ndarray, 用以初始化embedding
        :return numpy.ndarray:  shape为 [len(vocab), dimension], dimension由pretrain的embedding决定。
        """
        assert isinstance(vocab, Vocabulary), "Only fastNLP.Vocabulary is supported."
@@ -69,10 +71,13 @@ class EmbedLoader(BaseLoader):
                dim = len(parts) - 1
                f.seek(0)
            matrix = np.random.randn(len(vocab), dim).astype(dtype)
            if init_method:
                matrix = init_method(matrix)
            for idx, line in enumerate(f, start_idx):
                try:
                    parts = line.strip().split()
                    word = parts[0]
                    word = ''.join(parts[:-dim])
                    nums = parts[-dim:]
                    # 对齐unk与pad
                    if word==padding and vocab.padding is not None:
                        word = vocab.padding
@@ -80,7 +85,7 @@ class EmbedLoader(BaseLoader):
                        word = vocab.unknown
                    if word in vocab:
                        index = vocab.to_index(word)
                        matrix[index] = np.fromstring(' '.join(parts[1:]), sep=' ', dtype=dtype, count=dim)
                        matrix[index] = np.fromstring(' '.join(nums), sep=' ', dtype=dtype, count=dim)
                        hit_flags[index] = True
                except Exception as e:
                    if error == 'ignore':
@@ -90,14 +95,15 @@ class EmbedLoader(BaseLoader):
                        raise e
            total_hits = sum(hit_flags)
            print("Found {} out of {} words in the pre-training embedding.".format(total_hits, len(vocab)))
            found_vectors = matrix[hit_flags]
            if len(found_vectors) != 0:
                mean = np.mean(found_vectors, axis=0, keepdims=True)
                std = np.std(found_vectors, axis=0, keepdims=True)
                unfound_vec_num = len(vocab) - total_hits
                r_vecs = np.random.randn(unfound_vec_num, dim).astype(dtype) * std + mean
                matrix[hit_flags == False] = r_vecs
            
            if init_method is None:
                found_vectors = matrix[hit_flags]
                if len(found_vectors) != 0:
                    mean = np.mean(found_vectors, axis=0, keepdims=True)
                    std = np.std(found_vectors, axis=0, keepdims=True)
                    unfound_vec_num = len(vocab) - total_hits
                    r_vecs = np.random.randn(unfound_vec_num, dim).astype(dtype) * std + mean
                    matrix[hit_flags == False] = r_vecs

            if normalize:
                matrix /= np.linalg.norm(matrix, axis=1, keepdims=True)
            
@@ -135,10 +141,11 @@ class EmbedLoader(BaseLoader):
            for idx, line in enumerate(f, start=start):
                try:
                    parts = line.strip().split()
                    word = parts[0]
                    if dim == -1:
                        dim = len(parts) - 1
                    vec = np.fromstring(' '.join(parts[1:]), sep=' ', dtype=dtype, count=dim)
                    word = ''.join(parts[:-dim])
                    nums = parts[-dim:]
                    vec = np.fromstring(' '.join(nums), sep=' ', dtype=dtype, count=dim)
                    vec_dict[word] = vec
                    vocab.add_word(word)
                    if unknown is not None and unknown == word:
@@ -155,13 +162,17 @@ class EmbedLoader(BaseLoader):
            if dim == -1:
                raise RuntimeError("{} is an empty file.".format(embed_filepath))
            matrix = np.random.randn(len(vocab), dim).astype(dtype)
            for key, vec in vec_dict.items():
                index = vocab.to_index(key)
                matrix[index] = vec

            if (unknown is not None and not found_unknown) or (padding is not None and not found_pad):
                start_idx = 0
                if padding is not None:
                    start_idx += 1
                if unknown is not None:
                    start_idx += 1
                

                mean = np.mean(matrix[start_idx:], axis=0, keepdims=True)
                std = np.std(matrix[start_idx:], axis=0, keepdims=True)
                if (unknown is not None and not found_unknown):
@@ -169,10 +180,6 @@ class EmbedLoader(BaseLoader):
                if (padding is not None and not found_pad):
                    matrix[0] = np.random.randn(1, dim).astype(dtype) * std + mean
            
            for key, vec in vec_dict.items():
                index = vocab.to_index(key)
                matrix[index] = vec
            
            if normalize:
                matrix /= np.linalg.norm(matrix, axis=1, keepdims=True)
            
--- a/fastNLP/io/file_reader.py
+++ b/fastNLP/io/file_reader.py
@@ -90,11 +90,12 @@ def _read_conll(path, encoding='utf-8', indexes=None, dropna=True):
        return sample
    with open(path, 'r', encoding=encoding) as f:
        sample = []
        start = next(f)
        if '-DOCSTART-' not in start:
        start = next(f).strip()
        if '-DOCSTART-' not in start and start!='':
            sample.append(start.split())
        for line_idx, line in enumerate(f, 1):
            if line.startswith('\n'):
            line = line.strip()
            if line=='':
                if len(sample):
                    try:
                        res = parse_conll(sample)
@@ -107,7 +108,8 @@ def _read_conll(path, encoding='utf-8', indexes=None, dropna=True):
            elif line.startswith('#'):
                continue
            else:
                sample.append(line.split())
                if not line.startswith('-DOCSTART-'):
                    sample.append(line.split())
        if len(sample) > 0:
            try:
                res = parse_conll(sample)
@@ -115,4 +117,5 @@ def _read_conll(path, encoding='utf-8', indexes=None, dropna=True):
            except Exception as e:
                if dropna:
                    return
                raise ValueError('invalid instance at line: {}'.format(line_idx))
                print('invalid instance at line: {}'.format(line_idx))
                raise e
--- a/fastNLP/io/file_utils.py
+++ b/fastNLP/io/file_utils.py
@@ -10,10 +10,41 @@ import shutil
 import hashlib


 PRETRAINED_BERT_MODEL_DIR = {
    'en': 'bert-base-cased-f89bfe08.zip',
    'en-base-uncased': 'bert-base-uncased-3413b23c.zip',
    'en-base-cased': 'bert-base-cased-f89bfe08.zip',
    'en-large-uncased': 'bert-large-uncased-20939f45.zip',
    'en-large-cased': 'bert-large-cased-e0cf90fc.zip',

    'cn': 'bert-base-chinese-29d0a84a.zip',
    'cn-base': 'bert-base-chinese-29d0a84a.zip',

    'multilingual': 'bert-base-multilingual-cased-1bd364ee.zip',
    'multilingual-base-uncased': 'bert-base-multilingual-uncased-f8730fe4.zip',
    'multilingual-base-cased': 'bert-base-multilingual-cased-1bd364ee.zip',
 }

 PRETRAINED_ELMO_MODEL_DIR = {
    'en': 'elmo_en-d39843fe.tar.gz',
    'cn': 'elmo_cn-5e9b34e2.tar.gz'
 }

 PRETRAIN_STATIC_FILES = {
    'en': 'glove.840B.300d-cc1ad5e1.tar.gz',
    'en-glove-840b-300': 'glove.840B.300d-cc1ad5e1.tar.gz',
    'en-glove-6b-50': "glove.6B.50d-a6028c70.tar.gz",
    'en-word2vec-300': "GoogleNews-vectors-negative300-be166d9d.tar.gz",
    'en-fasttext': "cc.en.300.vec-d53187b2.gz",
    'cn': "tencent_cn-dab24577.tar.gz",
    'cn-fasttext': "cc.zh.300.vec-d68a9bcf.gz",
 }


 def cached_path(url_or_filename: str, cache_dir: Path=None) -> Path:
    """
        给定一个url或者文件名(可以是具体的文件名，也可以是文件)，先在cache_dir下寻找该文件是否存在，如果不存在则去下载, 并
    将文件放入到
    将文件放入到cache_dir中
    """
    if cache_dir is None:
        dataset_cache = Path(get_defalt_path())
@@ -88,7 +119,7 @@ def split_filename_suffix(filepath):
 def get_from_cache(url: str, cache_dir: Path = None) -> Path:
    """
    尝试在cache_dir中寻找url定义的资源; 如果没有找到。则从url下载并将结果放在cache_dir下，缓存的名称由url的结果推断而来。
        如果从url中下载的资源解压后有多个文件，则返回directory的路径; 如果只有一个资源，则返回具体的路径
        如果从url中下载的资源解压后有多个文件，则返回directory的路径; 如果只有一个资源，则返回具体的路径。

    """
    cache_dir.mkdir(parents=True, exist_ok=True)
--- a/fastNLP/modules/decoder/crf.py
+++ b/fastNLP/modules/decoder/crf.py
@@ -9,7 +9,7 @@ from torch import nn
 from ..utils import initial_parameter


 def allowed_transitions(id2target, encoding_type='bio', include_start_end=True):
 def allowed_transitions(id2target, encoding_type='bio', include_start_end=False):
    """
    别名：:class:`fastNLP.modules.allowed_transitions`  :class:`fastNLP.modules.decoder.crf.allowed_transitions`

@@ -17,7 +17,7 @@ def allowed_transitions(id2target, encoding_type='bio', include_start_end=True):

    :param dict id2target: key是label的indices，value是str类型的tag或tag-label。value可以是只有tag的, 比如"B", "M"; 也可以是
        "B-NN", "M-NN", tag和label之间一定要用"-"隔开。一般可以通过Vocabulary.idx2word得到id2label。
    :param str encoding_type: 支持"bio", "bmes", "bmeso"。
    :param str encoding_type: 支持"bio", "bmes", "bmeso", "bioes"。
    :param bool include_start_end: 是否包含开始与结尾的转换。比如在bio中，b/o可以在开头，但是i不能在开头；
        为True，返回的结果中会包含(start_idx, b_idx), (start_idx, o_idx), 但是不包含(start_idx, i_idx);
        start_idx=len(id2label), end_idx=len(id2label)+1。为False, 返回的结果中不含与开始结尾相关的内容
@@ -58,7 +58,7 @@ def allowed_transitions(id2target, encoding_type='bio', include_start_end=True):
 def _is_transition_allowed(encoding_type, from_tag, from_label, to_tag, to_label):
    """

    :param str encoding_type: 支持"BIO", "BMES", "BEMSO"。
    :param str encoding_type: 支持"BIO", "BMES", "BEMSO", 'bioes'。
    :param str from_tag: 比如"B", "M"之类的标注tag. 还包括start, end等两种特殊tag
    :param str from_label: 比如"PER", "LOC"等label
    :param str to_tag: 比如"B", "M"之类的标注tag. 还包括start, end等两种特殊tag
@@ -134,9 +134,19 @@ def _is_transition_allowed(encoding_type, from_tag, from_label, to_tag, to_label
            return to_tag in ['b', 's', 'end', 'o']
        else:
            raise ValueError("Unexpect tag type {}. Expect only 'B', 'M', 'E', 'S', 'O'.".format(from_tag))
    
    elif encoding_type == 'bioes':
        if from_tag == 'start':
            return to_tag in ['b', 's', 'o']
        elif from_tag == 'b':
            return to_tag in ['i', 'e'] and from_label == to_label
        elif from_tag == 'i':
            return to_tag in ['i', 'e'] and from_label == to_label
        elif from_tag in ['e', 's', 'o']:
            return to_tag in ['b', 's', 'end', 'o']
        else:
            raise ValueError("Unexpect tag type {}. Expect only 'B', 'I', 'E', 'S', 'O'.".format(from_tag))
    else:
        raise ValueError("Only support BIO, BMES, BMESO encoding type, got {}.".format(encoding_type))
        raise ValueError("Only support BIO, BMES, BMESO, BIOES encoding type, got {}.".format(encoding_type))


 class ConditionalRandomField(nn.Module):
--- a/fastNLP/modules/encoder/init.py
+++ b/fastNLP/modules/encoder/init.py
@@ -7,6 +7,12 @@ __all__ = [
    "ConvMaxpool",
    
    "Embedding",
    "StaticEmbedding",
    "ElmoEmbedding",
    "BertEmbedding",
    "StackEmbedding",
    "LSTMCharEmbedding",
    "CNNCharEmbedding",
    
    "LSTM",
    
@@ -18,10 +24,12 @@ __all__ = [
    "VarLSTM",
    "VarGRU"
 ]
 from .bert import BertModel
 from ._bert import BertModel
 from .bert import BertWordPieceEncoder
 from .char_encoder import ConvolutionCharEncoder, LSTMCharEncoder
 from .conv_maxpool import ConvMaxpool
 from .embedding import Embedding
 from .embedding import Embedding, StaticEmbedding, ElmoEmbedding, BertEmbedding, \
    StackEmbedding, LSTMCharEmbedding, CNNCharEmbedding
 from .lstm import LSTM
 from .star_transformer import StarTransformer
 from .transformer import TransformerEncoder
--- a/fastNLP/modules/encoder/_bert.py
+++ b/fastNLP/modules/encoder/_bert.py
@@ -6,18 +6,395 @@
 """


 import torch
 from torch import nn

 from ... import Vocabulary
 from ...core.vocabulary import Vocabulary
 import collections

 import os
 import unicodedata
 from ...io.file_utils import _get_base_url, cached_path
 from .bert import BertModel
 import numpy as np
 from itertools import chain
 import copy
 import json
 import math
 import os

 import torch
 from torch import nn
 import glob

 CONFIG_FILE = 'bert_config.json'
 MODEL_WEIGHTS = 'pytorch_model.bin'


 def gelu(x):
    return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))


 def swish(x):
    return x * torch.sigmoid(x)


 ACT2FN = {"gelu": gelu, "relu": torch.nn.functional.relu, "swish": swish}


 class BertLayerNorm(nn.Module):
    def __init__(self, hidden_size, eps=1e-12):
        super(BertLayerNorm, self).__init__()
        self.weight = nn.Parameter(torch.ones(hidden_size))
        self.bias = nn.Parameter(torch.zeros(hidden_size))
        self.variance_epsilon = eps

    def forward(self, x):
        u = x.mean(-1, keepdim=True)
        s = (x - u).pow(2).mean(-1, keepdim=True)
        x = (x - u) / torch.sqrt(s + self.variance_epsilon)
        return self.weight * x + self.bias


 class BertEmbeddings(nn.Module):
    def __init__(self, vocab_size, hidden_size, max_position_embeddings, type_vocab_size, hidden_dropout_prob):
        super(BertEmbeddings, self).__init__()
        self.word_embeddings = nn.Embedding(vocab_size, hidden_size)
        self.position_embeddings = nn.Embedding(max_position_embeddings, hidden_size)
        self.token_type_embeddings = nn.Embedding(type_vocab_size, hidden_size)

        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
        # any TensorFlow checkpoint file
        self.LayerNorm = BertLayerNorm(hidden_size, eps=1e-12)
        self.dropout = nn.Dropout(hidden_dropout_prob)

    def forward(self, input_ids, token_type_ids=None):
        seq_length = input_ids.size(1)
        position_ids = torch.arange(seq_length, dtype=torch.long, device=input_ids.device)
        position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
        if token_type_ids is None:
            token_type_ids = torch.zeros_like(input_ids)

        words_embeddings = self.word_embeddings(input_ids)
        position_embeddings = self.position_embeddings(position_ids)
        token_type_embeddings = self.token_type_embeddings(token_type_ids)

        embeddings = words_embeddings + position_embeddings + token_type_embeddings
        embeddings = self.LayerNorm(embeddings)
        embeddings = self.dropout(embeddings)
        return embeddings


 class BertSelfAttention(nn.Module):
    def __init__(self, hidden_size, num_attention_heads, attention_probs_dropout_prob):
        super(BertSelfAttention, self).__init__()
        if hidden_size % num_attention_heads != 0:
            raise ValueError(
                "The hidden size (%d) is not a multiple of the number of attention "
                "heads (%d)" % (hidden_size, num_attention_heads))
        self.num_attention_heads = num_attention_heads
        self.attention_head_size = int(hidden_size / num_attention_heads)
        self.all_head_size = self.num_attention_heads * self.attention_head_size

        self.query = nn.Linear(hidden_size, self.all_head_size)
        self.key = nn.Linear(hidden_size, self.all_head_size)
        self.value = nn.Linear(hidden_size, self.all_head_size)

        self.dropout = nn.Dropout(attention_probs_dropout_prob)

    def transpose_for_scores(self, x):
        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
        x = x.view(*new_x_shape)
        return x.permute(0, 2, 1, 3)

    def forward(self, hidden_states, attention_mask):
        mixed_query_layer = self.query(hidden_states)
        mixed_key_layer = self.key(hidden_states)
        mixed_value_layer = self.value(hidden_states)

        query_layer = self.transpose_for_scores(mixed_query_layer)
        key_layer = self.transpose_for_scores(mixed_key_layer)
        value_layer = self.transpose_for_scores(mixed_value_layer)

        # Take the dot product between "query" and "key" to get the raw attention scores.
        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
        # Apply the attention mask is (precomputed for all layers in BertModel forward() function)
        attention_scores = attention_scores + attention_mask

        # Normalize the attention scores to probabilities.
        attention_probs = nn.Softmax(dim=-1)(attention_scores)

        # This is actually dropping out entire tokens to attend to, which might
        # seem a bit unusual, but is taken from the original Transformer paper.
        attention_probs = self.dropout(attention_probs)

        context_layer = torch.matmul(attention_probs, value_layer)
        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
        context_layer = context_layer.view(*new_context_layer_shape)
        return context_layer


 class BertSelfOutput(nn.Module):
    def __init__(self, hidden_size, hidden_dropout_prob):
        super(BertSelfOutput, self).__init__()
        self.dense = nn.Linear(hidden_size, hidden_size)
        self.LayerNorm = BertLayerNorm(hidden_size, eps=1e-12)
        self.dropout = nn.Dropout(hidden_dropout_prob)

    def forward(self, hidden_states, input_tensor):
        hidden_states = self.dense(hidden_states)
        hidden_states = self.dropout(hidden_states)
        hidden_states = self.LayerNorm(hidden_states + input_tensor)
        return hidden_states


 class BertAttention(nn.Module):
    def __init__(self, hidden_size, num_attention_heads, attention_probs_dropout_prob, hidden_dropout_prob):
        super(BertAttention, self).__init__()
        self.self = BertSelfAttention(hidden_size, num_attention_heads, attention_probs_dropout_prob)
        self.output = BertSelfOutput(hidden_size, hidden_dropout_prob)

    def forward(self, input_tensor, attention_mask):
        self_output = self.self(input_tensor, attention_mask)
        attention_output = self.output(self_output, input_tensor)
        return attention_output


 class BertIntermediate(nn.Module):
    def __init__(self, hidden_size, intermediate_size, hidden_act):
        super(BertIntermediate, self).__init__()
        self.dense = nn.Linear(hidden_size, intermediate_size)
        self.intermediate_act_fn = ACT2FN[hidden_act] \
            if isinstance(hidden_act, str) else hidden_act

    def forward(self, hidden_states):
        hidden_states = self.dense(hidden_states)
        hidden_states = self.intermediate_act_fn(hidden_states)
        return hidden_states


 class BertOutput(nn.Module):
    def __init__(self, hidden_size, intermediate_size, hidden_dropout_prob):
        super(BertOutput, self).__init__()
        self.dense = nn.Linear(intermediate_size, hidden_size)
        self.LayerNorm = BertLayerNorm(hidden_size, eps=1e-12)
        self.dropout = nn.Dropout(hidden_dropout_prob)

    def forward(self, hidden_states, input_tensor):
        hidden_states = self.dense(hidden_states)
        hidden_states = self.dropout(hidden_states)
        hidden_states = self.LayerNorm(hidden_states + input_tensor)
        return hidden_states


 class BertLayer(nn.Module):
    def __init__(self, hidden_size, num_attention_heads, attention_probs_dropout_prob, hidden_dropout_prob,
                 intermediate_size, hidden_act):
        super(BertLayer, self).__init__()
        self.attention = BertAttention(hidden_size, num_attention_heads, attention_probs_dropout_prob,
                                       hidden_dropout_prob)
        self.intermediate = BertIntermediate(hidden_size, intermediate_size, hidden_act)
        self.output = BertOutput(hidden_size, intermediate_size, hidden_dropout_prob)

    def forward(self, hidden_states, attention_mask):
        attention_output = self.attention(hidden_states, attention_mask)
        intermediate_output = self.intermediate(attention_output)
        layer_output = self.output(intermediate_output, attention_output)
        return layer_output


 class BertEncoder(nn.Module):
    def __init__(self, num_hidden_layers, hidden_size, num_attention_heads, attention_probs_dropout_prob,
                 hidden_dropout_prob,
                 intermediate_size, hidden_act):
        super(BertEncoder, self).__init__()
        layer = BertLayer(hidden_size, num_attention_heads, attention_probs_dropout_prob, hidden_dropout_prob,
                          intermediate_size, hidden_act)
        self.layer = nn.ModuleList([copy.deepcopy(layer) for _ in range(num_hidden_layers)])

    def forward(self, hidden_states, attention_mask, output_all_encoded_layers=True):
        all_encoder_layers = []
        for layer_module in self.layer:
            hidden_states = layer_module(hidden_states, attention_mask)
            if output_all_encoded_layers:
                all_encoder_layers.append(hidden_states)
        if not output_all_encoded_layers:
            all_encoder_layers.append(hidden_states)
        return all_encoder_layers


 class BertPooler(nn.Module):
    def __init__(self, hidden_size):
        super(BertPooler, self).__init__()
        self.dense = nn.Linear(hidden_size, hidden_size)
        self.activation = nn.Tanh()

    def forward(self, hidden_states):
        # We "pool" the model by simply taking the hidden state corresponding
        # to the first token.
        first_token_tensor = hidden_states[:, 0]
        pooled_output = self.dense(first_token_tensor)
        pooled_output = self.activation(pooled_output)
        return pooled_output


 class BertModel(nn.Module):
    """BERT(Bidirectional Embedding Representations from Transformers).

    如果你想使用预训练好的权重矩阵，请在以下网址下载.
    sources::

    'bert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased.tar.gz",
    'bert-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased.tar.gz",
    'bert-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased.tar.gz",
    'bert-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased.tar.gz",
    'bert-base-multilingual-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased.tar.gz",
    'bert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased.tar.gz",
    'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese.tar.gz",


    用预训练权重矩阵来建立BERT模型::

        model = BertModel.from_pretrained("path/to/weights/directory")

    用随机初始化权重矩阵来建立BERT模型::

        model = BertModel()

    :param int vocab_size: 词表大小，默认值为30522，为BERT English uncase版本的词表大小
    :param int hidden_size: 隐层大小，默认值为768，为BERT base的版本
    :param int num_hidden_layers: 隐藏层数，默认值为12，为BERT base的版本
    :param int num_attention_heads: 多头注意力头数，默认值为12，为BERT base的版本
    :param int intermediate_size: FFN隐藏层大小，默认值是3072，为BERT base的版本
    :param str hidden_act: FFN隐藏层激活函数，默认值为``gelu``
    :param float hidden_dropout_prob: FFN隐藏层dropout，默认值为0.1
    :param float attention_probs_dropout_prob: Attention层的dropout，默认值为0.1
    :param int max_position_embeddings: 最大的序列长度，默认值为512，
    :param int type_vocab_size: 最大segment数量，默认值为2
    :param int initializer_range: 初始化权重范围，默认值为0.02
    """

    def __init__(self, vocab_size=30522,
                 hidden_size=768,
                 num_hidden_layers=12,
                 num_attention_heads=12,
                 intermediate_size=3072,
                 hidden_act="gelu",
                 hidden_dropout_prob=0.1,
                 attention_probs_dropout_prob=0.1,
                 max_position_embeddings=512,
                 type_vocab_size=2,
                 initializer_range=0.02):
        super(BertModel, self).__init__()
        self.hidden_size = hidden_size
        self.embeddings = BertEmbeddings(vocab_size, hidden_size, max_position_embeddings,
                                         type_vocab_size, hidden_dropout_prob)
        self.encoder = BertEncoder(num_hidden_layers, hidden_size, num_attention_heads,
                                   attention_probs_dropout_prob, hidden_dropout_prob, intermediate_size,
                                   hidden_act)
        self.pooler = BertPooler(hidden_size)
        self.initializer_range = initializer_range

        self.apply(self.init_bert_weights)

    def init_bert_weights(self, module):
        if isinstance(module, (nn.Linear, nn.Embedding)):
            # Slightly different from the TF version which uses truncated_normal for initialization
            # cf https://github.com/pytorch/pytorch/pull/5617
            module.weight.data.normal_(mean=0.0, std=self.initializer_range)
        elif isinstance(module, BertLayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        if isinstance(module, nn.Linear) and module.bias is not None:
            module.bias.data.zero_()

    def forward(self, input_ids, token_type_ids=None, attention_mask=None, output_all_encoded_layers=True):
        if attention_mask is None:
            attention_mask = torch.ones_like(input_ids)
        if token_type_ids is None:
            token_type_ids = torch.zeros_like(input_ids)

        # We create a 3D attention mask from a 2D tensor mask.
        # Sizes are [batch_size, 1, 1, to_seq_length]
        # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
        # this attention mask is more simple than the triangular masking of causal attention
        # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
        extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)

        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
        # masked positions, this operation will create a tensor which is 0.0 for
        # positions we want to attend and -10000.0 for masked positions.
        # Since we are adding it to the raw scores before the softmax, this is
        # effectively the same as removing these entirely.
        extended_attention_mask = extended_attention_mask.to(dtype=next(self.parameters()).dtype)  # fp16 compatibility
        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0

        embedding_output = self.embeddings(input_ids, token_type_ids)
        encoded_layers = self.encoder(embedding_output,
                                      extended_attention_mask,
                                      output_all_encoded_layers=output_all_encoded_layers)
        sequence_output = encoded_layers[-1]
        pooled_output = self.pooler(sequence_output)
        if not output_all_encoded_layers:
            encoded_layers = encoded_layers[-1]
        return encoded_layers, pooled_output

    @classmethod
    def from_pretrained(cls, pretrained_model_dir, state_dict=None, *inputs, **kwargs):
        # Load config
        config_file = os.path.join(pretrained_model_dir, CONFIG_FILE)
        config = json.load(open(config_file, "r"))
        # config = BertConfig.from_json_file(config_file)
        # logger.info("Model config {}".format(config))
        # Instantiate model.
        model = cls(*inputs, **config, **kwargs)
        if state_dict is None:
            files = glob.glob(os.path.join(pretrained_model_dir, '*.bin'))
            if len(files)==0:
                raise FileNotFoundError(f"There is no *.bin file in {pretrained_model_dir}")
            elif len(files)>1:
                raise FileExistsError(f"There are multiple *.bin files in {pretrained_model_dir}")
            weights_path = files[0]
            state_dict = torch.load(weights_path)

        old_keys = []
        new_keys = []
        for key in state_dict.keys():
            new_key = None
            if 'gamma' in key:
                new_key = key.replace('gamma', 'weight')
            if 'beta' in key:
                new_key = key.replace('beta', 'bias')
            if new_key:
                old_keys.append(key)
                new_keys.append(new_key)
        for old_key, new_key in zip(old_keys, new_keys):
            state_dict[new_key] = state_dict.pop(old_key)

        missing_keys = []
        unexpected_keys = []
        error_msgs = []
        # copy state_dict so _load_from_state_dict can modify it
        metadata = getattr(state_dict, '_metadata', None)
        state_dict = state_dict.copy()
        if metadata is not None:
            state_dict._metadata = metadata

        def load(module, prefix=''):
            local_metadata = {} if metadata is None else metadata.get(prefix[:-1], {})
            module._load_from_state_dict(
                state_dict, prefix, local_metadata, True, missing_keys, unexpected_keys, error_msgs)
            for name, child in module._modules.items():
                if child is not None:
                    load(child, prefix + name + '.')

        load(model, prefix='' if hasattr(model, 'bert') else 'bert.')
        if len(missing_keys) > 0:
            print("Weights of {} not initialized from pretrained model: {}".format(
                model.__class__.__name__, missing_keys))
        if len(unexpected_keys) > 0:
            print("Weights from pretrained model not used in {}: {}".format(
                model.__class__.__name__, unexpected_keys))
        return model


 def whitespace_tokenize(text):
    """Runs basic whitespace cleaning and splitting on a piece of text."""
@@ -290,6 +667,16 @@ class BertTokenizer(object):
        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
        self.max_len = max_len if max_len is not None else int(1e12)

    def _reinit_on_new_vocab(self, vocab):
        """
        在load bert之后，可能会对vocab进行重新排列。重新排列之后调用这个函数重新初始化与vocab相关的性质

        :param vocab:
        :return:
        """
        self.vocab = vocab
        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)

    def tokenize(self, text):
        split_tokens = []
        if self.do_basic_tokenize:
@@ -325,6 +712,8 @@ class BertTokenizer(object):
        index = 0
        if os.path.isdir(vocab_path):
            vocab_file = os.path.join(vocab_path, VOCAB_NAME)
        else:
            vocab_file = vocab_path
        with open(vocab_file, "w", encoding="utf-8") as writer:
            for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]):
                if index != token_index:
@@ -370,11 +759,44 @@ class _WordBertModel(nn.Module):

        assert pool_method in ('avg', 'max', 'first', 'last')
        self.pool_method = pool_method

        self.include_cls_sep = include_cls_sep

        # 将所有vocab中word的wordpiece计算出来, 需要额外考虑[CLS]和[SEP]
        print("Start to generating word pieces for word.")
        # 第一步统计出需要的word_piece, 然后创建新的embed和word_piece_vocab, 然后填入值
        word_piece_dict = {'[CLS]':1, '[SEP]':1}  # 用到的word_piece以及新增的
        found_count = 0
        for word, index in vocab:
            if index == vocab.padding_idx:  # pad是个特殊的符号
                word = '[PAD]'
            elif index == vocab.unknown_idx:
                word = '[UNK]'
            word_pieces = self.tokenzier.wordpiece_tokenizer.tokenize(word)
            if len(word_pieces)==1:
                if not vocab._is_word_no_create_entry(word):  # 如果是train中的值, 但是却没有找到
                    if index!=vocab.unknown_idx and word_pieces[0]=='[UNK]': # 说明这个词不在原始的word里面
                        word_piece_dict[word] = 1  # 新增一个值
                        continue
            for word_piece in word_pieces:
                word_piece_dict[word_piece] = 1
            found_count += 1
        original_embed = self.encoder.embeddings.word_embeddings.weight.data
        # 特殊词汇要特殊处理
        embed = nn.Embedding(len(word_piece_dict), original_embed.size(1))  # 新的embed
        new_word_piece_vocab = collections.OrderedDict()
        for index, token in enumerate(['[PAD]', '[UNK]']):
            word_piece_dict.pop(token, None)
            embed.weight.data[index] = original_embed[self.tokenzier.vocab[token]]
            new_word_piece_vocab[token] = index
        for token in word_piece_dict.keys():
            if token in self.tokenzier.vocab:
                embed.weight.data[len(new_word_piece_vocab)] = original_embed[self.tokenzier.vocab[token]]
            else:
                embed.weight.data[len(new_word_piece_vocab)] = original_embed[self.tokenzier.vocab['[UNK]']]
            new_word_piece_vocab[token] = len(new_word_piece_vocab)
        self.tokenzier._reinit_on_new_vocab(new_word_piece_vocab)
        self.encoder.embeddings.word_embeddings = embed

        word_to_wordpieces = []
        word_pieces_lengths = []
        for word, index in vocab:
@@ -386,12 +808,11 @@ class _WordBertModel(nn.Module):
            word_pieces = self.tokenzier.convert_tokens_to_ids(word_pieces)
            word_to_wordpieces.append(word_pieces)
            word_pieces_lengths.append(len(word_pieces))
        self._cls_index = len(vocab)
        self._sep_index = len(vocab) + 1
        print("Found(Or seg into word pieces) {} words out of {}.".format(found_count, len(vocab)))
        self._cls_index = self.tokenzier.vocab['[CLS]']
        self._sep_index = self.tokenzier.vocab['[SEP]']
        self._pad_index = vocab.padding_idx
        self._wordpiece_pad_index = self.tokenzier.convert_tokens_to_ids(['[PAD]'])[0]  # 需要用于生成word_piece
        word_to_wordpieces.append(self.tokenzier.convert_tokens_to_ids(['[CLS]']))
        word_to_wordpieces.append(self.tokenzier.convert_tokens_to_ids(['[SEP]']))
        self._wordpiece_pad_index = self.tokenzier.vocab['[PAD]']  # 需要用于生成word_piece
        self.word_to_wordpieces = np.array(word_to_wordpieces)
        self.word_pieces_lengths = nn.Parameter(torch.LongTensor(word_pieces_lengths), requires_grad=False)
        print("Successfully generate word pieces.")
@@ -410,7 +831,8 @@ class _WordBertModel(nn.Module):
        # +2是由于需要加入[CLS]与[SEP]
        word_pieces = words.new_full((batch_size, max_word_piece_length+2), fill_value=self._wordpiece_pad_index)
        word_pieces[:, 0].fill_(self._cls_index)
        word_pieces[:, word_pieces_lengths+1] = self._sep_index
        batch_indexes = torch.arange(batch_size).to(words)
        word_pieces[batch_indexes, word_pieces_lengths+1] = self._sep_index
        attn_masks = torch.zeros_like(word_pieces)
        # 1. 获取words的word_pieces的id，以及对应的span范围
        word_indexes = words.tolist()
@@ -458,8 +880,8 @@ class _WordBertModel(nn.Module):
                        start, end = batch_word_pieces_cum_length[i, j], batch_word_pieces_cum_length[i, j+1]
                        outputs[l_index, i, j+s_shift] = torch.mean(truncate_output_layer[i, start:end], dim=-2)
            if self.include_cls_sep:
                outputs[:, :, 0] = output_layer[:, 0]
                outputs[:, :, seq_len+s_shift] = output_layer[:, seq_len+s_shift]
                outputs[l_index, :, 0] = output_layer[:, 0]
                outputs[l_index, batch_indexes, seq_len+s_shift] = output_layer[batch_indexes, seq_len+s_shift]
        # 3. 最终的embedding结果
        return outputs

@@ -469,7 +891,7 @@ class _WordPieceBertModel(nn.Module):
    这个模块用于直接计算word_piece的结果.

    """
    def __init__(self, model_dir:str, vocab:Vocabulary, layers:str='-1'):
    def __init__(self, model_dir:str, layers:str='-1'):
        super().__init__()

        self.tokenzier = BertTokenizer.from_pretrained(model_dir)
@@ -485,44 +907,34 @@ class _WordPieceBertModel(nn.Module):
                assert layer<encoder_layer_number, f"The layer index:{layer} is out of scope for " \
                    f"a bert model with {encoder_layer_number} layers."

        # 将所有vocab中word的wordpiece计算出来, 需要额外考虑[CLS]和[SEP]
        print("Start to generating word pieces for word.")
        self.word_to_wordpieces = []
        self.word_pieces_length = []
        for word, index in vocab:
            if index == vocab.padding_idx:  # pad是个特殊的符号
                word = '[PAD]'
            elif index == vocab.unknown_idx:
                word = '[UNK]'
            word_pieces = self.tokenzier.wordpiece_tokenizer.tokenize(word)
            word_pieces = self.tokenzier.convert_tokens_to_ids(word_pieces)
            self.word_to_wordpieces.append(word_pieces)
            self.word_pieces_length.append(len(word_pieces))
        self._cls_index = len(vocab)
        self._sep_index = len(vocab) + 1
        self._pad_index = vocab.padding_idx
        self._wordpiece_pad_index = self.tokenzier.convert_tokens_to_ids(['[PAD]'])[0]  # 需要用于生成word_piece
        self.word_to_wordpieces.append(self.tokenzier.convert_tokens_to_ids(['[CLS]']))
        self.word_to_wordpieces.append(self.tokenzier.convert_tokens_to_ids(['[SEP]']))
        self.word_to_wordpieces = np.array(self.word_to_wordpieces, dtype=int)
        print("Successfully generate word pieces.")
        self._cls_index = self.tokenzier.vocab['[CLS]']
        self._sep_index = self.tokenzier.vocab['[SEP]']
        self._wordpiece_pad_index = self.tokenzier.vocab['[PAD]']  # 需要用于生成word_piece

    def index_dataset(self, *datasets):
    def index_dataset(self, *datasets, field_name):
        """
        使用bert的tokenizer将word_pieces与word_pieces_seq_len这两列加入到datasets中，并将他们设置为input。加入的word_piece
            已经包含了[CLS]与[SEP], 且将word_pieces这一列的pad value设置为了bert的pad value。
        使用bert的tokenizer新生成word_pieces列加入到datasets中，并将他们设置为input。如果首尾不是
            [CLS]与[SEP]会在首尾额外加入[CLS]与[SEP], 且将word_pieces这一列的pad value设置为了bert的pad value。

        :param datasets: DataSet对象
        :param field_name: 基于哪一列index
        :return:
        """
        def convert_words_to_word_pieces(words):
            word_pieces = list(chain(*self.word_to_wordpieces[words].tolist()))
            word_pieces = [self._cls_index] + word_pieces + [self._sep_index]
            word_pieces = []
            for word in words:
                tokens = self.tokenzier.wordpiece_tokenizer.tokenize(word)
                word_piece_ids = self.tokenzier.convert_tokens_to_ids(tokens)
                word_pieces.extend(word_piece_ids)
            if word_pieces[0]!=self._cls_index:
                word_pieces.insert(0, self._cls_index)
            if word_pieces[-1]!=self._sep_index:
                word_pieces.insert(-1, self._sep_index)
            return word_pieces

        for index, dataset in enumerate(datasets):
            try:
                dataset.apply_field(convert_words_to_word_pieces, field_name='words', new_field_name='word_pieces',
                dataset.apply_field(convert_words_to_word_pieces, field_name=field_name, new_field_name='word_pieces',
                                    is_input=True)
                dataset.set_pad_val('word_pieces', self._wordpiece_pad_index)
            except Exception as e:
@@ -538,7 +950,7 @@ class _WordPieceBertModel(nn.Module):
        """
        batch_size, max_len = word_pieces.size()

        attn_masks = word_pieces.ne(self._pad_index)
        attn_masks = word_pieces.ne(self._wordpiece_pad_index)
        bert_outputs, _ = self.encoder(word_pieces, token_type_ids=token_type_ids, attention_mask=attn_masks,
                                           output_all_encoded_layers=True)
        # output_layers = [self.layers]  # len(self.layers) x batch_size x max_word_piece_length x hidden_size
@@ -547,79 +959,3 @@ class _WordPieceBertModel(nn.Module):
            outputs[l_index] = bert_outputs[l]
        return outputs

 class BertWordPieceEncoder(nn.Module):
    """
    可以通过读取vocabulary使用的Bert的Encoder。传入vocab，然后调用index_datasets方法在vocabulary中生成word piece的表示。

    :param vocab: Vocabulary.
    :param model_dir_or_name:
    :param layers:
    :param requires_grad:
    """
    def __init__(self, vocab:Vocabulary, model_dir_or_name:str='en-base', layers:str='-1',
                 requires_grad:bool=False):
        super().__init__()
        PRETRAIN_URL = _get_base_url('bert')
        # TODO 修改
        PRETRAINED_BERT_MODEL_DIR = {'en-base': 'bert_en-80f95ea7.tar.gz',
                                     'cn': 'elmo_cn.zip'}

        if model_dir_or_name in PRETRAINED_BERT_MODEL_DIR:
            model_name = PRETRAINED_BERT_MODEL_DIR[model_dir_or_name]
            model_url = PRETRAIN_URL + model_name
            model_dir = cached_path(model_url)
            # 检查是否存在
        elif os.path.isdir(model_dir_or_name):
            model_dir = model_dir_or_name
        else:
            raise ValueError(f"Cannot recognize {model_dir_or_name}.")

        self.model = _WordPieceBertModel(model_dir=model_dir, vocab=vocab, layers=layers)
        self._embed_size = len(self.model.layers) * self.model.encoder.hidden_size
        self.requires_grad = requires_grad

    @property
    def requires_grad(self):
        """
        Embedding的参数是否允许优化。True: 所有参数运行优化; False: 所有参数不允许优化; None: 部分允许优化、部分不允许
        :return:
        """
        requires_grads = set([param.requires_grad for name, param in self.named_parameters()])
        if len(requires_grads)==1:
            return requires_grads.pop()
        else:
            return None

    @requires_grad.setter
    def requires_grad(self, value):
        for name, param in self.named_parameters():
            param.requires_grad = value

    @property
    def embed_size(self):
        return self._embed_size

    def index_datasets(self, *datasets):
        """
        对datasets进行word piece的index。

        Example::

        :param datasets:
        :return:
        """
        self.model.index_dataset(*datasets)

    def forward(self, words, token_type_ids=None):
        """
        计算words的bert embedding表示。计算之前会在每句话的开始增加[CLS]在结束增加[SEP], 并根据include_cls_sep判断要不要
            删除这两个表示。

        :param words: batch_size x max_len
        :param token_type_ids: batch_size x max_len, 用于区分前一句和后一句话
        :return: torch.FloatTensor. batch_size x max_len x (768*len(self.layers))
        """
        outputs = self.model(words, token_type_ids)
        outputs = torch.cat([*outputs], dim=-1)

        return outputs
--- a/fastNLP/modules/encoder/_elmo.py
+++ b/fastNLP/modules/encoder/_elmo.py
@@ -16,6 +16,7 @@ import json

 from ..utils import get_dropout_mask
 import codecs
 from torch import autograd

 class LstmCellWithProjection(torch.nn.Module):
    """
@@ -429,6 +430,8 @@ class LstmTokenEmbedder(nn.Module):
    def forward(self, words, chars):
        embs = []
        if self.word_emb_layer is not None:
            if hasattr(self, 'words_to_words'):
                words = self.words_to_words[words]
            word_emb = self.word_emb_layer(words)
            embs.append(word_emb)

@@ -486,6 +489,8 @@ class ConvTokenEmbedder(nn.Module):
    def forward(self, words, chars):
        embs = []
        if self.word_emb_layer is not None:
            if hasattr(self, 'words_to_words'):
                words = self.words_to_words[words]
            word_emb = self.word_emb_layer(words)
            embs.append(word_emb)

@@ -703,7 +708,12 @@ class _ElmoModel(nn.Module):
            self.token_embedder = LstmTokenEmbedder(
                config, word_emb_layer, char_emb_layer)
        self.token_embedder.load_state_dict(token_embedder_states, strict=False)

        if config['token_embedder']['word_dim'] > 0 and vocab._no_create_word_length > 0:  # 需要映射，使得来自于dev, test的idx指向unk
            words_to_words = nn.Parameter(torch.arange(len(vocab)+2).long(), requires_grad=False)
            for word, idx in vocab:
                if vocab._is_word_no_create_entry(word):
                    words_to_words[idx] = vocab.unknown_idx
            setattr(self.token_embedder, 'words_to_words', words_to_words)
        self.output_dim = config['encoder']['projection_dim']

        if config['encoder']['name'].lower() == 'elmo':
@@ -760,7 +770,11 @@ class _ElmoModel(nn.Module):
            token_embedding = self.token_embedder(expanded_words, chars)
        if self.config['encoder']['name'] == 'elmo':
            encoder_output = self.encoder(token_embedding, seq_len)
            sz = encoder_output.size()
            if encoder_output.size(2) < max_len+2:
                dummy_tensor = encoder_output.new_zeros(encoder_output.size(0), batch_size,
                                                        max_len + 2 - encoder_output.size(2), encoder_output.size(-1))
                encoder_output = torch.cat([encoder_output, dummy_tensor], 2)
            sz = encoder_output.size()  # 2, batch_size, max_len, hidden_size
            token_embedding = torch.cat([token_embedding, token_embedding], dim=2).view(1, sz[1], sz[2], sz[3])
            encoder_output = torch.cat([token_embedding, encoder_output], dim=0)
        elif self.config['encoder']['name'] == 'lstm':
--- a/fastNLP/modules/encoder/bert.py
+++ b/fastNLP/modules/encoder/bert.py
@@ -1,378 +1,92 @@
 """
 bert.py is modified from huggingface/pytorch-pretrained-BERT, which is licensed under the Apache License 2.0.

 """
 import copy
 import json
 import math
 import os

 import torch
 from torch import nn
 import torch
 from ...io.file_utils import _get_base_url, cached_path
 from ._bert import _WordPieceBertModel, BertModel

 CONFIG_FILE = 'bert_config.json'
 MODEL_WEIGHTS = 'pytorch_model.bin'


 def gelu(x):
    return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))


 def swish(x):
    return x * torch.sigmoid(x)


 ACT2FN = {"gelu": gelu, "relu": torch.nn.functional.relu, "swish": swish}


 class BertLayerNorm(nn.Module):
    def __init__(self, hidden_size, eps=1e-12):
        super(BertLayerNorm, self).__init__()
        self.weight = nn.Parameter(torch.ones(hidden_size))
        self.bias = nn.Parameter(torch.zeros(hidden_size))
        self.variance_epsilon = eps

    def forward(self, x):
        u = x.mean(-1, keepdim=True)
        s = (x - u).pow(2).mean(-1, keepdim=True)
        x = (x - u) / torch.sqrt(s + self.variance_epsilon)
        return self.weight * x + self.bias


 class BertEmbeddings(nn.Module):
    def __init__(self, vocab_size, hidden_size, max_position_embeddings, type_vocab_size, hidden_dropout_prob):
        super(BertEmbeddings, self).__init__()
        self.word_embeddings = nn.Embedding(vocab_size, hidden_size)
        self.position_embeddings = nn.Embedding(max_position_embeddings, hidden_size)
        self.token_type_embeddings = nn.Embedding(type_vocab_size, hidden_size)

        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
        # any TensorFlow checkpoint file
        self.LayerNorm = BertLayerNorm(hidden_size, eps=1e-12)
        self.dropout = nn.Dropout(hidden_dropout_prob)

    def forward(self, input_ids, token_type_ids=None):
        seq_length = input_ids.size(1)
        position_ids = torch.arange(seq_length, dtype=torch.long, device=input_ids.device)
        position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
        if token_type_ids is None:
            token_type_ids = torch.zeros_like(input_ids)

        words_embeddings = self.word_embeddings(input_ids)
        position_embeddings = self.position_embeddings(position_ids)
        token_type_embeddings = self.token_type_embeddings(token_type_ids)

        embeddings = words_embeddings + position_embeddings + token_type_embeddings
        embeddings = self.LayerNorm(embeddings)
        embeddings = self.dropout(embeddings)
        return embeddings


 class BertSelfAttention(nn.Module):
    def __init__(self, hidden_size, num_attention_heads, attention_probs_dropout_prob):
        super(BertSelfAttention, self).__init__()
        if hidden_size % num_attention_heads != 0:
            raise ValueError(
                "The hidden size (%d) is not a multiple of the number of attention "
                "heads (%d)" % (hidden_size, num_attention_heads))
        self.num_attention_heads = num_attention_heads
        self.attention_head_size = int(hidden_size / num_attention_heads)
        self.all_head_size = self.num_attention_heads * self.attention_head_size

        self.query = nn.Linear(hidden_size, self.all_head_size)
        self.key = nn.Linear(hidden_size, self.all_head_size)
        self.value = nn.Linear(hidden_size, self.all_head_size)

        self.dropout = nn.Dropout(attention_probs_dropout_prob)

    def transpose_for_scores(self, x):
        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
        x = x.view(*new_x_shape)
        return x.permute(0, 2, 1, 3)

    def forward(self, hidden_states, attention_mask):
        mixed_query_layer = self.query(hidden_states)
        mixed_key_layer = self.key(hidden_states)
        mixed_value_layer = self.value(hidden_states)

        query_layer = self.transpose_for_scores(mixed_query_layer)
        key_layer = self.transpose_for_scores(mixed_key_layer)
        value_layer = self.transpose_for_scores(mixed_value_layer)

        # Take the dot product between "query" and "key" to get the raw attention scores.
        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
        # Apply the attention mask is (precomputed for all layers in BertModel forward() function)
        attention_scores = attention_scores + attention_mask

        # Normalize the attention scores to probabilities.
        attention_probs = nn.Softmax(dim=-1)(attention_scores)

        # This is actually dropping out entire tokens to attend to, which might
        # seem a bit unusual, but is taken from the original Transformer paper.
        attention_probs = self.dropout(attention_probs)

        context_layer = torch.matmul(attention_probs, value_layer)
        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
        context_layer = context_layer.view(*new_context_layer_shape)
        return context_layer


 class BertSelfOutput(nn.Module):
    def __init__(self, hidden_size, hidden_dropout_prob):
        super(BertSelfOutput, self).__init__()
        self.dense = nn.Linear(hidden_size, hidden_size)
        self.LayerNorm = BertLayerNorm(hidden_size, eps=1e-12)
        self.dropout = nn.Dropout(hidden_dropout_prob)

    def forward(self, hidden_states, input_tensor):
        hidden_states = self.dense(hidden_states)
        hidden_states = self.dropout(hidden_states)
        hidden_states = self.LayerNorm(hidden_states + input_tensor)
        return hidden_states


 class BertAttention(nn.Module):
    def __init__(self, hidden_size, num_attention_heads, attention_probs_dropout_prob, hidden_dropout_prob):
        super(BertAttention, self).__init__()
        self.self = BertSelfAttention(hidden_size, num_attention_heads, attention_probs_dropout_prob)
        self.output = BertSelfOutput(hidden_size, hidden_dropout_prob)

    def forward(self, input_tensor, attention_mask):
        self_output = self.self(input_tensor, attention_mask)
        attention_output = self.output(self_output, input_tensor)
        return attention_output


 class BertIntermediate(nn.Module):
    def __init__(self, hidden_size, intermediate_size, hidden_act):
        super(BertIntermediate, self).__init__()
        self.dense = nn.Linear(hidden_size, intermediate_size)
        self.intermediate_act_fn = ACT2FN[hidden_act] \
            if isinstance(hidden_act, str) else hidden_act

    def forward(self, hidden_states):
        hidden_states = self.dense(hidden_states)
        hidden_states = self.intermediate_act_fn(hidden_states)
        return hidden_states


 class BertOutput(nn.Module):
    def __init__(self, hidden_size, intermediate_size, hidden_dropout_prob):
        super(BertOutput, self).__init__()
        self.dense = nn.Linear(intermediate_size, hidden_size)
        self.LayerNorm = BertLayerNorm(hidden_size, eps=1e-12)
        self.dropout = nn.Dropout(hidden_dropout_prob)

    def forward(self, hidden_states, input_tensor):
        hidden_states = self.dense(hidden_states)
        hidden_states = self.dropout(hidden_states)
        hidden_states = self.LayerNorm(hidden_states + input_tensor)
        return hidden_states


 class BertLayer(nn.Module):
    def __init__(self, hidden_size, num_attention_heads, attention_probs_dropout_prob, hidden_dropout_prob,
                 intermediate_size, hidden_act):
        super(BertLayer, self).__init__()
        self.attention = BertAttention(hidden_size, num_attention_heads, attention_probs_dropout_prob,
                                       hidden_dropout_prob)
        self.intermediate = BertIntermediate(hidden_size, intermediate_size, hidden_act)
        self.output = BertOutput(hidden_size, intermediate_size, hidden_dropout_prob)

    def forward(self, hidden_states, attention_mask):
        attention_output = self.attention(hidden_states, attention_mask)
        intermediate_output = self.intermediate(attention_output)
        layer_output = self.output(intermediate_output, attention_output)
        return layer_output


 class BertEncoder(nn.Module):
    def __init__(self, num_hidden_layers, hidden_size, num_attention_heads, attention_probs_dropout_prob,
                 hidden_dropout_prob,
                 intermediate_size, hidden_act):
        super(BertEncoder, self).__init__()
        layer = BertLayer(hidden_size, num_attention_heads, attention_probs_dropout_prob, hidden_dropout_prob,
                          intermediate_size, hidden_act)
        self.layer = nn.ModuleList([copy.deepcopy(layer) for _ in range(num_hidden_layers)])

    def forward(self, hidden_states, attention_mask, output_all_encoded_layers=True):
        all_encoder_layers = []
        for layer_module in self.layer:
            hidden_states = layer_module(hidden_states, attention_mask)
            if output_all_encoded_layers:
                all_encoder_layers.append(hidden_states)
        if not output_all_encoded_layers:
            all_encoder_layers.append(hidden_states)
        return all_encoder_layers


 class BertPooler(nn.Module):
    def __init__(self, hidden_size):
        super(BertPooler, self).__init__()
        self.dense = nn.Linear(hidden_size, hidden_size)
        self.activation = nn.Tanh()

    def forward(self, hidden_states):
        # We "pool" the model by simply taking the hidden state corresponding
        # to the first token.
        first_token_tensor = hidden_states[:, 0]
        pooled_output = self.dense(first_token_tensor)
        pooled_output = self.activation(pooled_output)
        return pooled_output


 class BertModel(nn.Module):
    """BERT(Bidirectional Embedding Representations from Transformers).

    如果你想使用预训练好的权重矩阵，请在以下网址下载.
    sources::

    'bert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased.tar.gz",
    'bert-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased.tar.gz",
    'bert-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased.tar.gz",
    'bert-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased.tar.gz",
    'bert-base-multilingual-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased.tar.gz",
    'bert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased.tar.gz",
    'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese.tar.gz",


    用预训练权重矩阵来建立BERT模型::

        model = BertModel.from_pretrained("path/to/weights/directory")

    用随机初始化权重矩阵来建立BERT模型::

        model = BertModel()

    :param int vocab_size: 词表大小，默认值为30522，为BERT English uncase版本的词表大小
    :param int hidden_size: 隐层大小，默认值为768，为BERT base的版本
    :param int num_hidden_layers: 隐藏层数，默认值为12，为BERT base的版本
    :param int num_attention_heads: 多头注意力头数，默认值为12，为BERT base的版本
    :param int intermediate_size: FFN隐藏层大小，默认值是3072，为BERT base的版本
    :param str hidden_act: FFN隐藏层激活函数，默认值为``gelu``
    :param float hidden_dropout_prob: FFN隐藏层dropout，默认值为0.1
    :param float attention_probs_dropout_prob: Attention层的dropout，默认值为0.1
    :param int max_position_embeddings: 最大的序列长度，默认值为512，
    :param int type_vocab_size: 最大segment数量，默认值为2
    :param int initializer_range: 初始化权重范围，默认值为0.02
 class BertWordPieceEncoder(nn.Module):
    """
    读取bert模型，读取之后调用index_dataset方法在dataset中生成word_pieces这一列。

    def __init__(self, vocab_size=30522,
                 hidden_size=768,
                 num_hidden_layers=12,
                 num_attention_heads=12,
                 intermediate_size=3072,
                 hidden_act="gelu",
                 hidden_dropout_prob=0.1,
                 attention_probs_dropout_prob=0.1,
                 max_position_embeddings=512,
                 type_vocab_size=2,
                 initializer_range=0.02):
        super(BertModel, self).__init__()
        self.hidden_size = hidden_size
        self.embeddings = BertEmbeddings(vocab_size, hidden_size, max_position_embeddings,
                                         type_vocab_size, hidden_dropout_prob)
        self.encoder = BertEncoder(num_hidden_layers, hidden_size, num_attention_heads,
                                   attention_probs_dropout_prob, hidden_dropout_prob, intermediate_size,
                                   hidden_act)
        self.pooler = BertPooler(hidden_size)
        self.initializer_range = initializer_range

        self.apply(self.init_bert_weights)

    def init_bert_weights(self, module):
        if isinstance(module, (nn.Linear, nn.Embedding)):
            # Slightly different from the TF version which uses truncated_normal for initialization
            # cf https://github.com/pytorch/pytorch/pull/5617
            module.weight.data.normal_(mean=0.0, std=self.initializer_range)
        elif isinstance(module, BertLayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        if isinstance(module, nn.Linear) and module.bias is not None:
            module.bias.data.zero_()

    def forward(self, input_ids, token_type_ids=None, attention_mask=None, output_all_encoded_layers=True):
        if attention_mask is None:
            attention_mask = torch.ones_like(input_ids)
        if token_type_ids is None:
            token_type_ids = torch.zeros_like(input_ids)

        # We create a 3D attention mask from a 2D tensor mask.
        # Sizes are [batch_size, 1, 1, to_seq_length]
        # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
        # this attention mask is more simple than the triangular masking of causal attention
        # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
        extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)

        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
        # masked positions, this operation will create a tensor which is 0.0 for
        # positions we want to attend and -10000.0 for masked positions.
        # Since we are adding it to the raw scores before the softmax, this is
        # effectively the same as removing these entirely.
        extended_attention_mask = extended_attention_mask.to(dtype=next(self.parameters()).dtype)  # fp16 compatibility
        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0

        embedding_output = self.embeddings(input_ids, token_type_ids)
        encoded_layers = self.encoder(embedding_output,
                                      extended_attention_mask,
                                      output_all_encoded_layers=output_all_encoded_layers)
        sequence_output = encoded_layers[-1]
        pooled_output = self.pooler(sequence_output)
        if not output_all_encoded_layers:
            encoded_layers = encoded_layers[-1]
        return encoded_layers, pooled_output

    @classmethod
    def from_pretrained(cls, pretrained_model_dir, state_dict=None, *inputs, **kwargs):
        # Load config
        config_file = os.path.join(pretrained_model_dir, CONFIG_FILE)
        config = json.load(open(config_file, "r"))
        # config = BertConfig.from_json_file(config_file)
        # logger.info("Model config {}".format(config))
        # Instantiate model.
        model = cls(*inputs, **config, **kwargs)
        if state_dict is None:
            weights_path = os.path.join(pretrained_model_dir, MODEL_WEIGHTS)
            state_dict = torch.load(weights_path)

        old_keys = []
        new_keys = []
        for key in state_dict.keys():
            new_key = None
            if 'gamma' in key:
                new_key = key.replace('gamma', 'weight')
            if 'beta' in key:
                new_key = key.replace('beta', 'bias')
            if new_key:
                old_keys.append(key)
                new_keys.append(new_key)
        for old_key, new_key in zip(old_keys, new_keys):
            state_dict[new_key] = state_dict.pop(old_key)

        missing_keys = []
        unexpected_keys = []
        error_msgs = []
        # copy state_dict so _load_from_state_dict can modify it
        metadata = getattr(state_dict, '_metadata', None)
        state_dict = state_dict.copy()
        if metadata is not None:
            state_dict._metadata = metadata

        def load(module, prefix=''):
            local_metadata = {} if metadata is None else metadata.get(prefix[:-1], {})
            module._load_from_state_dict(
                state_dict, prefix, local_metadata, True, missing_keys, unexpected_keys, error_msgs)
            for name, child in module._modules.items():
                if child is not None:
                    load(child, prefix + name + '.')

        load(model, prefix='' if hasattr(model, 'bert') else 'bert.')
        if len(missing_keys) > 0:
            print("Weights of {} not initialized from pretrained model: {}".format(
                model.__class__.__name__, missing_keys))
        if len(unexpected_keys) > 0:
            print("Weights from pretrained model not used in {}: {}".format(
                model.__class__.__name__, unexpected_keys))
        return model
    :param fastNLP.Vocabulary vocab: 词表
    :param str model_dir_or_name: 模型所在目录或者模型的名称。默认值为``en-base-uncased``
    :param str layers:最终结果中的表示。以','隔开层数，可以以负数去索引倒数几层
    :param bool requires_grad: 是否需要gradient。
    """
    def __init__(self, model_dir_or_name:str='en-base-uncased', layers:str='-1',
                 requires_grad:bool=False):
        super().__init__()
        PRETRAIN_URL = _get_base_url('bert')
        PRETRAINED_BERT_MODEL_DIR = {'en': 'bert-base-cased-f89bfe08.zip',
                                     'en-base-uncased': 'bert-base-uncased-3413b23c.zip',
                                     'en-base-cased': 'bert-base-cased-f89bfe08.zip',
                                     'en-large-uncased': 'bert-large-uncased-20939f45.zip',
                                     'en-large-cased': 'bert-large-cased-e0cf90fc.zip',

                                     'cn': 'bert-base-chinese-29d0a84a.zip',
                                     'cn-base': 'bert-base-chinese-29d0a84a.zip',

                                     'multilingual': 'bert-base-multilingual-cased-1bd364ee.zip',
                                     'multilingual-base-uncased': 'bert-base-multilingual-uncased-f8730fe4.zip',
                                     'multilingual-base-cased': 'bert-base-multilingual-cased-1bd364ee.zip',
                                     }

        if model_dir_or_name in PRETRAINED_BERT_MODEL_DIR:
            model_name = PRETRAINED_BERT_MODEL_DIR[model_dir_or_name]
            model_url = PRETRAIN_URL + model_name
            model_dir = cached_path(model_url)
            # 检查是否存在
        elif os.path.isdir(model_dir_or_name):
            model_dir = model_dir_or_name
        else:
            raise ValueError(f"Cannot recognize {model_dir_or_name}.")

        self.model = _WordPieceBertModel(model_dir=model_dir, layers=layers)
        self._embed_size = len(self.model.layers) * self.model.encoder.hidden_size
        self.requires_grad = requires_grad

    @property
    def requires_grad(self):
        """
        Embedding的参数是否允许优化。True: 所有参数运行优化; False: 所有参数不允许优化; None: 部分允许优化、部分不允许
        :return:
        """
        requires_grads = set([param.requires_grad for name, param in self.named_parameters()])
        if len(requires_grads)==1:
            return requires_grads.pop()
        else:
            return None

    @requires_grad.setter
    def requires_grad(self, value):
        for name, param in self.named_parameters():
            param.requires_grad = value

    @property
    def embed_size(self):
        return self._embed_size

    def index_datasets(self, *datasets, field_name):
        """
        使用bert的tokenizer新生成word_pieces列加入到datasets中，并将他们设置为input。如果首尾不是
            [CLS]与[SEP]会在首尾额外加入[CLS]与[SEP], 且将word_pieces这一列的pad value设置为了bert的pad value。

        :param datasets: DataSet对象
        :param field_name: 基于哪一列的内容生成word_pieces列。这一列中每个数据应该是List[str]的形式。
        :return:
        """
        self.model.index_dataset(*datasets, field_name=field_name)

    def forward(self, word_pieces, token_type_ids=None):
        """
        计算words的bert embedding表示。传入的words中应该自行包含[CLS]与[SEP]的tag。

        :param words: batch_size x max_len
        :param token_type_ids: batch_size x max_len, 用于区分前一句和后一句话
        :return: torch.FloatTensor. batch_size x max_len x (768*len(self.layers))
        """
        outputs = self.model(word_pieces, token_type_ids)
        outputs = torch.cat([*outputs], dim=-1)

        return outputs
--- a/fastNLP/modules/encoder/embedding.py
+++ b/fastNLP/modules/encoder/embedding.py
@@ -1,13 +1,19 @@
 __all__ = [
    "Embedding"
    "Embedding",
    "StaticEmbedding",
    "ElmoEmbedding",
    "BertEmbedding",
    "StackEmbedding",
    "LSTMCharEmbedding",
    "CNNCharEmbedding",
 ]
 import torch.nn as nn
 from ..utils import get_embeddings
 from .lstm import LSTM
 from ... import Vocabulary
 from ...core.vocabulary import Vocabulary
 from abc import abstractmethod
 import torch
 from ...io import EmbedLoader
 import numpy as np
 import torch.nn.functional as F
 import os
 from ._elmo import _ElmoModel
@@ -15,8 +21,12 @@ from ...io.file_utils import cached_path, _get_base_url
 from ._bert import _WordBertModel
 from typing import List

 from ... import DataSet, Batch, SequentialSampler
 import warnings
 from ...core.dataset import DataSet
 from ...core.batch import DataSetIter
 from ...core.sampler import SequentialSampler
 from ...core.utils import _move_model_to_device, _get_model_device
 from ...io.file_utils import PRETRAINED_BERT_MODEL_DIR, PRETRAINED_ELMO_MODEL_DIR, PRETRAIN_STATIC_FILES


 class Embedding(nn.Module):
@@ -25,13 +35,15 @@ class Embedding(nn.Module):

    Embedding组件. 可以通过self.num_embeddings获取词表大小; self.embedding_dim获取embedding的维度"""
    
    def __init__(self, init_embed, dropout=0.0):
    def __init__(self, init_embed, dropout=0.0, dropout_word=0, unk_index=None):
        """

        :param tuple(int,int),torch.FloatTensor,nn.Embedding,numpy.ndarray init_embed: Embedding的大小(传入tuple(int, int),
            第一个int为vocab_zie, 第二个int为embed_dim); 如果为Tensor, Embedding, ndarray等则直接使用该值初始化Embedding;
            也可以传入TokenEmbedding对象
        :param float dropout: 对Embedding的输出的dropout。
        :param float dropout_word: 按照一定比例随机将word设置为unk的idx，这样可以使得unk这个token得到足够的训练
        :param int unk_index: drop word时替换为的index，如果init_embed为TokenEmbedding不需要传入该值。
        """
        super(Embedding, self).__init__()

@@ -40,17 +52,36 @@ class Embedding(nn.Module):
        self.dropout = nn.Dropout(dropout)
        if not isinstance(self.embed, TokenEmbedding):
            self._embed_size = self.embed.weight.size(1)
            if dropout_word>0 and not isinstance(unk_index, int):
                raise ValueError("When drop word is set, you need to pass in the unk_index.")
        else:
            self._embed_size = self.embed.embed_size
    
            unk_index = self.embed.get_word_vocab().unknown_idx
        self.unk_index = unk_index
        self.dropout_word = dropout_word

    def forward(self, x):
        """
        :param torch.LongTensor x: [batch, seq_len]
        :return: torch.Tensor : [batch, seq_len, embed_dim]
        """
        if self.dropout_word>0 and self.training:
            mask = torch.ones_like(x).float() * self.dropout_word
            mask = torch.bernoulli(mask).byte()  # dropout_word越大，越多位置为1
            x = x.masked_fill(mask, self.unk_index)
        x = self.embed(x)
        return self.dropout(x)

    @property
    def num_embedding(self)->int:
        if isinstance(self.embed, nn.Embedding):
            return self.embed.weight.size(0)
        else:
            return self.embed.num_embedding

    def __len__(self):
        return len(self.embed)

    @property
    def embed_size(self) -> int:
        return self._embed_size
@@ -80,7 +111,7 @@ class Embedding(nn.Module):
    @property
    def size(self):
        if isinstance(self.embed, TokenEmbedding):
            return torch.Size(self.embed._word_vocab, self.embed.embed_size)
            return self.embed.size
        else:
            return self.embed.weight.size()

@@ -109,14 +140,17 @@ class TokenEmbedding(nn.Module):
        for param in self.parameters():
            param.requires_grad = value

    @abstractmethod
    def get_original_vocab(self):
        pass
    def __len__(self):
        return len(self._word_vocab)

    @property
    def embed_size(self) -> int:
        return self._embed_size

    @property
    def num_embedding(self) -> int:
        return len(self._word_vocab)

    def get_word_vocab(self):
        """
        返回embedding的词典。
@@ -127,7 +161,7 @@ class TokenEmbedding(nn.Module):

    @property
    def size(self):
        return torch.Size(self.embed._word_vocab, self._embed_size)
        return torch.Size(self.num_embedding, self._embed_size)


 class StaticEmbedding(TokenEmbedding):
@@ -138,51 +172,142 @@ class StaticEmbedding(TokenEmbedding):

    Example::

        >>> embed = StaticEmbedding(vocab, model_dir_or_name='en-glove-6b-50')


    :param vocab: Vocabulary. 若该项为None则会读取所有的embedding。
    :param model_dir_or_name: 可以有两种方式调用预训练好的static embedding：第一种是传入embedding的文件名，第二种是传入embedding
        的名称。目前支持的embedding包括{`en` 或者 `en-glove-840b-300` : glove.840B.300d, `en-glove-6b-50` : glove.6B.50d,
        `en-word2vec-300` : GoogleNews-vectors-negative300}。第二种情况将自动查看缓存中是否存在该模型，没有的话将自动下载。
    :param requires_grad: 是否需要gradient

    :param requires_grad: 是否需要gradient. 默认为True
    :param init_method: 如何初始化没有找到的值。可以使用torch.nn.init.*中各种方法。调用该方法时传入一个tensor对象。
    :param normailize: 是否对vector进行normalize，使得每个vector的norm为1。
    """

    def __init__(self, vocab: Vocabulary, model_dir_or_name: str='en', requires_grad: bool=False):
    def __init__(self, vocab: Vocabulary, model_dir_or_name: str='en', requires_grad: bool=True, init_method=None,
                 normalize=False):
        super(StaticEmbedding, self).__init__(vocab)

        # 优先定义需要下载的static embedding有哪些。这里估计需要自己搞一个server，
        PRETRAIN_URL = _get_base_url('static')
        PRETRAIN_STATIC_FILES = {
            'en': 'glove.840B.300d-cc1ad5e1.tar.gz',
            'en-glove-840b-300': 'glove.840B.300d-cc1ad5e1.tar.gz',
            'en-glove-6b-50': "glove.6B.50d-a6028c70.tar.gz",
            'en-word2vec-300': "GoogleNews-vectors-negative300-be166d9d.tar.gz",
            'en-fasttext': "cc.en.300.vec-d53187b2.gz",
            'cn': "tencent_cn-dab24577.tar.gz",
            'cn-fasttext': "cc.zh.300.vec-d68a9bcf.gz",
        }

        # 得到cache_path
        if model_dir_or_name.lower() in PRETRAIN_STATIC_FILES:
            PRETRAIN_URL = _get_base_url('static')
            model_name = PRETRAIN_STATIC_FILES[model_dir_or_name]
            model_url = PRETRAIN_URL + model_name
            model_path = cached_path(model_url)
            # 检查是否存在
        elif os.path.isfile(model_dir_or_name):
        elif os.path.isfile(os.path.expanduser(os.path.abspath(model_dir_or_name))):
            model_path = model_dir_or_name
        else:
            raise ValueError(f"Cannot recognize {model_dir_or_name}.")

        # 读取embedding
        embedding = EmbedLoader.load_with_vocab(model_path, vocab=vocab)
        embedding = torch.tensor(embedding)
        embedding, hit_flags = self._load_with_vocab(model_path, vocab=vocab, init_method=init_method,
                                                      normalize=normalize)
        self.embedding = nn.Embedding(num_embeddings=embedding.shape[0], embedding_dim=embedding.shape[1],
                                      padding_idx=vocab.padding_idx,
                                      max_norm=None, norm_type=2, scale_grad_by_freq=False,
                                      sparse=False, _weight=embedding)
        if vocab._no_create_word_length > 0:  # 需要映射，使得来自于dev, test的idx指向unk
            words_to_words = nn.Parameter(torch.arange(len(vocab)).long(), requires_grad=False)
            for word, idx in vocab:
                if vocab._is_word_no_create_entry(word) and not hit_flags[idx]:
                    words_to_words[idx] = vocab.unknown_idx
            self.words_to_words = words_to_words
        self._embed_size = self.embedding.weight.size(1)
        self.requires_grad = requires_grad

    @property
    def requires_grad(self):
        """
        Embedding的参数是否允许优化。True: 所有参数运行优化; False: 所有参数不允许优化; None: 部分允许优化、部分不允许
        :return:
        """
        requires_grads = set([param.requires_grad for name, param in self.named_parameters()
                              if 'words_to_words' not in name])
        if len(requires_grads) == 1:
            return requires_grads.pop()
        else:
            return None

    @requires_grad.setter
    def requires_grad(self, value):
        for name, param in self.named_parameters():
            if 'words_to_words' in name:
                continue
            param.requires_grad = value

    def _load_with_vocab(self, embed_filepath, vocab, dtype=np.float32, padding='<pad>', unknown='<unk>',
                         normalize=True, error='ignore', init_method=None):
        """
        从embed_filepath这个预训练的词向量中抽取出vocab这个词表的词的embedding。EmbedLoader将自动判断embed_filepath是
        word2vec(第一行只有两个元素)还是glove格式的数据。

        :param str embed_filepath: 预训练的embedding的路径。
        :param vocab: 词表 :class:`~fastNLP.Vocabulary` 类型，读取出现在vocab中的词的embedding。
            没有出现在vocab中的词的embedding将通过找到的词的embedding的正态分布采样出来，以使得整个Embedding是同分布的。
        :param dtype: 读出的embedding的类型
        :param str padding: 词表中padding的token
        :param str unknown: 词表中unknown的token
        :param bool normalize: 是否将每个vector归一化到norm为1
        :param str error: `ignore` , `strict` ; 如果 `ignore` ，错误将自动跳过; 如果 `strict` , 错误将抛出。
            这里主要可能出错的地方在于词表有空行或者词表出现了维度不一致。
        :param init_method: 如何初始化没有找到的值。可以使用torch.nn.init.*中各种方法。默认使用torch.nn.init.zeros_
        :return torch.tensor:  shape为 [len(vocab), dimension], dimension由pretrain的embedding决定。
        """
        assert isinstance(vocab, Vocabulary), "Only fastNLP.Vocabulary is supported."
        if not os.path.exists(embed_filepath):
            raise FileNotFoundError("`{}` does not exist.".format(embed_filepath))
        with open(embed_filepath, 'r', encoding='utf-8') as f:
            line = f.readline().strip()
            parts = line.split()
            start_idx = 0
            if len(parts) == 2:
                dim = int(parts[1])
                start_idx += 1
            else:
                dim = len(parts) - 1
                f.seek(0)
            matrix = torch.zeros(len(vocab), dim)
            if init_method is not None:
                init_method(matrix)
            hit_flags = np.zeros(len(vocab), dtype=bool)
            for idx, line in enumerate(f, start_idx):
                try:
                    parts = line.strip().split()
                    word = ''.join(parts[:-dim])
                    nums = parts[-dim:]
                    # 对齐unk与pad
                    if word == padding and vocab.padding is not None:
                        word = vocab.padding
                    elif word == unknown and vocab.unknown is not None:
                        word = vocab.unknown
                    if word in vocab:
                        index = vocab.to_index(word)
                        matrix[index] = torch.from_numpy(np.fromstring(' '.join(nums), sep=' ', dtype=dtype, count=dim))
                        hit_flags[index] = True
                except Exception as e:
                    if error == 'ignore':
                        warnings.warn("Error occurred at the {} line.".format(idx))
                    else:
                        print("Error occurred at the {} line.".format(idx))
                        raise e
            found_count = sum(hit_flags)
            print("Found {} out of {} words in the pre-training embedding.".format(found_count, len(vocab)))
            if init_method is None:
                if len(vocab)-found_count>0 and found_count>0: # 有的没找到
                    found_vecs = matrix[torch.LongTensor(hit_flags.astype(int)).byte()]
                    mean = found_vecs.mean(dim=0, keepdim=True)
                    std = found_vecs.std(dim=0, keepdim=True)
                    unfound_vec_num = np.sum(hit_flags==False)
                    unfound_vecs = torch.randn(unfound_vec_num, dim)*std + mean
                    matrix[torch.LongTensor(hit_flags.astype(int)).eq(0)] = unfound_vecs

            if normalize:
                matrix /= (torch.norm(matrix, dim=1, keepdim=True) + 1e-12)

            return matrix, hit_flags

    def forward(self, words):
        """
        传入words的index
@@ -190,6 +315,8 @@ class StaticEmbedding(TokenEmbedding):
        :param words: torch.LongTensor, [batch_size, max_len]
        :return: torch.FloatTensor, [batch_size, max_len, embed_size]
        """
        if hasattr(self, 'words_to_words'):
            words = self.words_to_words[words]
        return self.embedding(words)


@@ -201,11 +328,6 @@ class ContextualEmbedding(TokenEmbedding):
        """
        由于动态embedding生成比较耗时，所以可以把每句话embedding缓存下来，这样就不需要每次都运行生成过程。

        Example::

            >>>


        :param datasets: DataSet对象
        :param batch_size: int, 生成cache的sentence表示时使用的batch的大小
        :param device: 参考 :class::fastNLP.Trainer 的device
@@ -228,14 +350,14 @@ class ContextualEmbedding(TokenEmbedding):
        with torch.no_grad():
            for index, dataset in enumerate(datasets):
                try:
                    batch = Batch(dataset, batch_size=batch_size, sampler=SequentialSampler(), prefetch=False)
                    batch = DataSetIter(dataset, batch_size=batch_size, sampler=SequentialSampler())
                    for batch_x, batch_y in batch:
                        words = batch_x['words'].to(device)
                        words_list = words.tolist()
                        seq_len = words.ne(pad_index).sum(dim=-1)
                        max_len = words.size(1)
                        # 因为有些情况可能包含CLS, SEP, 从后面往前计算比较安全。
                        seq_len_from_behind =(max_len - seq_len).tolist()
                        seq_len_from_behind = (max_len - seq_len).tolist()
                        word_embeds = self(words).detach().cpu().numpy()
                        for b in range(words.size(0)):
                            length = seq_len_from_behind[b]
@@ -297,8 +419,7 @@ class ElmoEmbedding(ContextualEmbedding):

    Example::

        >>>
        >>>
        >>> embedding = ElmoEmbedding(vocab, model_dir_or_name='en', layers='2', requires_grad=True)

    :param vocab: 词表
    :param model_dir_or_name: 可以有两种方式调用预训练好的ELMo embedding：第一种是传入ELMo权重的文件名，第二种是传入ELMo版本的名称，
@@ -319,17 +440,13 @@ class ElmoEmbedding(ContextualEmbedding):
        self.layers = layers

        # 根据model_dir_or_name检查是否存在并下载
        PRETRAIN_URL = _get_base_url('elmo')
        # TODO 把baidu云上的加上去
        PRETRAINED_ELMO_MODEL_DIR = {'en': 'elmo_en-d39843fe.tar.gz',
                                     'cn': 'elmo_cn-5e9b34e2.tar.gz'}

        if model_dir_or_name.lower() in PRETRAINED_ELMO_MODEL_DIR:
            PRETRAIN_URL = _get_base_url('elmo')
            model_name = PRETRAINED_ELMO_MODEL_DIR[model_dir_or_name]
            model_url = PRETRAIN_URL + model_name
            model_dir = cached_path(model_url)
            # 检查是否存在
        elif os.path.isdir(model_dir_or_name):
        elif os.path.isdir(os.path.expanduser(os.path.abspath(model_dir_or_name))):
            model_dir = model_dir_or_name
        else:
            raise ValueError(f"Cannot recognize {model_dir_or_name}.")
@@ -368,7 +485,7 @@ class ElmoEmbedding(ContextualEmbedding):
        :return:
        """
        requires_grads = set([param.requires_grad for name, param in self.named_parameters()
                             if 'words_to_chars_embedding' not in name])
                             if 'words_to_chars_embedding' not in name and 'words_to_words' not in name])
        if len(requires_grads) == 1:
            return requires_grads.pop()
        else:
@@ -377,8 +494,8 @@ class ElmoEmbedding(ContextualEmbedding):
    @requires_grad.setter
    def requires_grad(self, value):
        for name, param in self.named_parameters():
            if 'words_to_chars_embedding' in name: # 这个不能加入到requires_grad中
                pass
            if 'words_to_chars_embedding' in name or 'words_to_words' in name: # 这个不能加入到requires_grad中
                continue
            param.requires_grad = value


@@ -386,18 +503,19 @@ class BertEmbedding(ContextualEmbedding):
    """
    别名：:class:`fastNLP.modules.BertEmbedding`   :class:`fastNLP.modules.encoder.embedding.BertEmbedding`

    使用BERT对words进行encode的Embedding。
    使用BERT对words进行encode的Embedding。建议将输入的words长度限制在450以内，而不要使用512。这是由于预训练的bert模型长
        度限制为512个token，而因为输入的word是未进行word piece分割的，在分割之后长度可能会超过最大长度限制。

    Example::

        >>>
        >>> embedding = BertEmbedding(vocab, model_dir_or_name='en-base-uncased', requires_grad=False, layers='4,-2,-1')


    :param fastNLP.Vocabulary vocab: 词表
    :param str model_dir_or_name: 模型所在目录或者模型的名称。默认值为``en-base-uncased``
    :param str model_dir_or_name: 模型所在目录或者模型的名称。默认值为 ``en-base-uncased``.
    :param str layers:最终结果中的表示。以','隔开层数，可以以负数去索引倒数几层
    :param str pool_method: 因为在bert中，每个word会被表示为多个word pieces, 当获取一个word的表示的时候，怎样从它的word pieces
        中计算得到他对应的表示。支持``last``, ``first``, ``avg``, ``max``.
        中计算得到它对应的表示。支持``last``, ``first``, ``avg``, ``max``。
    :param bool include_cls_sep: bool，在bert计算句子的表示的时候，需要在前面加上[CLS]和[SEP], 是否在结果中保留这两个内容。 这样
        会使得word embedding的结果比输入的结果长两个token。在使用 :class::StackEmbedding 可能会遇到问题。
    :param bool requires_grad: 是否需要gradient。
@@ -405,28 +523,15 @@ class BertEmbedding(ContextualEmbedding):
    def __init__(self, vocab: Vocabulary, model_dir_or_name: str='en-base-uncased', layers: str='-1',
                 pool_method: str='first', include_cls_sep: bool=False, requires_grad: bool=False):
        super(BertEmbedding, self).__init__(vocab)
        # 根据model_dir_or_name检查是否存在并下载
        PRETRAIN_URL = _get_base_url('bert')
        PRETRAINED_BERT_MODEL_DIR = {'en': 'bert-base-cased-f89bfe08.zip',
                                     'en-base-uncased': 'bert-base-uncased-3413b23c.zip',
                                     'en-base-cased': 'bert-base-cased-f89bfe08.zip',
                                     'en-large-uncased': 'bert-large-uncased-20939f45.zip',
                                     'en-large-cased': 'bert-large-cased-e0cf90fc.zip',

                                     'cn': 'bert-base-chinese-29d0a84a.zip',
                                     'cn-base': 'bert-base-chinese-29d0a84a.zip',

                                     'multilingual': 'bert-base-multilingual-cased-1bd364ee.zip',
                                     'multilingual-base-uncased': 'bert-base-multilingual-uncased-f8730fe4.zip',
                                     'multilingual-base-cased': 'bert-base-multilingual-cased-1bd364ee.zip',
                                     }

        # 根据model_dir_or_name检查是否存在并下载
        if model_dir_or_name.lower() in PRETRAINED_BERT_MODEL_DIR:
            PRETRAIN_URL = _get_base_url('bert')
            model_name = PRETRAINED_BERT_MODEL_DIR[model_dir_or_name]
            model_url = PRETRAIN_URL + model_name
            model_dir = cached_path(model_url)
            # 检查是否存在
        elif os.path.isdir(model_dir_or_name):
        elif os.path.isdir(os.path.expanduser(os.path.abspath(model_dir_or_name))):
            model_dir = model_dir_or_name
        else:
            raise ValueError(f"Cannot recognize {model_dir_or_name}.")
@@ -445,7 +550,7 @@ class BertEmbedding(ContextualEmbedding):
        计算words的bert embedding表示。计算之前会在每句话的开始增加[CLS]在结束增加[SEP], 并根据include_cls_sep判断要不要
            删除这两个token的表示。

        :param words: batch_size x max_len
        :param torch.LongTensor words: [batch_size, max_len]
        :return: torch.FloatTensor. batch_size x max_len x (768*len(self.layers))
        """
        outputs = self._get_sent_reprs(words)
@@ -473,7 +578,7 @@ class BertEmbedding(ContextualEmbedding):
    def requires_grad(self, value):
        for name, param in self.named_parameters():
            if 'word_pieces_lengths' in name:  # 这个不能加入到requires_grad中
                pass
                continue
            param.requires_grad = value


@@ -487,7 +592,8 @@ def _construct_char_vocab_from_vocab(vocab:Vocabulary, min_freq:int=1):
    """
    char_vocab = Vocabulary(min_freq=min_freq)
    for word, index in vocab:
        char_vocab.add_word_lst(list(word))
        if not vocab._is_word_no_create_entry(word):
            char_vocab.add_word_lst(list(word))
    return char_vocab


@@ -495,24 +601,25 @@ class CNNCharEmbedding(TokenEmbedding):
    """
    别名：:class:`fastNLP.modules.CNNCharEmbedding`   :class:`fastNLP.modules.encoder.embedding.CNNCharEmbedding`

    使用CNN生成character embedding。CNN的结果为, CNN(x) -> activation(x) -> pool -> fc. 不同的kernel大小的fitler结果是
        concat起来的。
    使用CNN生成character embedding。CNN的结果为, embed(x) -> Dropout(x) -> CNN(x) -> activation(x) -> pool
        -> fc. 不同的kernel大小的fitler结果是concat起来的。

    Example::

        >>>
        >>> cnn_char_embed = CNNCharEmbedding(vocab)


    :param vocab: 词表
    :param embed_size: 该word embedding的大小，默认值为50.
    :param char_emb_size: character的embed的大小。character是从vocab中生成的。默认值为50.
    :param dropout: 以多大的概率drop
    :param filter_nums: filter的数量. 长度需要和kernels一致。默认值为[40, 30, 20].
    :param kernels: kernel的大小. 默认值为[5, 3, 1].
    :param kernel_sizes: kernel的大小. 默认值为[5, 3, 1].
    :param pool_method: character的表示在合成一个表示时所使用的pool方法，支持'avg', 'max'.
    :param activation: CNN之后使用的激活方法，支持'relu', 'sigmoid', 'tanh' 或者自定义函数.
    :param min_char_freq: character的最少出现次数。默认值为2.
    """
    def __init__(self, vocab: Vocabulary, embed_size: int=50, char_emb_size: int=50,
    def __init__(self, vocab: Vocabulary, embed_size: int=50, char_emb_size: int=50, dropout:float=0.5,
                 filter_nums: List[int]=(40, 30, 20), kernel_sizes: List[int]=(5, 3, 1), pool_method: str='max',
                 activation='relu', min_char_freq: int=2):
        super(CNNCharEmbedding, self).__init__(vocab)
@@ -521,6 +628,7 @@ class CNNCharEmbedding(TokenEmbedding):
            assert kernel % 2 == 1, "Only odd kernel is allowed."

        assert pool_method in ('max', 'avg')
        self.dropout = nn.Dropout(dropout, inplace=True)
        self.pool_method = pool_method
        # activation function
        if isinstance(activation, str):
@@ -544,13 +652,13 @@ class CNNCharEmbedding(TokenEmbedding):
        self.char_pad_index = self.char_vocab.padding_idx
        print(f"In total, there are {len(self.char_vocab)} distinct characters.")
        # 对vocab进行index
        self.max_word_len = max(map(lambda x: len(x[0]), vocab))
        self.words_to_chars_embedding = nn.Parameter(torch.full((len(vocab), self.max_word_len),
        max_word_len = max(map(lambda x: len(x[0]), vocab))
        self.words_to_chars_embedding = nn.Parameter(torch.full((len(vocab), max_word_len),
                                                                fill_value=self.char_pad_index, dtype=torch.long),
                                                     requires_grad=False)
        self.word_lengths = nn.Parameter(torch.zeros(len(vocab)).long(), requires_grad=False)
        for word, index in vocab:
            # if index!=vocab.padding_idx:  # 如果是pad的话，直接就为pad_value了。 修改为不区分pad, 这样所有的<pad>也是同一个embed
            # if index!=vocab.padding_idx:  # 如果是pad的话，直接就为pad_value了。修改为不区分pad, 这样所有的<pad>也是同一个embed
            self.words_to_chars_embedding[index, :len(word)] = \
                torch.LongTensor([self.char_vocab.to_index(c) for c in word])
            self.word_lengths[index] = len(word)
@@ -561,6 +669,7 @@ class CNNCharEmbedding(TokenEmbedding):
            for i in range(len(kernel_sizes))])
        self._embed_size = embed_size
        self.fc = nn.Linear(sum(filter_nums), embed_size)
        self.init_param()

    def forward(self, words):
        """
@@ -577,7 +686,7 @@ class CNNCharEmbedding(TokenEmbedding):
        # 为1的地方为mask
        chars_masks = chars.eq(self.char_pad_index)  # batch_size x max_len x max_word_len 如果为0, 说明是padding的位置了
        chars = self.char_embedding(chars)  # batch_size x max_len x max_word_len x embed_size

        self.dropout(chars)
        reshaped_chars = chars.reshape(batch_size*max_len, max_word_len, -1)
        reshaped_chars = reshaped_chars.transpose(1, 2)  # B' x E x M
        conv_chars = [conv(reshaped_chars).transpose(1, 2).reshape(batch_size, max_len, max_word_len, -1)
@@ -613,30 +722,39 @@ class CNNCharEmbedding(TokenEmbedding):
    def requires_grad(self, value):
        for name, param in self.named_parameters():
            if 'words_to_chars_embedding' in name or 'word_lengths' in name:  # 这个不能加入到requires_grad中
                pass
                continue
            param.requires_grad = value

    def init_param(self):
        for name, param in self.named_parameters():
            if 'words_to_chars_embedding' in name or 'word_lengths' in name:  # 这个不能reset
                continue
            if param.data.dim()>1:
                nn.init.xavier_uniform_(param, 1)
            else:
                nn.init.uniform_(param, -1, 1)

 class LSTMCharEmbedding(TokenEmbedding):
    """
    别名：:class:`fastNLP.modules.LSTMCharEmbedding`   :class:`fastNLP.modules.encoder.embedding.LSTMCharEmbedding`

    使用LSTM的方式对character进行encode.
    使用LSTM的方式对character进行encode. embed(x) -> Dropout(x) -> LSTM(x) -> activation(x) -> pool

    Example::

        >>>
        >>> lstm_char_embed = LSTMCharEmbedding(vocab)

    :param vocab: 词表
    :param embed_size: embedding的大小。默认值为50.
    :param char_emb_size: character的embedding的大小。默认值为50.
    :param dropout: 以多大概率drop
    :param hidden_size: LSTM的中间hidden的大小，如果为bidirectional的，hidden会除二，默认为50.
    :param pool_method: 支持'max', 'avg'
    :param activation: 激活函数，支持'relu', 'sigmoid', 'tanh', 或者自定义函数.
    :param min_char_freq: character的最小出现次数。默认值为2.
    :param bidirectional: 是否使用双向的LSTM进行encode。默认值为True。
    """
    def __init__(self, vocab: Vocabulary, embed_size: int=50, char_emb_size: int=50, hidden_size=50,
    def __init__(self, vocab: Vocabulary, embed_size: int=50, char_emb_size: int=50, dropout:float=0.5, hidden_size=50,
                 pool_method: str='max', activation='relu', min_char_freq: int=2, bidirectional=True):
        super(LSTMCharEmbedding, self).__init__(vocab)

@@ -644,7 +762,7 @@ class LSTMCharEmbedding(TokenEmbedding):

        assert pool_method in ('max', 'avg')
        self.pool_method = pool_method

        self.dropout = nn.Dropout(dropout, inplace=True)
        # activation function
        if isinstance(activation, str):
            if activation.lower() == 'relu':
@@ -701,7 +819,7 @@ class LSTMCharEmbedding(TokenEmbedding):
        # 为mask的地方为1
        chars_masks = chars.eq(self.char_pad_index)  # batch_size x max_len x max_word_len 如果为0, 说明是padding的位置了
        chars = self.char_embedding(chars)  # batch_size x max_len x max_word_len x embed_size

        chars = self.dropout(chars)
        reshaped_chars = chars.reshape(batch_size * max_len, max_word_len, -1)
        char_seq_len = chars_masks.eq(0).sum(dim=-1).reshape(batch_size * max_len)
        lstm_chars = self.lstm(reshaped_chars, char_seq_len)[0].reshape(batch_size, max_len, max_word_len, -1)
@@ -739,7 +857,7 @@ class LSTMCharEmbedding(TokenEmbedding):
    def requires_grad(self, value):
        for name, param in self.named_parameters():
            if 'words_to_chars_embedding' in name or 'word_lengths' in name:  # 这个不能加入到requires_grad中
                pass
                continue
            param.requires_grad = value


@@ -751,7 +869,8 @@ class StackEmbedding(TokenEmbedding):

    Example::

        >>>
        >>> embed_1 = StaticEmbedding(vocab, model_dir_or_name='en-glove-6b-50', requires_grad=True)
        >>> embed_2 = StaticEmbedding(vocab, model_dir_or_name='en-word2vec-300', requires_grad=True)


    :param embeds: 一个由若干个TokenEmbedding组成的list，要求每一个TokenEmbedding的词表都保持一致
--- a/fastNLP/modules/encoder/lstm.py
+++ b/fastNLP/modules/encoder/lstm.py
@@ -11,13 +11,15 @@ import torch.nn as nn
 import torch.nn.utils.rnn as rnn

 from ..utils import initial_parameter
 from torch import autograd


 class LSTM(nn.Module):
    """
    别名：:class:`fastNLP.modules.LSTM`  :class:`fastNLP.modules.encoder.lstm.LSTM`

    LSTM 模块, 轻量封装的Pytorch LSTM
    LSTM 模块, 轻量封装的Pytorch LSTM. 在提供seq_len的情况下，将自动使用pack_padded_sequence; 同时默认将forget gate的bias初始化
        为1; 且可以应对DataParallel中LSTM的使用问题。

    :param input_size:  输入 `x` 的特征维度
    :param hidden_size: 隐状态 `h` 的特征维度.
@@ -30,23 +32,35 @@ class LSTM(nn.Module):
    """
    
    def __init__(self, input_size, hidden_size=100, num_layers=1, dropout=0.0, batch_first=True,
                 bidirectional=False, bias=True, initial_method=None):
                 bidirectional=False, bias=True):
        super(LSTM, self).__init__()
        self.batch_first = batch_first
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, bias=bias, batch_first=batch_first,
                            dropout=dropout, bidirectional=bidirectional)
        initial_parameter(self, initial_method)
    
        self.init_param()

    def init_param(self):
        for name, param in self.named_parameters():
            if 'bias' in name:
                # based on https://github.com/pytorch/pytorch/issues/750#issuecomment-280671871
                param.data.fill_(0)
                n = param.size(0)
                start, end = n // 4, n // 2
                param.data[start:end].fill_(1)
            else:
                nn.init.xavier_uniform_(param)

    def forward(self, x, seq_len=None, h0=None, c0=None):
        """

        :param x: [batch, seq_len, input_size] 输入序列
        :param seq_len: [batch, ] 序列长度, 若为 ``None``, 所有输入看做一样长. Default: ``None``
        :param h0: [batch, hidden_size] 初始隐状态, 若为 ``None`` , 设为全1向量. Default: ``None``
        :param c0: [batch, hidden_size] 初始Cell状态, 若为 ``None`` , 设为全1向量. Default: ``None``
        :param h0: [batch, hidden_size] 初始隐状态, 若为 ``None`` , 设为全0向量. Default: ``None``
        :param c0: [batch, hidden_size] 初始Cell状态, 若为 ``None`` , 设为全0向量. Default: ``None``
        :return (output, ht) 或 output: 若 ``get_hidden=True`` [batch, seq_len, hidden_size*num_direction] 输出序列
            和 [batch, hidden_size*num_direction] 最后时刻隐状态.
        """
        batch_size, max_len, _ = x.size()
        if h0 is not None and c0 is not None:
            hx = (h0, c0)
        else:
@@ -59,7 +73,7 @@ class LSTM(nn.Module):
                x = x[:, sort_idx]
            x = rnn.pack_padded_sequence(x, sort_lens, batch_first=self.batch_first)
            output, hx = self.lstm(x, hx)  # -> [N,L,C]
            output, _ = rnn.pad_packed_sequence(output, batch_first=self.batch_first)
            output, _ = rnn.pad_packed_sequence(output, batch_first=self.batch_first, total_length=max_len)
            _, unsort_idx = torch.sort(sort_idx, dim=0, descending=False)
            if self.batch_first:
                output = output[unsort_idx]
--- a/fastNLP/modules/utils.py
+++ b/fastNLP/modules/utils.py
@@ -82,6 +82,8 @@ def get_embeddings(init_embed):
    if isinstance(init_embed, tuple):
        res = nn.Embedding(
            num_embeddings=init_embed[0], embedding_dim=init_embed[1])
        nn.init.uniform_(res.weight.data, a=-np.sqrt(3/res.weight.data.size(1)),
                         b=np.sqrt(3/res.weight.data.size(1)))
    elif isinstance(init_embed, nn.Module):
        res = init_embed
    elif isinstance(init_embed, torch.Tensor):
--- a/reproduction/Biaffine_parser/run.py
+++ b/reproduction/Biaffine_parser/run.py
@@ -184,11 +184,8 @@ def train(path):
                        m.weight.requires_grad = True

    # Trainer
    trainer = Trainer(model=model, train_data=train_data, dev_data=dev_data,
                      loss=ParserLoss(), metrics=ParserMetric(), metric_key='UAS',
                      **train_args.data,
                      optimizer=fastNLP.Adam(**optim_args.data),
                      save_path=path,
    trainer = Trainer(train_data=train_data, model=model, optimizer=fastNLP.Adam(**optim_args.data), loss=ParserLoss(),
                      dev_data=dev_data, metrics=ParserMetric(), metric_key='UAS', save_path=path,
                      callbacks=[MyCallback()])

    # Start training
--- a/reproduction/POS_tagging/train_pos_tag.py
+++ b/reproduction/POS_tagging/train_pos_tag.py
@@ -89,11 +89,11 @@ def train(train_data_path, dev_data_path, checkpoint=None, save=None):
        model = torch.load(checkpoint)

    # call trainer to train
    trainer = Trainer(dataset, model, loss=None, metrics=SpanFPreRecMetric(tag_proc.vocab, pred="predict",
                                                                           target="truth",
                                                                           seq_lens="word_seq_origin_len"),
                      dev_data=dev_data, metric_key="f",
                      use_tqdm=True, use_cuda=True, print_every=10, n_epochs=20, save_path=save)
    trainer = Trainer(dataset, model, loss=None, n_epochs=20, print_every=10, dev_data=dev_data,
                      metrics=SpanFPreRecMetric(tag_proc.vocab, pred="predict",
                                                target="truth",
                                                seq_lens="word_seq_origin_len"), metric_key="f", save_path=save,
                      use_tqdm=True)
    trainer.train(load_best_model=True)

    # save model & pipeline
--- a/reproduction/Star_transformer/train.py
+++ b/reproduction/Star_transformer/train.py
@@ -149,14 +149,10 @@ def train():
    ) if x.requires_grad and x.size(0) != len(word_v)]
    optim_cfg = [{'params': model.enc.embedding.parameters(), 'lr': g_args.lr*0.1},
                 {'params': ex_param, 'lr': g_args.lr, 'weight_decay': g_args.w_decay}, ]
    trainer = FN.Trainer(model=model, train_data=train_data, dev_data=dev_data,
                         loss=loss, metrics=metric, metric_key=metric_key,
                         optimizer=torch.optim.Adam(optim_cfg),
                         n_epochs=g_args.ep, batch_size=g_args.bsz, print_every=10, validate_every=3000,
                         device=device,
                         use_tqdm=False, prefetch=False,
                         save_path=g_args.log,
                         callbacks=[MyCallback()])
    trainer = FN.Trainer(train_data=train_data, model=model, optimizer=torch.optim.Adam(optim_cfg), loss=loss,
                         batch_size=g_args.bsz, n_epochs=g_args.ep, print_every=10, dev_data=dev_data, metrics=metric,
                         metric_key=metric_key, validate_every=3000, save_path=g_args.log, use_tqdm=False,
                         device=device, callbacks=[MyCallback()])

    trainer.train()
    tester = FN.Tester(data=test_data, model=model, metrics=metric,
--- a/reproduction/matching/data/MatchingDataLoader.py
+++ b/reproduction/matching/data/MatchingDataLoader.py
@@ -0,0 +1,326 @@

 import os

 from typing import Union, Dict

 from fastNLP.core.const import Const
 from fastNLP.core.vocabulary import Vocabulary
 from fastNLP.io.base_loader import DataInfo
 from fastNLP.io.dataset_loader import JsonLoader, DataSetLoader, CSVLoader
 from fastNLP.io.file_utils import _get_base_url, cached_path, PRETRAINED_BERT_MODEL_DIR
 from fastNLP.modules.encoder._bert import BertTokenizer


 class MatchingLoader(DataSetLoader):
    """
    别名：:class:`fastNLP.io.MatchingLoader` :class:`fastNLP.io.dataset_loader.MatchingLoader`

    读取Matching任务的数据集
    """

    def __init__(self, paths: dict=None):
        """
        :param dict paths: key是数据集名称（如train、dev、test），value是对应的文件名
        """
        self.paths = paths

    def _load(self, path):
        """
        :param str path: 待读取数据集的路径名
        :return: fastNLP.DataSet ds: 返回一个DataSet对象，里面必须包含3个field：其中两个分别为两个句子
            的原始字符串文本，第三个为标签
        """
        raise NotImplementedError

    def process(self, paths: Union[str, Dict[str, str]], dataset_name: str=None,
                to_lower=False, seq_len_type: str=None, bert_tokenizer: str=None,
                cut_text: int = None, get_index=True, set_input: Union[list, str, bool]=True,
                set_target: Union[list, str, bool] = True, concat: Union[str, list, bool]=None, ) -> DataInfo:
        """
        :param paths: str或者Dict[str, str]。如果是str，则为数据集所在的文件夹或者是全路径文件名：如果是文件夹，
            则会从self.paths里面找对应的数据集名称与文件名。如果是Dict，则为数据集名称（如train、dev、test）和
            对应的全路径文件名。
        :param str dataset_name: 如果在paths里传入的是一个数据集的全路径文件名，那么可以用dataset_name来定义
            这个数据集的名字，如果不定义则默认为train。
        :param bool to_lower: 是否将文本自动转为小写。默认值为False。
        :param str seq_len_type: 提供的seq_len类型，支持 ``seq_len`` ：提供一个数字作为句子长度； ``mask`` :
            提供一个0/1的mask矩阵作为句子长度； ``bert`` ：提供segment_type_id（第一个句子为0，第二个句子为1）和
            attention mask矩阵（0/1的mask矩阵）。默认值为None，即不提供seq_len
        :param str bert_tokenizer: bert tokenizer所使用的词表所在的文件夹路径
        :param int cut_text: 将长于cut_text的内容截掉。默认为None，即不截。
        :param bool get_index: 是否需要根据词表将文本转为index
        :param set_input: 如果为True，则会自动将相关的field（名字里含有Const.INPUT的）设置为input，如果为False
            则不会将任何field设置为input。如果传入str或者List[str]，则会根据传入的内容将相对应的field设置为input，
            于此同时其他field不会被设置为input。默认值为True。
        :param set_target: set_target将控制哪些field可以被设置为target，用法与set_input一致。默认值为True。
        :param concat: 是否需要将两个句子拼接起来。如果为False则不会拼接。如果为True则会在两个句子之间插入一个<sep>。
            如果传入一个长度为4的list，则分别表示插在第一句开始前、第一句结束后、第二句开始前、第二句结束后的标识符。如果
            传入字符串 ``bert`` ，则会采用bert的拼接方式，等价于['[CLS]', '[SEP]', '', '[SEP]'].
        :return:
        """
        if isinstance(set_input, str):
            set_input = [set_input]
        if isinstance(set_target, str):
            set_target = [set_target]
        if isinstance(set_input, bool):
            auto_set_input = set_input
        else:
            auto_set_input = False
        if isinstance(set_target, bool):
            auto_set_target = set_target
        else:
            auto_set_target = False
        if isinstance(paths, str):
            if os.path.isdir(paths):
                path = {n: os.path.join(paths, self.paths[n]) for n in self.paths.keys()}
            else:
                path = {dataset_name if dataset_name is not None else 'train': paths}
        else:
            path = paths

        data_info = DataInfo()
        for data_name in path.keys():
            data_info.datasets[data_name] = self._load(path[data_name])

        for data_name, data_set in data_info.datasets.items():
            if auto_set_input:
                data_set.set_input(Const.INPUTS(0), Const.INPUTS(1))
            if auto_set_target:
                data_set.set_target(Const.TARGET)

        if to_lower:
            for data_name, data_set in data_info.datasets.items():
                data_set.apply(lambda x: [w.lower() for w in x[Const.INPUTS(0)]], new_field_name=Const.INPUTS(0),
                               is_input=auto_set_input)
                data_set.apply(lambda x: [w.lower() for w in x[Const.INPUTS(1)]], new_field_name=Const.INPUTS(1),
                               is_input=auto_set_input)

        if bert_tokenizer is not None:
            if bert_tokenizer.lower() in PRETRAINED_BERT_MODEL_DIR:
                PRETRAIN_URL = _get_base_url('bert')
                model_name = PRETRAINED_BERT_MODEL_DIR[bert_tokenizer]
                model_url = PRETRAIN_URL + model_name
                model_dir = cached_path(model_url)
                # 检查是否存在
            elif os.path.isdir(bert_tokenizer):
                model_dir = bert_tokenizer
            else:
                raise ValueError(f"Cannot recognize BERT tokenizer from {bert_tokenizer}.")

            tokenizer = BertTokenizer.from_pretrained(model_dir)

            for data_name, data_set in data_info.datasets.items():
                for fields in data_set.get_field_names():
                    if Const.INPUT in fields:
                        data_set.apply(lambda x: tokenizer.tokenize(' '.join(x[fields])), new_field_name=fields,
                                       is_input=auto_set_input)

        if isinstance(concat, bool):
            concat = 'default' if concat else None
        if concat is not None:
            if isinstance(concat, str):
                CONCAT_MAP = {'bert': ['[CLS]', '[SEP]', '', '[SEP]'],
                              'default': ['', '<sep>', '', '']}
                if concat.lower() in CONCAT_MAP:
                    concat = CONCAT_MAP[concat]
                else:
                    concat = 4 * [concat]
            assert len(concat) == 4, \
                f'Please choose a list with 4 symbols which at the beginning of first sentence ' \
                f'the end of first sentence, the begin of second sentence, and the end of second' \
                f'sentence. Your input is {concat}'

            for data_name, data_set in data_info.datasets.items():
                data_set.apply(lambda x: [concat[0]] + x[Const.INPUTS(0)] + [concat[1]] + [concat[2]] +
                               x[Const.INPUTS(1)] + [concat[3]], new_field_name=Const.INPUT)
                data_set.apply(lambda x: [w for w in x[Const.INPUT] if len(w) > 0], new_field_name=Const.INPUT,
                               is_input=auto_set_input)

        if seq_len_type is not None:
            if seq_len_type == 'seq_len':  #
                for data_name, data_set in data_info.datasets.items():
                    for fields in data_set.get_field_names():
                        if Const.INPUT in fields:
                            data_set.apply(lambda x: len(x[fields]),
                                           new_field_name=fields.replace(Const.INPUT, Const.INPUT_LEN),
                                           is_input=auto_set_input)
            elif seq_len_type == 'mask':
                for data_name, data_set in data_info.datasets.items():
                    for fields in data_set.get_field_names():
                        if Const.INPUT in fields:
                            data_set.apply(lambda x: [1] * len(x[fields]),
                                           new_field_name=fields.replace(Const.INPUT, Const.INPUT_LEN),
                                           is_input=auto_set_input)
            elif seq_len_type == 'bert':
                for data_name, data_set in data_info.datasets.items():
                    if Const.INPUT not in data_set.get_field_names():
                        raise KeyError(f'Field ``{Const.INPUT}`` not in {data_name} data set: '
                                       f'got {data_set.get_field_names()}')
                    data_set.apply(lambda x: [0] * (len(x[Const.INPUTS(0)]) + 2) + [1] * (len(x[Const.INPUTS(1)]) + 1),
                                   new_field_name=Const.INPUT_LENS(0), is_input=auto_set_input)
                    data_set.apply(lambda x: [1] * len(x[Const.INPUT_LENS(0)]),
                                   new_field_name=Const.INPUT_LENS(1), is_input=auto_set_input)

        if cut_text is not None:
            for data_name, data_set in data_info.datasets.items():
                for fields in data_set.get_field_names():
                    if (Const.INPUT in fields) or ((Const.INPUT_LEN in fields) and (seq_len_type != 'seq_len')):
                        data_set.apply(lambda x: x[fields][: cut_text], new_field_name=fields,
                                       is_input=auto_set_input)

        data_set_list = [d for n, d in data_info.datasets.items()]
        assert len(data_set_list) > 0, f'There are NO data sets in data info!'

        if bert_tokenizer is not None:
            words_vocab = Vocabulary(padding='[PAD]', unknown='[UNK]')
            with open(os.path.join(model_dir, 'vocab.txt'), 'r') as f:
                lines = f.readlines()
            lines = [line.strip() for line in lines]
            words_vocab.add_word_lst(lines)
            words_vocab.build_vocab()
        else:
            words_vocab = Vocabulary()
            words_vocab = words_vocab.from_dataset(*[d for n, d in data_info.datasets.items() if 'train' in n],
                                                   field_name=[n for n in data_set_list[0].get_field_names()
                                                               if (Const.INPUT in n)],
                                                   no_create_entry_dataset=[d for n, d in data_info.datasets.items()
                                                                            if 'train' not in n])
        target_vocab = Vocabulary(padding=None, unknown=None)
        target_vocab = target_vocab.from_dataset(*data_set_list, field_name=Const.TARGET)
        data_info.vocabs = {Const.INPUT: words_vocab, Const.TARGET: target_vocab}

        if get_index:
            for data_name, data_set in data_info.datasets.items():
                for fields in data_set.get_field_names():
                    if Const.INPUT in fields:
                        data_set.apply(lambda x: [words_vocab.to_index(w) for w in x[fields]], new_field_name=fields,
                                       is_input=auto_set_input)

                data_set.apply(lambda x: target_vocab.to_index(x[Const.TARGET]), new_field_name=Const.TARGET,
                               is_input=auto_set_input, is_target=auto_set_target)

        for data_name, data_set in data_info.datasets.items():
            if isinstance(set_input, list):
                data_set.set_input(*set_input)
            if isinstance(set_target, list):
                data_set.set_target(*set_target)

        return data_info


 class SNLILoader(MatchingLoader, JsonLoader):
    """
    别名：:class:`fastNLP.io.SNLILoader` :class:`fastNLP.io.dataset_loader.SNLILoader`

    读取SNLI数据集，读取的DataSet包含fields::

        words1: list(str)，第一句文本, premise
        words2: list(str), 第二句文本, hypothesis
        target: str, 真实标签

    数据来源: https://nlp.stanford.edu/projects/snli/snli_1.0.zip
    """

    def __init__(self, paths: dict=None):
        fields = {
            'sentence1_binary_parse': Const.INPUTS(0),
            'sentence2_binary_parse': Const.INPUTS(1),
            'gold_label': Const.TARGET,
        }
        paths = paths if paths is not None else {
            'train': 'snli_1.0_train.jsonl',
            'dev': 'snli_1.0_dev.jsonl',
            'test': 'snli_1.0_test.jsonl'}
        MatchingLoader.__init__(self, paths=paths)
        JsonLoader.__init__(self, fields=fields)

    def _load(self, path):
        ds = JsonLoader._load(self, path)

        parentheses_table = str.maketrans({'(': None, ')': None})

        ds.apply(lambda ins: ins[Const.INPUTS(0)].translate(parentheses_table).strip().split(),
                 new_field_name=Const.INPUTS(0))
        ds.apply(lambda ins: ins[Const.INPUTS(1)].translate(parentheses_table).strip().split(),
                 new_field_name=Const.INPUTS(1))
        ds.drop(lambda x: x[Const.TARGET] == '-')
        return ds


 class RTELoader(MatchingLoader, CSVLoader):
    """
    别名：:class:`fastNLP.io.RTELoader` :class:`fastNLP.io.dataset_loader.RTELoader`

    读取RTE数据集，读取的DataSet包含fields::

        words1: list(str)，第一句文本, premise
        words2: list(str), 第二句文本, hypothesis
        target: str, 真实标签

    数据来源:
    """

    def __init__(self, paths: dict=None):
        paths = paths if paths is not None else {
            'train': 'train.tsv',
            'dev': 'dev.tsv',
            # 'test': 'test.tsv' # test set has not label
        }
        MatchingLoader.__init__(self, paths=paths)
        self.fields = {
            'sentence1': Const.INPUTS(0),
            'sentence2': Const.INPUTS(1),
            'label': Const.TARGET,
        }
        CSVLoader.__init__(self, sep='\t')

    def _load(self, path):
        ds = CSVLoader._load(self, path)

        for k, v in self.fields.items():
            ds.rename_field(k, v)
        for fields in ds.get_all_fields():
            if Const.INPUT in fields:
                ds.apply(lambda x: x[fields].strip().split(), new_field_name=fields)

        return ds


 class QNLILoader(MatchingLoader, CSVLoader):
    """
    别名：:class:`fastNLP.io.QNLILoader` :class:`fastNLP.io.dataset_loader.QNLILoader`

    读取QNLI数据集，读取的DataSet包含fields::

        words1: list(str)，第一句文本, premise
        words2: list(str), 第二句文本, hypothesis
        target: str, 真实标签

    数据来源:
    """

    def __init__(self, paths: dict=None):
        paths = paths if paths is not None else {
            'train': 'train.tsv',
            'dev': 'dev.tsv',
            # 'test': 'test.tsv' # test set has not label
        }
        MatchingLoader.__init__(self, paths=paths)
        self.fields = {
            'question': Const.INPUTS(0),
            'sentence': Const.INPUTS(1),
            'label': Const.TARGET,
        }
        CSVLoader.__init__(self, sep='\t')

    def _load(self, path):
        ds = CSVLoader._load(self, path)

        for k, v in self.fields.items():
            ds.rename_field(k, v)
        for fields in ds.get_all_fields():
            if Const.INPUT in fields:
                ds.apply(lambda x: x[fields].strip().split(), new_field_name=fields)

        return ds

--- a/reproduction/matching/data/SNLIDataLoader.py
+++ b/reproduction/matching/data/SNLIDataLoader.py
@@ -1,6 +0,0 @@

 from fastNLP.io.dataset_loader import SNLILoader

 # TODO: still in progress


--- a/reproduction/matching/matching_esim.py
+++ b/reproduction/matching/matching_esim.py
@@ -0,0 +1,65 @@

 import argparse
 import torch

 from fastNLP.core import Trainer, Tester, Adam, AccuracyMetric, Const
 from fastNLP.modules.encoder.embedding import ElmoEmbedding, StaticEmbedding

 from reproduction.matching.data.MatchingDataLoader import SNLILoader
 from reproduction.matching.model.esim import ESIMModel

 argument = argparse.ArgumentParser()
 argument.add_argument('--embedding', choices=['glove', 'elmo'], default='glove')
 argument.add_argument('--batch-size-per-gpu', type=int, default=128)
 argument.add_argument('--n-epochs', type=int, default=100)
 argument.add_argument('--lr', type=float, default=1e-4)
 argument.add_argument('--seq-len-type', choices=['mask', 'seq_len'], default='seq_len')
 argument.add_argument('--save-dir', type=str, default=None)
 arg = argument.parse_args()

 bert_dirs = 'path/to/bert/dir'

 # load data set
 data_info = SNLILoader().process(
    paths='path/to/snli/data/dir', to_lower=True, seq_len_type=arg.seq_len_type, bert_tokenizer=None,
    get_index=True, concat=False,
 )

 # load embedding
 if arg.embedding == 'elmo':
    embedding = ElmoEmbedding(data_info.vocabs[Const.INPUT], requires_grad=True)
 elif arg.embedding == 'glove':
    embedding = StaticEmbedding(data_info.vocabs[Const.INPUT], requires_grad=True)
 else:
    raise ValueError(f'now we only support elmo or glove embedding for esim model!')

 # define model
 model = ESIMModel(embedding)

 # define trainer
 trainer = Trainer(train_data=data_info.datasets['train'], model=model,
                  optimizer=Adam(lr=arg.lr, model_params=model.parameters()),
                  batch_size=torch.cuda.device_count() * arg.batch_size_per_gpu,
                  n_epochs=arg.n_epochs, print_every=-1,
                  dev_data=data_info.datasets['dev'],
                  metrics=AccuracyMetric(), metric_key='acc',
                  device=[i for i in range(torch.cuda.device_count())],
                  check_code_level=-1,
                  save_path=arg.save_path)

 # train model
 trainer.train(load_best_model=True)

 # define tester
 tester = Tester(
    data=data_info.datasets['test'],
    model=model,
    metrics=AccuracyMetric(),
    batch_size=torch.cuda.device_count() * arg.batch_size_per_gpu,
    device=[i for i in range(torch.cuda.device_count())],
 )

 # test model
 tester.test()


--- a/reproduction/matching/model/esim.py
+++ b/reproduction/matching/model/esim.py
@@ -0,0 +1,197 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F

 from torch.nn import CrossEntropyLoss

 from fastNLP.models import BaseModel
 from fastNLP.modules.encoder.embedding import TokenEmbedding
 from fastNLP.modules.encoder.lstm import LSTM
 from fastNLP.core.const import Const
 from fastNLP.core.utils import seq_len_to_mask


 class ESIMModel(BaseModel):
    def __init__(self, init_embedding: TokenEmbedding, hidden_size=None, num_labels=3, dropout_rate=0.3,
                 dropout_embed=0.1):
        super(ESIMModel, self).__init__()

        self.embedding = init_embedding
        self.dropout_embed = EmbedDropout(p=dropout_embed)
        if hidden_size is None:
            hidden_size = self.embedding.embed_size
        self.rnn = BiRNN(self.embedding.embed_size, hidden_size, dropout_rate=dropout_rate)
        # self.rnn = LSTM(self.embedding.embed_size, hidden_size, dropout=dropout_rate, bidirectional=True)

        self.interfere = nn.Sequential(nn.Dropout(p=dropout_rate),
                                       nn.Linear(8 * hidden_size, hidden_size),
                                       nn.ReLU())
        nn.init.xavier_uniform_(self.interfere[1].weight.data)
        self.bi_attention = SoftmaxAttention()

        self.rnn_high = BiRNN(self.embedding.embed_size, hidden_size, dropout_rate=dropout_rate)
        # self.rnn_high = LSTM(hidden_size, hidden_size, dropout=dropout_rate, bidirectional=True,)

        self.classifier = nn.Sequential(nn.Dropout(p=dropout_rate),
                                        nn.Linear(8 * hidden_size, hidden_size),
                                        nn.Tanh(),
                                        nn.Dropout(p=dropout_rate),
                                        nn.Linear(hidden_size, num_labels))

        self.dropout_rnn = nn.Dropout(p=dropout_rate)

        nn.init.xavier_uniform_(self.classifier[1].weight.data)
        nn.init.xavier_uniform_(self.classifier[4].weight.data)

    def forward(self, words1, words2, seq_len1, seq_len2, target=None):
        """
        :param words1: [batch, seq_len]
        :param words2: [batch, seq_len]
        :param seq_len1: [batch]
        :param seq_len2: [batch]
        :param target:
        :return:
        """
        mask1 = seq_len_to_mask(seq_len1, words1.size(1))
        mask2 = seq_len_to_mask(seq_len2, words2.size(1))
        a0 = self.embedding(words1)  # B * len * emb_dim
        b0 = self.embedding(words2)
        a0, b0 = self.dropout_embed(a0), self.dropout_embed(b0)
        a = self.rnn(a0, mask1.byte())  # a: [B, PL, 2 * H]
        b = self.rnn(b0, mask2.byte())
        # a = self.dropout_rnn(self.rnn(a0, seq_len1)[0])  # a: [B, PL, 2 * H]
        # b = self.dropout_rnn(self.rnn(b0, seq_len2)[0])

        ai, bi = self.bi_attention(a, mask1, b, mask2)

        a_ = torch.cat((a, ai, a - ai, a * ai), dim=2)  # ma: [B, PL, 8 * H]
        b_ = torch.cat((b, bi, b - bi, b * bi), dim=2)
        a_f = self.interfere(a_)
        b_f = self.interfere(b_)

        a_h = self.rnn_high(a_f, mask1.byte())  # ma: [B, PL, 2 * H]
        b_h = self.rnn_high(b_f, mask2.byte())
        # a_h = self.dropout_rnn(self.rnn_high(a_f, seq_len1)[0])  # ma: [B, PL, 2 * H]
        # b_h = self.dropout_rnn(self.rnn_high(b_f, seq_len2)[0])

        a_avg = self.mean_pooling(a_h, mask1, dim=1)
        a_max, _ = self.max_pooling(a_h, mask1, dim=1)
        b_avg = self.mean_pooling(b_h, mask2, dim=1)
        b_max, _ = self.max_pooling(b_h, mask2, dim=1)

        out = torch.cat((a_avg, a_max, b_avg, b_max), dim=1)  # v: [B, 8 * H]
        logits = torch.tanh(self.classifier(out))

        if target is not None:
            loss_fct = CrossEntropyLoss()
            loss = loss_fct(logits, target)

            return {Const.LOSS: loss, Const.OUTPUT: logits}
        else:
            return {Const.OUTPUT: logits}

    def predict(self, **kwargs):
        return self.forward(**kwargs)

    # input [batch_size, len , hidden]
    # mask  [batch_size, len] (111...00)
    @staticmethod
    def mean_pooling(input, mask, dim=1):
        masks = mask.view(mask.size(0), mask.size(1), -1).float()
        return torch.sum(input * masks, dim=dim) / torch.sum(masks, dim=1)

    @staticmethod
    def max_pooling(input, mask, dim=1):
        my_inf = 10e12
        masks = mask.view(mask.size(0), mask.size(1), -1)
        masks = masks.expand(-1, -1, input.size(2)).float()
        return torch.max(input + masks.le(0.5).float() * -my_inf, dim=dim)


 class EmbedDropout(nn.Dropout):

    def forward(self, sequences_batch):
        ones = sequences_batch.data.new_ones(sequences_batch.shape[0], sequences_batch.shape[-1])
        dropout_mask = nn.functional.dropout(ones, self.p, self.training, inplace=False)
        return dropout_mask.unsqueeze(1) * sequences_batch


 class BiRNN(nn.Module):
    def __init__(self, input_size, hidden_size, dropout_rate=0.3):
        super(BiRNN, self).__init__()
        self.dropout_rate = dropout_rate
        self.rnn = nn.LSTM(input_size, hidden_size,
                           num_layers=1,
                           bidirectional=True,
                           batch_first=True)

    def forward(self, x, x_mask):
        # Sort x
        lengths = x_mask.data.eq(1).long().sum(1).squeeze()
        _, idx_sort = torch.sort(lengths, dim=0, descending=True)
        _, idx_unsort = torch.sort(idx_sort, dim=0)
        lengths = list(lengths[idx_sort])

        x = x.index_select(0, idx_sort)
        # Pack it up
        rnn_input = nn.utils.rnn.pack_padded_sequence(x, lengths, batch_first=True)
        # Apply dropout to input
        if self.dropout_rate > 0:
            dropout_input = F.dropout(rnn_input.data, p=self.dropout_rate, training=self.training)
            rnn_input = nn.utils.rnn.PackedSequence(dropout_input, rnn_input.batch_sizes)
        output = self.rnn(rnn_input)[0]
        # Unpack everything
        output = nn.utils.rnn.pad_packed_sequence(output, batch_first=True)[0]
        output = output.index_select(0, idx_unsort)
        if output.size(1) != x_mask.size(1):
            padding = torch.zeros(output.size(0),
                                  x_mask.size(1) - output.size(1),
                                  output.size(2)).type(output.data.type())
            output = torch.cat([output, padding], 1)
        return output


 def masked_softmax(tensor, mask):
    tensor_shape = tensor.size()
    reshaped_tensor = tensor.view(-1, tensor_shape[-1])

    # Reshape the mask so it matches the size of the input tensor.
    while mask.dim() < tensor.dim():
        mask = mask.unsqueeze(1)
    mask = mask.expand_as(tensor).contiguous().float()
    reshaped_mask = mask.view(-1, mask.size()[-1])
    result = F.softmax(reshaped_tensor * reshaped_mask, dim=-1)
    result = result * reshaped_mask
    # 1e-13 is added to avoid divisions by zero.
    result = result / (result.sum(dim=-1, keepdim=True) + 1e-13)
    return result.view(*tensor_shape)


 def weighted_sum(tensor, weights, mask):
    w_sum = weights.bmm(tensor)
    while mask.dim() < w_sum.dim():
        mask = mask.unsqueeze(1)
    mask = mask.transpose(-1, -2)
    mask = mask.expand_as(w_sum).contiguous().float()
    return w_sum * mask


 class SoftmaxAttention(nn.Module):

    def forward(self, premise_batch, premise_mask, hypothesis_batch, hypothesis_mask):
        similarity_matrix = premise_batch.bmm(hypothesis_batch.transpose(2, 1)
                                              .contiguous())

        prem_hyp_attn = masked_softmax(similarity_matrix, hypothesis_mask)
        hyp_prem_attn = masked_softmax(similarity_matrix.transpose(1, 2)
                                       .contiguous(),
                                       premise_mask)

        attended_premises = weighted_sum(hypothesis_batch,
                                         prem_hyp_attn,
                                         premise_mask)
        attended_hypotheses = weighted_sum(premise_batch,
                                           hyp_prem_attn,
                                           hypothesis_mask)

        return attended_premises, attended_hypotheses
--- a/reproduction/matching/snli.py
+++ b/reproduction/matching/snli.py
@@ -1,97 +0,0 @@
 import os

 import torch

 from fastNLP.core import Vocabulary, DataSet, Trainer, Tester, Const, Adam, AccuracyMetric

 from reproduction.matching.data.SNLIDataLoader import SNLILoader
 from legacy.component.bert_tokenizer import BertTokenizer
 from reproduction.matching.model.bert import BertForNLI


 def preprocess_data(data: DataSet, bert_dir):
    """
    preprocess data set to bert-need data set.
    :param data:
    :param bert_dir:
    :return:
    """
    tokenizer = BertTokenizer.from_pretrained(os.path.join(bert_dir, 'vocab.txt'))

    vocab = Vocabulary(padding=None, unknown=None)
    with open(os.path.join(bert_dir, 'vocab.txt')) as f:
        lines = f.readlines()
    vocab_list = []
    for line in lines:
        vocab_list.append(line.strip())
    vocab.add_word_lst(vocab_list)
    vocab.build_vocab()
    vocab.padding = '[PAD]'
    vocab.unknown = '[UNK]'

    for i in range(2):
        data.apply(lambda x: tokenizer.tokenize(" ".join(x[Const.INPUTS(i)])),
                   new_field_name=Const.INPUTS(i))
    data.apply(lambda x: ['[CLS]'] + x[Const.INPUTS(0)] + ['[SEP]'] + x[Const.INPUTS(1)] + ['[SEP]'],
               new_field_name=Const.INPUT)
    data.apply(lambda x: [0] * (len(x[Const.INPUTS(0)]) + 2) + [1] * (len(x[Const.INPUTS(1)]) + 1),
               new_field_name=Const.INPUT_LENS(0))
    data.apply(lambda x: [1] * len(x[Const.INPUT_LENS(0)]), new_field_name=Const.INPUT_LENS(1))

    max_len = 512
    data.apply(lambda x: x[Const.INPUT][: max_len], new_field_name=Const.INPUT)
    data.apply(lambda x: [vocab.to_index(w) for w in x[Const.INPUT]], new_field_name=Const.INPUT)
    data.apply(lambda x: x[Const.INPUT_LENS(0)][: max_len], new_field_name=Const.INPUT_LENS(0))
    data.apply(lambda x: x[Const.INPUT_LENS(1)][: max_len], new_field_name=Const.INPUT_LENS(1))

    target_vocab = Vocabulary(padding=None, unknown=None)
    target_vocab.add_word_lst(['neutral', 'contradiction', 'entailment'])
    target_vocab.build_vocab()
    data.apply(lambda x: target_vocab.to_index(x[Const.TARGET]), new_field_name=Const.TARGET)

    data.set_input(Const.INPUT, Const.INPUT_LENS(0), Const.INPUT_LENS(1), Const.TARGET)
    data.set_target(Const.TARGET)

    return data


 bert_dirs = 'path/to/bert/dir'

 # load raw data set
 train_data = SNLILoader().load('./data/snli/snli_1.0_train.jsonl')
 dev_data = SNLILoader().load('./data/snli/snli_1.0_dev.jsonl')
 test_data = SNLILoader().load('./data/snli/snli_1.0_test.jsonl')

 print('successfully load data sets!')

 train_data = preprocess_data(train_data, bert_dirs)
 dev_data = preprocess_data(dev_data, bert_dirs)
 test_data = preprocess_data(test_data, bert_dirs)

 model = BertForNLI(bert_dir=bert_dirs)

 trainer = Trainer(
    train_data=train_data,
    model=model,
    optimizer=Adam(lr=2e-5, model_params=model.parameters()),
    batch_size=torch.cuda.device_count() * 12,
    n_epochs=4,
    print_every=-1,
    dev_data=dev_data,
    metrics=AccuracyMetric(),
    metric_key='acc',
    device=[i for i in range(torch.cuda.device_count())],
    check_code_level=-1
 )
 trainer.train(load_best_model=True)

 tester = Tester(
    data=test_data,
    model=model,
    metrics=AccuracyMetric(),
    batch_size=torch.cuda.device_count() * 12,
    device=[i for i in range(torch.cuda.device_count())],
 )
 tester.test()


--- a/reproduction/matching/test/test_snlidataloader.py
+++ b/reproduction/matching/test/test_snlidataloader.py
@@ -1,10 +1,10 @@
 import unittest
 from ..data import SNLIDataLoader
 from ..data import MatchingDataLoader
 from fastNLP.core.vocabulary import Vocabulary


 class TestCWSDataLoader(unittest.TestCase):
    def test_case1(self):
        snli_loader = SNLIDataLoader()
        snli_loader = MatchingDataLoader()
        # TODO: still in progress

--- a/reproduction/seqence_labelling/cws/train_shift_relay.py
+++ b/reproduction/seqence_labelling/cws/train_shift_relay.py
@@ -57,12 +57,8 @@ callbacks = [clipper]
 # if pretrain:
 #     fixer = FixEmbedding([model.char_embedding, model.bigram_embedding], fix_until=fix_until)
 #     callbacks.append(fixer)
 trainer = Trainer(data.datasets['train'], model, optimizer=optimizer, loss=None,
                  batch_size=32, sampler=sampler, update_every=5,
                  n_epochs=3, print_every=5,
                  dev_data=data.datasets['dev'], metrics=RelayMetric(), metric_key='f',
                  validate_every=-1, save_path=None,
                  prefetch=True, use_tqdm=True, device=device,
                  callbacks=callbacks,
 trainer = Trainer(data.datasets['train'], model, optimizer=optimizer, loss=None, batch_size=32, sampler=sampler,
                  update_every=5, n_epochs=3, print_every=5, dev_data=data.datasets['dev'], metrics=RelayMetric(),
                  metric_key='f', validate_every=-1, save_path=None, use_tqdm=True, device=device, callbacks=callbacks,
                  check_code_level=0)
 trainer.train()
--- a/reproduction/seqence_labelling/ner/init.py
+++ b/reproduction/seqence_labelling/ner/init.py
--- a/reproduction/seqence_labelling/ner/data/Conll2003Loader.py
+++ b/reproduction/seqence_labelling/ner/data/Conll2003Loader.py
@@ -0,0 +1,93 @@

 from fastNLP.core.vocabulary import VocabularyOption
 from fastNLP.io.base_loader import DataSetLoader, DataInfo
 from typing import Union, Dict
 from fastNLP import Vocabulary
 from fastNLP import Const
 from reproduction.utils import check_dataloader_paths

 from fastNLP.io.dataset_loader import ConllLoader
 from reproduction.seqence_labelling.ner.data.utils import iob2bioes, iob2


 class Conll2003DataLoader(DataSetLoader):
    def __init__(self, task:str='ner', encoding_type:str='bioes'):
        """
        加载Conll2003格式的英语语料，该数据集的信息可以在https://www.clips.uantwerpen.be/conll2003/ner/找到。当task为pos
            时，返回的DataSet中target取值于第2列; 当task为chunk时，返回的DataSet中target取值于第3列;当task为ner时，返回
            的DataSet中target取值于第4列。所有"-DOCSTART- -X- O O"将被忽略，这会导致数据的数量少于很多文献报道的值，但
            鉴于"-DOCSTART- -X- O O"只是用于文档分割的符号，并不应该作为预测对象，所以我们忽略了数据中的-DOCTSTART-开头的行
        ner与chunk任务读取后的数据的target将为encoding_type类型。pos任务读取后就是pos列的数据。

        :param task: 指定需要标注任务。可选ner, pos, chunk
        """
        assert task in ('ner', 'pos', 'chunk')
        index = {'ner':3, 'pos':1, 'chunk':2}[task]
        self._loader = ConllLoader(headers=['raw_words', 'target'], indexes=[0, index])
        self._tag_converters = None
        if task in ('ner', 'chunk'):
            self._tag_converters = [iob2]
            if encoding_type == 'bioes':
                self._tag_converters.append(iob2bioes)

    def load(self, path: str):
        dataset = self._loader.load(path)
        def convert_tag_schema(tags):
            for converter in self._tag_converters:
                tags = converter(tags)
            return tags
        if self._tag_converters:
            dataset.apply_field(convert_tag_schema, field_name=Const.TARGET, new_field_name=Const.TARGET)
        return dataset

    def process(self, paths: Union[str, Dict[str, str]], word_vocab_opt:VocabularyOption=None, lower:bool=True):
        """
        读取并处理数据。数据中的'-DOCSTART-'开头的行会被忽略

        :param paths:
        :param word_vocab_opt: vocabulary的初始化值
        :param lower: 是否将所有字母转为小写
        :return:
        """
        # 读取数据
        paths = check_dataloader_paths(paths)
        data = DataInfo()
        input_fields = [Const.TARGET, Const.INPUT, Const.INPUT_LEN]
        target_fields = [Const.TARGET, Const.INPUT_LEN]
        for name, path in paths.items():
            dataset = self.load(path)
            dataset.apply_field(lambda words: words, field_name='raw_words', new_field_name=Const.INPUT)
            if lower:
                dataset.words.lower()
            data.datasets[name] = dataset

        # 对construct vocab
        word_vocab = Vocabulary(min_freq=2) if word_vocab_opt is None else Vocabulary(**word_vocab_opt)
        word_vocab.from_dataset(data.datasets['train'], field_name=Const.INPUT,
                                no_create_entry_dataset=[dataset for name, dataset in data.datasets.items() if name!='train'])
        word_vocab.index_dataset(*data.datasets.values(), field_name=Const.INPUT, new_field_name=Const.INPUT)
        data.vocabs[Const.INPUT] = word_vocab

        # cap words
        cap_word_vocab = Vocabulary()
        cap_word_vocab.from_dataset(data.datasets['train'], field_name='raw_words',
                                no_create_entry_dataset=[dataset for name, dataset in data.datasets.items() if name!='train'])
        cap_word_vocab.index_dataset(*data.datasets.values(), field_name='raw_words', new_field_name='cap_words')
        input_fields.append('cap_words')
        data.vocabs['cap_words'] = cap_word_vocab

        # 对target建vocab
        target_vocab = Vocabulary(unknown=None, padding=None)
        target_vocab.from_dataset(*data.datasets.values(), field_name=Const.TARGET)
        target_vocab.index_dataset(*data.datasets.values(), field_name=Const.TARGET)
        data.vocabs[Const.TARGET] = target_vocab

        for name, dataset in data.datasets.items():
            dataset.add_seq_len(Const.INPUT, new_field_name=Const.INPUT_LEN)
            dataset.set_input(*input_fields)
            dataset.set_target(*target_fields)

        return data

 if __name__ == '__main__':
    pass
--- a/reproduction/seqence_labelling/ner/data/OntoNoteLoader.py
+++ b/reproduction/seqence_labelling/ner/data/OntoNoteLoader.py
@@ -0,0 +1,152 @@
 from fastNLP.core.vocabulary import VocabularyOption
 from fastNLP.io.base_loader import DataSetLoader, DataInfo
 from typing import Union, Dict
 from fastNLP import DataSet
 from fastNLP import Vocabulary
 from fastNLP import Const
 from reproduction.utils import check_dataloader_paths

 from fastNLP.io.dataset_loader import ConllLoader
 from reproduction.seqence_labelling.ner.data.utils import iob2bioes, iob2

 class OntoNoteNERDataLoader(DataSetLoader):
    """
    用于读取处理为Conll格式后的OntoNote数据。将OntoNote数据处理为conll格式的过程可以参考https://github.com/yhcc/OntoNotes-5.0-NER。

    """
    def __init__(self, encoding_type:str='bioes'):
        assert encoding_type in ('bioes', 'bio')
        self.encoding_type = encoding_type
        if encoding_type=='bioes':
            self.encoding_method = iob2bioes
        else:
            self.encoding_method = iob2

    def load(self, path:str)->DataSet:
        """
        给定一个文件路径，读取数据。返回的DataSet包含以下的field
            raw_words: List[str]
            target: List[str]

        :param path:
        :return:
        """
        dataset = ConllLoader(headers=['raw_words', 'target'], indexes=[3, 10]).load(path)
        def convert_to_bio(tags):
            bio_tags = []
            flag = None
            for tag in tags:
                label = tag.strip("()*")
                if '(' in tag:
                    bio_label = 'B-' + label
                    flag = label
                elif flag:
                    bio_label = 'I-' + flag
                else:
                    bio_label = 'O'
                if ')' in tag:
                    flag = None
                bio_tags.append(bio_label)
            return self.encoding_method(bio_tags)

        def convert_word(words):
            converted_words = []
            for word in words:
                word = word.replace('/.', '.')  # 有些结尾的.是/.形式的
                if not word.startswith('-'):
                    converted_words.append(word)
                    continue
                # 以下是由于这些符号被转义了，再转回来
                tfrs = {'-LRB-':'(',
                        '-RRB-': ')',
                        '-LSB-': '[',
                        '-RSB-': ']',
                        '-LCB-': '{',
                        '-RCB-': '}'
                        }
                if word in tfrs:
                    converted_words.append(tfrs[word])
                else:
                    converted_words.append(word)
            return converted_words

        dataset.apply_field(convert_word, field_name='raw_words', new_field_name='raw_words')
        dataset.apply_field(convert_to_bio, field_name='target', new_field_name='target')

        return dataset

    def process(self, paths: Union[str, Dict[str, str]], word_vocab_opt:VocabularyOption=None,
                lower:bool=True)->DataInfo:
        """
        读取并处理数据。返回的DataInfo包含以下的内容
            vocabs:
                word: Vocabulary
                target: Vocabulary
            datasets:
                train: DataSet
                    words: List[int], 被设置为input
                    target: int. label，被同时设置为input和target
                    seq_len: int. 句子的长度，被同时设置为input和target
                    raw_words: List[str]
                xxx(根据传入的paths可能有所变化)

        :param paths:
        :param word_vocab_opt: vocabulary的初始化值
        :param lower: 是否使用小写
        :return:
        """
        paths = check_dataloader_paths(paths)
        data = DataInfo()
        input_fields = [Const.TARGET, Const.INPUT, Const.INPUT_LEN]
        target_fields = [Const.TARGET, Const.INPUT_LEN]
        for name, path in paths.items():
            dataset = self.load(path)
            dataset.apply_field(lambda words: words, field_name='raw_words', new_field_name=Const.INPUT)
            if lower:
                dataset.words.lower()
            data.datasets[name] = dataset

        # 对construct vocab
        word_vocab = Vocabulary(min_freq=2) if word_vocab_opt is None else Vocabulary(**word_vocab_opt)
        word_vocab.from_dataset(data.datasets['train'], field_name=Const.INPUT,
                                no_create_entry_dataset=[dataset for name, dataset in data.datasets.items() if name!='train'])
        word_vocab.index_dataset(*data.datasets.values(), field_name=Const.INPUT, new_field_name=Const.INPUT)
        data.vocabs[Const.INPUT] = word_vocab

        # cap words
        cap_word_vocab = Vocabulary()
        cap_word_vocab.from_dataset(*data.datasets.values(), field_name='raw_words')
        cap_word_vocab.index_dataset(*data.datasets.values(), field_name='raw_words', new_field_name='cap_words')
        input_fields.append('cap_words')
        data.vocabs['cap_words'] = cap_word_vocab

        # 对target建vocab
        target_vocab = Vocabulary(unknown=None, padding=None)
        target_vocab.from_dataset(*data.datasets.values(), field_name=Const.TARGET)
        target_vocab.index_dataset(*data.datasets.values(), field_name=Const.TARGET)
        data.vocabs[Const.TARGET] = target_vocab

        for name, dataset in data.datasets.items():
            dataset.add_seq_len(Const.INPUT, new_field_name=Const.INPUT_LEN)
            dataset.set_input(*input_fields)
            dataset.set_target(*target_fields)

        return data


 if __name__ == '__main__':
    loader = OntoNoteNERDataLoader()
    dataset = loader.load('/hdd/fudanNLP/fastNLP/others/data/v4/english/test.txt')
    print(dataset.target.value_count())
    print(dataset[:4])


 """
 train 115812 2200752
 development 15680 304684
 test 12217 230111

 train 92403 1901772
 valid 13606 279180
 test 10258 204135
 """
--- a/reproduction/seqence_labelling/ner/data/utils.py
+++ b/reproduction/seqence_labelling/ner/data/utils.py
@@ -0,0 +1,49 @@
 from typing import List

 def iob2(tags:List[str])->List[str]:
    """
    检查数据是否是合法的IOB数据，如果是IOB1会被自动转换为IOB2。

    :param tags: 需要转换的tags
    """
    for i, tag in enumerate(tags):
        if tag == "O":
            continue
        split = tag.split("-")
        if len(split) != 2 or split[0] not in ["I", "B"]:
            raise TypeError("The encoding schema is not a valid IOB type.")
        if split[0] == "B":
            continue
        elif i == 0 or tags[i - 1] == "O":  # conversion IOB1 to IOB2
            tags[i] = "B" + tag[1:]
        elif tags[i - 1][1:] == tag[1:]:
            continue
        else:  # conversion IOB1 to IOB2
            tags[i] = "B" + tag[1:]
    return tags

 def iob2bioes(tags:List[str])->List[str]:
    """
    将iob的tag转换为bmeso编码
    :param tags:
    :return:
    """
    new_tags = []
    for i, tag in enumerate(tags):
        if tag == 'O':
            new_tags.append(tag)
        else:
            split = tag.split('-')[0]
            if split == 'B':
                if i+1!=len(tags) and tags[i+1].split('-')[0] == 'I':
                    new_tags.append(tag)
                else:
                    new_tags.append(tag.replace('B-', 'S-'))
            elif split == 'I':
                if i + 1<len(tags) and tags[i+1].split('-')[0] == 'I':
                    new_tags.append(tag)
                else:
                    new_tags.append(tag.replace('I-', 'E-'))
            else:
                raise TypeError("Invalid IOB format.")
    return new_tags
--- a/reproduction/seqence_labelling/ner/model/lstm_cnn_crf.py
+++ b/reproduction/seqence_labelling/ner/model/lstm_cnn_crf.py
@@ -0,0 +1,56 @@

 import torch
 from torch import nn
 from fastNLP import seq_len_to_mask
 from fastNLP.modules import Embedding
 from fastNLP.modules import LSTM
 from fastNLP.modules import ConditionalRandomField, allowed_transitions
 import torch.nn.functional as F
 from fastNLP import Const

 class CNNBiLSTMCRF(nn.Module):
    def __init__(self, embed, char_embed, hidden_size, num_layers, tag_vocab, dropout=0.5, encoding_type='bioes'):
        super().__init__()

        self.embedding = Embedding(embed, dropout=0.5, dropout_word=0)
        self.char_embedding = Embedding(char_embed, dropout=0.5, dropout_word=0.01)
        self.lstm = LSTM(input_size=self.embedding.embedding_dim+self.char_embedding.embedding_dim,
                             hidden_size=hidden_size//2, num_layers=num_layers,
                             bidirectional=True, batch_first=True)
        self.fc = nn.Linear(hidden_size, len(tag_vocab))

        transitions = allowed_transitions(tag_vocab.idx2word, encoding_type=encoding_type, include_start_end=True)
        self.crf = ConditionalRandomField(len(tag_vocab), include_start_end_trans=True, allowed_transitions=transitions)

        self.dropout = nn.Dropout(dropout, inplace=True)

        for name, param in self.named_parameters():
            if 'fc' in name:
                if param.data.dim()>1:
                    nn.init.xavier_uniform_(param)
                else:
                    nn.init.constant_(param, 0)
            if 'crf' in name:
                nn.init.zeros_(param)

    def _forward(self, words, cap_words, seq_len, target=None):
        words = self.embedding(words)
        chars = self.char_embedding(cap_words)
        words = torch.cat([words, chars], dim=-1)
        outputs, _ = self.lstm(words, seq_len)
        self.dropout(outputs)

        logits = F.log_softmax(self.fc(outputs), dim=-1)

        if target is not None:
            loss = self.crf(logits, target, seq_len_to_mask(seq_len))
            return {Const.LOSS: loss}
        else:
            pred, _ = self.crf.viterbi_decode(logits, seq_len_to_mask(seq_len))
            return {Const.OUTPUT: pred}

    def forward(self, words, cap_words, seq_len, target):
        return self._forward(words, cap_words, seq_len, target)

    def predict(self, words, cap_words, seq_len):
        return self._forward(words, cap_words, seq_len, None)
--- a/reproduction/seqence_labelling/ner/test/init.py
+++ b/reproduction/seqence_labelling/ner/test/init.py
--- a/reproduction/seqence_labelling/ner/test/test.py
+++ b/reproduction/seqence_labelling/ner/test/test.py
@@ -0,0 +1,33 @@

 from reproduction.seqence_labelling.ner.data.Conll2003Loader import Conll2003DataLoader
 from reproduction.seqence_labelling.ner.data.Conll2003Loader import iob2, iob2bioes
 import unittest

 class TestTagSchemaConverter(unittest.TestCase):
    def test_iob2(self):
        tags = ['B-ORG', 'O', 'B-MISC', 'O', 'O', 'O', 'B-MISC', 'O', 'O']
        golden = ['B-ORG', 'O', 'B-MISC', 'O', 'O', 'O', 'B-MISC', 'O', 'O']
        self.assertListEqual(golden, iob2(tags))

        tags = ['I-ORG', 'O']
        golden = ['B-ORG', 'O']
        self.assertListEqual(golden, iob2(tags))

        tags = ['I-MISC', 'I-MISC', 'O', 'I-PER', 'I-PER', 'O']
        golden = ['B-MISC', 'I-MISC', 'O', 'B-PER', 'I-PER', 'O']
        self.assertListEqual(golden, iob2(tags))

    def test_iob2bemso(self):
        tags = ['B-MISC', 'I-MISC', 'O', 'B-PER', 'I-PER', 'O']
        golden = ['B-MISC', 'E-MISC', 'O', 'B-PER', 'E-PER', 'O']
        self.assertListEqual(golden, iob2bioes(tags))


 def test_conll2003_loader():
    path = '/hdd/fudanNLP/fastNLP/others/data/conll2003/train.txt'
    loader = Conll2003DataLoader().load(path)
    print(loader[:3])


 if __name__ == '__main__':
    test_conll2003_loader()
--- a/reproduction/seqence_labelling/ner/train_cnn_lstm_crf_conll2003.py
+++ b/reproduction/seqence_labelling/ner/train_cnn_lstm_crf_conll2003.py
@@ -0,0 +1,70 @@


 from fastNLP.modules.encoder.embedding import CNNCharEmbedding, StaticEmbedding, BertEmbedding, ElmoEmbedding, LSTMCharEmbedding
 from fastNLP.core.vocabulary import VocabularyOption

 from reproduction.seqence_labelling.ner.model.lstm_cnn_crf import CNNBiLSTMCRF
 from fastNLP import Trainer
 from fastNLP import SpanFPreRecMetric
 from fastNLP import BucketSampler
 from fastNLP import Const
 from torch.optim import SGD, Adam
 from fastNLP import GradientClipCallback
 from fastNLP.core.callback import FitlogCallback, LRScheduler
 from torch.optim.lr_scheduler import LambdaLR
 from reproduction.seqence_labelling.ner.model.swats import SWATS

 import fitlog
 fitlog.debug()

 from reproduction.seqence_labelling.ner.data.Conll2003Loader import Conll2003DataLoader

 encoding_type = 'bioes'

 data = Conll2003DataLoader(encoding_type=encoding_type).process('../../../../others/data/conll2003',
                                                                word_vocab_opt=VocabularyOption(min_freq=2),
                                                                lower=False)
 print(data)
 char_embed = CNNCharEmbedding(vocab=data.vocabs['cap_words'], embed_size=30, char_emb_size=30, filter_nums=[30],
                              kernel_sizes=[3])
 # char_embed = LSTMCharEmbedding(vocab=data.vocabs['cap_words'], embed_size=30 ,char_emb_size=30)
 word_embed = StaticEmbedding(vocab=data.vocabs[Const.INPUT],
                             model_dir_or_name='/hdd/fudanNLP/pretrain_vectors/wiki_en_100_50_case_2.txt',
                             requires_grad=True)
 word_embed.embedding.weight.data = word_embed.embedding.weight.data/word_embed.embedding.weight.data.std()

 # import joblib
 # raw_data = joblib.load('/hdd/fudanNLP/fastNLP/others/NER-with-LS/data/conll_with_data.joblib')
 # def convert_to_ids(raw_words):
 #     ids = []
 #     for word in raw_words:
 #         id = raw_data['word_to_id'][word]
 #         id = raw_data['id_to_emb_map'][id]
 #         ids.append(id)
 #     return ids
 # word_embed = raw_data['emb_matrix']
 # for name, dataset in data.datasets.items():
 #     dataset.apply_field(convert_to_ids, field_name='raw_words', new_field_name=Const.INPUT)

 # word_embed = ElmoEmbedding(vocab=data.vocabs['cap_words'],
 #                              model_dir_or_name='/hdd/fudanNLP/fastNLP/others/pretrained_models/elmo_en',
 #                              requires_grad=True)

 model = CNNBiLSTMCRF(word_embed, char_embed, hidden_size=200, num_layers=1, tag_vocab=data.vocabs[Const.TARGET],
                     encoding_type=encoding_type)

 callbacks = [
            GradientClipCallback(clip_type='value', clip_value=5)
            , FitlogCallback({'test':data.datasets['test']}, verbose=1)
            ]
 # optimizer = Adam(model.parameters(), lr=0.005)
 optimizer = SWATS(model.parameters(), verbose=True)
 # optimizer = SGD(model.parameters(), lr=0.008, momentum=0.9)
 # scheduler = LRScheduler(LambdaLR(optimizer, lr_lambda=lambda epoch: 1 / (1 + 0.05 * epoch)))
 # callbacks.append(scheduler)

 trainer = Trainer(train_data=data.datasets['train'], model=model, optimizer=optimizer, sampler=BucketSampler(),
                  device=1, dev_data=data.datasets['dev'], batch_size=10,
                  metrics=SpanFPreRecMetric(tag_vocab=data.vocabs[Const.TARGET], encoding_type=encoding_type),
                  callbacks=callbacks, num_workers=1, n_epochs=100)
 trainer.train()
--- a/reproduction/seqence_labelling/ner/train_ontonote.py
+++ b/reproduction/seqence_labelling/ner/train_ontonote.py
@@ -0,0 +1,65 @@
 import sys

 sys.path.append('../../..')

 from fastNLP.modules.encoder.embedding import CNNCharEmbedding, StaticEmbedding

 from reproduction.seqence_labelling.ner.model.lstm_cnn_crf import CNNBiLSTMCRF
 from fastNLP import Trainer
 from fastNLP import SpanFPreRecMetric
 from fastNLP import BucketSampler
 from fastNLP import Const
 from torch.optim import SGD, Adam
 from torch.optim.lr_scheduler import LambdaLR
 from fastNLP import GradientClipCallback
 from fastNLP.core.callback import FitlogCallback, LRScheduler
 from reproduction.seqence_labelling.ner.model.swats import SWATS

 import fitlog
 fitlog.debug()

 from reproduction.seqence_labelling.ner.data.OntoNoteLoader import OntoNoteNERDataLoader

 encoding_type = 'bioes'

 data = OntoNoteNERDataLoader(encoding_type=encoding_type).process('/hdd/fudanNLP/fastNLP/others/data/v4/english',
                                                                  lower=True)

 import joblib
 raw_data = joblib.load('/hdd/fudanNLP/fastNLP/others/NER-with-LS/data/ontonotes_with_data.joblib')
 def convert_to_ids(raw_words):
    ids = []
    for word in raw_words:
        id = raw_data['word_to_id'][word]
        id = raw_data['id_to_emb_map'][id]
        ids.append(id)
    return ids
 word_embed = raw_data['emb_matrix']
 for name, dataset in data.datasets.items():
    dataset.apply_field(convert_to_ids, field_name='raw_words', new_field_name=Const.INPUT)

 print(data)
 char_embed = CNNCharEmbedding(vocab=data.vocabs['cap_words'], embed_size=30, char_emb_size=30, filter_nums=[30],
                              kernel_sizes=[3])
 # word_embed = StaticEmbedding(vocab=data.vocabs[Const.INPUT],
 #                              model_dir_or_name='/hdd/fudanNLP/pretrain_vectors/glove.6B.100d.txt',
 #                              requires_grad=True)

 model = CNNBiLSTMCRF(word_embed, char_embed, hidden_size=1200, num_layers=1, tag_vocab=data.vocabs[Const.TARGET],
                     encoding_type=encoding_type)

 callbacks = [GradientClipCallback(clip_value=5, clip_type='value'),
             FitlogCallback(data.datasets['test'], verbose=1)]

 optimizer = SGD(model.parameters(), lr=0.01, momentum=0.9)
 scheduler = LRScheduler(LambdaLR(optimizer, lr_lambda=lambda epoch: 1 / (1 + 0.05 * epoch)))
 callbacks.append(scheduler)
 # optimizer = SWATS(model.parameters(), verbose=True)
 # optimizer = Adam(model.parameters(), lr=0.005)


 trainer = Trainer(train_data=data.datasets['train'], model=model, optimizer=optimizer, sampler=BucketSampler(num_buckets=100),
                  device=0, dev_data=data.datasets['dev'], batch_size=10,
                  metrics=SpanFPreRecMetric(tag_vocab=data.vocabs[Const.TARGET], encoding_type=encoding_type),
                  callbacks=callbacks, num_workers=1, n_epochs=100)
 trainer.train()
--- a/reproduction/utils.py
+++ b/reproduction/utils.py
@@ -13,7 +13,8 @@ def check_dataloader_paths(paths:Union[str, Dict[str, str]])->Dict[str, str]:
    }
    如果paths为不合法的，将直接进行raise相应的错误

    :param paths: 路径
    :param paths: 路径. 可以为一个文件路径(则认为该文件就是train的文件); 可以为一个文件目录，将在该目录下寻找train.txt,
        test.txt, dev.txt; 可以为一个dict, 则key是用户自定义的某个文件的名称，value是这个文件的路径。
    :return:
    """
    if isinstance(paths, str):
@@ -24,7 +25,7 @@ def check_dataloader_paths(paths:Union[str, Dict[str, str]])->Dict[str, str]:
            if not os.path.isfile(train_fp):
                raise FileNotFoundError(f"train.txt is not found in folder {paths}.")
            files = {'train': train_fp}
            for filename in ['test.txt', 'dev.txt']:
            for filename in ['dev.txt', 'test.txt']:
                fp = os.path.join(paths, filename)
                if os.path.isfile(fp):
                    files[filename.split('.')[0]] = fp
--- a/setup.py
+++ b/setup.py
@@ -13,7 +13,7 @@ with open('requirements.txt', encoding='utf-8') as f:

 setup(
    name='FastNLP',
    version='0.4.0',
    version='dev0.5.0',
    description='fastNLP: Deep Learning Toolkit for NLP, developed by Fudan FastNLP Team',
    long_description=readme,
    long_description_content_type='text/markdown',
--- a/test/core/test_batch.py
+++ b/test/core/test_batch.py
@@ -3,7 +3,7 @@ import unittest
 import numpy as np
 import torch

 from fastNLP import Batch
 from fastNLP import DataSetIter
 from fastNLP import DataSet
 from fastNLP import Instance
 from fastNLP import SequentialSampler
@@ -57,7 +57,7 @@ class TestCase1(unittest.TestCase):
        dataset = construct_dataset(
            [["FastNLP", "is", "the", "most", "beautiful", "tool", "in", "the", "world"] for _ in range(40)])
        dataset.set_target()
        batch = Batch(dataset, batch_size=4, sampler=SequentialSampler(), as_numpy=True)
        batch = DataSetIter(dataset, batch_size=4, sampler=SequentialSampler(), as_numpy=True)
        
        cnt = 0
        for _, _ in batch:
@@ -68,7 +68,7 @@ class TestCase1(unittest.TestCase):
        ds = DataSet({"x": [[1, 2, 3, 4]] * 40, "y": [[5, 6]] * 40})
        ds.set_input("x")
        ds.set_target("y")
        iter = Batch(ds, batch_size=4, sampler=SequentialSampler(), as_numpy=True)
        iter = DataSetIter(ds, batch_size=4, sampler=SequentialSampler(), as_numpy=True)
        for x, y in iter:
            self.assertTrue(isinstance(x["x"], np.ndarray) and isinstance(y["y"], np.ndarray))
            self.assertEqual(len(x["x"]), 4)
@@ -81,7 +81,7 @@ class TestCase1(unittest.TestCase):
                      "y": [[4, 3, 2, 1], [3, 2, 1], [2, 1], [1]] * 10})
        ds.set_input("x")
        ds.set_target("y")
        iter = Batch(ds, batch_size=4, sampler=SequentialSampler(), as_numpy=True)
        iter = DataSetIter(ds, batch_size=4, sampler=SequentialSampler(), as_numpy=True)
        for x, y in iter:
            self.assertEqual(x["x"].shape, (4, 4))
            self.assertEqual(y["y"].shape, (4, 4))
@@ -91,7 +91,7 @@ class TestCase1(unittest.TestCase):
                      "y": np.array([[4, 3, 2, 1], [3, 2, 1], [2, 1], [1]] * 10)})
        ds.set_input("x")
        ds.set_target("y")
        iter = Batch(ds, batch_size=4, sampler=SequentialSampler(), as_numpy=True)
        iter = DataSetIter(ds, batch_size=4, sampler=SequentialSampler(), as_numpy=True)
        for x, y in iter:
            self.assertEqual(x["x"].shape, (4, 4))
            self.assertEqual(y["y"].shape, (4, 4))
@@ -101,7 +101,7 @@ class TestCase1(unittest.TestCase):
                      "y": [[4, 3, 2, 1], [3, 2, 1], [2, 1], [1]] * 10})
        ds.set_input("x")
        ds.set_target("y")
        iter = Batch(ds, batch_size=4, sampler=SequentialSampler(), as_numpy=False)
        iter = DataSetIter(ds, batch_size=4, sampler=SequentialSampler(), as_numpy=False)
        for x, y in iter:
            self.assertTrue(isinstance(x["x"], torch.Tensor))
            self.assertEqual(tuple(x["x"].shape), (4, 4))
@@ -113,7 +113,7 @@ class TestCase1(unittest.TestCase):
                      "y": np.array([[4, 3, 2, 1], [3, 2, 1], [2, 1], [1]] * 10)})
        ds.set_input("x")
        ds.set_target("y")
        iter = Batch(ds, batch_size=4, sampler=SequentialSampler(), as_numpy=False)
        iter = DataSetIter(ds, batch_size=4, sampler=SequentialSampler(), as_numpy=False)
        for x, y in iter:
            self.assertTrue(isinstance(x["x"], torch.Tensor))
            self.assertEqual(tuple(x["x"].shape), (4, 4))
@@ -125,7 +125,7 @@ class TestCase1(unittest.TestCase):
                     [Instance(x=[1, 2, 3, 4], y=[3, 4, 5, 6]) for _ in range(2)])
        ds.set_input("x")
        ds.set_target("y")
        iter = Batch(ds, batch_size=4, sampler=SequentialSampler(), as_numpy=False)
        iter = DataSetIter(ds, batch_size=4, sampler=SequentialSampler(), as_numpy=False)
        for x, y in iter:
            self.assertTrue(isinstance(x["x"], torch.Tensor))
            self.assertEqual(tuple(x["x"].shape), (4, 4))
@@ -137,7 +137,7 @@ class TestCase1(unittest.TestCase):
                     [Instance(x=np.array([1, 2, 3, 4]), y=np.array([3, 4, 5, 6])) for _ in range(2)])
        ds.set_input("x")
        ds.set_target("y")
        iter = Batch(ds, batch_size=4, sampler=SequentialSampler(), as_numpy=False)
        iter = DataSetIter(ds, batch_size=4, sampler=SequentialSampler(), as_numpy=False)
        for x, y in iter:
            print(x, y)
    
@@ -146,7 +146,7 @@ class TestCase1(unittest.TestCase):
        num_samples = 1000
        dataset = generate_fake_dataset(num_samples)
        
        batch = Batch(dataset, batch_size=batch_size, sampler=SequentialSampler())
        batch = DataSetIter(dataset, batch_size=batch_size, sampler=SequentialSampler())
        for batch_x, batch_y in batch:
            pass
    
--- a/test/core/test_callbacks.py
+++ b/test/core/test_callbacks.py
@@ -40,89 +40,50 @@ class TestCallback(unittest.TestCase):
    
    def test_gradient_clip(self):
        data_set, model = prepare_env()
        trainer = Trainer(data_set, model,
                          loss=BCELoss(pred="predict", target="y"),
                          n_epochs=20,
                          batch_size=32,
                          print_every=50,
                          optimizer=SGD(lr=0.1),
                          check_code_level=2,
                          use_tqdm=False,
                          dev_data=data_set,
                          metrics=AccuracyMetric(pred="predict", target="y"),
                          callbacks=[GradientClipCallback(model.parameters(), clip_value=2)])
        trainer = Trainer(data_set, model, optimizer=SGD(lr=0.1), loss=BCELoss(pred="predict", target="y"),
                          batch_size=32, n_epochs=20, print_every=50, dev_data=data_set,
                          metrics=AccuracyMetric(pred="predict", target="y"), use_tqdm=False,
                          callbacks=[GradientClipCallback(model.parameters(), clip_value=2)], check_code_level=2)
        trainer.train()
    
    def test_early_stop(self):
        data_set, model = prepare_env()
        trainer = Trainer(data_set, model,
                          loss=BCELoss(pred="predict", target="y"),
                          n_epochs=20,
                          batch_size=32,
                          print_every=50,
                          optimizer=SGD(lr=0.01),
                          check_code_level=2,
                          use_tqdm=False,
                          dev_data=data_set,
                          metrics=AccuracyMetric(pred="predict", target="y"),
                          callbacks=[EarlyStopCallback(5)])
        trainer = Trainer(data_set, model, optimizer=SGD(lr=0.01), loss=BCELoss(pred="predict", target="y"),
                          batch_size=32, n_epochs=20, print_every=50, dev_data=data_set,
                          metrics=AccuracyMetric(pred="predict", target="y"), use_tqdm=False,
                          callbacks=[EarlyStopCallback(5)], check_code_level=2)
        trainer.train()
    
    def test_lr_scheduler(self):
        data_set, model = prepare_env()
        optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
        trainer = Trainer(data_set, model,
                          loss=BCELoss(pred="predict", target="y"),
                          n_epochs=5,
                          batch_size=32,
                          print_every=50,
                          optimizer=optimizer,
                          check_code_level=2,
                          use_tqdm=False,
                          dev_data=data_set,
                          metrics=AccuracyMetric(pred="predict", target="y"),
                          callbacks=[LRScheduler(torch.optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.1))])
        trainer = Trainer(data_set, model, optimizer=optimizer, loss=BCELoss(pred="predict", target="y"), batch_size=32,
                          n_epochs=5, print_every=50, dev_data=data_set,
                          metrics=AccuracyMetric(pred="predict", target="y"), use_tqdm=False,
                          callbacks=[LRScheduler(torch.optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.1))],
                          check_code_level=2)
        trainer.train()
    
    def test_KeyBoardInterrupt(self):
        data_set, model = prepare_env()
        trainer = Trainer(data_set, model,
                          loss=BCELoss(pred="predict", target="y"),
                          n_epochs=5,
                          batch_size=32,
                          print_every=50,
                          optimizer=SGD(lr=0.1),
                          check_code_level=2,
                          use_tqdm=False,
                          callbacks=[ControlC(False)])
        trainer = Trainer(data_set, model, optimizer=SGD(lr=0.1), loss=BCELoss(pred="predict", target="y"),
                          batch_size=32, n_epochs=5, print_every=50, use_tqdm=False, callbacks=[ControlC(False)],
                          check_code_level=2)
        trainer.train()
    
    def test_LRFinder(self):
        data_set, model = prepare_env()
        trainer = Trainer(data_set, model,
                          loss=BCELoss(pred="predict", target="y"),
                          n_epochs=5,
                          batch_size=32,
                          print_every=50,
                          optimizer=SGD(lr=0.1),
                          check_code_level=2,
                          use_tqdm=False,
                          callbacks=[LRFinder(len(data_set) // 32)])
        trainer = Trainer(data_set, model, optimizer=SGD(lr=0.1), loss=BCELoss(pred="predict", target="y"),
                          batch_size=32, n_epochs=5, print_every=50, use_tqdm=False,
                          callbacks=[LRFinder(len(data_set) // 32)], check_code_level=2)
        trainer.train()
    
    def test_TensorboardCallback(self):
        data_set, model = prepare_env()
        trainer = Trainer(data_set, model,
                          loss=BCELoss(pred="predict", target="y"),
                          n_epochs=5,
                          batch_size=32,
                          print_every=50,
                          optimizer=SGD(lr=0.1),
                          check_code_level=2,
                          use_tqdm=False,
                          dev_data=data_set,
                          metrics=AccuracyMetric(pred="predict", target="y"),
                          callbacks=[TensorboardCallback("loss", "metric")])
        trainer = Trainer(data_set, model, optimizer=SGD(lr=0.1), loss=BCELoss(pred="predict", target="y"),
                          batch_size=32, n_epochs=5, print_every=50, dev_data=data_set,
                          metrics=AccuracyMetric(pred="predict", target="y"), use_tqdm=False,
                          callbacks=[TensorboardCallback("loss", "metric")], check_code_level=2)
        trainer.train()
    
    def test_readonly_property(self):
@@ -141,16 +102,9 @@ class TestCallback(unittest.TestCase):
                print(self.optimizer)
        
        data_set, model = prepare_env()
        trainer = Trainer(data_set, model,
                          loss=BCELoss(pred="predict", target="y"),
                          n_epochs=total_epochs,
                          batch_size=32,
                          print_every=50,
                          optimizer=SGD(lr=0.1),
                          check_code_level=2,
                          use_tqdm=False,
                          dev_data=data_set,
                          metrics=AccuracyMetric(pred="predict", target="y"),
                          callbacks=[MyCallback()])
        trainer = Trainer(data_set, model, optimizer=SGD(lr=0.1), loss=BCELoss(pred="predict", target="y"),
                          batch_size=32, n_epochs=total_epochs, print_every=50, dev_data=data_set,
                          metrics=AccuracyMetric(pred="predict", target="y"), use_tqdm=False, callbacks=[MyCallback()],
                          check_code_level=2)
        trainer.train()
        assert passed_epochs == list(range(1, total_epochs + 1))
--- a/test/core/test_metrics.py
+++ b/test/core/test_metrics.py
@@ -161,7 +161,15 @@ class TestAccuracyMetric(unittest.TestCase):
            print(e)
            return
        self.assertTrue(True, False), "No exception catches."
    

    def test_duplicate(self):
        # 0.4.1的潜在bug，不能出现形参重复的情况
        metric = AccuracyMetric(pred='predictions', target='targets')
        pred_dict = {"predictions": torch.zeros(4, 3, 2), "seq_len": torch.ones(4) * 3, 'pred':0}
        target_dict = {'targets':torch.zeros(4, 3), 'target': 0}
        metric(pred_dict=pred_dict, target_dict=target_dict)


    def test_seq_len(self):
        N = 256
        seq_len = torch.zeros(N).long()
--- a/test/core/test_trainer.py
+++ b/test/core/test_trainer.py
@@ -46,18 +46,10 @@ class TrainerTestGround(unittest.TestCase):
        
        model = NaiveClassifier(2, 1)
        
        trainer = Trainer(train_set, model,
                          loss=BCELoss(pred="predict", target="y"),
                          metrics=AccuracyMetric(pred="predict", target="y"),
                          n_epochs=10,
                          batch_size=32,
                          print_every=50,
                          validate_every=-1,
                          dev_data=dev_set,
                          optimizer=SGD(lr=0.1),
                          check_code_level=2,
                          use_tqdm=True,
                          save_path=None)
        trainer = Trainer(train_set, model, optimizer=SGD(lr=0.1), loss=BCELoss(pred="predict", target="y"),
                          batch_size=32, n_epochs=10, print_every=50, dev_data=dev_set,
                          metrics=AccuracyMetric(pred="predict", target="y"), validate_every=-1, save_path=None,
                          use_tqdm=True, check_code_level=2)
        trainer.train()
        """
        # 应该正确运行
@@ -83,10 +75,7 @@ class TrainerTestGround(unittest.TestCase):
        model = Model()
        
        with self.assertRaises(RuntimeError):
            trainer = Trainer(
                train_data=dataset,
                model=model
            )
            trainer = Trainer(train_data=dataset, model=model)
        """
        # 应该获取到的报错提示
        NameError: 
@@ -116,12 +105,7 @@ class TrainerTestGround(unittest.TestCase):
                return {'loss': loss}
        
        model = Model()
        trainer = Trainer(
            train_data=dataset,
            model=model,
            use_tqdm=False,
            print_every=2
        )
        trainer = Trainer(train_data=dataset, model=model, print_every=2, use_tqdm=False)
        trainer.train()
        """
        # 应该正确运行
@@ -147,12 +131,7 @@ class TrainerTestGround(unittest.TestCase):
        
        model = Model()
        with self.assertRaises(NameError):
            trainer = Trainer(
                train_data=dataset,
                model=model,
                use_tqdm=False,
                print_every=2
            )
            trainer = Trainer(train_data=dataset, model=model, print_every=2, use_tqdm=False)
            trainer.train()
    
    def test_trainer_suggestion4(self):
@@ -175,12 +154,7 @@ class TrainerTestGround(unittest.TestCase):
        
        model = Model()
        with self.assertRaises(NameError):
            trainer = Trainer(
                train_data=dataset,
                model=model,
                use_tqdm=False,
                print_every=2
            )
            trainer = Trainer(train_data=dataset, model=model, print_every=2, use_tqdm=False)
    
    def test_trainer_suggestion5(self):
        # 检查报错提示能否正确提醒用户
@@ -203,12 +177,7 @@ class TrainerTestGround(unittest.TestCase):
                return {'loss': loss}
        
        model = Model()
        trainer = Trainer(
            train_data=dataset,
            model=model,
            use_tqdm=False,
            print_every=2
        )
        trainer = Trainer(train_data=dataset, model=model, print_every=2, use_tqdm=False)
    
    def test_trainer_suggestion6(self):
        # 检查报错提示能否正确提醒用户
@@ -233,14 +202,8 @@ class TrainerTestGround(unittest.TestCase):
        
        model = Model()
        with self.assertRaises(NameError):
            trainer = Trainer(
                train_data=dataset,
                model=model,
                dev_data=dataset,
                loss=CrossEntropyLoss(),
                metrics=AccuracyMetric(),
                use_tqdm=False,
                print_every=2)
            trainer = Trainer(train_data=dataset, model=model, loss=CrossEntropyLoss(), print_every=2, dev_data=dataset,
                              metrics=AccuracyMetric(), use_tqdm=False)
    
    """
    def test_trainer_multiprocess(self):
--- a/test/core/test_utils.py
+++ b/test/core/test_utils.py
@@ -237,6 +237,10 @@ class TestSeqLenToMask(unittest.TestCase):
        with self.assertRaises(AssertionError):
            mask = seq_len_to_mask(seq_len)

        # 3. pad到指定长度
        seq_len = np.random.randint(1, 10, size=(10,))
        mask = seq_len_to_mask(seq_len, 100)
        self.assertEqual(100, mask.shape[1])

    def test_pytorch_seq_len(self):
        # 1. 随机测试
@@ -250,3 +254,8 @@ class TestSeqLenToMask(unittest.TestCase):
        seq_len = torch.randn(3, 4)
        with self.assertRaises(AssertionError):
            mask = seq_len_to_mask(seq_len)

        # 3. pad到指定长度
        seq_len = torch.randint(1, 10, size=(10, ))
        mask = seq_len_to_mask(seq_len, 100)
        self.assertEqual(100, mask.size(1))
--- a/test/core/test_vocabulary.py
+++ b/test/core/test_vocabulary.py
@@ -70,6 +70,24 @@ class TestAdd(unittest.TestCase):
            self.assertEqual(vocab.to_index(chr(start_char + i)), i + 2)
        vocab.index_dataset(dataset, field_name='char')

    def test_from_dataset_no_entry(self):
        # 测试能否正确将no_create_entry正确设置
        dataset = DataSet()
        start_char = 65
        num_samples = 10
        test_dataset = DataSet()
        for i in range(num_samples):
            char = [chr(start_char + i)] * 6
            ins = Instance(char=char)
            dataset.append(ins)
            ins = Instance(char=[c+c for c in char])
            test_dataset.append(ins)
        vocab = Vocabulary()
        vocab.from_dataset(dataset, field_name='char', no_create_entry_dataset=test_dataset)
        vocab.index_dataset(dataset, field_name='char')
        for i in range(num_samples):
            self.assertEqual(True, vocab._is_word_no_create_entry(chr(start_char + i)+chr(start_char + i)))


 class TestIndexing(unittest.TestCase):
    def test_len(self):
--- a/test/models/model_runner.py
+++ b/test/models/model_runner.py
@@ -130,11 +130,8 @@ class ModelRunner():
        tester = Tester(data=data, model=model, metrics=metrics,
                        batch_size=BATCH_SIZE, verbose=0)
        before_train = tester.test()
        trainer = Trainer(model=model, train_data=data, dev_data=None,
                          n_epochs=N_EPOCHS, batch_size=BATCH_SIZE,
                          loss=loss,
                          save_path=None,
                          use_tqdm=False)
        trainer = Trainer(train_data=data, model=model, loss=loss, batch_size=BATCH_SIZE, n_epochs=N_EPOCHS,
                          dev_data=None, save_path=None, use_tqdm=False)
        trainer.train(load_best_model=False)
        after_train = tester.test()
        for metric_name, v1 in before_train.items():
--- a/test/models/test_biaffine_parser.py
+++ b/test/models/test_biaffine_parser.py
@@ -1,6 +1,5 @@
 import unittest

 import fastNLP
 from fastNLP.models.biaffine_parser import BiaffineParser, ParserLoss, ParserMetric
 from .model_runner import *

--- a/test/modules/decoder/test_CRF.py
+++ b/test/modules/decoder/test_CRF.py
@@ -10,14 +10,14 @@ class TestCRF(unittest.TestCase):
        id2label = {0: 'B', 1: 'I', 2:'O'}
        expected_res = {(0, 0), (0, 1), (0, 2), (0, 4), (1, 0), (1, 1), (1, 2), (1, 4), (2, 0), (2, 2),
                        (2, 4), (3, 0), (3, 2)}
        self.assertSetEqual(expected_res, set(allowed_transitions(id2label)))
        self.assertSetEqual(expected_res, set(allowed_transitions(id2label, include_start_end=True)))

        id2label = {0: 'B', 1:'M', 2:'E', 3:'S'}
        expected_res = {(0, 1), (0, 2), (1, 1), (1, 2), (2, 0), (2, 3), (2, 5), (3, 0), (3, 3), (3, 5), (4, 0), (4, 3)}
        self.assertSetEqual(expected_res, set(allowed_transitions(id2label, encoding_type='BMES')))
        self.assertSetEqual(expected_res, set(allowed_transitions(id2label, encoding_type='BMES', include_start_end=True)))

        id2label = {0: 'B', 1: 'I', 2:'O', 3: '<pad>', 4:"<unk>"}
        allowed_transitions(id2label)
        allowed_transitions(id2label, include_start_end=True)

        labels = ['O']
        for label in ['X', 'Y']:
@@ -27,7 +27,7 @@ class TestCRF(unittest.TestCase):
        expected_res = {(0, 0), (0, 1), (0, 3), (0, 6), (1, 0), (1, 1), (1, 2), (1, 3), (1, 6), (2, 0), (2, 1),
                        (2, 2), (2, 3), (2, 6), (3, 0), (3, 1), (3, 3), (3, 4), (3, 6), (4, 0), (4, 1), (4, 3),
                        (4, 4), (4, 6), (5, 0), (5, 1), (5, 3)}
        self.assertSetEqual(expected_res, set(allowed_transitions(id2label)))
        self.assertSetEqual(expected_res, set(allowed_transitions(id2label, include_start_end=True)))

        labels = []
        for label in ['X', 'Y']:
@@ -37,7 +37,7 @@ class TestCRF(unittest.TestCase):
        expected_res = {(0, 1), (0, 2), (1, 1), (1, 2), (2, 0), (2, 3), (2, 4), (2, 7), (2, 9), (3, 0), (3, 3), (3, 4),
                        (3, 7), (3, 9), (4, 5), (4, 6), (5, 5), (5, 6), (6, 0), (6, 3), (6, 4), (6, 7), (6, 9), (7, 0),
                        (7, 3), (7, 4), (7, 7), (7, 9), (8, 0), (8, 3), (8, 4), (8, 7)}
        self.assertSetEqual(expected_res, set(allowed_transitions(id2label, encoding_type='BMES')))
        self.assertSetEqual(expected_res, set(allowed_transitions(id2label, encoding_type='BMES', include_start_end=True)))

    def test_case2(self):
        # 测试CRF能否避免解码出非法跃迁, 使用allennlp做了验证。
--- a/test/test_tutorials.py
+++ b/test/test_tutorials.py
@@ -60,10 +60,10 @@ class TestTutorial(unittest.TestCase):
        print(test_data[0])

        # 如果你们需要做强化学习或者GAN之类的项目，你们也可以使用这些数据预处理的工具
        from fastNLP.core.batch import Batch
        from fastNLP.core.batch import DataSetIter
        from fastNLP.core.sampler import RandomSampler

        batch_iterator = Batch(dataset=train_data, batch_size=2, sampler=RandomSampler())
        batch_iterator = DataSetIter(dataset=train_data, batch_size=2, sampler=RandomSampler())
        for batch_x, batch_y in batch_iterator:
            print("batch_x has: ", batch_x)
            print("batch_y has: ", batch_y)
@@ -80,23 +80,19 @@ class TestTutorial(unittest.TestCase):
        test_data.rename_field('label', 'label_seq')

        loss = CrossEntropyLoss(pred="output", target="label_seq")
        metric = AccuracyMetric(pred="predict", target="label_seq")
        metric = AccuracyMetric(target="label_seq")

        # 实例化Trainer，传入模型和数据，进行训练
        # 先在test_data拟合（确保模型的实现是正确的）
        copy_model = deepcopy(model)
        overfit_trainer = Trainer(model=copy_model, train_data=test_data, dev_data=test_data,
                                  loss=loss,
                                  metrics=metric,
                                  save_path=None,
                                  batch_size=32,
                                  n_epochs=5)
        overfit_trainer = Trainer(train_data=test_data, model=copy_model, loss=loss, batch_size=32, n_epochs=5,
                                  dev_data=test_data, metrics=metric, save_path=None)
        overfit_trainer.train()

        # 用train_data训练，在test_data验证
        trainer = Trainer(model=model, train_data=train_data, dev_data=test_data,
                          loss=CrossEntropyLoss(pred="output", target="label_seq"),
                          metrics=AccuracyMetric(pred="predict", target="label_seq"),
                          metrics=AccuracyMetric(target="label_seq"),
                          save_path=None,
                          batch_size=32,
                          n_epochs=5)
@@ -106,7 +102,7 @@ class TestTutorial(unittest.TestCase):
        # 调用Tester在test_data上评价效果
        from fastNLP import Tester

        tester = Tester(data=test_data, model=model, metrics=AccuracyMetric(pred="predict", target="label_seq"),
        tester = Tester(data=test_data, model=model, metrics=AccuracyMetric(target="label_seq"),
                        batch_size=4)
        acc = tester.test()
        print(acc)
@@ -147,13 +143,8 @@ class TestTutorial(unittest.TestCase):

        from fastNLP import Trainer, CrossEntropyLoss, AccuracyMetric, Adam

        trainer = Trainer(model=model,
                          train_data=train_data,
                          dev_data=dev_data,
                          loss=CrossEntropyLoss(),
                          optimizer= Adam(),
                          metrics=AccuracyMetric(target='target')
                          )
        trainer = Trainer(train_data=train_data, model=model, optimizer=Adam(), loss=CrossEntropyLoss(),
                          dev_data=dev_data, metrics=AccuracyMetric(target='target'))
        trainer.train()
        print('Train finished!')