From 32fdb48754b87d7ecae02f3f5bf74af45775e151 Mon Sep 17 00:00:00 2001
From: ChenXin <will131@foxmail.com>
Date: Thu, 16 May 2019 19:45:41 +0800
Subject: [PATCH 1/8] =?UTF-8?q?=E4=BF=AE=E6=94=B9=E4=BA=86=20core=20?=
 =?UTF-8?q?=E9=83=A8=E5=88=86=20import=20=E7=9A=84=E9=A1=BA=E5=BA=8F?=
 =?UTF-8?q?=EF=BC=8C=5F=5Fall=5F=5F=20=E6=9A=B4=E9=9C=B2=E7=9A=84=E5=86=85?=
 =?UTF-8?q?=E5=AE=B9?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 fastNLP/core/batch.py      |  17 +--
 fastNLP/core/callback.py   |  23 ++--
 fastNLP/core/dataset.py    |   8 +-
 fastNLP/core/field.py      | 100 ++++++++++--------
 fastNLP/core/instance.py   |   4 +-
 fastNLP/core/losses.py     |  17 ++-
 fastNLP/core/metrics.py    | 208 +++++++++++++++++++------------------
 fastNLP/core/optimizer.py  |  20 ++--
 fastNLP/core/predictor.py  |  11 +-
 fastNLP/core/sampler.py    |  12 ++-
 fastNLP/core/tester.py     |  29 +++---
 fastNLP/core/trainer.py    |  13 ++-
 fastNLP/core/utils.py      | 132 ++++++++++++-----------
 fastNLP/core/vocabulary.py |   5 +
 14 files changed, 336 insertions(+), 263 deletions(-)

diff --git a/fastNLP/core/batch.py b/fastNLP/core/batch.py
index 235a9a3a..90f0fc8c 100644
--- a/fastNLP/core/batch.py
+++ b/fastNLP/core/batch.py
@@ -2,15 +2,19 @@
 batch 模块实现了 fastNLP 所需的 Batch 类。
 
 """
-__all__ = ["Batch"]
+import atexit
 import numpy as np
 import torch
-import atexit
-
-from .sampler import RandomSampler, Sampler
 import torch.multiprocessing as mp
+
 from queue import Empty, Full
 
+from .sampler import RandomSampler
+
+__all__ = [
+    "Batch"
+]
+
 _python_is_exit = False
 
 
@@ -120,7 +124,7 @@ class Batch(object):
         :return list(int) indexes: 下标序列
         """
         return self.cur_batch_indices
-
+    
     @staticmethod
     def _run_fetch(batch, q):
         try:
@@ -145,7 +149,7 @@ class Batch(object):
             q.put(e)
         finally:
             q.join()
-
+    
     @staticmethod
     def _run_batch_iter(batch):
         q = mp.JoinableQueue(maxsize=10)
@@ -182,4 +186,3 @@ def _to_tensor(batch, dtype):
     except:
         pass
     return batch
-
diff --git a/fastNLP/core/callback.py b/fastNLP/core/callback.py
index 9dce426b..0a5ddc52 100644
--- a/fastNLP/core/callback.py
+++ b/fastNLP/core/callback.py
@@ -49,6 +49,18 @@ callback模块实现了 fastNLP 中的许多 callback 类，用于增强 :class:
     trainer.train()
 
 """
+import os
+import torch
+
+try:
+    from tensorboardX import SummaryWriter
+    
+    tensorboardX_flag = True
+except:
+    tensorboardX_flag = False
+
+from ..io.model_io import ModelSaver, ModelLoader
+
 __all__ = [
     "Callback",
     "GradientClipCallback",
@@ -60,15 +72,6 @@ __all__ = [
     "CallbackException",
     "EarlyStopError"
 ]
-import os
-import torch
-from ..io.model_io import ModelSaver, ModelLoader
-
-try:
-    from tensorboardX import SummaryWriter
-    tensorboardX_flag = True
-except:
-    tensorboardX_flag = False
 
 
 class Callback(object):
@@ -587,7 +590,7 @@ class TensorboardCallback(Callback):
             self._summary_writer = SummaryWriter(path)
         else:
             self._summary_writer = None
-            
+    
     def on_batch_begin(self, batch_x, batch_y, indices):
         if "model" in self.options and self.graph_added is False:
             # tesorboardX 这里有大bug，暂时没法画模型图
diff --git a/fastNLP/core/dataset.py b/fastNLP/core/dataset.py
index b506dfae..63f66019 100644
--- a/fastNLP/core/dataset.py
+++ b/fastNLP/core/dataset.py
@@ -272,9 +272,7 @@
 
 
 """
-__all__ = ["DataSet"]
 import _pickle as pickle
-
 import numpy as np
 import warnings
 
@@ -283,6 +281,10 @@ from .field import FieldArray
 from .instance import Instance
 from .utils import _get_func_signature
 
+__all__ = [
+    "DataSet"
+]
+
 
 class DataSet(object):
     """
@@ -854,4 +856,4 @@ class DataSet(object):
         with open(path, 'rb') as f:
             d = pickle.load(f)
             assert isinstance(d, DataSet), "The object is not DataSet, but {}.".format(type(d))
-        return d
\ No newline at end of file
+        return d
diff --git a/fastNLP/core/field.py b/fastNLP/core/field.py
index a355c4d2..4029a4ca 100644
--- a/fastNLP/core/field.py
+++ b/fastNLP/core/field.py
@@ -3,11 +3,17 @@ field模块实现了 FieldArray 和若干 Padder。 FieldArray 是  :class:`~fas
 原理部分请参考 :doc:`fastNLP.core.dataset`
 
 """
-
-
 import numpy as np
+
 from copy import deepcopy
 
+__all__ = [
+    "FieldArray",
+    "Padder",
+    "AutoPadder",
+    "EngChar2DPadder"
+]
+
 
 class FieldArray(object):
     """
@@ -24,6 +30,7 @@ class FieldArray(object):
     :param bool ignore_type: 是否忽略该field的type，一般如果这个field不需要转为torch.FloatTensor或torch.LongTensor,
         就可以设置为True。具体意义请参考 :class:`~fastNLP.DataSet` 。
     """
+    
     def __init__(self, name, content, is_target=None, is_input=None, padder=None, ignore_type=False):
         self.name = name
         if isinstance(content, list):
@@ -41,7 +48,7 @@ class FieldArray(object):
             raise TypeError("content in FieldArray can only be list or numpy.ndarray, got {}.".format(type(content)))
         if len(content) == 0:
             raise RuntimeError("Cannot initialize FieldArray with empty list.")
-
+        
         self.content = content  # 1维 或 2维 或 3维 list, 形状可能不对齐
         self.content_dim = None  # 表示content是多少维的list
         if padder is None:
@@ -51,27 +58,27 @@ class FieldArray(object):
             padder = deepcopy(padder)
         self.set_padder(padder)
         self.ignore_type = ignore_type
-
+        
         self.BASIC_TYPES = (int, float, str)  # content中可接受的Python基本类型，这里没有np.array
-
+        
         self.pytype = None
         self.dtype = None
         self._is_input = None
         self._is_target = None
-
+        
         if is_input is not None or is_target is not None:
             self.is_input = is_input
             self.is_target = is_target
-
+    
     def _set_dtype(self):
         if self.ignore_type is False:
             self.pytype = self._type_detection(self.content)
             self.dtype = self._map_to_np_type(self.pytype)
-
+    
     @property
     def is_input(self):
         return self._is_input
-
+    
     @is_input.setter
     def is_input(self, value):
         """
@@ -80,11 +87,11 @@ class FieldArray(object):
         if value is True:
             self._set_dtype()
         self._is_input = value
-
+    
     @property
     def is_target(self):
         return self._is_target
-
+    
     @is_target.setter
     def is_target(self, value):
         """
@@ -93,7 +100,7 @@ class FieldArray(object):
         if value is True:
             self._set_dtype()
         self._is_target = value
-
+    
     def _type_detection(self, content):
         """
         当该field被设置为is_input或者is_target时被调用
@@ -101,9 +108,9 @@ class FieldArray(object):
         """
         if len(content) == 0:
             raise RuntimeError("Empty list in Field {}.".format(self.name))
-
+        
         type_set = set([type(item) for item in content])
-
+        
         if list in type_set:
             if len(type_set) > 1:
                 # list 跟 非list 混在一起
@@ -139,7 +146,7 @@ class FieldArray(object):
                         self.name, self.BASIC_TYPES, content_type))
             self.content_dim = 1
             return self._basic_type_detection(type_set)
-
+    
     def _basic_type_detection(self, type_set):
         """
         :param type_set: a set of Python types
@@ -158,7 +165,7 @@ class FieldArray(object):
         else:
             # str, int, float混在一起
             raise RuntimeError("Mixed data types in Field {}: {}".format(self.name, list(type_set)))
-
+    
     def _1d_list_check(self, val):
         """如果不是1D list就报错
         """
@@ -168,7 +175,7 @@ class FieldArray(object):
         self._basic_type_detection(type_set)
         # otherwise: _basic_type_detection will raise error
         return True
-
+    
     def _2d_list_check(self, val):
         """如果不是2D list 就报错
         """
@@ -181,15 +188,15 @@ class FieldArray(object):
                 inner_type_set.add(type(obj))
         self._basic_type_detection(inner_type_set)
         return True
-
+    
     @staticmethod
     def _map_to_np_type(basic_type):
         type_mapping = {int: np.int64, float: np.float64, str: np.str, np.ndarray: np.ndarray}
         return type_mapping[basic_type]
-
+    
     def __repr__(self):
         return "FieldArray {}: {}".format(self.name, self.content.__repr__())
-
+    
     def append(self, val):
         """将val append到这个field的尾部。如果这个field已经被设置为input或者target，则在append之前会检查该类型是否与已有
         的内容是匹配的。
@@ -208,7 +215,7 @@ class FieldArray(object):
             else:
                 raise RuntimeError(
                     "Unexpected data type {}. Should be list, np.array, or {}".format(type(val), self.BASIC_TYPES))
-
+            
             if self.is_input is True or self.is_target is True:
                 if type(val) == list:
                     if len(val) == 0:
@@ -231,14 +238,14 @@ class FieldArray(object):
                     raise RuntimeError(
                         "Unexpected data type {}. Should be list, np.array, or {}".format(type(val), self.BASIC_TYPES))
         self.content.append(val)
-
+    
     def __getitem__(self, indices):
         return self.get(indices, pad=False)
-
+    
     def __setitem__(self, idx, val):
         assert isinstance(idx, int)
         self.content[idx] = val
-
+    
     def get(self, indices, pad=True):
         """
         根据给定的indices返回内容
@@ -251,13 +258,13 @@ class FieldArray(object):
             return self.content[indices]
         if self.is_input is False and self.is_target is False:
             raise RuntimeError("Please specify either is_input or is_target is True for {}".format(self.name))
-
+        
         contents = [self.content[i] for i in indices]
         if self.padder is None or pad is False:
             return np.array(contents)
         else:
             return self.padder(contents, field_name=self.name, field_ele_dtype=self.dtype)
-
+    
     def set_padder(self, padder):
         """
         设置padder，在这个field进行pad的时候用这个padder进行pad，如果为None则不进行pad。
@@ -269,7 +276,7 @@ class FieldArray(object):
             self.padder = deepcopy(padder)
         else:
             self.padder = None
-
+    
     def set_pad_val(self, pad_val):
         """
         修改padder的pad_val.
@@ -279,8 +286,7 @@ class FieldArray(object):
         if self.padder is not None:
             self.padder.set_pad_val(pad_val)
         return self
-
-
+    
     def __len__(self):
         """
         Returns the size of FieldArray.
@@ -288,7 +294,7 @@ class FieldArray(object):
         :return int length:
         """
         return len(self.content)
-
+    
     def to(self, other):
         """
         将other的属性复制给本FieldArray(other必须为FieldArray类型).
@@ -298,14 +304,15 @@ class FieldArray(object):
         :return: :class:`~fastNLP.FieldArray`
         """
         assert isinstance(other, FieldArray), "Only support FieldArray type, not {}.".format(type(other))
-
+        
         self.is_input = other.is_input
         self.is_target = other.is_target
         self.padder = other.padder
         self.ignore_type = other.ignore_type
-
+        
         return self
 
+
 def _is_iterable(content):
     try:
         _ = (e for e in content)
@@ -331,13 +338,13 @@ class Padder:
         :return: np.array([padded_element])
     
     """
-
+    
     def __init__(self, pad_val=0, **kwargs):
         self.pad_val = pad_val
-
+    
     def set_pad_val(self, pad_val):
         self.pad_val = pad_val
-
+    
     def __call__(self, contents, field_name, field_ele_dtype):
         """
         传入的是List内容。假设有以下的DataSet。
@@ -396,13 +403,13 @@ class AutoPadder(Padder):
         2.2 如果该field的内容为List, 那么会将Batch中的List pad为一样长。若该List下还有里层的List需要padding，请使用其它padder。
         即如果Instance中field形如[1, 2, 3, ...]，则可以pad；若为[[1,2], [3,4, ...]]则不能进行pad
     """
-
+    
     def __init__(self, pad_val=0):
         """
         :param pad_val: int, padding的位置使用该index
         """
         super().__init__(pad_val=pad_val)
-
+    
     def _is_two_dimension(self, contents):
         """
         判断contents是不是只有两个维度。[[1,2], [3]]是两个维度. [[[1,2], [3, 4, 5]], [[4,5]]]有三个维度
@@ -416,7 +423,7 @@ class AutoPadder(Padder):
                 return False
             return True
         return False
-
+    
     def __call__(self, contents, field_name, field_ele_dtype):
         
         if not _is_iterable(contents[0]):
@@ -458,6 +465,7 @@ class EngChar2DPadder(Padder):
         dataset.set_padder('chars', padder)  # chars这个field的设置为了EnChar2DPadder
 
     """
+    
     def __init__(self, pad_val=0, pad_length=0):
         """
         :param pad_val: int, pad的位置使用该index
@@ -465,9 +473,9 @@ class EngChar2DPadder(Padder):
             都pad或截取到该长度.
         """
         super().__init__(pad_val=pad_val)
-
+        
         self.pad_length = pad_length
-
+    
     def _exactly_three_dims(self, contents, field_name):
         """
         检查传入的contents是否刚好是3维，如果不是3维就报错。理论上，第一个维度是batch，第二个维度是word，第三个维度是character
@@ -486,10 +494,10 @@ class EngChar2DPadder(Padder):
             value = value[0]
         except:
             raise ValueError("Field:{} only has two dimensions.".format(field_name))
-
+        
         if _is_iterable(value):
             raise ValueError("Field:{} has more than 3 dimension.".format(field_name))
-
+    
     def __call__(self, contents, field_name, field_ele_dtype):
         """
         期望输入类似于
@@ -516,12 +524,12 @@ class EngChar2DPadder(Padder):
         max_sent_length = max(len(word_lst) for word_lst in contents)
         batch_size = len(contents)
         dtype = type(contents[0][0][0])
-
+        
         padded_array = np.full((batch_size, max_sent_length, max_char_length), fill_value=self.pad_val,
-                                        dtype=dtype)
+                               dtype=dtype)
         for b_idx, word_lst in enumerate(contents):
             for c_idx, char_lst in enumerate(word_lst):
                 chars = char_lst[:max_char_length]
                 padded_array[b_idx, c_idx, :len(chars)] = chars
-
-        return padded_array
\ No newline at end of file
+        
+        return padded_array
diff --git a/fastNLP/core/instance.py b/fastNLP/core/instance.py
index 2303c510..07ae6495 100644
--- a/fastNLP/core/instance.py
+++ b/fastNLP/core/instance.py
@@ -3,7 +3,9 @@ instance 模块实现了Instance 类在fastNLP中对应sample。一个sample可
 便于理解的例子可以参考文档 :doc:`fastNLP.core.dataset` 中的表格
 
 """
-__all__ = ["Instance"]
+__all__ = [
+    "Instance"
+]
 
 
 class Instance(object):
diff --git a/fastNLP/core/losses.py b/fastNLP/core/losses.py
index 7a5fdf9d..b98c5ac7 100644
--- a/fastNLP/core/losses.py
+++ b/fastNLP/core/losses.py
@@ -2,13 +2,12 @@
 losses 模块定义了 fastNLP 中所需的各种损失函数，一般做为 :class:`~fastNLP.Trainer` 的参数使用。
 
 """
-__all__ = ["LossBase", "L1Loss", "LossFunc", "LossInForward", "BCELoss", "CrossEntropyLoss", "NLLLoss"]
 import inspect
-from collections import defaultdict
-
 import torch
 import torch.nn.functional as F
 
+from collections import defaultdict
+
 from .utils import _CheckError
 from .utils import _CheckRes
 from .utils import _build_args
@@ -16,6 +15,18 @@ from .utils import _check_arg_dict_list
 from .utils import _check_function_or_method
 from .utils import _get_func_signature
 
+__all__ = [
+    "LossBase",
+    
+    "LossFunc",
+    "LossInForward",
+    
+    "CrossEntropyLoss",
+    "BCELoss",
+    "L1Loss",
+    "NLLLoss"
+]
+
 
 class LossBase(object):
     """
diff --git a/fastNLP/core/metrics.py b/fastNLP/core/metrics.py
index 7a96020b..df85a318 100644
--- a/fastNLP/core/metrics.py
+++ b/fastNLP/core/metrics.py
@@ -3,11 +3,11 @@ metrics 模块实现了 fastNLP 所需的各种常用衡量指标，一般做为
 
 """
 import inspect
-from collections import defaultdict
-
 import numpy as np
 import torch
 
+from collections import defaultdict
+
 from .utils import _CheckError
 from .utils import _CheckRes
 from .utils import _build_args
@@ -16,6 +16,13 @@ from .utils import _get_func_signature
 from .utils import seq_len_to_mask
 from .vocabulary import Vocabulary
 
+__all__ = [
+    "MetricBase",
+    "AccuracyMetric",
+    "SpanFPreRecMetric",
+    "SQuADMetric"
+]
+
 
 class MetricBase(object):
     """
@@ -106,16 +113,17 @@ class MetricBase(object):
     self.get_metric将统计当前的评价指标并返回评价结果, 返回值需要是一个dict, key是指标名称，value是指标的值
 
     """
+    
     def __init__(self):
         self.param_map = {}  # key is param in function, value is input param.
         self._checked = False
-
+    
     def evaluate(self, *args, **kwargs):
         raise NotImplementedError
-
+    
     def get_metric(self, reset=True):
         raise NotImplemented
-
+    
     def _init_param_map(self, key_map=None, **kwargs):
         """检查key_map和其他参数map，并将这些映射关系添加到self.param_map
 
@@ -148,7 +156,7 @@ class MetricBase(object):
         for value, key_set in value_counter.items():
             if len(key_set) > 1:
                 raise ValueError(f"Several parameters:{key_set} are provided with one output {value}.")
-
+        
         # check consistence between signature and param_map
         func_spect = inspect.getfullargspec(self.evaluate)
         func_args = [arg for arg in func_spect.args if arg != 'self']
@@ -157,7 +165,7 @@ class MetricBase(object):
                 raise NameError(
                     f"Parameter `{func_param}` is not in {_get_func_signature(self.evaluate)}. Please check the "
                     f"initialization parameters, or change its signature.")
-
+    
     def _fast_param_map(self, pred_dict, target_dict):
         """Only used as inner function. When the pred_dict, target is unequivocal. Don't need users to pass key_map.
             such as pred_dict has one element, target_dict has one element
@@ -172,7 +180,7 @@ class MetricBase(object):
             fast_param['target'] = list(target_dict.values())[0]
             return fast_param
         return fast_param
-
+    
     def __call__(self, pred_dict, target_dict):
         """
         这个方法会调用self.evaluate 方法.
@@ -187,12 +195,12 @@ class MetricBase(object):
         :param target_dict: DataSet.batch_y里的键-值对所组成的dict(即is_target=True的fields的内容)
         :return:
         """
-
+        
         fast_param = self._fast_param_map(pred_dict, target_dict)
         if fast_param:
             self.evaluate(**fast_param)
             return
-
+        
         if not self._checked:
             if not callable(self.evaluate):
                 raise TypeError(f"{self.__class__.__name__}.evaluate has to be callable, not {type(self.evaluate)}.")
@@ -202,14 +210,14 @@ class MetricBase(object):
             for func_arg, input_arg in self.param_map.items():
                 if func_arg not in func_args:
                     raise NameError(f"`{func_arg}` not in {_get_func_signature(self.evaluate)}.")
-
+            
             # 2. only part of the param_map are passed, left are not
             for arg in func_args:
                 if arg not in self.param_map:
                     self.param_map[arg] = arg  # This param does not need mapping.
             self._evaluate_args = func_args
             self._reverse_param_map = {input_arg: func_arg for func_arg, input_arg in self.param_map.items()}
-
+        
         # need to wrap inputs in dict.
         mapped_pred_dict = {}
         mapped_target_dict = {}
@@ -229,7 +237,7 @@ class MetricBase(object):
                 not_duplicate_flag += 1
             if not_duplicate_flag == 3:
                 duplicated.append(input_arg)
-
+        
         # missing
         if not self._checked:
             check_res = _check_arg_dict_list(self.evaluate, [mapped_pred_dict, mapped_target_dict])
@@ -240,23 +248,23 @@ class MetricBase(object):
             for idx, func_arg in enumerate(missing):
                 # Don't delete `` in this information, nor add ``
                 replaced_missing[idx] = f"{self.param_map[func_arg]}" + f"(assign to `{func_arg}` " \
-                                                                        f"in `{self.__class__.__name__}`)"
-
+                    f"in `{self.__class__.__name__}`)"
+            
             check_res = _CheckRes(missing=replaced_missing,
                                   unused=check_res.unused,
                                   duplicated=duplicated,
                                   required=check_res.required,
                                   all_needed=check_res.all_needed,
                                   varargs=check_res.varargs)
-
+            
             if check_res.missing or check_res.duplicated:
                 raise _CheckError(check_res=check_res,
                                   func_signature=_get_func_signature(self.evaluate))
         refined_args = _build_args(self.evaluate, **mapped_pred_dict, **mapped_target_dict)
-
+        
         self.evaluate(**refined_args)
         self._checked = True
-
+        
         return
 
 
@@ -271,15 +279,16 @@ class AccuracyMetric(MetricBase):
     :param target: 参数映射表中 `target` 的映射关系，None表示映射关系为 `target` -> `target`
     :param seq_len: 参数映射表中 `seq_len` 的映射关系，None表示映射关系为 `seq_len` -> `seq_len`
     """
+    
     def __init__(self, pred=None, target=None, seq_len=None):
         
         super().__init__()
-
+        
         self._init_param_map(pred=pred, target=target, seq_len=seq_len)
-
+        
         self.total = 0
         self.acc_count = 0
-
+    
     def evaluate(self, pred, target, seq_len=None):
         """
         evaluate函数将针对一个批次的预测结果做评价指标的累计
@@ -299,16 +308,16 @@ class AccuracyMetric(MetricBase):
         if not isinstance(target, torch.Tensor):
             raise TypeError(f"`target` in {_get_func_signature(self.evaluate)} must be torch.Tensor,"
                             f"got {type(target)}.")
-
+        
         if seq_len is not None and not isinstance(seq_len, torch.Tensor):
             raise TypeError(f"`seq_lens` in {_get_func_signature(self.evaluate)} must be torch.Tensor,"
                             f"got {type(seq_len)}.")
-
+        
         if seq_len is not None:
             masks = seq_len_to_mask(seq_len=seq_len)
         else:
             masks = None
-
+        
         if pred.size() == target.size():
             pass
         elif len(pred.size()) == len(target.size()) + 1:
@@ -317,7 +326,7 @@ class AccuracyMetric(MetricBase):
             raise RuntimeError(f"In {_get_func_signature(self.evaluate)}, when pred have "
                                f"size:{pred.size()}, target should have size: {pred.size()} or "
                                f"{pred.size()[:-1]}, got {target.size()}.")
-
+        
         target = target.to(pred)
         if masks is not None:
             self.acc_count += torch.sum(torch.eq(pred, target).masked_fill(masks.eq(0), 0)).item()
@@ -325,7 +334,7 @@ class AccuracyMetric(MetricBase):
         else:
             self.acc_count += torch.sum(torch.eq(pred, target)).item()
             self.total += np.prod(list(pred.size()))
-
+    
     def get_metric(self, reset=True):
         """
         get_metric函数将根据evaluate函数累计的评价指标统计量来计算最终的评价结果.
@@ -350,7 +359,7 @@ def _bmes_tag_to_spans(tags, ignore_labels=None):
     :return: List[Tuple[str, List[int, int]]]. [(label，[start, end])]
     """
     ignore_labels = set(ignore_labels) if ignore_labels else set()
-
+    
     spans = []
     prev_bmes_tag = None
     for idx, tag in enumerate(tags):
@@ -358,14 +367,14 @@ def _bmes_tag_to_spans(tags, ignore_labels=None):
         bmes_tag, label = tag[:1], tag[2:]
         if bmes_tag in ('b', 's'):
             spans.append((label, [idx, idx]))
-        elif bmes_tag in ('m', 'e') and prev_bmes_tag in ('b', 'm') and label==spans[-1][0]:
+        elif bmes_tag in ('m', 'e') and prev_bmes_tag in ('b', 'm') and label == spans[-1][0]:
             spans[-1][1][1] = idx
         else:
             spans.append((label, [idx, idx]))
         prev_bmes_tag = bmes_tag
-    return [(span[0], (span[1][0], span[1][1]+1))
-                    for span in spans
-                        if span[0] not in ignore_labels
+    return [(span[0], (span[1][0], span[1][1] + 1))
+            for span in spans
+            if span[0] not in ignore_labels
             ]
 
 
@@ -379,7 +388,7 @@ def _bmeso_tag_to_spans(tags, ignore_labels=None):
     :return: List[Tuple[str, List[int, int]]]. [(label，[start, end])]
     """
     ignore_labels = set(ignore_labels) if ignore_labels else set()
-
+    
     spans = []
     prev_bmes_tag = None
     for idx, tag in enumerate(tags):
@@ -387,16 +396,16 @@ def _bmeso_tag_to_spans(tags, ignore_labels=None):
         bmes_tag, label = tag[:1], tag[2:]
         if bmes_tag in ('b', 's'):
             spans.append((label, [idx, idx]))
-        elif bmes_tag in ('m', 'e') and prev_bmes_tag in ('b', 'm') and label==spans[-1][0]:
+        elif bmes_tag in ('m', 'e') and prev_bmes_tag in ('b', 'm') and label == spans[-1][0]:
             spans[-1][1][1] = idx
         elif bmes_tag == 'o':
             pass
         else:
             spans.append((label, [idx, idx]))
         prev_bmes_tag = bmes_tag
-    return [(span[0], (span[1][0], span[1][1]+1))
-                    for span in spans
-                        if span[0] not in ignore_labels
+    return [(span[0], (span[1][0], span[1][1] + 1))
+            for span in spans
+            if span[0] not in ignore_labels
             ]
 
 
@@ -410,7 +419,7 @@ def _bio_tag_to_spans(tags, ignore_labels=None):
     :return: List[Tuple[str, List[int, int]]]. [(label，[start, end])]
     """
     ignore_labels = set(ignore_labels) if ignore_labels else set()
-
+    
     spans = []
     prev_bio_tag = None
     for idx, tag in enumerate(tags):
@@ -418,14 +427,14 @@ def _bio_tag_to_spans(tags, ignore_labels=None):
         bio_tag, label = tag[:1], tag[2:]
         if bio_tag == 'b':
             spans.append((label, [idx, idx]))
-        elif bio_tag == 'i' and prev_bio_tag in ('b', 'i') and label==spans[-1][0]:
+        elif bio_tag == 'i' and prev_bio_tag in ('b', 'i') and label == spans[-1][0]:
             spans[-1][1][1] = idx
-        elif bio_tag == 'o': # o tag does not count
+        elif bio_tag == 'o':  # o tag does not count
             pass
         else:
             spans.append((label, [idx, idx]))
         prev_bio_tag = bio_tag
-    return [(span[0], (span[1][0], span[1][1]+1)) for span in spans if span[0] not in ignore_labels]
+    return [(span[0], (span[1][0], span[1][1] + 1)) for span in spans if span[0] not in ignore_labels]
 
 
 class SpanFPreRecMetric(MetricBase):
@@ -470,16 +479,17 @@ class SpanFPreRecMetric(MetricBase):
     :param float beta: f_beta分数，f_beta = (1 + beta^2)*(pre*rec)/(beta^2*pre + rec). 常用为beta=0.5, 1, 2. 若为0.5
         则精确率的权重高于召回率；若为1，则两者平等；若为2，则召回率权重高于精确率。
     """
+    
     def __init__(self, tag_vocab, pred=None, target=None, seq_len=None, encoding_type='bio', ignore_labels=None,
-                  only_gross=True, f_type='micro', beta=1):
+                 only_gross=True, f_type='micro', beta=1):
         
         encoding_type = encoding_type.lower()
-
+        
         if not isinstance(tag_vocab, Vocabulary):
             raise TypeError("tag_vocab can only be fastNLP.Vocabulary, not {}.".format(type(tag_vocab)))
         if f_type not in ('micro', 'macro'):
             raise ValueError("f_type only supports `micro` or `macro`', got {}.".format(f_type))
-
+        
         self.encoding_type = encoding_type
         if self.encoding_type == 'bmes':
             self.tag_to_span_func = _bmes_tag_to_spans
@@ -489,22 +499,22 @@ class SpanFPreRecMetric(MetricBase):
             self.tag_to_span_func = _bmeso_tag_to_spans
         else:
             raise ValueError("Only support 'bio', 'bmes', 'bmeso' type.")
-
+        
         self.ignore_labels = ignore_labels
         self.f_type = f_type
         self.beta = beta
-        self.beta_square = self.beta**2
+        self.beta_square = self.beta ** 2
         self.only_gross = only_gross
-
+        
         super().__init__()
         self._init_param_map(pred=pred, target=target, seq_len=seq_len)
-
+        
         self.tag_vocab = tag_vocab
-
+        
         self._true_positives = defaultdict(int)
         self._false_positives = defaultdict(int)
         self._false_negatives = defaultdict(int)
-
+    
     def evaluate(self, pred, target, seq_len):
         """evaluate函数将针对一个批次的预测结果做评价指标的累计
 
@@ -519,11 +529,11 @@ class SpanFPreRecMetric(MetricBase):
         if not isinstance(target, torch.Tensor):
             raise TypeError(f"`target` in {_get_func_signature(self.evaluate)} must be torch.Tensor,"
                             f"got {type(target)}.")
-
+        
         if not isinstance(seq_len, torch.Tensor):
             raise TypeError(f"`seq_lens` in {_get_func_signature(self.evaluate)} must be torch.Tensor,"
                             f"got {type(seq_len)}.")
-
+        
         if pred.size() == target.size() and len(target.size()) == 2:
             pass
         elif len(pred.size()) == len(target.size()) + 1 and len(target.size()) == 2:
@@ -536,20 +546,20 @@ class SpanFPreRecMetric(MetricBase):
             raise RuntimeError(f"In {_get_func_signature(self.evaluate)}, when pred have "
                                f"size:{pred.size()}, target should have size: {pred.size()} or "
                                f"{pred.size()[:-1]}, got {target.size()}.")
-
+        
         batch_size = pred.size(0)
         pred = pred.tolist()
         target = target.tolist()
         for i in range(batch_size):
             pred_tags = pred[i][:int(seq_len[i])]
             gold_tags = target[i][:int(seq_len[i])]
-
+            
             pred_str_tags = [self.tag_vocab.to_word(tag) for tag in pred_tags]
             gold_str_tags = [self.tag_vocab.to_word(tag) for tag in gold_tags]
-
+            
             pred_spans = self.tag_to_span_func(pred_str_tags, ignore_labels=self.ignore_labels)
             gold_spans = self.tag_to_span_func(gold_str_tags, ignore_labels=self.ignore_labels)
-
+            
             for span in pred_spans:
                 if span in gold_spans:
                     self._true_positives[span[0]] += 1
@@ -558,7 +568,7 @@ class SpanFPreRecMetric(MetricBase):
                     self._false_positives[span[0]] += 1
             for span in gold_spans:
                 self._false_negatives[span[0]] += 1
-
+    
     def get_metric(self, reset=True):
         """get_metric函数将根据evaluate函数累计的评价指标统计量来计算最终的评价结果."""
         evaluate_result = {}
@@ -577,19 +587,19 @@ class SpanFPreRecMetric(MetricBase):
                 f_sum += f
                 pre_sum += pre
                 rec_sum + rec
-                if not self.only_gross and tag!='': # tag!=''防止无tag的情况
+                if not self.only_gross and tag != '':  # tag!=''防止无tag的情况
                     f_key = 'f-{}'.format(tag)
                     pre_key = 'pre-{}'.format(tag)
                     rec_key = 'rec-{}'.format(tag)
                     evaluate_result[f_key] = f
                     evaluate_result[pre_key] = pre
                     evaluate_result[rec_key] = rec
-
+            
             if self.f_type == 'macro':
-                evaluate_result['f'] = f_sum/len(tags)
-                evaluate_result['pre'] = pre_sum/len(tags)
-                evaluate_result['rec'] = rec_sum/len(tags)
-
+                evaluate_result['f'] = f_sum / len(tags)
+                evaluate_result['pre'] = pre_sum / len(tags)
+                evaluate_result['rec'] = rec_sum / len(tags)
+        
         if self.f_type == 'micro':
             f, pre, rec = self._compute_f_pre_rec(sum(self._true_positives.values()),
                                                   sum(self._false_negatives.values()),
@@ -597,17 +607,17 @@ class SpanFPreRecMetric(MetricBase):
             evaluate_result['f'] = f
             evaluate_result['pre'] = pre
             evaluate_result['rec'] = rec
-
+        
         if reset:
             self._true_positives = defaultdict(int)
             self._false_positives = defaultdict(int)
             self._false_negatives = defaultdict(int)
-
+        
         for key, value in evaluate_result.items():
             evaluate_result[key] = round(value, 6)
-
+        
         return evaluate_result
-
+    
     def _compute_f_pre_rec(self, tp, fn, fp):
         """
 
@@ -619,11 +629,10 @@ class SpanFPreRecMetric(MetricBase):
         pre = tp / (fp + tp + 1e-13)
         rec = tp / (fn + tp + 1e-13)
         f = (1 + self.beta_square) * pre * rec / (self.beta_square * pre + rec + 1e-13)
-
+        
         return f, pre, rec
 
 
-
 def _prepare_metrics(metrics):
     """
 
@@ -705,33 +714,33 @@ class SQuADMetric(MetricBase):
     :param bool print_predict_stat: True则输出预测答案是否为空与正确答案是否为空的统计信息, False则不输出
     
     """
-
+    
     def __init__(self, pred1=None, pred2=None, target1=None, target2=None,
                  beta=1, right_open=True, print_predict_stat=False):
         
         super(SQuADMetric, self).__init__()
-
+        
         self._init_param_map(pred1=pred1, pred2=pred2, target1=target1, target2=target2)
-
+        
         self.print_predict_stat = print_predict_stat
-
+        
         self.no_ans_correct = 0
         self.no_ans_wrong = 0
-
+        
         self.has_ans_correct = 0
         self.has_ans_wrong = 0
-
+        
         self.has_ans_f = 0.
-
+        
         self.no2no = 0
         self.no2yes = 0
         self.yes2no = 0
         self.yes2yes = 0
-
+        
         self.f_beta = beta
-
+        
         self.right_open = right_open
-
+    
     def evaluate(self, pred1, pred2, target1, target2):
         """evaluate函数将针对一个批次的预测结果做评价指标的累计
 
@@ -745,7 +754,7 @@ class SQuADMetric(MetricBase):
         pred_end = pred2
         target_start = target1
         target_end = target2
-
+        
         if len(pred_start.size()) == 2:
             start_inference = pred_start.max(dim=-1)[1].cpu().tolist()
         else:
@@ -754,12 +763,12 @@ class SQuADMetric(MetricBase):
             end_inference = pred_end.max(dim=-1)[1].cpu().tolist()
         else:
             end_inference = pred_end.cpu().tolist()
-
+        
         start, end = [], []
         max_len = pred_start.size(1)
         t_start = target_start.cpu().tolist()
         t_end = target_end.cpu().tolist()
-
+        
         for s, e in zip(start_inference, end_inference):
             start.append(min(s, e))
             end.append(max(s, e))
@@ -779,7 +788,7 @@ class SQuADMetric(MetricBase):
                     self.yes2no += 1
                 else:
                     self.yes2yes += 1
-
+                
                 if s == ts and e == te:
                     self.has_ans_correct += 1
                 else:
@@ -787,29 +796,29 @@ class SQuADMetric(MetricBase):
                 a = [0] * s + [1] * (e - s) + [0] * (max_len - e)
                 b = [0] * ts + [1] * (te - ts) + [0] * (max_len - te)
                 a, b = torch.tensor(a), torch.tensor(b)
-
+                
                 TP = int(torch.sum(a * b))
                 pre = TP / int(torch.sum(a)) if int(torch.sum(a)) > 0 else 0
                 rec = TP / int(torch.sum(b)) if int(torch.sum(b)) > 0 else 0
-
+                
                 if pre + rec > 0:
-                    f = (1 + (self.f_beta**2)) * pre * rec / ((self.f_beta**2) * pre + rec)
+                    f = (1 + (self.f_beta ** 2)) * pre * rec / ((self.f_beta ** 2) * pre + rec)
                 else:
                     f = 0
                 self.has_ans_f += f
-
+    
     def get_metric(self, reset=True):
         """get_metric函数将根据evaluate函数累计的评价指标统计量来计算最终的评价结果."""
         evaluate_result = {}
-
+        
         if self.no_ans_correct + self.no_ans_wrong + self.has_ans_correct + self.no_ans_wrong <= 0:
             return evaluate_result
-
+        
         evaluate_result['EM'] = 0
         evaluate_result[f'f_{self.f_beta}'] = 0
-
+        
         flag = 0
-
+        
         if self.no_ans_correct + self.no_ans_wrong > 0:
             evaluate_result[f'noAns-f_{self.f_beta}'] = \
                 round(100 * self.no_ans_correct / (self.no_ans_correct + self.no_ans_wrong), 3)
@@ -818,7 +827,7 @@ class SQuADMetric(MetricBase):
             evaluate_result[f'f_{self.f_beta}'] += evaluate_result[f'noAns-f_{self.f_beta}']
             evaluate_result['EM'] += evaluate_result['noAns-EM']
             flag += 1
-
+        
         if self.has_ans_correct + self.has_ans_wrong > 0:
             evaluate_result[f'hasAns-f_{self.f_beta}'] = \
                 round(100 * self.has_ans_f / (self.has_ans_correct + self.has_ans_wrong), 3)
@@ -827,32 +836,31 @@ class SQuADMetric(MetricBase):
             evaluate_result[f'f_{self.f_beta}'] += evaluate_result[f'hasAns-f_{self.f_beta}']
             evaluate_result['EM'] += evaluate_result['hasAns-EM']
             flag += 1
-
+        
         if self.print_predict_stat:
             evaluate_result['no2no'] = self.no2no
             evaluate_result['no2yes'] = self.no2yes
             evaluate_result['yes2no'] = self.yes2no
             evaluate_result['yes2yes'] = self.yes2yes
-
+        
         if flag <= 0:
             return evaluate_result
-
+        
         evaluate_result[f'f_{self.f_beta}'] = round(evaluate_result[f'f_{self.f_beta}'] / flag, 3)
         evaluate_result['EM'] = round(evaluate_result['EM'] / flag, 3)
-
+        
         if reset:
             self.no_ans_correct = 0
             self.no_ans_wrong = 0
-
+            
             self.has_ans_correct = 0
             self.has_ans_wrong = 0
-
+            
             self.has_ans_f = 0.
-
+            
             self.no2no = 0
             self.no2yes = 0
             self.yes2no = 0
             self.yes2yes = 0
-
+        
         return evaluate_result
-
diff --git a/fastNLP/core/optimizer.py b/fastNLP/core/optimizer.py
index ea4905eb..28f618f9 100644
--- a/fastNLP/core/optimizer.py
+++ b/fastNLP/core/optimizer.py
@@ -4,6 +4,12 @@ optimizer 模块定义了 fastNLP 中所需的各种优化器，一般做为 :cl
 """
 import torch
 
+__all__ = [
+    "Optimizer",
+    "SGD",
+    "Adam"
+]
+
 
 class Optimizer(object):
     """
@@ -12,15 +18,16 @@ class Optimizer(object):
     :param model_params: a generator. E.g. ``model.parameters()`` for PyTorch models.
     :param kwargs: additional parameters.
     """
+    
     def __init__(self, model_params, **kwargs):
         if model_params is not None and not hasattr(model_params, "__next__"):
             raise RuntimeError("model parameters should be a generator, rather than {}.".format(type(model_params)))
         self.model_params = model_params
         self.settings = kwargs
-
+    
     def construct_from_pytorch(self, model_params):
         raise NotImplementedError
-
+    
     def _get_require_grads_param(self, params):
         """
         将params中不需要gradient的删除
@@ -29,6 +36,7 @@ class Optimizer(object):
         """
         return [param for param in params if param.requires_grad]
 
+
 class SGD(Optimizer):
     """
     别名：:class:`fastNLP.SGD` :class:`fastNLP.core.optimizer.SGD`
@@ -37,12 +45,12 @@ class SGD(Optimizer):
     :param float momentum: momentum. Default: 0
     :param model_params: a generator. E.g. ``model.parameters()`` for PyTorch models.
     """
-
+    
     def __init__(self, lr=0.001, momentum=0, model_params=None):
         if not isinstance(lr, float):
             raise TypeError("learning rate has to be float.")
         super(SGD, self).__init__(model_params, lr=lr, momentum=momentum)
-
+    
     def construct_from_pytorch(self, model_params):
         if self.model_params is None:
             # careful! generator cannot be assigned.
@@ -59,13 +67,13 @@ class Adam(Optimizer):
     :param float weight_decay:
     :param model_params: a generator. E.g. ``model.parameters()`` for PyTorch models.
     """
-
+    
     def __init__(self, lr=0.001, weight_decay=0, betas=(0.9, 0.999), eps=1e-8, amsgrad=False, model_params=None):
         if not isinstance(lr, float):
             raise TypeError("learning rate has to be float.")
         super(Adam, self).__init__(model_params, lr=lr, betas=betas, eps=eps, amsgrad=amsgrad,
                                    weight_decay=weight_decay)
-
+    
     def construct_from_pytorch(self, model_params):
         if self.model_params is None:
             # careful! generator cannot be assigned.
diff --git a/fastNLP/core/predictor.py b/fastNLP/core/predictor.py
index 34784b7c..a9ef7924 100644
--- a/fastNLP/core/predictor.py
+++ b/fastNLP/core/predictor.py
@@ -1,7 +1,11 @@
-from collections import defaultdict
-
+"""
+    ..todo::
+        检查这个类是否需要
+"""
 import torch
 
+from collections import defaultdict
+
 from . import Batch
 from . import DataSet
 from . import SequentialSampler
@@ -9,7 +13,8 @@ from .utils import _build_args
 
 
 class Predictor(object):
-    """An interface for predicting outputs based on trained models.
+    """
+    An interface for predicting outputs based on trained models.
 
     It does not care about evaluations of the model, which is different from Tester.
     This is a high-level model wrapper to be called by FastNLP.
diff --git a/fastNLP/core/sampler.py b/fastNLP/core/sampler.py
index e270dac1..0900e733 100644
--- a/fastNLP/core/sampler.py
+++ b/fastNLP/core/sampler.py
@@ -1,12 +1,16 @@
 """
 sampler 子类实现了 fastNLP 所需的各种采样器。
-
-
 """
-__all__ = ["Sampler", "BucketSampler", "SequentialSampler", "RandomSampler"]
+import numpy as np
+
 from itertools import chain
 
-import numpy as np
+__all__ = [
+    "Sampler",
+    "BucketSampler",
+    "SequentialSampler",
+    "RandomSampler"
+]
 
 
 class Sampler(object):
diff --git a/fastNLP/core/tester.py b/fastNLP/core/tester.py
index 7b6fdda5..47aef46e 100644
--- a/fastNLP/core/tester.py
+++ b/fastNLP/core/tester.py
@@ -33,9 +33,8 @@ Tester在验证进行之前会调用model.eval()提示当前进入了evaluation
 
 """
 import warnings
-
 import torch
-from torch import nn
+import torch.nn as nn
 
 from .batch import Batch
 from .dataset import DataSet
@@ -49,6 +48,10 @@ from .utils import _get_func_signature
 from .utils import _get_model_device
 from .utils import _move_model_to_device
 
+__all__ = [
+    "Tester"
+]
+
 
 class Tester(object):
     """
@@ -77,29 +80,29 @@ class Tester(object):
         如果模型是通过predict()进行预测的话，那么将不能使用多卡(DataParallel)进行验证，只会使用第一张卡上的模型。
     :param int verbose: 如果为0不输出任何信息; 如果为1，打印出验证结果。
     """
-
+    
     def __init__(self, data, model, metrics, batch_size=16, device=None, verbose=1):
         super(Tester, self).__init__()
-
+        
         if not isinstance(data, DataSet):
             raise TypeError(f"The type of data must be `fastNLP.DataSet`, got `{type(data)}`.")
         if not isinstance(model, nn.Module):
             raise TypeError(f"The type of model must be `torch.nn.Module`, got `{type(model)}`.")
-
+        
         self.metrics = _prepare_metrics(metrics)
-
+        
         self.data = data
         self._model = _move_model_to_device(model, device=device)
         self.batch_size = batch_size
         self.verbose = verbose
-
+        
         #  如果是DataParallel将没有办法使用predict方法
         if isinstance(self._model, nn.DataParallel):
             if hasattr(self._model.module, 'predict') and not hasattr(self._model, 'predict'):
                 warnings.warn("Cannot use DataParallel to test your model, because your model offer predict() function,"
                               " while DataParallel has no predict() function.")
                 self._model = self._model.module
-
+        
         # check predict
         if hasattr(self._model, 'predict'):
             self._predict_func = self._model.predict
@@ -109,7 +112,7 @@ class Tester(object):
                                 f"for evaluation, not `{type(self._predict_func)}`.")
         else:
             self._predict_func = self._model.forward
-
+    
     def test(self):
         """开始进行验证，并返回验证结果。
 
@@ -144,12 +147,12 @@ class Tester(object):
             _check_loss_evaluate(prev_func_signature=prev_func_signature, func_signature=e.func_signature,
                                  check_res=e.check_res, pred_dict=pred_dict, target_dict=batch_y,
                                  dataset=self.data, check_level=0)
-
+        
         if self.verbose >= 1:
             print("[tester] \n{}".format(self._format_eval_results(eval_results)))
         self._mode(network, is_test=False)
         return eval_results
-
+    
     def _mode(self, model, is_test=False):
         """Train mode or Test mode. This is for PyTorch currently.
 
@@ -161,13 +164,13 @@ class Tester(object):
             model.eval()
         else:
             model.train()
-
+    
     def _data_forward(self, func, x):
         """A forward pass of the model. """
         x = _build_args(func, **x)
         y = func(**x)
         return y
-
+    
     def _format_eval_results(self, results):
         """Override this method to support more print formats.
 
diff --git a/fastNLP/core/trainer.py b/fastNLP/core/trainer.py
index 9b56d834..87d57f12 100644
--- a/fastNLP/core/trainer.py
+++ b/fastNLP/core/trainer.py
@@ -297,13 +297,12 @@ Example2.3
 """
 
 import os
-import time
-from datetime import datetime
-from datetime import timedelta
-
 import numpy as np
+import time
 import torch
-from torch import nn
+import torch.nn as nn
+
+from datetime import datetime, timedelta
 
 try:
     from tqdm.auto import tqdm
@@ -315,6 +314,7 @@ from .callback import CallbackManager, CallbackException
 from .dataset import DataSet
 from .losses import _prepare_losser
 from .metrics import _prepare_metrics
+from .optimizer import Optimizer
 from .sampler import Sampler
 from .sampler import RandomSampler
 from .sampler import SequentialSampler
@@ -326,7 +326,6 @@ from .utils import _check_loss_evaluate
 from .utils import _move_dict_value_to_device
 from .utils import _get_func_signature
 from .utils import _get_model_device
-from .optimizer import Optimizer
 from .utils import _move_model_to_device
 
 
@@ -464,7 +463,7 @@ class Trainer(object):
             len(self.train_data) % self.batch_size != 0)) * self.n_epochs
         
         self.model = _move_model_to_device(self.model, device=device)
-
+        
         if isinstance(optimizer, torch.optim.Optimizer):
             self.optimizer = optimizer
         elif isinstance(optimizer, Optimizer):
diff --git a/fastNLP/core/utils.py b/fastNLP/core/utils.py
index 2c386bbe..a7ad3326 100644
--- a/fastNLP/core/utils.py
+++ b/fastNLP/core/utils.py
@@ -1,20 +1,25 @@
 """
 utils模块实现了 fastNLP 内部和外部所需的很多工具。其中用户可以使用的是 :func:`cache_results` 修饰器。
 """
-__all__ = ["cache_results", "seq_len_to_mask"]
 import _pickle
 import inspect
+import numpy as np
 import os
+import torch
+import torch.nn as nn
 import warnings
+
 from collections import Counter
 from collections import namedtuple
 
-import numpy as np
-import torch
-from torch import nn
+__all__ = [
+    "cache_results",
+    "seq_len_to_mask"
+]
 
 _CheckRes = namedtuple('_CheckRes', ['missing', 'unused', 'duplicated', 'required', 'all_needed',
-                                   'varargs'])
+                                     'varargs'])
+
 
 def _prepare_cache_filepath(filepath):
     """
@@ -83,11 +88,13 @@ def cache_results(_cache_fp, _refresh=False, _verbose=1):
     :param int _verbose: 是否打印cache的信息。
     :return:
     """
+    
     def wrapper_(func):
         signature = inspect.signature(func)
         for key, _ in signature.parameters.items():
             if key in ('_cache_fp', '_refresh', '_verbose'):
                 raise RuntimeError("The function decorated by cache_results cannot have keyword `{}`.".format(key))
+        
         def wrapper(*args, **kwargs):
             if '_cache_fp' in kwargs:
                 cache_filepath = kwargs.pop('_cache_fp')
@@ -95,7 +102,7 @@ def cache_results(_cache_fp, _refresh=False, _verbose=1):
             else:
                 cache_filepath = _cache_fp
             if '_refresh' in kwargs:
-                refresh  = kwargs.pop('_refresh')
+                refresh = kwargs.pop('_refresh')
                 assert isinstance(refresh, bool), "_refresh can only be bool."
             else:
                 refresh = _refresh
@@ -105,16 +112,16 @@ def cache_results(_cache_fp, _refresh=False, _verbose=1):
             else:
                 verbose = _verbose
             refresh_flag = True
-
+            
             if cache_filepath is not None and refresh is False:
                 # load data
                 if os.path.exists(cache_filepath):
                     with open(cache_filepath, 'rb') as f:
                         results = _pickle.load(f)
-                    if verbose==1:
+                    if verbose == 1:
                         print("Read cache from {}.".format(cache_filepath))
                     refresh_flag = False
-
+            
             if refresh_flag:
                 results = func(*args, **kwargs)
                 if cache_filepath is not None:
@@ -124,11 +131,14 @@ def cache_results(_cache_fp, _refresh=False, _verbose=1):
                     with open(cache_filepath, 'wb') as f:
                         _pickle.dump(results, f)
                     print("Save cache to {}.".format(cache_filepath))
-
+            
             return results
+        
         return wrapper
+    
     return wrapper_
 
+
 # def save_pickle(obj, pickle_path, file_name):
 #     """Save an object into a pickle file.
 #
@@ -196,7 +206,7 @@ def _move_model_to_device(model, device):
     """
     if isinstance(model, torch.nn.parallel.DistributedDataParallel):
         raise RuntimeError("model of `torch.nn.parallel.DistributedDataParallel` is not supported right now.")
-
+    
     if device is None:
         if isinstance(model, torch.nn.DataParallel):
             model.cuda()
@@ -205,34 +215,35 @@ def _move_model_to_device(model, device):
         if not torch.cuda.is_available() and (
                 device != 'cpu' or (isinstance(device, torch.device) and device.type != 'cpu')):
             raise ValueError("There is no usable gpu. set `device` as `cpu` or `None`.")
-
+    
     if isinstance(model, torch.nn.DataParallel):
         raise RuntimeError("When model is `torch.nn.DataParallel`, the device has to be `None`.")
-
+    
     if isinstance(device, int):
-        assert device>-1, "device can only be non-negative integer"
-        assert torch.cuda.device_count()>device, "Only has {} gpus, cannot use device {}.".format(torch.cuda.device_count(),
-                                                                                                  device)
+        assert device > -1, "device can only be non-negative integer"
+        assert torch.cuda.device_count() > device, "Only has {} gpus, cannot use device {}.".format(
+            torch.cuda.device_count(),
+            device)
         device = torch.device('cuda:{}'.format(device))
     elif isinstance(device, str):
         device = torch.device(device)
         if device.type == 'cuda' and device.index is not None:
-            assert device.index<torch.cuda.device_count(), "Only has {} gpus, cannot use device cuda:{}.".format(
-                                                                                            torch.cuda.device_count(),
-                                                                                                  device)
+            assert device.index < torch.cuda.device_count(), "Only has {} gpus, cannot use device cuda:{}.".format(
+                torch.cuda.device_count(),
+                device)
     elif isinstance(device, torch.device):
         if device.type == 'cuda' and device.index is not None:
-            assert device.index<torch.cuda.device_count(), "Only has {} gpus, cannot use device cuda:{}.".format(
-                                                                                            torch.cuda.device_count(),
-                                                                                                  device)
+            assert device.index < torch.cuda.device_count(), "Only has {} gpus, cannot use device cuda:{}.".format(
+                torch.cuda.device_count(),
+                device)
     elif isinstance(device, list):
         types = set([type(d) for d in device])
-        assert len(types)==1, "Mixed type in device, only `int` allowed."
+        assert len(types) == 1, "Mixed type in device, only `int` allowed."
         assert list(types)[0] == int, "Only int supported for multiple devices."
-        assert len(set(device))==len(device), "Duplicated device id found in device."
+        assert len(set(device)) == len(device), "Duplicated device id found in device."
         for d in device:
-            assert d>-1, "Only non-negative device id allowed."
-        if len(device)>1:
+            assert d > -1, "Only non-negative device id allowed."
+        if len(device) > 1:
             output_device = device[0]
             model = nn.DataParallel(model, device_ids=device, output_device=output_device)
         device = torch.device(device[0])
@@ -250,9 +261,9 @@ def _get_model_device(model):
     :return: torch.device,None 如果返回值为None，说明这个模型没有任何参数。
     """
     assert isinstance(model, nn.Module)
-
+    
     parameters = list(model.parameters())
-    if len(parameters)==0:
+    if len(parameters) == 0:
         return None
     else:
         return parameters[0].device
@@ -407,7 +418,7 @@ def _move_dict_value_to_device(*args, device: torch.device, non_blocking=False):
     
     if not isinstance(device, torch.device):
         raise TypeError(f"device must be `torch.device`, got `{type(device)}`")
-
+    
     for arg in args:
         if isinstance(arg, dict):
             for key, value in arg.items():
@@ -422,10 +433,10 @@ class _CheckError(Exception):
 
     _CheckError. Used in losses.LossBase, metrics.MetricBase.
     """
-
+    
     def __init__(self, check_res: _CheckRes, func_signature: str):
         errs = [f'Problems occurred when calling `{func_signature}`']
-
+        
         if check_res.varargs:
             errs.append(f"\tvarargs: {check_res.varargs}(Does not support pass positional arguments, please delete it)")
         if check_res.missing:
@@ -434,9 +445,9 @@ class _CheckError(Exception):
             errs.append(f"\tduplicated param: {check_res.duplicated}")
         if check_res.unused:
             errs.append(f"\tunused param: {check_res.unused}")
-
+        
         Exception.__init__(self, '\n'.join(errs))
-
+        
         self.check_res = check_res
         self.func_signature = func_signature
 
@@ -456,7 +467,7 @@ def _check_loss_evaluate(prev_func_signature: str, func_signature: str, check_re
     # if check_res.varargs:
     #     errs.append(f"\tvarargs: *{check_res.varargs}")
     #     suggestions.append(f"Does not support pass positional arguments, please delete *{check_res.varargs}.")
-
+    
     if check_res.unused:
         for _unused in check_res.unused:
             if _unused in target_dict:
@@ -466,8 +477,8 @@ def _check_loss_evaluate(prev_func_signature: str, func_signature: str, check_re
         if _unused_field:
             unuseds.append(f"\tunused field: {_unused_field}")
         if _unused_param:
-            unuseds.append(f"\tunused param: {_unused_param}") # output from predict or forward
-
+            unuseds.append(f"\tunused param: {_unused_param}")  # output from predict or forward
+    
     module_name = func_signature.split('.')[0]
     if check_res.missing:
         errs.append(f"\tmissing param: {check_res.missing}")
@@ -488,14 +499,14 @@ def _check_loss_evaluate(prev_func_signature: str, func_signature: str, check_re
                     mapped_missing.append(_miss)
             else:
                 unmapped_missing.append(_miss)
-
+        
         for _miss in mapped_missing + unmapped_missing:
             if _miss in dataset:
                 suggestions.append(f"Set `{_miss}` as target.")
             else:
                 _tmp = ''
                 if check_res.unused:
-                    _tmp = f"Check key assignment for `{input_func_map.get(_miss, _miss)}` when initialize {module_name}."
+                    _tmp = f"Check key assignment for `{input_func_map.get(_miss,_miss)}` when initialize {module_name}."
                 if _tmp:
                     _tmp += f' Or provide `{_miss}` in DataSet or output of {prev_func_signature}.'
                 else:
@@ -513,25 +524,25 @@ def _check_loss_evaluate(prev_func_signature: str, func_signature: str, check_re
         #         else:
         #             _tmp = f'Provide `{_miss}` in output of {prev_func_signature} or DataSet.'
         #         suggestions.append(_tmp)
-
+    
     if check_res.duplicated:
         errs.append(f"\tduplicated param: {check_res.duplicated}.")
         suggestions.append(f"Delete {check_res.duplicated} in the output of "
                            f"{prev_func_signature} or do not set {check_res.duplicated} as targets. ")
-
-    if len(errs)>0:
+    
+    if len(errs) > 0:
         errs.extend(unuseds)
     elif check_level == STRICT_CHECK_LEVEL:
         errs.extend(unuseds)
-
+    
     if len(errs) > 0:
         errs.insert(0, f'Problems occurred when calling {func_signature}')
         sugg_str = ""
         if len(suggestions) > 1:
             for idx, sugg in enumerate(suggestions):
-                if idx>0:
+                if idx > 0:
                     sugg_str += '\t\t\t'
-                sugg_str += f'({idx+1}). {sugg}\n'
+                sugg_str += f'({idx + 1}). {sugg}\n'
             sugg_str = sugg_str[:-1]
         else:
             sugg_str += suggestions[0]
@@ -546,14 +557,15 @@ def _check_loss_evaluate(prev_func_signature: str, func_signature: str, check_re
             _unused_warn = f'{check_res.unused} is not used by {module_name}.'
             warnings.warn(message=_unused_warn)
 
+
 def _check_forward_error(forward_func, batch_x, dataset, check_level):
     check_res = _check_arg_dict_list(forward_func, batch_x)
     func_signature = _get_func_signature(forward_func)
-
+    
     errs = []
     suggestions = []
     _unused = []
-
+    
     # if check_res.varargs:
     #     errs.append(f"\tvarargs: {check_res.varargs}")
     #     suggestions.append(f"Does not support pass positional arguments, please delete *{check_res.varargs}.")
@@ -574,20 +586,20 @@ def _check_forward_error(forward_func, batch_x, dataset, check_level):
             #     _tmp += f"Or you might find it in `unused field:`, you can use DataSet.rename_field() to " \
             #             f"rename the field in `unused field:`."
             suggestions.append(_tmp)
-
+    
     if check_res.unused:
         _unused = [f"\tunused field: {check_res.unused}"]
-        if len(errs)>0:
+        if len(errs) > 0:
             errs.extend(_unused)
         elif check_level == STRICT_CHECK_LEVEL:
             errs.extend(_unused)
-
+    
     if len(errs) > 0:
         errs.insert(0, f'Problems occurred when calling {func_signature}')
         sugg_str = ""
         if len(suggestions) > 1:
             for idx, sugg in enumerate(suggestions):
-                sugg_str += f'({idx+1}). {sugg}'
+                sugg_str += f'({idx + 1}). {sugg}'
         else:
             sugg_str += suggestions[0]
         err_str = '\n' + '\n'.join(errs) + '\n\tSuggestion: ' + sugg_str
@@ -622,8 +634,8 @@ def seq_len_to_mask(seq_len):
         assert len(np.shape(seq_len)) == 1, f"seq_len can only have one dimension, got {len(np.shape(seq_len))}."
         max_len = int(seq_len.max())
         broad_cast_seq_len = np.tile(np.arange(max_len), (len(seq_len), 1))
-        mask = broad_cast_seq_len<seq_len.reshape(-1, 1)
-
+        mask = broad_cast_seq_len < seq_len.reshape(-1, 1)
+    
     elif isinstance(seq_len, torch.Tensor):
         assert seq_len.dim() == 1, f"seq_len can only have one dimension, got {seq_len.dim() == 1}."
         batch_size = seq_len.size(0)
@@ -632,7 +644,7 @@ def seq_len_to_mask(seq_len):
         mask = broad_cast_seq_len.lt(seq_len.unsqueeze(1))
     else:
         raise TypeError("Only support 1-d numpy.ndarray or 1-d torch.Tensor.")
-
+    
     return mask
 
 
@@ -640,24 +652,24 @@ class _pseudo_tqdm:
     """
     当无法引入tqdm，或者Trainer中设置use_tqdm为false的时候，用该方法打印数据
     """
-
+    
     def __init__(self, **kwargs):
         pass
-
+    
     def write(self, info):
         print(info)
-
+    
     def set_postfix_str(self, info):
         print(info)
-
+    
     def __getattr__(self, item):
         def pass_func(*args, **kwargs):
             pass
-
+        
         return pass_func
-
+    
     def __enter__(self):
         return self
-
+    
     def __exit__(self, exc_type, exc_val, exc_tb):
         del self
diff --git a/fastNLP/core/vocabulary.py b/fastNLP/core/vocabulary.py
index e9f2cb61..03759194 100644
--- a/fastNLP/core/vocabulary.py
+++ b/fastNLP/core/vocabulary.py
@@ -1,7 +1,12 @@
 from functools import wraps
 from collections import Counter
+
 from .dataset import DataSet
 
+__all__ = [
+    "Vocabulary"
+]
+
 
 def _check_build_vocab(func):
     """A decorator to make sure the indexing is built before used.

From 9c078500198e550d72a8b13eb8206aed82a18803 Mon Sep 17 00:00:00 2001
From: ChenXin <will131@foxmail.com>
Date: Thu, 16 May 2019 20:32:10 +0800
Subject: [PATCH 2/8] =?UTF-8?q?=E4=BF=AE=E6=94=B9=E4=BA=86=20models=20?=
 =?UTF-8?q?=E9=83=A8=E5=88=86=20import=20=E7=9A=84=E9=A1=BA=E5=BA=8F?=
 =?UTF-8?q?=EF=BC=8C=5F=5Fall=5F=5F=20=E6=9A=B4=E9=9C=B2=E7=9A=84=E5=86=85?=
 =?UTF-8?q?=E5=AE=B9?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 docs/source/fastNLP.models.base_model.rst     |   7 -
 docs/source/fastNLP.models.bert.rst           |   7 -
 .../source/fastNLP.models.enas_controller.rst |   7 -
 docs/source/fastNLP.models.enas_model.rst     |   7 -
 docs/source/fastNLP.models.enas_trainer.rst   |   7 -
 docs/source/fastNLP.models.enas_utils.rst     |   7 -
 docs/source/fastNLP.models.rst                |   6 -
 fastNLP/core/batch.py                         |   4 +-
 fastNLP/core/callback.py                      |   1 +
 fastNLP/core/dataset.py                       |   3 +-
 fastNLP/core/field.py                         |   4 +-
 fastNLP/core/losses.py                        |   4 +-
 fastNLP/core/metrics.py                       |   4 +-
 fastNLP/core/predictor.py                     |   4 +-
 fastNLP/core/sampler.py                       |   4 +-
 fastNLP/core/tester.py                        |   1 +
 fastNLP/core/trainer.py                       |   6 +-
 fastNLP/core/utils.py                         |   9 +-
 fastNLP/io/__init__.py                        |   9 +-
 fastNLP/io/base_loader.py                     |  19 ++-
 fastNLP/io/config_io.py                       |  64 ++++---
 fastNLP/io/dataset_loader.py                  |  11 +-
 fastNLP/io/embed_loader.py                    |  56 +++---
 fastNLP/io/model_io.py                        |  15 +-
 fastNLP/models/__init__.py                    |  20 ++-
 fastNLP/models/base_model.py                  |  10 +-
 fastNLP/models/biaffine_parser.py             | 159 ++++++++++--------
 fastNLP/models/cnn_text_classification.py     |  17 +-
 fastNLP/models/enas_controller.py             |   1 +
 fastNLP/models/enas_model.py                  | 139 +++++++--------
 fastNLP/models/enas_trainer.py                | 141 ++++++++--------
 fastNLP/models/enas_utils.py                  |   2 -
 fastNLP/models/sequence_labeling.py           |  10 +-
 fastNLP/models/snli.py                        |  61 +++----
 fastNLP/models/star_transformer.py            |  67 +++++---
 35 files changed, 465 insertions(+), 428 deletions(-)
 delete mode 100644 docs/source/fastNLP.models.base_model.rst
 delete mode 100644 docs/source/fastNLP.models.bert.rst
 delete mode 100644 docs/source/fastNLP.models.enas_controller.rst
 delete mode 100644 docs/source/fastNLP.models.enas_model.rst
 delete mode 100644 docs/source/fastNLP.models.enas_trainer.rst
 delete mode 100644 docs/source/fastNLP.models.enas_utils.rst

diff --git a/docs/source/fastNLP.models.base_model.rst b/docs/source/fastNLP.models.base_model.rst
deleted file mode 100644
index e1d4d64f..00000000
--- a/docs/source/fastNLP.models.base_model.rst
+++ /dev/null
@@ -1,7 +0,0 @@
-fastNLP.models.base\_model
-==========================
-
-.. automodule:: fastNLP.models.base_model
-    :members:
-    :undoc-members:
-    :show-inheritance:
diff --git a/docs/source/fastNLP.models.bert.rst b/docs/source/fastNLP.models.bert.rst
deleted file mode 100644
index bba323df..00000000
--- a/docs/source/fastNLP.models.bert.rst
+++ /dev/null
@@ -1,7 +0,0 @@
-fastNLP.models.bert
-===================
-
-.. automodule:: fastNLP.models.bert
-    :members:
-    :undoc-members:
-    :show-inheritance:
diff --git a/docs/source/fastNLP.models.enas_controller.rst b/docs/source/fastNLP.models.enas_controller.rst
deleted file mode 100644
index 28655bd7..00000000
--- a/docs/source/fastNLP.models.enas_controller.rst
+++ /dev/null
@@ -1,7 +0,0 @@
-fastNLP.models.enas\_controller
-===============================
-
-.. automodule:: fastNLP.models.enas_controller
-    :members:
-    :undoc-members:
-    :show-inheritance:
diff --git a/docs/source/fastNLP.models.enas_model.rst b/docs/source/fastNLP.models.enas_model.rst
deleted file mode 100644
index 35fbe495..00000000
--- a/docs/source/fastNLP.models.enas_model.rst
+++ /dev/null
@@ -1,7 +0,0 @@
-fastNLP.models.enas\_model
-==========================
-
-.. automodule:: fastNLP.models.enas_model
-    :members:
-    :undoc-members:
-    :show-inheritance:
diff --git a/docs/source/fastNLP.models.enas_trainer.rst b/docs/source/fastNLP.models.enas_trainer.rst
deleted file mode 100644
index 7e0ef462..00000000
--- a/docs/source/fastNLP.models.enas_trainer.rst
+++ /dev/null
@@ -1,7 +0,0 @@
-fastNLP.models.enas\_trainer
-============================
-
-.. automodule:: fastNLP.models.enas_trainer
-    :members:
-    :undoc-members:
-    :show-inheritance:
diff --git a/docs/source/fastNLP.models.enas_utils.rst b/docs/source/fastNLP.models.enas_utils.rst
deleted file mode 100644
index 0a049706..00000000
--- a/docs/source/fastNLP.models.enas_utils.rst
+++ /dev/null
@@ -1,7 +0,0 @@
-fastNLP.models.enas\_utils
-==========================
-
-.. automodule:: fastNLP.models.enas_utils
-    :members:
-    :undoc-members:
-    :show-inheritance:
diff --git a/docs/source/fastNLP.models.rst b/docs/source/fastNLP.models.rst
index 57592bf4..5858ebcd 100644
--- a/docs/source/fastNLP.models.rst
+++ b/docs/source/fastNLP.models.rst
@@ -12,14 +12,8 @@ fastNLP.models
 .. toctree::
    :titlesonly:
 
-   fastNLP.models.base_model
-   fastNLP.models.bert
    fastNLP.models.biaffine_parser
    fastNLP.models.cnn_text_classification
-   fastNLP.models.enas_controller
-   fastNLP.models.enas_model
-   fastNLP.models.enas_trainer
-   fastNLP.models.enas_utils
    fastNLP.models.sequence_labeling
    fastNLP.models.snli
    fastNLP.models.star_transformer
diff --git a/fastNLP/core/batch.py b/fastNLP/core/batch.py
index 90f0fc8c..b031d051 100644
--- a/fastNLP/core/batch.py
+++ b/fastNLP/core/batch.py
@@ -3,12 +3,12 @@ batch 模块实现了 fastNLP 所需的 Batch 类。
 
 """
 import atexit
+from queue import Empty, Full
+
 import numpy as np
 import torch
 import torch.multiprocessing as mp
 
-from queue import Empty, Full
-
 from .sampler import RandomSampler
 
 __all__ = [
diff --git a/fastNLP/core/callback.py b/fastNLP/core/callback.py
index 0a5ddc52..51495f23 100644
--- a/fastNLP/core/callback.py
+++ b/fastNLP/core/callback.py
@@ -50,6 +50,7 @@ callback模块实现了 fastNLP 中的许多 callback 类，用于增强 :class:
 
 """
 import os
+
 import torch
 
 try:
diff --git a/fastNLP/core/dataset.py b/fastNLP/core/dataset.py
index 63f66019..f20dd1f8 100644
--- a/fastNLP/core/dataset.py
+++ b/fastNLP/core/dataset.py
@@ -273,9 +273,10 @@
 
 """
 import _pickle as pickle
-import numpy as np
 import warnings
 
+import numpy as np
+
 from .field import AutoPadder
 from .field import FieldArray
 from .instance import Instance
diff --git a/fastNLP/core/field.py b/fastNLP/core/field.py
index 4029a4ca..14e2538d 100644
--- a/fastNLP/core/field.py
+++ b/fastNLP/core/field.py
@@ -3,10 +3,10 @@ field模块实现了 FieldArray 和若干 Padder。 FieldArray 是  :class:`~fas
 原理部分请参考 :doc:`fastNLP.core.dataset`
 
 """
-import numpy as np
-
 from copy import deepcopy
 
+import numpy as np
+
 __all__ = [
     "FieldArray",
     "Padder",
diff --git a/fastNLP/core/losses.py b/fastNLP/core/losses.py
index b98c5ac7..797b557d 100644
--- a/fastNLP/core/losses.py
+++ b/fastNLP/core/losses.py
@@ -3,11 +3,11 @@ losses 模块定义了 fastNLP 中所需的各种损失函数，一般做为 :cl
 
 """
 import inspect
+from collections import defaultdict
+
 import torch
 import torch.nn.functional as F
 
-from collections import defaultdict
-
 from .utils import _CheckError
 from .utils import _CheckRes
 from .utils import _build_args
diff --git a/fastNLP/core/metrics.py b/fastNLP/core/metrics.py
index df85a318..5ea2a5f1 100644
--- a/fastNLP/core/metrics.py
+++ b/fastNLP/core/metrics.py
@@ -3,11 +3,11 @@ metrics 模块实现了 fastNLP 所需的各种常用衡量指标，一般做为
 
 """
 import inspect
+from collections import defaultdict
+
 import numpy as np
 import torch
 
-from collections import defaultdict
-
 from .utils import _CheckError
 from .utils import _CheckRes
 from .utils import _build_args
diff --git a/fastNLP/core/predictor.py b/fastNLP/core/predictor.py
index a9ef7924..4f37e105 100644
--- a/fastNLP/core/predictor.py
+++ b/fastNLP/core/predictor.py
@@ -2,10 +2,10 @@
     ..todo::
         检查这个类是否需要
 """
-import torch
-
 from collections import defaultdict
 
+import torch
+
 from . import Batch
 from . import DataSet
 from . import SequentialSampler
diff --git a/fastNLP/core/sampler.py b/fastNLP/core/sampler.py
index 0900e733..c8577722 100644
--- a/fastNLP/core/sampler.py
+++ b/fastNLP/core/sampler.py
@@ -1,10 +1,10 @@
 """
 sampler 子类实现了 fastNLP 所需的各种采样器。
 """
-import numpy as np
-
 from itertools import chain
 
+import numpy as np
+
 __all__ = [
     "Sampler",
     "BucketSampler",
diff --git a/fastNLP/core/tester.py b/fastNLP/core/tester.py
index 47aef46e..883e0d01 100644
--- a/fastNLP/core/tester.py
+++ b/fastNLP/core/tester.py
@@ -33,6 +33,7 @@ Tester在验证进行之前会调用model.eval()提示当前进入了evaluation
 
 """
 import warnings
+
 import torch
 import torch.nn as nn
 
diff --git a/fastNLP/core/trainer.py b/fastNLP/core/trainer.py
index 87d57f12..7efa5d28 100644
--- a/fastNLP/core/trainer.py
+++ b/fastNLP/core/trainer.py
@@ -297,13 +297,13 @@ Example2.3
 """
 
 import os
-import numpy as np
 import time
+from datetime import datetime, timedelta
+
+import numpy as np
 import torch
 import torch.nn as nn
 
-from datetime import datetime, timedelta
-
 try:
     from tqdm.auto import tqdm
 except:
diff --git a/fastNLP/core/utils.py b/fastNLP/core/utils.py
index a7ad3326..6e2f99ff 100644
--- a/fastNLP/core/utils.py
+++ b/fastNLP/core/utils.py
@@ -3,14 +3,13 @@ utils模块实现了 fastNLP 内部和外部所需的很多工具。其中用户
 """
 import _pickle
 import inspect
-import numpy as np
 import os
-import torch
-import torch.nn as nn
 import warnings
+from collections import Counter, namedtuple
 
-from collections import Counter
-from collections import namedtuple
+import numpy as np
+import torch
+import torch.nn as nn
 
 __all__ = [
     "cache_results",
diff --git a/fastNLP/io/__init__.py b/fastNLP/io/__init__.py
index 3baf878c..6ce7ebc3 100644
--- a/fastNLP/io/__init__.py
+++ b/fastNLP/io/__init__.py
@@ -9,6 +9,11 @@
 
 这些类的使用方法如下:
 """
+from .embed_loader import EmbedLoader
+from .dataset_loader import DataSetLoader, CSVLoader, JsonLoader, ConllLoader, SNLILoader, SSTLoader, \
+    PeopleDailyCorpusLoader, Conll2003Loader
+from .model_io import ModelLoader, ModelSaver
+
 __all__ = [
     'EmbedLoader',
     
@@ -24,7 +29,3 @@ __all__ = [
     'ModelLoader',
     'ModelSaver',
 ]
-from .embed_loader import EmbedLoader
-from .dataset_loader import DataSetLoader, CSVLoader, JsonLoader, ConllLoader, SNLILoader, SSTLoader, \
-    PeopleDailyCorpusLoader, Conll2003Loader
-from .model_io import ModelLoader as ModelLoader, ModelSaver as ModelSaver
\ No newline at end of file
diff --git a/fastNLP/io/base_loader.py b/fastNLP/io/base_loader.py
index 051de281..33f59fe5 100644
--- a/fastNLP/io/base_loader.py
+++ b/fastNLP/io/base_loader.py
@@ -1,15 +1,20 @@
 import _pickle as pickle
 import os
 
+__all__ = [
+    "BaseLoader"
+]
+
 
 class BaseLoader(object):
     """
     各个 Loader 的基类，提供了 API 的参考。
 
     """
+    
     def __init__(self):
         super(BaseLoader, self).__init__()
-
+    
     @staticmethod
     def load_lines(data_path):
         """
@@ -20,7 +25,7 @@ class BaseLoader(object):
         with open(data_path, "r", encoding="utf=8") as f:
             text = f.readlines()
         return [line.strip() for line in text]
-
+    
     @classmethod
     def load(cls, data_path):
         """
@@ -31,7 +36,7 @@ class BaseLoader(object):
         with open(data_path, "r", encoding="utf-8") as f:
             text = f.readlines()
         return [[word for word in sent.strip()] for sent in text]
-
+    
     @classmethod
     def load_with_cache(cls, data_path, cache_path):
         """缓存版的load
@@ -48,16 +53,18 @@ class BaseLoader(object):
 
 class DataLoaderRegister:
     _readers = {}
-
+    
     @classmethod
     def set_reader(cls, reader_cls, read_fn_name):
         # def wrapper(reader_cls):
         if read_fn_name in cls._readers:
-            raise KeyError('duplicate reader: {} and {} for read_func: {}'.format(cls._readers[read_fn_name], reader_cls, read_fn_name))
+            raise KeyError(
+                'duplicate reader: {} and {} for read_func: {}'.format(cls._readers[read_fn_name], reader_cls,
+                                                                       read_fn_name))
         if hasattr(reader_cls, 'load'):
             cls._readers[read_fn_name] = reader_cls().load
         return reader_cls
-
+    
     @classmethod
     def get_reader(cls, read_fn_name):
         if read_fn_name in cls._readers:
diff --git a/fastNLP/io/config_io.py b/fastNLP/io/config_io.py
index 8fa30dd4..e67511ee 100644
--- a/fastNLP/io/config_io.py
+++ b/fastNLP/io/config_io.py
@@ -1,14 +1,20 @@
 """
-
 用于读入和处理和保存 config 文件
+ .. todo::
+    这个模块中的类可能被抛弃？
 """
-__all__ = ["ConfigLoader","ConfigSection","ConfigSaver"]
 import configparser
 import json
 import os
 
 from .base_loader import BaseLoader
 
+__all__ = [
+    "ConfigLoader",
+    "ConfigSection",
+    "ConfigSaver"
+]
+
 
 class ConfigLoader(BaseLoader):
     """
@@ -19,15 +25,16 @@ class ConfigLoader(BaseLoader):
     :param str data_path: 配置文件的路径
 
     """
+    
     def __init__(self, data_path=None):
         super(ConfigLoader, self).__init__()
         if data_path is not None:
             self.config = self.parse(super(ConfigLoader, self).load(data_path))
-
+    
     @staticmethod
     def parse(string):
         raise NotImplementedError
-
+    
     @staticmethod
     def load_config(file_path, sections):
         """
@@ -81,10 +88,10 @@ class ConfigSection(object):
     ConfigSection是一个存储了一个section中所有键值对的数据结构，推荐使用此类的实例来配合 :meth:`ConfigLoader.load_config` 使用
 
     """
-
+    
     def __init__(self):
         super(ConfigSection, self).__init__()
-
+    
     def __getitem__(self, key):
         """
         :param key: str, the name of the attribute
@@ -97,7 +104,7 @@ class ConfigSection(object):
         if key in self.__dict__.keys():
             return getattr(self, key)
         raise AttributeError("do NOT have attribute %s" % key)
-
+    
     def __setitem__(self, key, value):
         """
         :param key: str, the name of the attribute
@@ -112,14 +119,14 @@ class ConfigSection(object):
                 raise AttributeError("attr %s except %s but got %s" %
                                      (key, str(type(getattr(self, key))), str(type(value))))
         setattr(self, key, value)
-
+    
     def __contains__(self, item):
         """
         :param item: The key of item.
         :return: True if the key in self.__dict__.keys() else False.
         """
         return item in self.__dict__.keys()
-
+    
     def __eq__(self, other):
         """Overwrite the == operator
 
@@ -131,15 +138,15 @@ class ConfigSection(object):
                 return False
             if getattr(self, k) != getattr(self, k):
                 return False
-
+        
         for k in other.__dict__.keys():
             if k not in self.__dict__.keys():
                 return False
             if getattr(self, k) != getattr(self, k):
                 return False
-
+        
         return True
-
+    
     def __ne__(self, other):
         """Overwrite the != operator
 
@@ -147,7 +154,7 @@ class ConfigSection(object):
         :return:
         """
         return not self.__eq__(other)
-
+    
     @property
     def data(self):
         return self.__dict__
@@ -162,11 +169,12 @@ class ConfigSaver(object):
     :param str file_path: 配置文件的路径
 
     """
+    
     def __init__(self, file_path):
         self.file_path = file_path
         if not os.path.exists(self.file_path):
             raise FileNotFoundError("file {} NOT found!".__format__(self.file_path))
-
+    
     def _get_section(self, sect_name):
         """
         This is the function to get the section with the section name.
@@ -177,7 +185,7 @@ class ConfigSaver(object):
         sect = ConfigSection()
         ConfigLoader().load_config(self.file_path, {sect_name: sect})
         return sect
-
+    
     def _read_section(self):
         """
         This is the function to read sections from the config file.
@@ -187,16 +195,16 @@ class ConfigSaver(object):
             sect_key_list: A list of names in sect_list.
         """
         sect_name = None
-
+        
         sect_list = {}
         sect_key_list = []
-
+        
         single_section = {}
         single_section_key = []
-
+        
         with open(self.file_path, 'r') as f:
             lines = f.readlines()
-
+        
         for line in lines:
             if line.startswith('[') and line.endswith(']\n'):
                 if sect_name is None:
@@ -208,29 +216,29 @@ class ConfigSaver(object):
                     sect_key_list.append(sect_name)
                 sect_name = line[1: -2]
                 continue
-
+            
             if line.startswith('#'):
                 single_section[line] = '#'
                 single_section_key.append(line)
                 continue
-
+            
             if line.startswith('\n'):
                 single_section_key.append('\n')
                 continue
-
+            
             if '=' not in line:
                 raise RuntimeError("can NOT load config file {}".__format__(self.file_path))
-
+            
             key = line.split('=', maxsplit=1)[0].strip()
             value = line.split('=', maxsplit=1)[1].strip() + '\n'
             single_section[key] = value
             single_section_key.append(key)
-
+        
         if sect_name is not None:
             sect_list[sect_name] = single_section, single_section_key
             sect_key_list.append(sect_name)
         return sect_list, sect_key_list
-
+    
     def _write_section(self, sect_list, sect_key_list):
         """
         This is the function to write config file with section list and name list.
@@ -252,7 +260,7 @@ class ConfigSaver(object):
                         continue
                     f.write(key + ' = ' + single_section[key])
                 f.write('\n')
-
+    
     def save_config_file(self, section_name, section):
         """
         这个方法可以用来修改并保存配置文件中单独的一个 section
@@ -284,11 +292,11 @@ class ConfigSaver(object):
                     break
             if not change_file:
                 return
-
+            
             sect_list, sect_key_list = self._read_section()
             if section_name not in sect_key_list:
                 raise AttributeError()
-
+            
             sect, sect_key = sect_list[section_name]
             for k in section.__dict__.keys():
                 if k not in sect_key:
diff --git a/fastNLP/io/dataset_loader.py b/fastNLP/io/dataset_loader.py
index 3cd475a5..a4b233ad 100644
--- a/fastNLP/io/dataset_loader.py
+++ b/fastNLP/io/dataset_loader.py
@@ -10,6 +10,12 @@ dataset_loader模块实现了许多 DataSetLoader, 用于读取不同格式的
 
     # ... do stuff
 """
+from nltk.tree import Tree
+
+from ..core.dataset import DataSet
+from ..core.instance import Instance
+from .file_reader import _read_csv, _read_json, _read_conll
+
 __all__ = [
     'DataSetLoader',
     'CSVLoader',
@@ -20,11 +26,6 @@ __all__ = [
     'PeopleDailyCorpusLoader',
     'Conll2003Loader',
 ]
-from nltk.tree import Tree
-
-from ..core.dataset import DataSet
-from ..core.instance import Instance
-from .file_reader import _read_csv, _read_json, _read_conll
 
 
 def _download_from_url(url, path):
diff --git a/fastNLP/io/embed_loader.py b/fastNLP/io/embed_loader.py
index 9f3a73dd..7a845366 100644
--- a/fastNLP/io/embed_loader.py
+++ b/fastNLP/io/embed_loader.py
@@ -1,11 +1,15 @@
 import os
+import warnings
 
 import numpy as np
 
 from ..core.vocabulary import Vocabulary
 from .base_loader import BaseLoader
 
-import warnings
+__all__ = [
+    "EmbedLoader"
+]
+
 
 class EmbedLoader(BaseLoader):
     """
@@ -13,10 +17,10 @@ class EmbedLoader(BaseLoader):
 
     用于读取预训练的embedding, 读取结果可直接载入为模型参数。
     """
-
+    
     def __init__(self):
         super(EmbedLoader, self).__init__()
-
+    
     @staticmethod
     def load_with_vocab(embed_filepath, vocab, dtype=np.float32, normalize=True, error='ignore'):
         """
@@ -40,11 +44,11 @@ class EmbedLoader(BaseLoader):
             line = f.readline().strip()
             parts = line.split()
             start_idx = 0
-            if len(parts)==2:
+            if len(parts) == 2:
                 dim = int(parts[1])
                 start_idx += 1
             else:
-                dim = len(parts)-1
+                dim = len(parts) - 1
                 f.seek(0)
             matrix = np.random.randn(len(vocab), dim).astype(dtype)
             for idx, line in enumerate(f, start_idx):
@@ -63,21 +67,21 @@ class EmbedLoader(BaseLoader):
             total_hits = sum(hit_flags)
             print("Found {} out of {} words in the pre-training embedding.".format(total_hits, len(vocab)))
             found_vectors = matrix[hit_flags]
-            if len(found_vectors)!=0:
+            if len(found_vectors) != 0:
                 mean = np.mean(found_vectors, axis=0, keepdims=True)
                 std = np.std(found_vectors, axis=0, keepdims=True)
                 unfound_vec_num = len(vocab) - total_hits
-                r_vecs = np.random.randn(unfound_vec_num, dim).astype(dtype)*std + mean
-                matrix[hit_flags==False] = r_vecs
-
+                r_vecs = np.random.randn(unfound_vec_num, dim).astype(dtype) * std + mean
+                matrix[hit_flags == False] = r_vecs
+            
             if normalize:
                 matrix /= np.linalg.norm(matrix, axis=1, keepdims=True)
-
+            
             return matrix
-
+    
     @staticmethod
     def load_without_vocab(embed_filepath, dtype=np.float32, padding='<pad>', unknown='<unk>', normalize=True,
-                            error='ignore'):
+                           error='ignore'):
         """
         从embed_filepath中读取预训练的word vector。根据预训练的词表读取embedding并生成一个对应的Vocabulary。
 
@@ -96,35 +100,35 @@ class EmbedLoader(BaseLoader):
         vec_dict = {}
         found_unknown = False
         found_pad = False
-
+        
         with open(embed_filepath, 'r', encoding='utf-8') as f:
             line = f.readline()
             start = 1
             dim = -1
-            if len(line.strip().split())!=2:
+            if len(line.strip().split()) != 2:
                 f.seek(0)
                 start = 0
             for idx, line in enumerate(f, start=start):
                 try:
                     parts = line.strip().split()
                     word = parts[0]
-                    if dim==-1:
-                        dim = len(parts)-1
+                    if dim == -1:
+                        dim = len(parts) - 1
                     vec = np.fromstring(' '.join(parts[1:]), sep=' ', dtype=dtype, count=dim)
                     vec_dict[word] = vec
                     vocab.add_word(word)
-                    if unknown is not None and unknown==word:
+                    if unknown is not None and unknown == word:
                         found_unknown = True
-                    if found_pad is not None and padding==word:
+                    if found_pad is not None and padding == word:
                         found_pad = True
                 except Exception as e:
-                    if error=='ignore':
+                    if error == 'ignore':
                         warnings.warn("Error occurred at the {} line.".format(idx))
                         pass
                     else:
                         print("Error occurred at the {} line.".format(idx))
                         raise e
-            if dim==-1:
+            if dim == -1:
                 raise RuntimeError("{} is an empty file.".format(embed_filepath))
             matrix = np.random.randn(len(vocab), dim).astype(dtype)
             if (unknown is not None and not found_unknown) or (padding is not None and not found_pad):
@@ -133,19 +137,19 @@ class EmbedLoader(BaseLoader):
                     start_idx += 1
                 if unknown is not None:
                     start_idx += 1
-
+                
                 mean = np.mean(matrix[start_idx:], axis=0, keepdims=True)
                 std = np.std(matrix[start_idx:], axis=0, keepdims=True)
                 if (unknown is not None and not found_unknown):
-                    matrix[start_idx-1] = np.random.randn(1, dim).astype(dtype)*std + mean
+                    matrix[start_idx - 1] = np.random.randn(1, dim).astype(dtype) * std + mean
                 if (padding is not None and not found_pad):
-                    matrix[0] = np.random.randn(1, dim).astype(dtype)*std + mean
-
+                    matrix[0] = np.random.randn(1, dim).astype(dtype) * std + mean
+            
             for key, vec in vec_dict.items():
                 index = vocab.to_index(key)
                 matrix[index] = vec
-
+            
             if normalize:
                 matrix /= np.linalg.norm(matrix, axis=1, keepdims=True)
-
+            
             return matrix, vocab
diff --git a/fastNLP/io/model_io.py b/fastNLP/io/model_io.py
index 48e53ab3..36393cd4 100644
--- a/fastNLP/io/model_io.py
+++ b/fastNLP/io/model_io.py
@@ -5,6 +5,11 @@ import torch
 
 from .base_loader import BaseLoader
 
+__all__ = [
+    "ModelLoader",
+    "ModelSaver"
+]
+
 
 class ModelLoader(BaseLoader):
     """
@@ -12,10 +17,10 @@ class ModelLoader(BaseLoader):
 
     用于读取模型
     """
-
+    
     def __init__(self):
         super(ModelLoader, self).__init__()
-
+    
     @staticmethod
     def load_pytorch(empty_model, model_path):
         """
@@ -25,7 +30,7 @@ class ModelLoader(BaseLoader):
         :param str model_path: 模型保存的路径
         """
         empty_model.load_state_dict(torch.load(model_path))
-
+    
     @staticmethod
     def load_pytorch_model(model_path):
         """
@@ -48,14 +53,14 @@ class ModelSaver(object):
         saver.save_pytorch(model)
 
     """
-
+    
     def __init__(self, save_path):
         """
 
         :param save_path: 模型保存的路径
         """
         self.save_path = save_path
-
+    
     def save_pytorch(self, model, param_only=True):
         """
         把 PyTorch 模型存入 ".pkl" 文件
diff --git a/fastNLP/models/__init__.py b/fastNLP/models/__init__.py
index 66af3a46..f9ade153 100644
--- a/fastNLP/models/__init__.py
+++ b/fastNLP/models/__init__.py
@@ -7,7 +7,6 @@ fastNLP 在 :mod:`~fastNLP.models` 模块中内置了如 :class:`~fastNLP.models
 
 
 """
-__all__ = ["CNNText", "SeqLabeling", "ESIM", "STSeqLabel", "AdvSeqLabel", "STNLICls", "STSeqCls"]
 from .base_model import BaseModel
 from .bert import BertForMultipleChoice, BertForQuestionAnswering, BertForSequenceClassification, \
     BertForTokenClassification
@@ -15,4 +14,21 @@ from .biaffine_parser import BiaffineParser, GraphParser
 from .cnn_text_classification import CNNText
 from .sequence_labeling import SeqLabeling, AdvSeqLabel
 from .snli import ESIM
-from .star_transformer import STSeqCls, STNLICls, STSeqLabel
+from .star_transformer import StarTransEnc, STSeqCls, STNLICls, STSeqLabel
+
+__all__ = [
+    "CNNText",
+    
+    "SeqLabeling",
+    "AdvSeqLabel",
+    
+    "ESIM",
+    
+    "StarTransEnc",
+    "STSeqLabel",
+    "STNLICls",
+    "STSeqCls",
+    
+    "BiaffineParser",
+    "GraphParser"
+]
diff --git a/fastNLP/models/base_model.py b/fastNLP/models/base_model.py
index 39ac99a0..d27f1d21 100644
--- a/fastNLP/models/base_model.py
+++ b/fastNLP/models/base_model.py
@@ -6,13 +6,13 @@ from ..modules.decoder.MLP import MLP
 class BaseModel(torch.nn.Module):
     """Base PyTorch model for all models.
     """
-
+    
     def __init__(self):
         super(BaseModel, self).__init__()
-
+    
     def fit(self, train_data, dev_data=None, **train_args):
         pass
-
+    
     def predict(self, *args, **kwargs):
         raise NotImplementedError
 
@@ -21,9 +21,9 @@ class NaiveClassifier(BaseModel):
     def __init__(self, in_feature_dim, out_feature_dim):
         super(NaiveClassifier, self).__init__()
         self.mlp = MLP([in_feature_dim, in_feature_dim, out_feature_dim])
-
+    
     def forward(self, x):
         return {"predict": torch.sigmoid(self.mlp(x))}
-
+    
     def predict(self, x):
         return {"predict": torch.sigmoid(self.mlp(x)) > 0.5}
diff --git a/fastNLP/models/biaffine_parser.py b/fastNLP/models/biaffine_parser.py
index 100bfb72..7f16202d 100644
--- a/fastNLP/models/biaffine_parser.py
+++ b/fastNLP/models/biaffine_parser.py
@@ -1,11 +1,12 @@
-"""Biaffine Dependency Parser 的 Pytorch 实现.
 """
-from collections import defaultdict
-
+Biaffine Dependency Parser 的 Pytorch 实现.
+"""
 import numpy as np
 import torch
-from torch import nn
-from torch.nn import functional as F
+import torch.nn as nn
+import torch.nn.functional as F
+
+from collections import defaultdict
 
 from ..core.const import Const as C
 from ..core.losses import LossFunc
@@ -18,6 +19,12 @@ from ..modules.utils import get_embeddings
 from .base_model import BaseModel
 from ..core.utils import seq_len_to_mask
 
+__all__ = [
+    "BiaffineParser",
+    "GraphParser"
+]
+
+
 def _mst(scores):
     """
     with some modification to support parser output for MST decoding
@@ -44,7 +51,7 @@ def _mst(scores):
             scores[roots, new_heads] / root_scores)]
         heads[roots] = new_heads
         heads[new_root] = 0
-
+    
     edges = defaultdict(set)
     vertices = set((0,))
     for dep, head in enumerate(heads[tokens]):
@@ -73,7 +80,7 @@ def _mst(scores):
         heads[changed_cycle] = new_head
         edges[new_head].add(changed_cycle)
         edges[old_head].remove(changed_cycle)
-
+    
     return heads
 
 
@@ -88,7 +95,7 @@ def _find_cycle(vertices, edges):
     _lowlinks = {}
     _onstack = defaultdict(lambda: False)
     _SCCs = []
-
+    
     def _strongconnect(v):
         nonlocal _index
         _indices[v] = _index
@@ -96,28 +103,28 @@ def _find_cycle(vertices, edges):
         _index += 1
         _stack.append(v)
         _onstack[v] = True
-
+        
         for w in edges[v]:
             if w not in _indices:
                 _strongconnect(w)
                 _lowlinks[v] = min(_lowlinks[v], _lowlinks[w])
             elif _onstack[w]:
                 _lowlinks[v] = min(_lowlinks[v], _indices[w])
-
+        
         if _lowlinks[v] == _indices[v]:
             SCC = set()
             while True:
                 w = _stack.pop()
                 _onstack[w] = False
                 SCC.add(w)
-                if not(w != v):
+                if not (w != v):
                     break
             _SCCs.append(SCC)
-
+    
     for v in vertices:
         if v not in _indices:
             _strongconnect(v)
-
+    
     return [SCC for SCC in _SCCs if len(SCC) > 1]
 
 
@@ -125,9 +132,10 @@ class GraphParser(BaseModel):
     """
     基于图的parser base class, 支持贪婪解码和最大生成树解码
     """
+    
     def __init__(self):
         super(GraphParser, self).__init__()
-
+    
     @staticmethod
     def greedy_decoder(arc_matrix, mask=None):
         """
@@ -146,7 +154,7 @@ class GraphParser(BaseModel):
         if mask is not None:
             heads *= mask.long()
         return heads
-
+    
     @staticmethod
     def mst_decoder(arc_matrix, mask=None):
         """
@@ -176,6 +184,7 @@ class ArcBiaffine(nn.Module):
     :param hidden_size: 输入的特征维度
     :param bias: 是否使用bias. Default: ``True``
     """
+    
     def __init__(self, hidden_size, bias=True):
         super(ArcBiaffine, self).__init__()
         self.U = nn.Parameter(torch.Tensor(hidden_size, hidden_size), requires_grad=True)
@@ -185,7 +194,7 @@ class ArcBiaffine(nn.Module):
         else:
             self.register_parameter("bias", None)
         initial_parameter(self)
-
+    
     def forward(self, head, dep):
         """
 
@@ -209,11 +218,12 @@ class LabelBilinear(nn.Module):
     :param num_label: 边类别的个数
     :param bias: 是否使用bias. Default: ``True``
     """
+    
     def __init__(self, in1_features, in2_features, num_label, bias=True):
         super(LabelBilinear, self).__init__()
         self.bilinear = nn.Bilinear(in1_features, in2_features, num_label, bias=bias)
         self.lin = nn.Linear(in1_features + in2_features, num_label, bias=False)
-
+    
     def forward(self, x1, x2):
         """
 
@@ -225,13 +235,13 @@ class LabelBilinear(nn.Module):
         output += self.lin(torch.cat([x1, x2], dim=2))
         return output
 
+
 class BiaffineParser(GraphParser):
     """
     别名：:class:`fastNLP.models.BiaffineParser`  :class:`fastNLP.models.baffine_parser.BiaffineParser`
 
     Biaffine Dependency Parser 实现.
-    论文参考 ` Deep Biaffine Attention for Neural Dependency Parsing (Dozat and Manning, 2016)
-    <https://arxiv.org/abs/1611.01734>`_ .
+    论文参考 `Deep Biaffine Attention for Neural Dependency Parsing (Dozat and Manning, 2016) <https://arxiv.org/abs/1611.01734>`_ .
 
     :param init_embed: 单词词典, 可以是 tuple, 包括(num_embedings, embedding_dim), 即
         embedding的大小和每个词的维度. 也可以传入 nn.Embedding 对象,
@@ -248,18 +258,19 @@ class BiaffineParser(GraphParser):
     :param use_greedy_infer: 是否在inference时使用贪心算法.
         若 ``False`` , 使用更加精确但相对缓慢的MST算法. Default: ``False``
     """
+    
     def __init__(self,
-                init_embed,
-                pos_vocab_size,
-                pos_emb_dim,
-                num_label,
-                rnn_layers=1,
-                rnn_hidden_size=200,
-                arc_mlp_size=100,
-                label_mlp_size=100,
-                dropout=0.3,
-                encoder='lstm',
-                use_greedy_infer=False):
+                 init_embed,
+                 pos_vocab_size,
+                 pos_emb_dim,
+                 num_label,
+                 rnn_layers=1,
+                 rnn_hidden_size=200,
+                 arc_mlp_size=100,
+                 label_mlp_size=100,
+                 dropout=0.3,
+                 encoder='lstm',
+                 use_greedy_infer=False):
         super(BiaffineParser, self).__init__()
         rnn_out_size = 2 * rnn_hidden_size
         word_hid_dim = pos_hid_dim = rnn_hidden_size
@@ -295,20 +306,20 @@ class BiaffineParser(GraphParser):
             if (d_k * n_head) != rnn_out_size:
                 raise ValueError('unsupported rnn_out_size: {} for transformer'.format(rnn_out_size))
             self.position_emb = nn.Embedding(num_embeddings=self.max_len,
-                                             embedding_dim=rnn_out_size,)
+                                             embedding_dim=rnn_out_size, )
             self.encoder = TransformerEncoder(num_layers=rnn_layers,
                                               model_size=rnn_out_size,
                                               inner_size=1024,
                                               key_size=d_k,
                                               value_size=d_v,
                                               num_head=n_head,
-                                              dropout=dropout,)
+                                              dropout=dropout, )
         else:
             raise ValueError('unsupported encoder type: {}'.format(encoder))
-
+        
         self.mlp = nn.Sequential(nn.Linear(rnn_out_size, arc_mlp_size * 2 + label_mlp_size * 2),
-                                          nn.ELU(),
-                                          TimestepDropout(p=dropout),)
+                                 nn.ELU(),
+                                 TimestepDropout(p=dropout), )
         self.arc_mlp_size = arc_mlp_size
         self.label_mlp_size = label_mlp_size
         self.arc_predictor = ArcBiaffine(arc_mlp_size, bias=True)
@@ -316,7 +327,7 @@ class BiaffineParser(GraphParser):
         self.use_greedy_infer = use_greedy_infer
         self.reset_parameters()
         self.dropout = dropout
-
+    
     def reset_parameters(self):
         for m in self.modules():
             if isinstance(m, nn.Embedding):
@@ -327,7 +338,7 @@ class BiaffineParser(GraphParser):
             else:
                 for p in m.parameters():
                     nn.init.normal_(p, 0, 0.1)
-
+    
     def forward(self, words1, words2, seq_len, target1=None):
         """模型forward阶段
 
@@ -337,50 +348,52 @@ class BiaffineParser(GraphParser):
         :param target1: [batch_size, seq_len] 输入真实标注的heads, 仅在训练阶段有效,
             用于训练label分类器. 若为 ``None`` , 使用预测的heads输入到label分类器
             Default: ``None``
-        :return dict: parsing结果::
+        :return dict: parsing
+                结果::
+
+                    pred1: [batch_size, seq_len, seq_len] 边预测logits
+                    pred2: [batch_size, seq_len, num_label] label预测logits
+                    pred3: [batch_size, seq_len] heads的预测结果, 在 ``target1=None`` 时预测
 
-            pred1: [batch_size, seq_len, seq_len] 边预测logits
-            pred2: [batch_size, seq_len, num_label] label预测logits
-            pred3: [batch_size, seq_len] heads的预测结果, 在 ``target1=None`` 时预测
         """
         # prepare embeddings
         batch_size, length = words1.shape
         # print('forward {} {}'.format(batch_size, seq_len))
-
+        
         # get sequence mask
         mask = seq_len_to_mask(seq_len).long()
-
-        word = self.word_embedding(words1) # [N,L] -> [N,L,C_0]
-        pos = self.pos_embedding(words2) # [N,L] -> [N,L,C_1]
-
+        
+        word = self.word_embedding(words1)  # [N,L] -> [N,L,C_0]
+        pos = self.pos_embedding(words2)  # [N,L] -> [N,L,C_1]
+        
         word, pos = self.word_fc(word), self.pos_fc(pos)
         word, pos = self.word_norm(word), self.pos_norm(pos)
-        x = torch.cat([word, pos], dim=2) # -> [N,L,C]
-
+        x = torch.cat([word, pos], dim=2)  # -> [N,L,C]
+        
         # encoder, extract features
         if self.encoder_name.endswith('lstm'):
             sort_lens, sort_idx = torch.sort(seq_len, dim=0, descending=True)
             x = x[sort_idx]
             x = nn.utils.rnn.pack_padded_sequence(x, sort_lens, batch_first=True)
-            feat, _ = self.encoder(x) # -> [N,L,C]
+            feat, _ = self.encoder(x)  # -> [N,L,C]
             feat, _ = nn.utils.rnn.pad_packed_sequence(feat, batch_first=True)
             _, unsort_idx = torch.sort(sort_idx, dim=0, descending=False)
             feat = feat[unsort_idx]
         else:
-            seq_range = torch.arange(length, dtype=torch.long, device=x.device)[None,:]
+            seq_range = torch.arange(length, dtype=torch.long, device=x.device)[None, :]
             x = x + self.position_emb(seq_range)
             feat = self.encoder(x, mask.float())
-
+        
         # for arc biaffine
         # mlp, reduce dim
         feat = self.mlp(feat)
         arc_sz, label_sz = self.arc_mlp_size, self.label_mlp_size
-        arc_dep, arc_head = feat[:,:,:arc_sz], feat[:,:,arc_sz:2*arc_sz]
-        label_dep, label_head = feat[:,:,2*arc_sz:2*arc_sz+label_sz], feat[:,:,2*arc_sz+label_sz:]
-
+        arc_dep, arc_head = feat[:, :, :arc_sz], feat[:, :, arc_sz:2 * arc_sz]
+        label_dep, label_head = feat[:, :, 2 * arc_sz:2 * arc_sz + label_sz], feat[:, :, 2 * arc_sz + label_sz:]
+        
         # biaffine arc classifier
-        arc_pred = self.arc_predictor(arc_head, arc_dep) # [N, L, L]
-
+        arc_pred = self.arc_predictor(arc_head, arc_dep)  # [N, L, L]
+        
         # use gold or predicted arc to predict label
         if target1 is None or not self.training:
             # use greedy decoding in training
@@ -390,22 +403,22 @@ class BiaffineParser(GraphParser):
                 heads = self.mst_decoder(arc_pred, mask)
             head_pred = heads
         else:
-            assert self.training # must be training mode
+            assert self.training  # must be training mode
             if target1 is None:
                 heads = self.greedy_decoder(arc_pred, mask)
                 head_pred = heads
             else:
                 head_pred = None
                 heads = target1
-
+        
         batch_range = torch.arange(start=0, end=batch_size, dtype=torch.long, device=words1.device).unsqueeze(1)
         label_head = label_head[batch_range, heads].contiguous()
-        label_pred = self.label_predictor(label_head, label_dep) # [N, L, num_label]
+        label_pred = self.label_predictor(label_head, label_dep)  # [N, L, num_label]
         res_dict = {C.OUTPUTS(0): arc_pred, C.OUTPUTS(1): label_pred}
         if head_pred is not None:
             res_dict[C.OUTPUTS(2)] = head_pred
         return res_dict
-
+    
     @staticmethod
     def loss(pred1, pred2, target1, target2, seq_len):
         """
@@ -418,7 +431,7 @@ class BiaffineParser(GraphParser):
         :param seq_len: [batch_size, seq_len] 真实目标的长度
         :return loss: scalar
         """
-
+        
         batch_size, length, _ = pred1.shape
         mask = seq_len_to_mask(seq_len)
         flip_mask = (mask == 0)
@@ -430,24 +443,26 @@ class BiaffineParser(GraphParser):
         child_index = torch.arange(length, device=arc_logits.device, dtype=torch.long).unsqueeze(0)
         arc_loss = arc_logits[batch_index, child_index, target1]
         label_loss = label_logits[batch_index, child_index, target2]
-
+        
         byte_mask = flip_mask.byte()
         arc_loss.masked_fill_(byte_mask, 0)
         label_loss.masked_fill_(byte_mask, 0)
         arc_nll = -arc_loss.mean()
         label_nll = -label_loss.mean()
         return arc_nll + label_nll
-
+    
     def predict(self, words1, words2, seq_len):
         """模型预测API
 
         :param words1: [batch_size, seq_len] 输入word序列
         :param words2: [batch_size, seq_len] 输入pos序列
         :param seq_len: [batch_size, seq_len] 输入序列长度
-        :return dict: parsing结果::
+        :return dict: parsing
+                结果::
+
+                    pred1: [batch_size, seq_len] heads的预测结果
+                    pred2: [batch_size, seq_len, num_label] label预测logits
 
-            pred1: [batch_size, seq_len] heads的预测结果
-            pred2: [batch_size, seq_len, num_label] label预测logits
         """
         res = self(words1, words2, seq_len)
         output = {}
@@ -470,6 +485,7 @@ class ParserLoss(LossFunc):
     :param seq_len: [batch_size, seq_len] 真实目标的长度
     :return loss: scalar
     """
+    
     def __init__(self, pred1=None, pred2=None,
                  target1=None, target2=None,
                  seq_len=None):
@@ -497,9 +513,10 @@ class ParserMetric(MetricBase):
         UAS: 不带label时, 边预测的准确率
         LAS: 同时预测边和label的准确率
     """
+    
     def __init__(self, pred1=None, pred2=None,
                  target1=None, target2=None, seq_len=None):
-
+        
         super().__init__()
         self._init_param_map(pred1=pred1, pred2=pred2,
                              target1=target1, target2=target2,
@@ -507,13 +524,13 @@ class ParserMetric(MetricBase):
         self.num_arc = 0
         self.num_label = 0
         self.num_sample = 0
-
+    
     def get_metric(self, reset=True):
-        res = {'UAS': self.num_arc*1.0 / self.num_sample, 'LAS': self.num_label*1.0 / self.num_sample}
+        res = {'UAS': self.num_arc * 1.0 / self.num_sample, 'LAS': self.num_label * 1.0 / self.num_sample}
         if reset:
             self.num_sample = self.num_label = self.num_arc = 0
         return res
-
+    
     def evaluate(self, pred1, pred2, target1, target2, seq_len=None):
         """Evaluate the performance of prediction.
         """
@@ -522,7 +539,7 @@ class ParserMetric(MetricBase):
         else:
             seq_mask = seq_len_to_mask(seq_len.long()).long()
         # mask out <root> tag
-        seq_mask[:,0] = 0
+        seq_mask[:, 0] = 0
         head_pred_correct = (pred1 == target1).long() * seq_mask
         label_pred_correct = (pred2 == target2).long() * head_pred_correct
         self.num_arc += head_pred_correct.sum().item()
diff --git a/fastNLP/models/cnn_text_classification.py b/fastNLP/models/cnn_text_classification.py
index 01b03b9f..a9ccc568 100644
--- a/fastNLP/models/cnn_text_classification.py
+++ b/fastNLP/models/cnn_text_classification.py
@@ -1,12 +1,13 @@
-# python: 3.6
-# encoding: utf-8
-
 import torch
 import torch.nn as nn
-from ..core.const import Const as C
 
+from ..core.const import Const as C
 from ..modules import encoder
 
+__all__ = [
+    "CNNText"
+]
+
 
 class CNNText(torch.nn.Module):
     """
@@ -23,7 +24,7 @@ class CNNText(torch.nn.Module):
     :param int padding: 对句子前后的pad的大小, 用0填充。
     :param float dropout: Dropout的大小
     """
-
+    
     def __init__(self, init_embed,
                  num_classes,
                  kernel_nums=(3, 4, 5),
@@ -31,7 +32,7 @@ class CNNText(torch.nn.Module):
                  padding=0,
                  dropout=0.5):
         super(CNNText, self).__init__()
-
+        
         # no support for pre-trained embedding currently
         self.embed = encoder.Embedding(init_embed)
         self.conv_pool = encoder.ConvMaxpool(
@@ -41,7 +42,7 @@ class CNNText(torch.nn.Module):
             padding=padding)
         self.dropout = nn.Dropout(dropout)
         self.fc = nn.Linear(sum(kernel_nums), num_classes)
-
+    
     def forward(self, words, seq_len=None):
         """
 
@@ -54,7 +55,7 @@ class CNNText(torch.nn.Module):
         x = self.dropout(x)
         x = self.fc(x)  # [N,C] -> [N, N_class]
         return {C.OUTPUT: x}
-
+    
     def predict(self, words, seq_len=None):
         """
         :param torch.LongTensor words: [batch_size, seq_len]，句子中word的index
diff --git a/fastNLP/models/enas_controller.py b/fastNLP/models/enas_controller.py
index 16b970e6..e83c6b51 100644
--- a/fastNLP/models/enas_controller.py
+++ b/fastNLP/models/enas_controller.py
@@ -5,6 +5,7 @@ import os
 
 import torch
 import torch.nn.functional as F
+
 from . import enas_utils as utils
 from .enas_utils import Node
 
diff --git a/fastNLP/models/enas_model.py b/fastNLP/models/enas_model.py
index 5c667927..b6b683c0 100644
--- a/fastNLP/models/enas_model.py
+++ b/fastNLP/models/enas_model.py
@@ -1,17 +1,19 @@
-# Code Modified from https://github.com/carpedm20/ENAS-pytorch
-
-"""Module containing the shared RNN model."""
-import numpy as np
+"""
+Module containing the shared RNN model.
+Code Modified from https://github.com/carpedm20/ENAS-pytorch
+"""
 import collections
 
+import numpy as np
 import torch
-from torch import nn
+import torch.nn as nn
 import torch.nn.functional as F
 from torch.autograd import Variable
 
 from . import enas_utils as utils
 from .base_model import BaseModel
 
+
 def _get_dropped_weights(w_raw, dropout_p, is_training):
     """Drops out weights to implement DropConnect.
 
@@ -35,12 +37,13 @@ def _get_dropped_weights(w_raw, dropout_p, is_training):
     The above TODO is the reason for the hacky check for `torch.nn.Parameter`.
     """
     dropped_w = F.dropout(w_raw, p=dropout_p, training=is_training)
-
+    
     if isinstance(dropped_w, torch.nn.Parameter):
         dropped_w = dropped_w.clone()
-
+    
     return dropped_w
 
+
 class EmbeddingDropout(torch.nn.Embedding):
     """Class for dropping out embeddings by zero'ing out parameters in the
     embedding matrix.
@@ -53,6 +56,7 @@ class EmbeddingDropout(torch.nn.Embedding):
     See 'A Theoretically Grounded Application of Dropout in Recurrent Neural
     Networks', (Gal and Ghahramani, 2016).
     """
+    
     def __init__(self,
                  num_embeddings,
                  embedding_dim,
@@ -83,14 +87,14 @@ class EmbeddingDropout(torch.nn.Embedding):
         assert (dropout >= 0.0) and (dropout < 1.0), ('Dropout must be >= 0.0 '
                                                       'and < 1.0')
         self.scale = scale
-
+    
     def forward(self, inputs):  # pylint:disable=arguments-differ
         """Embeds `inputs` with the dropped out embedding weight matrix."""
         if self.training:
             dropout = self.dropout
         else:
             dropout = 0
-
+        
         if dropout:
             mask = self.weight.data.new(self.weight.size(0), 1)
             mask.bernoulli_(1 - dropout)
@@ -101,7 +105,7 @@ class EmbeddingDropout(torch.nn.Embedding):
             masked_weight = self.weight
         if self.scale and self.scale != 1:
             masked_weight = masked_weight * self.scale
-
+        
         return F.embedding(inputs,
                            masked_weight,
                            max_norm=self.max_norm,
@@ -114,7 +118,7 @@ class LockedDropout(nn.Module):
     # code from https://github.com/salesforce/awd-lstm-lm/blob/master/locked_dropout.py
     def __init__(self):
         super().__init__()
-
+    
     def forward(self, x, dropout=0.5):
         if not self.training or not dropout:
             return x
@@ -126,11 +130,12 @@ class LockedDropout(nn.Module):
 
 class ENASModel(BaseModel):
     """Shared RNN model."""
+    
     def __init__(self, embed_num, num_classes, num_blocks=4, cuda=False, shared_hid=1000, shared_embed=1000):
         super(ENASModel, self).__init__()
-
+        
         self.use_cuda = cuda
-
+        
         self.shared_hid = shared_hid
         self.num_blocks = num_blocks
         self.decoder = nn.Linear(self.shared_hid, num_classes)
@@ -139,16 +144,16 @@ class ENASModel(BaseModel):
                                         dropout=0.1)
         self.lockdrop = LockedDropout()
         self.dag = None
-
+        
         # Tie weights
         # self.decoder.weight = self.encoder.weight
-
+        
         # Since W^{x, c} and W^{h, c} are always summed, there
         # is no point duplicating their bias offset parameter. Likewise for
         # W^{x, h} and W^{h, h}.
         self.w_xc = nn.Linear(shared_embed, self.shared_hid)
         self.w_xh = nn.Linear(shared_embed, self.shared_hid)
-
+        
         # The raw weights are stored here because the hidden-to-hidden weights
         # are weight dropped on the forward pass.
         self.w_hc_raw = torch.nn.Parameter(
@@ -157,10 +162,10 @@ class ENASModel(BaseModel):
             torch.Tensor(self.shared_hid, self.shared_hid))
         self.w_hc = None
         self.w_hh = None
-
+        
         self.w_h = collections.defaultdict(dict)
         self.w_c = collections.defaultdict(dict)
-
+        
         for idx in range(self.num_blocks):
             for jdx in range(idx + 1, self.num_blocks):
                 self.w_h[idx][jdx] = nn.Linear(self.shared_hid,
@@ -169,48 +174,47 @@ class ENASModel(BaseModel):
                 self.w_c[idx][jdx] = nn.Linear(self.shared_hid,
                                                self.shared_hid,
                                                bias=False)
-
+        
         self._w_h = nn.ModuleList([self.w_h[idx][jdx]
                                    for idx in self.w_h
                                    for jdx in self.w_h[idx]])
         self._w_c = nn.ModuleList([self.w_c[idx][jdx]
                                    for idx in self.w_c
                                    for jdx in self.w_c[idx]])
-
+        
         self.batch_norm = None
         # if args.mode == 'train':
         #     self.batch_norm = nn.BatchNorm1d(self.shared_hid)
         # else:
         #     self.batch_norm = None
-
+        
         self.reset_parameters()
         self.static_init_hidden = utils.keydefaultdict(self.init_hidden)
-
+    
     def setDAG(self, dag):
         if self.dag is None:
             self.dag = dag
-
+    
     def forward(self, word_seq, hidden=None):
         inputs = torch.transpose(word_seq, 0, 1)
-
+        
         time_steps = inputs.size(0)
         batch_size = inputs.size(1)
-
-
+        
         self.w_hh = _get_dropped_weights(self.w_hh_raw,
                                          0.5,
                                          self.training)
         self.w_hc = _get_dropped_weights(self.w_hc_raw,
                                          0.5,
                                          self.training)
-
+        
         # hidden = self.static_init_hidden[batch_size] if hidden is None else hidden
         hidden = self.static_init_hidden[batch_size]
-
+        
         embed = self.encoder(inputs)
-
+        
         embed = self.lockdrop(embed, 0.65 if self.training else 0)
-
+        
         # The norm of hidden states are clipped here because
         # otherwise ENAS is especially prone to exploding activations on the
         # forward pass. This could probably be fixed in a more elegant way, but
@@ -226,7 +230,7 @@ class ENASModel(BaseModel):
         for step in range(time_steps):
             x_t = embed[step]
             logit, hidden = self.cell(x_t, hidden, self.dag)
-
+            
             hidden_norms = hidden.norm(dim=-1)
             max_norm = 25.0
             if hidden_norms.data.max() > max_norm:
@@ -237,60 +241,60 @@ class ENASModel(BaseModel):
                 # because the PyTorch slicing and slice assignment is too
                 # flaky.
                 hidden_norms = hidden_norms.data.cpu().numpy()
-
+                
                 clipped_num += 1
                 if hidden_norms.max() > max_clipped_norm:
                     max_clipped_norm = hidden_norms.max()
-
+                
                 clip_select = hidden_norms > max_norm
                 clip_norms = hidden_norms[clip_select]
-
+                
                 mask = np.ones(hidden.size())
-                normalizer = max_norm/clip_norms
+                normalizer = max_norm / clip_norms
                 normalizer = normalizer[:, np.newaxis]
-
+                
                 mask[clip_select] = normalizer
-
+                
                 if self.use_cuda:
                     hidden *= torch.autograd.Variable(
                         torch.FloatTensor(mask).cuda(), requires_grad=False)
                 else:
                     hidden *= torch.autograd.Variable(
-                        torch.FloatTensor(mask), requires_grad=False)                    
+                        torch.FloatTensor(mask), requires_grad=False)
             logits.append(logit)
             h1tohT.append(hidden)
-
+        
         h1tohT = torch.stack(h1tohT)
         output = torch.stack(logits)
         raw_output = output
-
+        
         output = self.lockdrop(output, 0.4 if self.training else 0)
-
-        #Pooling 
+        
+        # Pooling
         output = torch.mean(output, 0)
-
+        
         decoded = self.decoder(output)
-
+        
         extra_out = {'dropped': decoded,
                      'hiddens': h1tohT,
                      'raw': raw_output}
         return {'pred': decoded, 'hidden': hidden, 'extra_out': extra_out}
-
+    
     def cell(self, x, h_prev, dag):
         """Computes a single pass through the discovered RNN cell."""
         c = {}
         h = {}
         f = {}
-
+        
         f[0] = self.get_f(dag[-1][0].name)
         c[0] = torch.sigmoid(self.w_xc(x) + F.linear(h_prev, self.w_hc, None))
-        h[0] = (c[0]*f[0](self.w_xh(x) + F.linear(h_prev, self.w_hh, None)) +
-                (1 - c[0])*h_prev)
-
+        h[0] = (c[0] * f[0](self.w_xh(x) + F.linear(h_prev, self.w_hh, None)) +
+                (1 - c[0]) * h_prev)
+        
         leaf_node_ids = []
         q = collections.deque()
         q.append(0)
-
+        
         # Computes connections from the parent nodes `node_id`
         # to their child nodes `next_id` recursively, skipping leaf nodes. A
         # leaf node is a node whose id == `self.num_blocks`.
@@ -306,10 +310,10 @@ class ENASModel(BaseModel):
         while True:
             if len(q) == 0:
                 break
-
+            
             node_id = q.popleft()
             nodes = dag[node_id]
-
+            
             for next_node in nodes:
                 next_id = next_node.id
                 if next_id == self.num_blocks:
@@ -317,38 +321,38 @@ class ENASModel(BaseModel):
                     assert len(nodes) == 1, ('parent of leaf node should have '
                                              'only one child')
                     continue
-
+                
                 w_h = self.w_h[node_id][next_id]
                 w_c = self.w_c[node_id][next_id]
-
+                
                 f[next_id] = self.get_f(next_node.name)
                 c[next_id] = torch.sigmoid(w_c(h[node_id]))
-                h[next_id] = (c[next_id]*f[next_id](w_h(h[node_id])) +
-                              (1 - c[next_id])*h[node_id])
-
+                h[next_id] = (c[next_id] * f[next_id](w_h(h[node_id])) +
+                              (1 - c[next_id]) * h[node_id])
+                
                 q.append(next_id)
-
+        
         # Instead of averaging loose ends, perhaps there should
         # be a set of separate unshared weights for each "loose" connection
         # between each node in a cell and the output.
         #
         # As it stands, all weights W^h_{ij} are doing double duty by
         # connecting both from i to j, as well as from i to the output.
-
+        
         # average all the loose ends
         leaf_nodes = [h[node_id] for node_id in leaf_node_ids]
         output = torch.mean(torch.stack(leaf_nodes, 2), -1)
-
+        
         # stabilizing the Updates of omega
         if self.batch_norm is not None:
             output = self.batch_norm(output)
-
+        
         return output, h[self.num_blocks - 1]
-
+    
     def init_hidden(self, batch_size):
         zeros = torch.zeros(batch_size, self.shared_hid)
         return utils.get_variable(zeros, self.use_cuda, requires_grad=False)
-
+    
     def get_f(self, name):
         name = name.lower()
         if name == 'relu':
@@ -360,22 +364,21 @@ class ENASModel(BaseModel):
         elif name == 'sigmoid':
             f = torch.sigmoid
         return f
-        
-
+    
     @property
     def num_parameters(self):
         def size(p):
             return np.prod(p.size())
+        
         return sum([size(param) for param in self.parameters()])
-
-
+    
     def reset_parameters(self):
         init_range = 0.025
         # init_range = 0.025 if self.args.mode == 'train' else 0.04
         for param in self.parameters():
             param.data.uniform_(-init_range, init_range)
         self.decoder.bias.data.fill_(0)
-
+    
     def predict(self, word_seq):
         """
 
diff --git a/fastNLP/models/enas_trainer.py b/fastNLP/models/enas_trainer.py
index 9cd7d8d0..ef596b03 100644
--- a/fastNLP/models/enas_trainer.py
+++ b/fastNLP/models/enas_trainer.py
@@ -1,12 +1,12 @@
 # Code Modified from https://github.com/carpedm20/ENAS-pytorch
-
-import time
-from datetime import datetime
-from datetime import timedelta
-
+import math
 import numpy as np
+import time
 import torch
-import math
+
+from datetime import datetime, timedelta
+
+from torch.optim import Adam
 
 try:
     from tqdm.auto import tqdm
@@ -21,8 +21,6 @@ from ..core.utils import _move_dict_value_to_device
 from . import enas_utils as utils
 from ..core.utils import _build_args
 
-from torch.optim import Adam
-
 
 def _get_no_grad_ctx_mgr():
     """Returns a the `torch.no_grad` context manager for PyTorch version >=
@@ -33,6 +31,7 @@ def _get_no_grad_ctx_mgr():
 
 class ENASTrainer(Trainer):
     """A class to wrap training code."""
+    
     def __init__(self, train_data, model, controller, **kwargs):
         """Constructor for training algorithm.
         :param DataSet train_data: the training data
@@ -45,19 +44,19 @@ class ENASTrainer(Trainer):
         self.controller_step = 0
         self.shared_step = 0
         self.max_length = 35
-
+        
         self.shared = model
         self.controller = controller
-
+        
         self.shared_optim = Adam(
             self.shared.parameters(),
             lr=20.0,
             weight_decay=1e-7)
-
+        
         self.controller_optim = Adam(
             self.controller.parameters(),
             lr=3.5e-4)
-
+    
     def train(self, load_best_model=True):
         """
         :param bool load_best_model: 该参数只有在初始化提供了dev_data的情况下有效，如果True, trainer将在返回之前重新加载dev表现
@@ -82,21 +81,22 @@ class ENASTrainer(Trainer):
                 self.model = self.model.cuda()
             self._model_device = self.model.parameters().__next__().device
             self._mode(self.model, is_test=False)
-
+            
             self.start_time = str(datetime.now().strftime('%Y-%m-%d-%H-%M-%S'))
             start_time = time.time()
             print("training epochs started " + self.start_time, flush=True)
-
+            
             try:
                 self.callback_manager.on_train_begin()
                 self._train()
                 self.callback_manager.on_train_end()
             except (CallbackException, KeyboardInterrupt) as e:
                 self.callback_manager.on_exception(e)
-
+            
             if self.dev_data is not None:
-                print("\nIn Epoch:{}/Step:{}, got best dev performance:".format(self.best_dev_epoch, self.best_dev_step) +
-                      self.tester._format_eval_results(self.best_dev_perf),)
+                print(
+                    "\nIn Epoch:{}/Step:{}, got best dev performance:".format(self.best_dev_epoch, self.best_dev_step) +
+                    self.tester._format_eval_results(self.best_dev_perf), )
                 results['best_eval'] = self.best_dev_perf
                 results['best_epoch'] = self.best_dev_epoch
                 results['best_step'] = self.best_dev_step
@@ -110,9 +110,9 @@ class ENASTrainer(Trainer):
         finally:
             pass
         results['seconds'] = round(time.time() - start_time, 2)
-
+        
         return results
-
+    
     def _train(self):
         if not self.use_tqdm:
             from fastNLP.core.utils import _pseudo_tqdm as inner_tqdm
@@ -126,21 +126,21 @@ class ENASTrainer(Trainer):
             avg_loss = 0
             data_iterator = Batch(self.train_data, batch_size=self.batch_size, sampler=self.sampler, as_numpy=False,
                                   prefetch=self.prefetch)
-            for epoch in range(1, self.n_epochs+1):
+            for epoch in range(1, self.n_epochs + 1):
                 pbar.set_description_str(desc="Epoch {}/{}".format(epoch, self.n_epochs))
                 last_stage = (epoch > self.n_epochs + 1 - self.final_epochs)
                 if epoch == self.n_epochs + 1 - self.final_epochs:
                     print('Entering the final stage. (Only train the selected structure)')
                 # early stopping
                 self.callback_manager.on_epoch_begin()
-
+                
                 # 1. Training the shared parameters omega of the child models
                 self.train_shared(pbar)
-
+                
                 # 2. Training the controller parameters theta
                 if not last_stage:
                     self.train_controller()
-
+                
                 if ((self.validate_every > 0 and self.step % self.validate_every == 0) or
                     (self.validate_every < 0 and self.step % len(data_iterator) == 0)) \
                         and self.dev_data is not None:
@@ -149,16 +149,15 @@ class ENASTrainer(Trainer):
                     eval_res = self._do_validation(epoch=epoch, step=self.step)
                     eval_str = "Evaluation at Epoch {}/{}. Step:{}/{}. ".format(epoch, self.n_epochs, self.step,
                                                                                 total_steps) + \
-                                self.tester._format_eval_results(eval_res)
+                               self.tester._format_eval_results(eval_res)
                     pbar.write(eval_str)
-
+                
                 # lr decay; early stopping
                 self.callback_manager.on_epoch_end()
             # =============== epochs end =================== #
             pbar.close()
         # ============ tqdm end ============== #
-
-
+    
     def get_loss(self, inputs, targets, hidden, dags):
         """Computes the loss for the same batch for M models.
 
@@ -167,7 +166,7 @@ class ENASTrainer(Trainer):
         """
         if not isinstance(dags, list):
             dags = [dags]
-
+        
         loss = 0
         for dag in dags:
             self.shared.setDAG(dag)
@@ -175,14 +174,14 @@ class ENASTrainer(Trainer):
             inputs['hidden'] = hidden
             result = self.shared(**inputs)
             output, hidden, extra_out = result['pred'], result['hidden'], result['extra_out']
-
+            
             self.callback_manager.on_loss_begin(targets, result)
             sample_loss = self._compute_loss(result, targets)
             loss += sample_loss
-
+        
         assert len(dags) == 1, 'there are multiple `hidden` for multple `dags`'
         return loss, hidden, extra_out
-
+    
     def train_shared(self, pbar=None, max_step=None, dag=None):
         """Train the language model for 400 steps of minibatches of 64
         examples.
@@ -200,9 +199,9 @@ class ENASTrainer(Trainer):
         model = self.shared
         model.train()
         self.controller.eval()
-
+        
         hidden = self.shared.init_hidden(self.batch_size)
-
+        
         abs_max_grad = 0
         abs_max_hidden_norm = 0
         step = 0
@@ -211,15 +210,15 @@ class ENASTrainer(Trainer):
         train_idx = 0
         avg_loss = 0
         data_iterator = Batch(self.train_data, batch_size=self.batch_size, sampler=self.sampler, as_numpy=False,
-                prefetch=self.prefetch)
-
+                              prefetch=self.prefetch)
+        
         for batch_x, batch_y in data_iterator:
             _move_dict_value_to_device(batch_x, batch_y, device=self._model_device)
             indices = data_iterator.get_batch_indices()
             # negative sampling; replace unknown; re-weight batch_y
             self.callback_manager.on_batch_begin(batch_x, batch_y, indices)
             # prediction = self._data_forward(self.model, batch_x)
-
+            
             dags = self.controller.sample(1)
             inputs, targets = batch_x, batch_y
             # self.callback_manager.on_loss_begin(batch_y, prediction)
@@ -228,18 +227,18 @@ class ENASTrainer(Trainer):
                                                     hidden,
                                                     dags)
             hidden.detach_()
-           
+            
             avg_loss += loss.item()
-
+            
             # Is loss NaN or inf? requires_grad = False
             self.callback_manager.on_backward_begin(loss)
             self._grad_backward(loss)
             self.callback_manager.on_backward_end()
-
+            
             self._update()
             self.callback_manager.on_step_end()
-
-            if (self.step+1) % self.print_every == 0:
+            
+            if (self.step + 1) % self.print_every == 0:
                 if self.use_tqdm:
                     print_output = "loss:{0:<6.5f}".format(avg_loss / self.print_every)
                     pbar.update(self.print_every)
@@ -255,30 +254,29 @@ class ENASTrainer(Trainer):
             self.shared_step += 1
             self.callback_manager.on_batch_end()
         # ================= mini-batch end ==================== #
-
-
+    
     def get_reward(self, dag, entropies, hidden, valid_idx=0):
         """Computes the perplexity of a single sampled model on a minibatch of
         validation data.
         """
         if not isinstance(entropies, np.ndarray):
             entropies = entropies.data.cpu().numpy()
-
+        
         data_iterator = Batch(self.dev_data, batch_size=self.batch_size, sampler=self.sampler, as_numpy=False,
-                prefetch=self.prefetch)
-
+                              prefetch=self.prefetch)
+        
         for inputs, targets in data_iterator:
             valid_loss, hidden, _ = self.get_loss(inputs, targets, hidden, dag)
             valid_loss = utils.to_item(valid_loss.data)
-
+            
             valid_ppl = math.exp(valid_loss)
-
+            
             R = 80 / valid_ppl
-
+            
             rewards = R + 1e-4 * entropies
-
+            
             return rewards, hidden
-
+    
     def train_controller(self):
         """Fixes the shared parameters and updates the controller parameters.
 
@@ -296,13 +294,13 @@ class ENASTrainer(Trainer):
         # Why can't we call shared.eval() here? Leads to loss
         # being uniformly zero for the controller.
         # self.shared.eval()
-
+        
         avg_reward_base = None
         baseline = None
         adv_history = []
         entropy_history = []
         reward_history = []
-
+        
         hidden = self.shared.init_hidden(self.batch_size)
         total_loss = 0
         valid_idx = 0
@@ -310,7 +308,7 @@ class ENASTrainer(Trainer):
             # sample models
             dags, log_probs, entropies = self.controller.sample(
                 with_details=True)
-
+            
             # calculate reward
             np_entropies = entropies.data.cpu().numpy()
             # No gradients should be backpropagated to the
@@ -320,40 +318,39 @@ class ENASTrainer(Trainer):
                                                   np_entropies,
                                                   hidden,
                                                   valid_idx)
-
-
+            
             reward_history.extend(rewards)
             entropy_history.extend(np_entropies)
-
+            
             # moving average baseline
             if baseline is None:
                 baseline = rewards
             else:
                 decay = 0.95
                 baseline = decay * baseline + (1 - decay) * rewards
-
+            
             adv = rewards - baseline
             adv_history.extend(adv)
-
+            
             # policy loss
-            loss = -log_probs*utils.get_variable(adv,
-                                                 'cuda' in self.device,
-                                                 requires_grad=False)
-
+            loss = -log_probs * utils.get_variable(adv,
+                                                   'cuda' in self.device,
+                                                   requires_grad=False)
+            
             loss = loss.sum()  # or loss.mean()
-
+            
             # update
             self.controller_optim.zero_grad()
             loss.backward()
-
+            
             self.controller_optim.step()
-
+            
             total_loss += utils.to_item(loss.data)
-
+            
             if ((step % 50) == 0) and (step > 0):
                 reward_history, adv_history, entropy_history = [], [], []
                 total_loss = 0
-
+            
             self.controller_step += 1
             # prev_valid_idx = valid_idx
             # valid_idx = ((valid_idx + self.max_length) %
@@ -362,16 +359,16 @@ class ENASTrainer(Trainer):
             # # validation data, we reset the hidden states.
             # if prev_valid_idx > valid_idx:
             #     hidden = self.shared.init_hidden(self.batch_size)
-
+    
     def derive(self, sample_num=10, valid_idx=0):
         """We are always deriving based on the very first batch
         of validation data? This seems wrong...
         """
         hidden = self.shared.init_hidden(self.batch_size)
-
+        
         dags, _, entropies = self.controller.sample(sample_num,
                                                     with_details=True)
-
+        
         max_R = 0
         best_dag = None
         for dag in dags:
@@ -379,5 +376,5 @@ class ENASTrainer(Trainer):
             if R.max() > max_R:
                 max_R = R.max()
                 best_dag = dag
-
+        
         self.model.setDAG(best_dag)
diff --git a/fastNLP/models/enas_utils.py b/fastNLP/models/enas_utils.py
index aafcb3a7..68c170ed 100644
--- a/fastNLP/models/enas_utils.py
+++ b/fastNLP/models/enas_utils.py
@@ -1,12 +1,10 @@
 # Code Modified from https://github.com/carpedm20/ENAS-pytorch
 
 from __future__ import print_function
-
 from collections import defaultdict
 import collections
 
 import numpy as np
-
 import torch
 from torch.autograd import Variable
 
diff --git a/fastNLP/models/sequence_labeling.py b/fastNLP/models/sequence_labeling.py
index 39f4c3fe..17f02298 100644
--- a/fastNLP/models/sequence_labeling.py
+++ b/fastNLP/models/sequence_labeling.py
@@ -1,11 +1,19 @@
+"""
+    本模块实现了两种序列标注模型
+"""
 import torch
+import torch.nn as nn
 
 from .base_model import BaseModel
 from ..modules import decoder, encoder
 from ..modules.decoder.CRF import allowed_transitions
 from ..core.utils import seq_len_to_mask
 from ..core.const import Const as C
-from torch import nn
+
+__all__ = [
+    "SeqLabeling",
+    "AdvSeqLabel"
+]
 
 
 class SeqLabeling(BaseModel):
diff --git a/fastNLP/models/snli.py b/fastNLP/models/snli.py
index 34b54302..606bcc42 100644
--- a/fastNLP/models/snli.py
+++ b/fastNLP/models/snli.py
@@ -8,6 +8,9 @@ from ..modules import encoder as Encoder
 from ..modules import aggregator as Aggregator
 from ..core.utils import seq_len_to_mask
 
+__all__ = [
+    "ESIM"
+]
 
 my_inf = 10e12
 
@@ -26,7 +29,7 @@ class ESIM(BaseModel):
     :param int num_classes: 标签数目，默认为3
     :param numpy.array init_embedding: 初始词嵌入矩阵，形状为(vocab_size, embed_dim)，默认为None，即随机初始化词嵌入矩阵
     """
-
+    
     def __init__(self, vocab_size, embed_dim, hidden_size, dropout=0.0, num_classes=3, init_embedding=None):
         
         super(ESIM, self).__init__()
@@ -35,35 +38,36 @@ class ESIM(BaseModel):
         self.hidden_size = hidden_size
         self.dropout = dropout
         self.n_labels = num_classes
-
+        
         self.drop = nn.Dropout(self.dropout)
-
+        
         self.embedding = Encoder.Embedding(
             (self.vocab_size, self.embed_dim), dropout=self.dropout,
         )
-
+        
         self.embedding_layer = nn.Linear(self.embed_dim, self.hidden_size)
-
+        
         self.encoder = Encoder.LSTM(
             input_size=self.embed_dim, hidden_size=self.hidden_size, num_layers=1, bias=True,
             batch_first=True, bidirectional=True
         )
-
+        
         self.bi_attention = Aggregator.BiAttention()
         self.mean_pooling = Aggregator.AvgPoolWithMask()
         self.max_pooling = Aggregator.MaxPoolWithMask()
-
+        
         self.inference_layer = nn.Linear(self.hidden_size * 4, self.hidden_size)
-
+        
         self.decoder = Encoder.LSTM(
             input_size=self.hidden_size, hidden_size=self.hidden_size, num_layers=1, bias=True,
             batch_first=True, bidirectional=True
         )
-
+        
         self.output = Decoder.MLP([4 * self.hidden_size, self.hidden_size, self.n_labels], 'tanh', dropout=self.dropout)
-
+    
     def forward(self, words1, words2, seq_len1=None, seq_len2=None, target=None):
         """ Forward function
+        
         :param torch.Tensor words1: [batch size(B), premise seq len(PL)] premise的token表示
         :param torch.Tensor words2: [B, hypothesis seq len(HL)] hypothesis的token表示
         :param torch.LongTensor seq_len1: [B] premise的长度
@@ -71,10 +75,10 @@ class ESIM(BaseModel):
         :param torch.LongTensor target: [B] 真实目标值
         :return: dict prediction: [B, n_labels(N)] 预测结果
         """
-
+        
         premise0 = self.embedding_layer(self.embedding(words1))
         hypothesis0 = self.embedding_layer(self.embedding(words2))
-
+        
         if seq_len1 is not None:
             seq_len1 = seq_len_to_mask(seq_len1)
         else:
@@ -85,55 +89,55 @@ class ESIM(BaseModel):
         else:
             seq_len2 = torch.ones(hypothesis0.size(0), hypothesis0.size(1))
             seq_len2 = (seq_len2.long()).to(device=hypothesis0.device)
-
+        
         _BP, _PSL, _HP = premise0.size()
         _BH, _HSL, _HH = hypothesis0.size()
         _BPL, _PLL = seq_len1.size()
         _HPL, _HLL = seq_len2.size()
-
+        
         assert _BP == _BH and _BPL == _HPL and _BP == _BPL
         assert _HP == _HH
         assert _PSL == _PLL and _HSL == _HLL
-
+        
         B, PL, H = premise0.size()
         B, HL, H = hypothesis0.size()
-
+        
         a0 = self.encoder(self.drop(premise0))  # a0: [B, PL, H * 2]
         b0 = self.encoder(self.drop(hypothesis0))  # b0: [B, HL, H * 2]
-
+        
         a = torch.mean(a0.view(B, PL, -1, H), dim=2)  # a: [B, PL, H]
         b = torch.mean(b0.view(B, HL, -1, H), dim=2)  # b: [B, HL, H]
-
+        
         ai, bi = self.bi_attention(a, b, seq_len1, seq_len2)
-
+        
         ma = torch.cat((a, ai, a - ai, a * ai), dim=2)  # ma: [B, PL, 4 * H]
         mb = torch.cat((b, bi, b - bi, b * bi), dim=2)  # mb: [B, HL, 4 * H]
-
+        
         f_ma = self.inference_layer(ma)
         f_mb = self.inference_layer(mb)
-
+        
         vat = self.decoder(self.drop(f_ma))
         vbt = self.decoder(self.drop(f_mb))
-
+        
         va = torch.mean(vat.view(B, PL, -1, H), dim=2)  # va: [B, PL, H]
         vb = torch.mean(vbt.view(B, HL, -1, H), dim=2)  # vb: [B, HL, H]
-
+        
         va_ave = self.mean_pooling(va, seq_len1, dim=1)  # va_ave: [B, H]
         va_max, va_arg_max = self.max_pooling(va, seq_len1, dim=1)  # va_max: [B, H]
         vb_ave = self.mean_pooling(vb, seq_len2, dim=1)  # vb_ave: [B, H]
         vb_max, vb_arg_max = self.max_pooling(vb, seq_len2, dim=1)  # vb_max: [B, H]
-
+        
         v = torch.cat((va_ave, va_max, vb_ave, vb_max), dim=1)  # v: [B, 4 * H]
-
+        
         prediction = torch.tanh(self.output(v))  # prediction: [B, N]
-
+        
         if target is not None:
             func = nn.CrossEntropyLoss()
             loss = func(prediction, target)
             return {Const.OUTPUT: prediction, Const.LOSS: loss}
-
+        
         return {Const.OUTPUT: prediction}
-
+    
     def predict(self, words1, words2, seq_len1=None, seq_len2=None, target=None):
         """ Predict function
 
@@ -146,4 +150,3 @@ class ESIM(BaseModel):
         """
         prediction = self.forward(words1, words2, seq_len1, seq_len2)[Const.OUTPUT]
         return {Const.OUTPUT: torch.argmax(prediction, dim=-1)}
-
diff --git a/fastNLP/models/star_transformer.py b/fastNLP/models/star_transformer.py
index cdd1f321..2e55f7e4 100644
--- a/fastNLP/models/star_transformer.py
+++ b/fastNLP/models/star_transformer.py
@@ -1,17 +1,25 @@
-"""Star-Transformer 的 一个 Pytorch 实现.
 """
+Star-Transformer 的 Pytorch 实现。
+"""
+import torch
+from torch import nn
+
 from ..modules.encoder.star_transformer import StarTransformer
 from ..core.utils import seq_len_to_mask
 from ..modules.utils import get_embeddings
 from ..core.const import Const
 
-import torch
-from torch import nn
+__all__ = [
+    "StarTransEnc",
+    "STNLICls",
+    "STSeqCls",
+    "STSeqLabel",
+]
 
 
 class StarTransEnc(nn.Module):
     """
-    别名：:class:`fastNLP.models.StarTransEnc`  :class:`fastNLP.models.start_transformer.StarTransEnc`
+    别名：:class:`fastNLP.models.StarTransEnc`  :class:`fastNLP.models.star_transformer.StarTransEnc`
 
     带word embedding的Star-Transformer Encoder
 
@@ -28,6 +36,7 @@ class StarTransEnc(nn.Module):
     :param emb_dropout: 词嵌入的dropout概率.
     :param dropout: 模型除词嵌入外的dropout概率.
     """
+    
     def __init__(self, init_embed,
                  hidden_size,
                  num_layers,
@@ -47,7 +56,7 @@ class StarTransEnc(nn.Module):
                                        head_dim=head_dim,
                                        dropout=dropout,
                                        max_len=max_len)
-
+    
     def forward(self, x, mask):
         """
         :param FloatTensor data: [batch, length, hidden] 输入的序列
@@ -72,7 +81,7 @@ class _Cls(nn.Module):
             nn.Dropout(dropout),
             nn.Linear(hid_dim, num_cls),
         )
-
+    
     def forward(self, x):
         h = self.fc(x)
         return h
@@ -83,20 +92,21 @@ class _NLICls(nn.Module):
         super(_NLICls, self).__init__()
         self.fc = nn.Sequential(
             nn.Dropout(dropout),
-            nn.Linear(in_dim*4, hid_dim),  #4
+            nn.Linear(in_dim * 4, hid_dim),  # 4
             nn.LeakyReLU(),
             nn.Dropout(dropout),
             nn.Linear(hid_dim, num_cls),
         )
-
+    
     def forward(self, x1, x2):
-        x = torch.cat([x1, x2, torch.abs(x1-x2), x1*x2], 1)
+        x = torch.cat([x1, x2, torch.abs(x1 - x2), x1 * x2], 1)
         h = self.fc(x)
         return h
 
+
 class STSeqLabel(nn.Module):
     """
-    别名：:class:`fastNLP.models.STSeqLabel`  :class:`fastNLP.models.start_transformer.STSeqLabel`
+    别名：:class:`fastNLP.models.STSeqLabel`  :class:`fastNLP.models.star_transformer.STSeqLabel`
 
     用于序列标注的Star-Transformer模型
 
@@ -112,6 +122,7 @@ class STSeqLabel(nn.Module):
     :param emb_dropout: 词嵌入的dropout概率. Default: 0.1
     :param dropout: 模型除词嵌入外的dropout概率. Default: 0.1
     """
+    
     def __init__(self, init_embed, num_cls,
                  hidden_size=300,
                  num_layers=4,
@@ -120,7 +131,7 @@ class STSeqLabel(nn.Module):
                  max_len=512,
                  cls_hidden_size=600,
                  emb_dropout=0.1,
-                 dropout=0.1,):
+                 dropout=0.1, ):
         super(STSeqLabel, self).__init__()
         self.enc = StarTransEnc(init_embed=init_embed,
                                 hidden_size=hidden_size,
@@ -131,7 +142,7 @@ class STSeqLabel(nn.Module):
                                 emb_dropout=emb_dropout,
                                 dropout=dropout)
         self.cls = _Cls(hidden_size, num_cls, cls_hidden_size)
-
+    
     def forward(self, words, seq_len):
         """
 
@@ -142,9 +153,9 @@ class STSeqLabel(nn.Module):
         mask = seq_len_to_mask(seq_len)
         nodes, _ = self.enc(words, mask)
         output = self.cls(nodes)
-        output = output.transpose(1,2) # make hidden to be dim 1
-        return {Const.OUTPUT: output} # [bsz, n_cls, seq_len]
-
+        output = output.transpose(1, 2)  # make hidden to be dim 1
+        return {Const.OUTPUT: output}  # [bsz, n_cls, seq_len]
+    
     def predict(self, words, seq_len):
         """
 
@@ -159,7 +170,7 @@ class STSeqLabel(nn.Module):
 
 class STSeqCls(nn.Module):
     """
-    别名：:class:`fastNLP.models.STSeqCls`  :class:`fastNLP.models.start_transformer.STSeqCls`
+    别名：:class:`fastNLP.models.STSeqCls`  :class:`fastNLP.models.star_transformer.STSeqCls`
 
     用于分类任务的Star-Transformer
 
@@ -175,7 +186,7 @@ class STSeqCls(nn.Module):
     :param emb_dropout: 词嵌入的dropout概率. Default: 0.1
     :param dropout: 模型除词嵌入外的dropout概率. Default: 0.1
     """
-
+    
     def __init__(self, init_embed, num_cls,
                  hidden_size=300,
                  num_layers=4,
@@ -184,7 +195,7 @@ class STSeqCls(nn.Module):
                  max_len=512,
                  cls_hidden_size=600,
                  emb_dropout=0.1,
-                 dropout=0.1,):
+                 dropout=0.1, ):
         super(STSeqCls, self).__init__()
         self.enc = StarTransEnc(init_embed=init_embed,
                                 hidden_size=hidden_size,
@@ -195,7 +206,7 @@ class STSeqCls(nn.Module):
                                 emb_dropout=emb_dropout,
                                 dropout=dropout)
         self.cls = _Cls(hidden_size, num_cls, cls_hidden_size)
-
+    
     def forward(self, words, seq_len):
         """
 
@@ -206,9 +217,9 @@ class STSeqCls(nn.Module):
         mask = seq_len_to_mask(seq_len)
         nodes, relay = self.enc(words, mask)
         y = 0.5 * (relay + nodes.max(1)[0])
-        output = self.cls(y) # [bsz, n_cls]
+        output = self.cls(y)  # [bsz, n_cls]
         return {Const.OUTPUT: output}
-
+    
     def predict(self, words, seq_len):
         """
 
@@ -223,7 +234,7 @@ class STSeqCls(nn.Module):
 
 class STNLICls(nn.Module):
     """
-    别名：:class:`fastNLP.models.STNLICls`  :class:`fastNLP.models.start_transformer.STNLICls`
+    别名：:class:`fastNLP.models.STNLICls`  :class:`fastNLP.models.star_transformer.STNLICls`
     
     用于自然语言推断(NLI)的Star-Transformer
 
@@ -239,7 +250,7 @@ class STNLICls(nn.Module):
     :param emb_dropout: 词嵌入的dropout概率. Default: 0.1
     :param dropout: 模型除词嵌入外的dropout概率. Default: 0.1
     """
-
+    
     def __init__(self, init_embed, num_cls,
                  hidden_size=300,
                  num_layers=4,
@@ -248,7 +259,7 @@ class STNLICls(nn.Module):
                  max_len=512,
                  cls_hidden_size=600,
                  emb_dropout=0.1,
-                 dropout=0.1,):
+                 dropout=0.1, ):
         super(STNLICls, self).__init__()
         self.enc = StarTransEnc(init_embed=init_embed,
                                 hidden_size=hidden_size,
@@ -259,7 +270,7 @@ class STNLICls(nn.Module):
                                 emb_dropout=emb_dropout,
                                 dropout=dropout)
         self.cls = _NLICls(hidden_size, num_cls, cls_hidden_size)
-
+    
     def forward(self, words1, words2, seq_len1, seq_len2):
         """
 
@@ -271,14 +282,16 @@ class STNLICls(nn.Module):
         """
         mask1 = seq_len_to_mask(seq_len1)
         mask2 = seq_len_to_mask(seq_len2)
+        
         def enc(seq, mask):
             nodes, relay = self.enc(seq, mask)
             return 0.5 * (relay + nodes.max(1)[0])
+        
         y1 = enc(words1, mask1)
         y2 = enc(words2, mask2)
-        output = self.cls(y1, y2) # [bsz, n_cls]
+        output = self.cls(y1, y2)  # [bsz, n_cls]
         return {Const.OUTPUT: output}
-
+    
     def predict(self, words1, words2, seq_len1, seq_len2):
         """
 

From fb143ff2add1a7ddd6c31fbc4eb3a68d95d423cb Mon Sep 17 00:00:00 2001
From: ChenXin <will131@foxmail.com>
Date: Thu, 16 May 2019 21:37:19 +0800
Subject: [PATCH 3/8] =?UTF-8?q?=E4=BF=AE=E6=94=B9=E4=BA=86=E6=96=87?=
 =?UTF-8?q?=E4=BB=B6=E5=90=8D?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 ...odules.decoder.CRF.rst => fastNLP.modules.decoder.crf.rst} | 2 +-
 ...odules.decoder.MLP.rst => fastNLP.modules.decoder.mlp.rst} | 2 +-
 docs/source/fastNLP.modules.decoder.rst                       | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)
 rename docs/source/{fastNLP.modules.decoder.CRF.rst => fastNLP.modules.decoder.crf.rst} (72%)
 rename docs/source/{fastNLP.modules.decoder.MLP.rst => fastNLP.modules.decoder.mlp.rst} (72%)

diff --git a/docs/source/fastNLP.modules.decoder.CRF.rst b/docs/source/fastNLP.modules.decoder.crf.rst
similarity index 72%
rename from docs/source/fastNLP.modules.decoder.CRF.rst
rename to docs/source/fastNLP.modules.decoder.crf.rst
index fc643fef..6d5b0d5b 100644
--- a/docs/source/fastNLP.modules.decoder.CRF.rst
+++ b/docs/source/fastNLP.modules.decoder.crf.rst
@@ -1,7 +1,7 @@
 fastNLP.modules.decoder.CRF
 ===========================
 
-.. automodule:: fastNLP.modules.decoder.CRF
+.. automodule:: fastNLP.modules.decoder.crf
     :members:
     :undoc-members:
     :show-inheritance:
diff --git a/docs/source/fastNLP.modules.decoder.MLP.rst b/docs/source/fastNLP.modules.decoder.mlp.rst
similarity index 72%
rename from docs/source/fastNLP.modules.decoder.MLP.rst
rename to docs/source/fastNLP.modules.decoder.mlp.rst
index feb5c228..7d661ebf 100644
--- a/docs/source/fastNLP.modules.decoder.MLP.rst
+++ b/docs/source/fastNLP.modules.decoder.mlp.rst
@@ -1,7 +1,7 @@
 fastNLP.modules.decoder.MLP
 ===========================
 
-.. automodule:: fastNLP.modules.decoder.MLP
+.. automodule:: fastNLP.modules.decoder.mlp
     :members:
     :undoc-members:
     :show-inheritance:
diff --git a/docs/source/fastNLP.modules.decoder.rst b/docs/source/fastNLP.modules.decoder.rst
index 1c28740b..e42a9f39 100644
--- a/docs/source/fastNLP.modules.decoder.rst
+++ b/docs/source/fastNLP.modules.decoder.rst
@@ -12,7 +12,7 @@ fastNLP.modules.decoder
 .. toctree::
    :titlesonly:
 
-   fastNLP.modules.decoder.CRF
-   fastNLP.modules.decoder.MLP
+   fastNLP.modules.decoder.crf
+   fastNLP.modules.decoder.mlp
    fastNLP.modules.decoder.utils
 

From ff1d695aa40a5beebea50825c4b5e4d09391ec29 Mon Sep 17 00:00:00 2001
From: ChenXin <will131@foxmail.com>
Date: Thu, 16 May 2019 21:45:17 +0800
Subject: [PATCH 4/8] =?UTF-8?q?=E4=BF=AE=E6=94=B9=E4=BA=86=20modules=20?=
 =?UTF-8?q?=E6=A8=A1=E5=9D=97=E7=9A=84=5F=5Fall=5F=5F=20=E5=92=8C=20import?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 fastNLP/models/base_model.py                  |  2 +-
 fastNLP/models/sequence_labeling.py           |  8 +-
 fastNLP/modules/__init__.py                   | 18 ++--
 fastNLP/modules/aggregator/__init__.py        |  4 +-
 fastNLP/modules/aggregator/attention.py       | 62 +++++++-----
 fastNLP/modules/decoder/__init__.py           |  6 +-
 fastNLP/modules/decoder/{CRF.py => crf.py}    | 99 ++++++++++---------
 fastNLP/modules/decoder/{MLP.py => mlp.py}    | 42 +++-----
 fastNLP/modules/decoder/utils.py              | 24 +++--
 fastNLP/modules/encoder/__init__.py           | 26 ++++-
 fastNLP/modules/encoder/char_encoder.py       | 32 ++++--
 fastNLP/modules/encoder/conv_maxpool.py       | 29 +++---
 fastNLP/modules/encoder/embedding.py          | 19 ++--
 fastNLP/modules/encoder/lstm.py               | 10 +-
 fastNLP/modules/encoder/star_transformer.py   | 71 +++++++------
 fastNLP/modules/encoder/transformer.py        | 17 ++--
 fastNLP/modules/encoder/variational_rnn.py    | 66 ++++++++-----
 .../models/cws_model.py                       |  6 +-
 .../models/cws_transformer.py                 |  4 +-
 .../main.py                                   |  2 +-
 test/modules/decoder/test_CRF.py              | 10 +-
 21 files changed, 321 insertions(+), 236 deletions(-)
 rename fastNLP/modules/decoder/{CRF.py => crf.py} (87%)
 rename fastNLP/modules/decoder/{MLP.py => mlp.py} (77%)

diff --git a/fastNLP/models/base_model.py b/fastNLP/models/base_model.py
index d27f1d21..2646d580 100644
--- a/fastNLP/models/base_model.py
+++ b/fastNLP/models/base_model.py
@@ -1,6 +1,6 @@
 import torch
 
-from ..modules.decoder.MLP import MLP
+from ..modules.decoder.mlp import MLP
 
 
 class BaseModel(torch.nn.Module):
diff --git a/fastNLP/models/sequence_labeling.py b/fastNLP/models/sequence_labeling.py
index 17f02298..503c79ba 100644
--- a/fastNLP/models/sequence_labeling.py
+++ b/fastNLP/models/sequence_labeling.py
@@ -6,7 +6,7 @@ import torch.nn as nn
 
 from .base_model import BaseModel
 from ..modules import decoder, encoder
-from ..modules.decoder.CRF import allowed_transitions
+from ..modules.decoder.crf import allowed_transitions
 from ..core.utils import seq_len_to_mask
 from ..core.const import Const as C
 
@@ -35,7 +35,7 @@ class SeqLabeling(BaseModel):
         self.Embedding = encoder.embedding.Embedding(init_embed)
         self.Rnn = encoder.lstm.LSTM(self.Embedding.embedding_dim, hidden_size)
         self.Linear = nn.Linear(hidden_size, num_classes)
-        self.Crf = decoder.CRF.ConditionalRandomField(num_classes)
+        self.Crf = decoder.crf.ConditionalRandomField(num_classes)
         self.mask = None
     
     def forward(self, words, seq_len, target):
@@ -141,9 +141,9 @@ class AdvSeqLabel(nn.Module):
         self.Linear2 = nn.Linear(hidden_size * 2 // 3, num_classes)
         
         if id2words is None:
-            self.Crf = decoder.CRF.ConditionalRandomField(num_classes, include_start_end_trans=False)
+            self.Crf = decoder.crf.ConditionalRandomField(num_classes, include_start_end_trans=False)
         else:
-            self.Crf = decoder.CRF.ConditionalRandomField(num_classes, include_start_end_trans=False,
+            self.Crf = decoder.crf.ConditionalRandomField(num_classes, include_start_end_trans=False,
                                                           allowed_transitions=allowed_transitions(id2words,
                                                                                                   encoding_type=encoding_type))
     
diff --git a/fastNLP/modules/__init__.py b/fastNLP/modules/__init__.py
index 53d44f47..cd54c8db 100644
--- a/fastNLP/modules/__init__.py
+++ b/fastNLP/modules/__init__.py
@@ -32,19 +32,25 @@ from .encoder import *
 from .utils import get_embeddings
 
 __all__ = [
-    "LSTM",
-    "Embedding",
+    # "BertModel",
+    "ConvolutionCharEncoder",
+    "LSTMCharEncoder",
     "ConvMaxpool",
-    "BertModel",
+    "Embedding",
+    "LSTM",
+    "StarTransformer",
+    "TransformerEncoder",
+    "VarRNN",
+    "VarLSTM",
+    "VarGRU",
     
     "MaxPool",
     "MaxPoolWithMask",
     "AvgPool",
     "MultiHeadAttention",
-    "BiAttention",
-
+    
     "MLP",
     "ConditionalRandomField",
     "viterbi_decode",
     "allowed_transitions",
-]
\ No newline at end of file
+]
diff --git a/fastNLP/modules/aggregator/__init__.py b/fastNLP/modules/aggregator/__init__.py
index 725ccd4b..117dad83 100644
--- a/fastNLP/modules/aggregator/__init__.py
+++ b/fastNLP/modules/aggregator/__init__.py
@@ -3,12 +3,12 @@ from .pooling import MaxPoolWithMask
 from .pooling import AvgPool
 from .pooling import AvgPoolWithMask
 
-from .attention import MultiHeadAttention, BiAttention
+from .attention import MultiHeadAttention
+
 __all__ = [
     "MaxPool",
     "MaxPoolWithMask",
     "AvgPool",
     
     "MultiHeadAttention",
-    "BiAttention"
 ]
diff --git a/fastNLP/modules/aggregator/attention.py b/fastNLP/modules/aggregator/attention.py
index cea9c405..a1a7fda8 100644
--- a/fastNLP/modules/aggregator/attention.py
+++ b/fastNLP/modules/aggregator/attention.py
@@ -1,4 +1,3 @@
-__all__ =["MultiHeadAttention"]
 import math
 
 import torch
@@ -9,12 +8,17 @@ from ..dropout import TimestepDropout
 
 from ..utils import initial_parameter
 
+__all__ = [
+    "MultiHeadAttention"
+]
+
 
 class DotAttention(nn.Module):
     """
     .. todo::
         补上文档
     """
+    
     def __init__(self, key_size, value_size, dropout=0):
         super(DotAttention, self).__init__()
         self.key_size = key_size
@@ -22,7 +26,7 @@ class DotAttention(nn.Module):
         self.scale = math.sqrt(key_size)
         self.drop = nn.Dropout(dropout)
         self.softmax = nn.Softmax(dim=2)
-
+    
     def forward(self, Q, K, V, mask_out=None):
         """
 
@@ -41,6 +45,8 @@ class DotAttention(nn.Module):
 
 class MultiHeadAttention(nn.Module):
     """
+    别名：:class:`fastNLP.modules.MultiHeadAttention`   :class:`fastNLP.modules.aggregator.attention.MultiHeadAttention`
+
 
     :param input_size: int, 输入维度的大小。同时也是输出维度的大小。
     :param key_size: int, 每个head的维度大小。
@@ -48,13 +54,14 @@ class MultiHeadAttention(nn.Module):
     :param num_head: int，head的数量。
     :param dropout: float。
     """
+    
     def __init__(self, input_size, key_size, value_size, num_head, dropout=0.1):
         super(MultiHeadAttention, self).__init__()
         self.input_size = input_size
         self.key_size = key_size
         self.value_size = value_size
         self.num_head = num_head
-
+        
         in_size = key_size * num_head
         self.q_in = nn.Linear(input_size, in_size)
         self.k_in = nn.Linear(input_size, in_size)
@@ -64,14 +71,14 @@ class MultiHeadAttention(nn.Module):
         self.out = nn.Linear(value_size * num_head, input_size)
         self.drop = TimestepDropout(dropout)
         self.reset_parameters()
-
+    
     def reset_parameters(self):
         sqrt = math.sqrt
         nn.init.normal_(self.q_in.weight, mean=0, std=sqrt(2.0 / (self.input_size + self.key_size)))
         nn.init.normal_(self.k_in.weight, mean=0, std=sqrt(2.0 / (self.input_size + self.key_size)))
         nn.init.normal_(self.v_in.weight, mean=0, std=sqrt(2.0 / (self.input_size + self.value_size)))
         nn.init.xavier_normal_(self.out.weight)
-
+    
     def forward(self, Q, K, V, atte_mask_out=None):
         """
 
@@ -87,7 +94,7 @@ class MultiHeadAttention(nn.Module):
         q = self.q_in(Q).view(batch, sq, n_head, d_k)
         k = self.k_in(K).view(batch, sk, n_head, d_k)
         v = self.v_in(V).view(batch, sk, n_head, d_v)
-
+        
         # transpose q, k and v to do batch attention
         q = q.permute(2, 0, 1, 3).contiguous().view(-1, sq, d_k)
         k = k.permute(2, 0, 1, 3).contiguous().view(-1, sk, d_k)
@@ -95,7 +102,7 @@ class MultiHeadAttention(nn.Module):
         if atte_mask_out is not None:
             atte_mask_out = atte_mask_out.repeat(n_head, 1, 1)
         atte = self.attention(q, k, v, atte_mask_out).view(n_head, batch, sq, d_v)
-
+        
         # concat all heads, do output linear
         atte = atte.permute(1, 2, 0, 3).contiguous().view(batch, sq, -1)
         output = self.drop(self.out(atte))
@@ -104,6 +111,10 @@ class MultiHeadAttention(nn.Module):
 
 class BiAttention(nn.Module):
     r"""Bi Attention module
+    
+    .. todo::
+        这个模块的负责人来继续完善一下
+        
     Calculate Bi Attention matrix `e`
     
     .. math::
@@ -115,11 +126,11 @@ class BiAttention(nn.Module):
         \end{array}
         
     """
-
+    
     def __init__(self):
         super(BiAttention, self).__init__()
         self.inf = 10e12
-
+    
     def forward(self, in_x1, in_x2, x1_len, x2_len):
         """
         :param torch.Tensor in_x1: [batch_size, x1_seq_len, hidden_size] 第一句的特征表示
@@ -130,36 +141,36 @@ class BiAttention(nn.Module):
             torch.Tensor out_x2: [batch_size, x2_seq_len, hidden_size] 第一句attend到的特征表示
         
         """
-
+        
         assert in_x1.size()[0] == in_x2.size()[0]
         assert in_x1.size()[2] == in_x2.size()[2]
         # The batch size and hidden size must be equal.
         assert in_x1.size()[1] == x1_len.size()[1] and in_x2.size()[1] == x2_len.size()[1]
         # The seq len in in_x and x_len must be equal.
         assert in_x1.size()[0] == x1_len.size()[0] and x1_len.size()[0] == x2_len.size()[0]
-
+        
         batch_size = in_x1.size()[0]
         x1_max_len = in_x1.size()[1]
         x2_max_len = in_x2.size()[1]
-
+        
         in_x2_t = torch.transpose(in_x2, 1, 2)  # [batch_size, hidden_size, x2_seq_len]
-
+        
         attention_matrix = torch.bmm(in_x1, in_x2_t)  # [batch_size, x1_seq_len, x2_seq_len]
-
+        
         a_mask = x1_len.le(0.5).float() * -self.inf  # [batch_size, x1_seq_len]
         a_mask = a_mask.view(batch_size, x1_max_len, -1)
         a_mask = a_mask.expand(-1, -1, x2_max_len)  # [batch_size, x1_seq_len, x2_seq_len]
         b_mask = x2_len.le(0.5).float() * -self.inf
         b_mask = b_mask.view(batch_size, -1, x2_max_len)
         b_mask = b_mask.expand(-1, x1_max_len, -1)  # [batch_size, x1_seq_len, x2_seq_len]
-
+        
         attention_a = F.softmax(attention_matrix + a_mask, dim=2)  # [batch_size, x1_seq_len, x2_seq_len]
         attention_b = F.softmax(attention_matrix + b_mask, dim=1)  # [batch_size, x1_seq_len, x2_seq_len]
-
+        
         out_x1 = torch.bmm(attention_a, in_x2)  # [batch_size, x1_seq_len, hidden_size]
         attention_b_t = torch.transpose(attention_b, 1, 2)
         out_x2 = torch.bmm(attention_b_t, in_x1)  # [batch_size, x2_seq_len, hidden_size]
-
+        
         return out_x1, out_x2
 
 
@@ -173,10 +184,10 @@ class SelfAttention(nn.Module):
     :param float drop: dropout概率，默认值为0.5
     :param str initial_method: 初始化参数方法
     """
-
-    def __init__(self, input_size, attention_unit=300, attention_hops=10, drop=0.5, initial_method=None,):
+    
+    def __init__(self, input_size, attention_unit=300, attention_hops=10, drop=0.5, initial_method=None, ):
         super(SelfAttention, self).__init__()
-
+        
         self.attention_hops = attention_hops
         self.ws1 = nn.Linear(input_size, attention_unit, bias=False)
         self.ws2 = nn.Linear(attention_unit, attention_hops, bias=False)
@@ -185,7 +196,7 @@ class SelfAttention(nn.Module):
         self.drop = nn.Dropout(drop)
         self.tanh = nn.Tanh()
         initial_parameter(self, initial_method)
-
+    
     def _penalization(self, attention):
         """
         compute the penalization term for attention module
@@ -199,7 +210,7 @@ class SelfAttention(nn.Module):
         mat = torch.bmm(attention, attention_t) - self.I[:attention.size(0)]
         ret = (torch.sum(torch.sum((mat ** 2), 2), 1).squeeze() + 1e-10) ** 0.5
         return torch.sum(ret) / size[0]
-
+    
     def forward(self, input, input_origin):
         """
         :param torch.Tensor input: [baz, senLen, h_dim] 要做attention的矩阵
@@ -209,15 +220,14 @@ class SelfAttention(nn.Module):
         """
         input = input.contiguous()
         size = input.size()  # [bsz, len, nhid]
-
+        
         input_origin = input_origin.expand(self.attention_hops, -1, -1)  # [hops,baz, len]
         input_origin = input_origin.transpose(0, 1).contiguous()  # [baz, hops,len]
-
+        
         y1 = self.tanh(self.ws1(self.drop(input)))  # [baz,len,dim] -->[bsz,len, attention-unit]
         attention = self.ws2(y1).transpose(1, 2).contiguous()
         # [bsz,len, attention-unit]--> [bsz, len, hop]--> [baz,hop,len]
-
+        
         attention = attention + (-999999 * (input_origin == 0).float())  # remove the weight on padding token.
         attention = F.softmax(attention, 2)  # [baz ,hop, len]
         return torch.bmm(attention, input), self._penalization(attention)  # output1 --> [baz ,hop ,nhid]
-
diff --git a/fastNLP/modules/decoder/__init__.py b/fastNLP/modules/decoder/__init__.py
index 516b687a..5df48c43 100644
--- a/fastNLP/modules/decoder/__init__.py
+++ b/fastNLP/modules/decoder/__init__.py
@@ -1,7 +1,7 @@
-from .CRF import ConditionalRandomField
-from .MLP import MLP
+from .crf import ConditionalRandomField
+from .mlp import MLP
 from .utils import viterbi_decode
-from .CRF import allowed_transitions
+from .crf import allowed_transitions
 
 __all__ = [
     "MLP",
diff --git a/fastNLP/modules/decoder/CRF.py b/fastNLP/modules/decoder/crf.py
similarity index 87%
rename from fastNLP/modules/decoder/CRF.py
rename to fastNLP/modules/decoder/crf.py
index 84f374e6..130ed40e 100644
--- a/fastNLP/modules/decoder/CRF.py
+++ b/fastNLP/modules/decoder/crf.py
@@ -3,10 +3,15 @@ from torch import nn
 
 from ..utils import initial_parameter
 
+__all__ = [
+    "ConditionalRandomField",
+    "allowed_transitions"
+]
+
 
 def allowed_transitions(id2target, encoding_type='bio', include_start_end=True):
     """
-    别名：:class:`fastNLP.modules.allowed_transitions`  :class:`fastNLP.modules.decoder.CRF.allowed_transitions`
+    别名：:class:`fastNLP.modules.allowed_transitions`  :class:`fastNLP.modules.decoder.crf.allowed_transitions`
 
     给定一个id到label的映射表，返回所有可以跳转的(from_tag_id, to_tag_id)列表。
 
@@ -15,8 +20,7 @@ def allowed_transitions(id2target, encoding_type='bio', include_start_end=True):
     :param str encoding_type: 支持"bio", "bmes", "bmeso"。
     :param bool include_start_end: 是否包含开始与结尾的转换。比如在bio中，b/o可以在开头，但是i不能在开头；
         为True，返回的结果中会包含(start_idx, b_idx), (start_idx, o_idx), 但是不包含(start_idx, i_idx);
-            start_idx=len(id2label), end_idx=len(id2label)+1。
-        为False, 返回的结果中不含与开始结尾相关的内容
+        start_idx=len(id2label), end_idx=len(id2label)+1。为False, 返回的结果中不含与开始结尾相关的内容
     :return: List[Tuple(int, int)]], 内部的Tuple是可以进行跳转的(from_tag_id, to_tag_id)。
     """
     num_tags = len(id2target)
@@ -27,6 +31,7 @@ def allowed_transitions(id2target, encoding_type='bio', include_start_end=True):
     id_label_lst = list(id2target.items())
     if include_start_end:
         id_label_lst += [(start_idx, 'start'), (end_idx, 'end')]
+    
     def split_tag_label(from_label):
         from_label = from_label.lower()
         if from_label in ['start', 'end']:
@@ -36,7 +41,7 @@ def allowed_transitions(id2target, encoding_type='bio', include_start_end=True):
             from_tag = from_label[:1]
             from_label = from_label[2:]
         return from_tag, from_label
-
+    
     for from_id, from_label in id_label_lst:
         if from_label in ['<pad>', '<unk>']:
             continue
@@ -60,7 +65,7 @@ def _is_transition_allowed(encoding_type, from_tag, from_label, to_tag, to_label
     :param str to_label: 比如"PER", "LOC"等label
     :return: bool，能否跃迁
     """
-    if to_tag=='start' or from_tag=='end':
+    if to_tag == 'start' or from_tag == 'end':
         return False
     encoding_type = encoding_type.lower()
     if encoding_type == 'bio':
@@ -83,12 +88,12 @@ def _is_transition_allowed(encoding_type, from_tag, from_label, to_tag, to_label
         if from_tag == 'start':
             return to_tag in ('b', 'o')
         elif from_tag in ['b', 'i']:
-            return any([to_tag in ['end', 'b', 'o'], to_tag=='i' and from_label==to_label])
+            return any([to_tag in ['end', 'b', 'o'], to_tag == 'i' and from_label == to_label])
         elif from_tag == 'o':
             return to_tag in ['end', 'b', 'o']
         else:
             raise ValueError("Unexpect tag {}. Expect only 'B', 'I', 'O'.".format(from_tag))
-
+    
     elif encoding_type == 'bmes':
         """
         第一行是to_tag, 第一列是from_tag，y任意条件下可转，-只有在label相同时可转，n不可转
@@ -111,9 +116,9 @@ def _is_transition_allowed(encoding_type, from_tag, from_label, to_tag, to_label
         if from_tag == 'start':
             return to_tag in ['b', 's']
         elif from_tag == 'b':
-            return to_tag in ['m', 'e'] and from_label==to_label
+            return to_tag in ['m', 'e'] and from_label == to_label
         elif from_tag == 'm':
-            return to_tag in ['m', 'e'] and from_label==to_label
+            return to_tag in ['m', 'e'] and from_label == to_label
         elif from_tag in ['e', 's']:
             return to_tag in ['b', 's', 'end']
         else:
@@ -122,21 +127,21 @@ def _is_transition_allowed(encoding_type, from_tag, from_label, to_tag, to_label
         if from_tag == 'start':
             return to_tag in ['b', 's', 'o']
         elif from_tag == 'b':
-            return to_tag in ['m', 'e'] and from_label==to_label
+            return to_tag in ['m', 'e'] and from_label == to_label
         elif from_tag == 'm':
-            return to_tag in ['m', 'e'] and from_label==to_label
+            return to_tag in ['m', 'e'] and from_label == to_label
         elif from_tag in ['e', 's', 'o']:
             return to_tag in ['b', 's', 'end', 'o']
         else:
             raise ValueError("Unexpect tag type {}. Expect only 'B', 'M', 'E', 'S', 'O'.".format(from_tag))
-
+    
     else:
         raise ValueError("Only support BIO, BMES, BMESO encoding type, got {}.".format(encoding_type))
 
 
 class ConditionalRandomField(nn.Module):
     """
-    别名：:class:`fastNLP.modules.ConditionalRandomField`  :class:`fastNLP.modules.decoder.CRF.ConditionalRandomField`
+    别名：:class:`fastNLP.modules.ConditionalRandomField`  :class:`fastNLP.modules.decoder.crf.ConditionalRandomField`
 
     条件随机场。
     提供forward()以及viterbi_decode()两个方法，分别用于训练与inference。
@@ -148,30 +153,31 @@ class ConditionalRandomField(nn.Module):
                                allowed_transitions()函数得到；如果为None，则所有跃迁均为合法
     :param str initial_method: 初始化方法。见initial_parameter
     """
+    
     def __init__(self, num_tags, include_start_end_trans=False, allowed_transitions=None,
                  initial_method=None):
         
         super(ConditionalRandomField, self).__init__()
-
+        
         self.include_start_end_trans = include_start_end_trans
         self.num_tags = num_tags
-
+        
         # the meaning of entry in this matrix is (from_tag_id, to_tag_id) score
         self.trans_m = nn.Parameter(torch.randn(num_tags, num_tags))
         if self.include_start_end_trans:
             self.start_scores = nn.Parameter(torch.randn(num_tags))
             self.end_scores = nn.Parameter(torch.randn(num_tags))
-
+        
         if allowed_transitions is None:
             constrain = torch.zeros(num_tags + 2, num_tags + 2)
         else:
-            constrain = torch.full((num_tags+2, num_tags+2), fill_value=-10000.0, dtype=torch.float)
+            constrain = torch.full((num_tags + 2, num_tags + 2), fill_value=-10000.0, dtype=torch.float)
             for from_tag_id, to_tag_id in allowed_transitions:
                 constrain[from_tag_id, to_tag_id] = 0
         self._constrain = nn.Parameter(constrain, requires_grad=False)
-
+        
         initial_parameter(self, initial_method)
-
+    
     def _normalizer_likelihood(self, logits, mask):
         """Computes the (batch_size,) denominator term for the log-likelihood, which is the
         sum of the likelihoods across all possible state sequences.
@@ -184,21 +190,21 @@ class ConditionalRandomField(nn.Module):
         alpha = logits[0]
         if self.include_start_end_trans:
             alpha = alpha + self.start_scores.view(1, -1)
-
+        
         flip_mask = mask.eq(0)
-
+        
         for i in range(1, seq_len):
             emit_score = logits[i].view(batch_size, 1, n_tags)
             trans_score = self.trans_m.view(1, n_tags, n_tags)
             tmp = alpha.view(batch_size, n_tags, 1) + emit_score + trans_score
             alpha = torch.logsumexp(tmp, 1).masked_fill(flip_mask[i].view(batch_size, 1), 0) + \
                     alpha.masked_fill(mask[i].byte().view(batch_size, 1), 0)
-
+        
         if self.include_start_end_trans:
             alpha = alpha + self.end_scores.view(1, -1)
-
+        
         return torch.logsumexp(alpha, 1)
-
+    
     def _gold_score(self, logits, tags, mask):
         """
         Compute the score for the gold path.
@@ -210,15 +216,15 @@ class ConditionalRandomField(nn.Module):
         seq_len, batch_size, _ = logits.size()
         batch_idx = torch.arange(batch_size, dtype=torch.long, device=logits.device)
         seq_idx = torch.arange(seq_len, dtype=torch.long, device=logits.device)
-
+        
         # trans_socre [L-1, B]
         mask = mask.byte()
         flip_mask = mask.eq(0)
-        trans_score = self.trans_m[tags[:seq_len-1], tags[1:]].masked_fill(flip_mask[1:, :], 0)
+        trans_score = self.trans_m[tags[:seq_len - 1], tags[1:]].masked_fill(flip_mask[1:, :], 0)
         # emit_score [L, B]
-        emit_score = logits[seq_idx.view(-1,1), batch_idx.view(1,-1), tags].masked_fill(flip_mask, 0)
+        emit_score = logits[seq_idx.view(-1, 1), batch_idx.view(1, -1), tags].masked_fill(flip_mask, 0)
         # score [L-1, B]
-        score = trans_score + emit_score[:seq_len-1, :]
+        score = trans_score + emit_score[:seq_len - 1, :]
         score = score.sum(0) + emit_score[-1].masked_fill(flip_mask[-1], 0)
         if self.include_start_end_trans:
             st_scores = self.start_scores.view(1, -1).repeat(batch_size, 1)[batch_idx, tags[0]]
@@ -227,24 +233,24 @@ class ConditionalRandomField(nn.Module):
             score = score + st_scores + ed_scores
         # return [B,]
         return score
-
+    
     def forward(self, feats, tags, mask):
         """
         用于计算CRF的前向loss，返回值为一个batch_size的FloatTensor，可能需要mean()求得loss。
 
-        :param torch.FloatTensor feats:batch_size x max_len x num_tags，特征矩阵。
+        :param torch.FloatTensor feats: batch_size x max_len x num_tags，特征矩阵。
         :param torch.LongTensor tags: batch_size x max_len，标签矩阵。
         :param torch.ByteTensor mask: batch_size x max_len，为0的位置认为是padding。
-        :return:torch.FloatTensor, (batch_size,)
+        :return: torch.FloatTensor, (batch_size,)
         """
         feats = feats.transpose(0, 1)
         tags = tags.transpose(0, 1).long()
         mask = mask.transpose(0, 1).float()
         all_path_score = self._normalizer_likelihood(feats, mask)
         gold_path_score = self._gold_score(feats, tags, mask)
-
+        
         return all_path_score - gold_path_score
-
+    
     def viterbi_decode(self, logits, mask, unpad=False):
         """给定一个特征矩阵以及转移分数矩阵，计算出最佳的路径以及对应的分数
 
@@ -259,9 +265,9 @@ class ConditionalRandomField(nn.Module):
 
         """
         batch_size, seq_len, n_tags = logits.size()
-        logits = logits.transpose(0, 1).data # L, B, H
-        mask = mask.transpose(0, 1).data.byte() # L, B
-
+        logits = logits.transpose(0, 1).data  # L, B, H
+        mask = mask.transpose(0, 1).data.byte()  # L, B
+        
         # dp
         vpath = logits.new_zeros((seq_len, batch_size, n_tags), dtype=torch.long)
         vscore = logits[0]
@@ -269,8 +275,8 @@ class ConditionalRandomField(nn.Module):
         transitions[:n_tags, :n_tags] += self.trans_m.data
         if self.include_start_end_trans:
             transitions[n_tags, :n_tags] += self.start_scores.data
-            transitions[:n_tags, n_tags+1] += self.end_scores.data
-
+            transitions[:n_tags, n_tags + 1] += self.end_scores.data
+        
         vscore += transitions[n_tags, :n_tags]
         trans_score = transitions[:n_tags, :n_tags].view(1, n_tags, n_tags).data
         for i in range(1, seq_len):
@@ -280,30 +286,29 @@ class ConditionalRandomField(nn.Module):
             best_score, best_dst = score.max(1)
             vpath[i] = best_dst
             vscore = best_score.masked_fill(mask[i].eq(0).view(batch_size, 1), 0) + \
-                                vscore.masked_fill(mask[i].view(batch_size, 1), 0)
-
+                     vscore.masked_fill(mask[i].view(batch_size, 1), 0)
+        
         if self.include_start_end_trans:
-            vscore += transitions[:n_tags, n_tags+1].view(1, -1)
-
+            vscore += transitions[:n_tags, n_tags + 1].view(1, -1)
+        
         # backtrace
         batch_idx = torch.arange(batch_size, dtype=torch.long, device=logits.device)
         seq_idx = torch.arange(seq_len, dtype=torch.long, device=logits.device)
         lens = (mask.long().sum(0) - 1)
         # idxes [L, B], batched idx from seq_len-1 to 0
-        idxes = (lens.view(1,-1) - seq_idx.view(-1,1)) % seq_len
-
+        idxes = (lens.view(1, -1) - seq_idx.view(-1, 1)) % seq_len
+        
         ans = logits.new_empty((seq_len, batch_size), dtype=torch.long)
         ans_score, last_tags = vscore.max(1)
         ans[idxes[0], batch_idx] = last_tags
         for i in range(seq_len - 1):
             last_tags = vpath[idxes[i], batch_idx, last_tags]
-            ans[idxes[i+1], batch_idx] = last_tags
+            ans[idxes[i + 1], batch_idx] = last_tags
         ans = ans.transpose(0, 1)
         if unpad:
             paths = []
             for idx, seq_len in enumerate(lens):
-                paths.append(ans[idx, :seq_len+1].tolist())
+                paths.append(ans[idx, :seq_len + 1].tolist())
         else:
             paths = ans
         return paths, ans_score
-
diff --git a/fastNLP/modules/decoder/MLP.py b/fastNLP/modules/decoder/mlp.py
similarity index 77%
rename from fastNLP/modules/decoder/MLP.py
rename to fastNLP/modules/decoder/mlp.py
index 71d899b0..27019432 100644
--- a/fastNLP/modules/decoder/MLP.py
+++ b/fastNLP/modules/decoder/mlp.py
@@ -3,20 +3,23 @@ import torch.nn as nn
 
 from ..utils import initial_parameter
 
+__all__ = [
+    "MLP"
+]
+
 
 class MLP(nn.Module):
     """
-    别名：:class:`fastNLP.modules.MLP`  :class:`fastNLP.modules.decoder.MLP.MLP`
+    别名：:class:`fastNLP.modules.MLP`  :class:`fastNLP.modules.decoder.mlp.MLP`
 
     多层感知器
 
-    :param list size_layer: 一个int的列表，用来定义MLP的层数，列表中的数字为每一层是hidden数目。MLP的层数为 len(size_layer) - 1
-    :param str or list activation:
-        一个字符串或者函数或者字符串跟函数的列表，用来定义每一个隐层的激活函数，字符串包括relu，tanh和sigmoid，默认值为relu
-    :param str or function output_activation : 字符串或者函数，用来定义输出层的激活函数，默认值为None，表示输出层没有激活函数
+    :param List[int] size_layer: 一个int的列表，用来定义MLP的层数，列表中的数字为每一层是hidden数目。MLP的层数为 len(size_layer) - 1
+    :param Union[str,func,List[str]] activation: 一个字符串或者函数的列表，用来定义每一个隐层的激活函数，字符串包括relu，tanh和sigmoid，默认值为relu
+    :param Union[str,func] output_activation:  字符串或者函数，用来定义输出层的激活函数，默认值为None，表示输出层没有激活函数
     :param str initial_method: 参数初始化方式
     :param float dropout: dropout概率，默认值为0
-
+    
     .. note::
         隐藏层的激活函数通过activation定义。一个str/function或者一个str/function的list可以被传入activation。
         如果只传入了一个str/function，那么所有隐藏层的激活函数都由这个str/function定义；
@@ -35,10 +38,8 @@ class MLP(nn.Module):
         >>>     y = net(x)
         >>>     print(x)
         >>>     print(y)
-        >>>
-
     """
-
+    
     def __init__(self, size_layer, activation='relu', output_activation=None, initial_method=None, dropout=0.0):
         super(MLP, self).__init__()
         self.hiddens = nn.ModuleList()
@@ -46,12 +47,12 @@ class MLP(nn.Module):
         self.output_activation = output_activation
         for i in range(1, len(size_layer)):
             if i + 1 == len(size_layer):
-                self.output = nn.Linear(size_layer[i-1], size_layer[i])
+                self.output = nn.Linear(size_layer[i - 1], size_layer[i])
             else:
-                self.hiddens.append(nn.Linear(size_layer[i-1], size_layer[i]))
-
+                self.hiddens.append(nn.Linear(size_layer[i - 1], size_layer[i]))
+        
         self.dropout = nn.Dropout(p=dropout)
-
+        
         actives = {
             'relu': nn.ReLU(),
             'tanh': nn.Tanh(),
@@ -80,7 +81,7 @@ class MLP(nn.Module):
             else:
                 raise ValueError("should set activation correctly: {}".format(activation))
         initial_parameter(self, initial_method)
-
+    
     def forward(self, x):
         """
         :param torch.Tensor x: MLP接受的输入
@@ -93,16 +94,3 @@ class MLP(nn.Module):
             x = self.output_activation(x)
         x = self.dropout(x)
         return x
-
-
-if __name__ == '__main__':
-    net1 = MLP([5, 10, 5])
-    net2 = MLP([5, 10, 5], 'tanh')
-    net3 = MLP([5, 6, 7, 8, 5], 'tanh')
-    net4 = MLP([5, 6, 7, 8, 5], 'relu', output_activation='tanh')
-    net5 = MLP([5, 6, 7, 8, 5], ['tanh', 'relu', 'tanh'], 'tanh')
-    for net in [net1, net2, net3, net4, net5]:
-        x = torch.randn(5, 5)
-        y = net(x)
-        print(x)
-        print(y)
diff --git a/fastNLP/modules/decoder/utils.py b/fastNLP/modules/decoder/utils.py
index a749fa88..434873c7 100644
--- a/fastNLP/modules/decoder/utils.py
+++ b/fastNLP/modules/decoder/utils.py
@@ -1,10 +1,13 @@
-__all__ = ["viterbi_decode"]
 import torch
 
+__all__ = [
+    "viterbi_decode"
+]
+
 
 def viterbi_decode(logits, transitions, mask=None, unpad=False):
-    """
-    别名：:class:`fastNLP.modules.viterbi_decode`  :class:`fastNLP.modules.decoder.utils.viterbi_decode
+    r"""
+    别名：:class:`fastNLP.modules.viterbi_decode`  :class:`fastNLP.modules.decoder.utils.viterbi_decode`
 
     给定一个特征矩阵以及转移分数矩阵，计算出最佳的路径以及对应的分数
 
@@ -20,18 +23,19 @@ def viterbi_decode(logits, transitions, mask=None, unpad=False):
 
     """
     batch_size, seq_len, n_tags = logits.size()
-    assert n_tags==transitions.size(0) and n_tags==transitions.size(1), "The shapes of transitions and feats are not " \
-                                                                        "compatible."
+    assert n_tags == transitions.size(0) and n_tags == transitions.size(
+        1), "The shapes of transitions and feats are not " \
+            "compatible."
     logits = logits.transpose(0, 1).data  # L, B, H
     if mask is not None:
         mask = mask.transpose(0, 1).data.byte()  # L, B
     else:
         mask = logits.new_ones((seq_len, batch_size), dtype=torch.uint8)
-
+    
     # dp
     vpath = logits.new_zeros((seq_len, batch_size, n_tags), dtype=torch.long)
     vscore = logits[0]
-
+    
     trans_score = transitions.view(1, n_tags, n_tags).data
     for i in range(1, seq_len):
         prev_score = vscore.view(batch_size, n_tags, 1)
@@ -41,14 +45,14 @@ def viterbi_decode(logits, transitions, mask=None, unpad=False):
         vpath[i] = best_dst
         vscore = best_score.masked_fill(mask[i].eq(0).view(batch_size, 1), 0) + \
                  vscore.masked_fill(mask[i].view(batch_size, 1), 0)
-
+    
     # backtrace
     batch_idx = torch.arange(batch_size, dtype=torch.long, device=logits.device)
     seq_idx = torch.arange(seq_len, dtype=torch.long, device=logits.device)
     lens = (mask.long().sum(0) - 1)
     # idxes [L, B], batched idx from seq_len-1 to 0
     idxes = (lens.view(1, -1) - seq_idx.view(-1, 1)) % seq_len
-
+    
     ans = logits.new_empty((seq_len, batch_size), dtype=torch.long)
     ans_score, last_tags = vscore.max(1)
     ans[idxes[0], batch_idx] = last_tags
@@ -62,4 +66,4 @@ def viterbi_decode(logits, transitions, mask=None, unpad=False):
             paths.append(ans[idx, :seq_len + 1].tolist())
     else:
         paths = ans
-    return paths, ans_score
\ No newline at end of file
+    return paths, ans_score
diff --git a/fastNLP/modules/encoder/__init__.py b/fastNLP/modules/encoder/__init__.py
index 67f69850..3d65867a 100644
--- a/fastNLP/modules/encoder/__init__.py
+++ b/fastNLP/modules/encoder/__init__.py
@@ -1,11 +1,29 @@
+from .bert import BertModel
+from .char_encoder import ConvolutionCharEncoder, LSTMCharEncoder
 from .conv_maxpool import ConvMaxpool
 from .embedding import Embedding
 from .lstm import LSTM
-from .bert import BertModel
+from .star_transformer import StarTransformer
+from .transformer import TransformerEncoder
+from .variational_rnn import VarRNN, VarLSTM, VarGRU
 
 __all__ = [
-    "LSTM",
-    "Embedding",
+    # "BertModel",
+    
+    "ConvolutionCharEncoder",
+    "LSTMCharEncoder",
+    
     "ConvMaxpool",
-    "BertModel"
+    
+    "Embedding",
+    
+    "LSTM",
+    
+    "StarTransformer",
+    
+    "TransformerEncoder",
+    
+    "VarRNN",
+    "VarLSTM",
+    "VarGRU"
 ]
diff --git a/fastNLP/modules/encoder/char_encoder.py b/fastNLP/modules/encoder/char_encoder.py
index b5941547..8aefd284 100644
--- a/fastNLP/modules/encoder/char_encoder.py
+++ b/fastNLP/modules/encoder/char_encoder.py
@@ -1,8 +1,13 @@
 import torch
-from torch import nn
+import torch.nn as nn
 
 from ..utils import initial_parameter
 
+__all__ = [
+    "ConvolutionCharEncoder",
+    "LSTMCharEncoder"
+]
+
 
 # from torch.nn.init import xavier_uniform
 class ConvolutionCharEncoder(nn.Module):
@@ -10,20 +15,22 @@ class ConvolutionCharEncoder(nn.Module):
     别名：:class:`fastNLP.modules.ConvolutionCharEncoder`   :class:`fastNLP.modules.encoder.char_encoder.ConvolutionCharEncoder`
 
     char级别的卷积编码器.
+    
     :param int char_emb_size: char级别embedding的维度. Default: 50
-            例: 有26个字符, 每一个的embedding是一个50维的向量, 所以输入的向量维度为50.
+        :例: 有26个字符, 每一个的embedding是一个50维的向量, 所以输入的向量维度为50.
     :param tuple feature_maps: 一个由int组成的tuple. tuple的长度是char级别卷积操作的数目, 第`i`个int表示第`i`个卷积操作的filter.
     :param tuple kernels: 一个由int组成的tuple. tuple的长度是char级别卷积操作的数目, 第`i`个int表示第`i`个卷积操作的卷积核.
     :param initial_method: 初始化参数的方式, 默认为`xavier normal`
     """
+    
     def __init__(self, char_emb_size=50, feature_maps=(40, 30, 30), kernels=(3, 4, 5), initial_method=None):
         super(ConvolutionCharEncoder, self).__init__()
         self.convs = nn.ModuleList([
             nn.Conv2d(1, feature_maps[i], kernel_size=(char_emb_size, kernels[i]), bias=True, padding=(0, 4))
             for i in range(len(kernels))])
-
+        
         initial_parameter(self, initial_method)
-
+    
     def forward(self, x):
         """
         :param torch.Tensor x: ``[batch_size * sent_length, word_length, char_emb_size]`` 输入字符的embedding
@@ -34,7 +41,7 @@ class ConvolutionCharEncoder(nn.Module):
         x = x.transpose(2, 3)
         # [batch_size*sent_length, channel, height, width]
         return self._convolute(x).unsqueeze(2)
-
+    
     def _convolute(self, x):
         feats = []
         for conv in self.convs:
@@ -50,7 +57,14 @@ class ConvolutionCharEncoder(nn.Module):
 
 
 class LSTMCharEncoder(nn.Module):
-    """char级别基于LSTM的encoder."""
+    """
+    别名：:class:`fastNLP.modules.LSTMCharEncoder`   :class:`fastNLP.modules.encoder.char_encoder.LSTMCharEncoder`
+
+    char级别基于LSTM的encoder.
+    
+    
+    """
+    
     def __init__(self, char_emb_size=50, hidden_size=None, initial_method=None):
         """
         :param int char_emb_size: char级别embedding的维度. Default: 50
@@ -60,14 +74,14 @@ class LSTMCharEncoder(nn.Module):
         """
         super(LSTMCharEncoder, self).__init__()
         self.hidden_size = char_emb_size if hidden_size is None else hidden_size
-
+        
         self.lstm = nn.LSTM(input_size=char_emb_size,
                             hidden_size=self.hidden_size,
                             num_layers=1,
                             bias=True,
                             batch_first=True)
         initial_parameter(self, initial_method)
-
+    
     def forward(self, x):
         """
         :param torch.Tensor x: ``[ n_batch*n_word, word_length, char_emb_size]`` 输入字符的embedding
@@ -78,6 +92,6 @@ class LSTMCharEncoder(nn.Module):
         h0 = nn.init.orthogonal_(h0)
         c0 = torch.empty(1, batch_size, self.hidden_size)
         c0 = nn.init.orthogonal_(c0)
-
+        
         _, hidden = self.lstm(x, (h0, c0))
         return hidden[0].squeeze().unsqueeze(2)
diff --git a/fastNLP/modules/encoder/conv_maxpool.py b/fastNLP/modules/encoder/conv_maxpool.py
index 5ecd376d..5e714e88 100644
--- a/fastNLP/modules/encoder/conv_maxpool.py
+++ b/fastNLP/modules/encoder/conv_maxpool.py
@@ -1,12 +1,13 @@
-# python: 3.6
-# encoding: utf-8
-
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 
 from ..utils import initial_parameter
 
+__all__ = [
+    "ConvMaxpool"
+]
+
 
 class ConvMaxpool(nn.Module):
     """
@@ -27,22 +28,24 @@ class ConvMaxpool(nn.Module):
     :param str activation: Convolution后的结果将通过该activation后再经过max-pooling。支持relu, sigmoid, tanh
     :param str initial_method: str。
     """
+    
     def __init__(self, in_channels, out_channels, kernel_sizes,
                  stride=1, padding=0, dilation=1,
                  groups=1, bias=True, activation="relu", initial_method=None):
         super(ConvMaxpool, self).__init__()
-
+        
         # convolution
         if isinstance(kernel_sizes, (list, tuple, int)):
             if isinstance(kernel_sizes, int) and isinstance(out_channels, int):
                 out_channels = [out_channels]
                 kernel_sizes = [kernel_sizes]
             elif isinstance(kernel_sizes, (tuple, list)) and isinstance(out_channels, (tuple, list)):
-                assert len(out_channels)==len(kernel_sizes), "The number of out_channels should be equal to the number" \
-                                                             " of kernel_sizes."
+                assert len(out_channels) == len(
+                    kernel_sizes), "The number of out_channels should be equal to the number" \
+                                   " of kernel_sizes."
             else:
                 raise ValueError("The type of out_channels and kernel_sizes should be the same.")
-
+            
             self.convs = nn.ModuleList([nn.Conv1d(
                 in_channels=in_channels,
                 out_channels=oc,
@@ -53,11 +56,11 @@ class ConvMaxpool(nn.Module):
                 groups=groups,
                 bias=bias)
                 for oc, ks in zip(out_channels, kernel_sizes)])
-
+        
         else:
             raise Exception(
                 'Incorrect kernel sizes: should be list, tuple or int')
-
+        
         # activation function
         if activation == 'relu':
             self.activation = F.relu
@@ -68,9 +71,9 @@ class ConvMaxpool(nn.Module):
         else:
             raise Exception(
                 "Undefined activation function: choose from: relu, tanh, sigmoid")
-
+        
         initial_parameter(self, initial_method)
-
+    
     def forward(self, x, mask=None):
         """
 
@@ -83,9 +86,9 @@ class ConvMaxpool(nn.Module):
         # convolution
         xs = [self.activation(conv(x)) for conv in self.convs]  # [[N,C,L], ...]
         if mask is not None:
-            mask = mask.unsqueeze(1) # B x 1 x L
+            mask = mask.unsqueeze(1)  # B x 1 x L
             xs = [x.masked_fill_(mask, float('-inf')) for x in xs]
         # max-pooling
         xs = [F.max_pool1d(input=i, kernel_size=i.size(2)).squeeze(2)
               for i in xs]  # [[N, C], ...]
-        return torch.cat(xs, dim=-1)  # [N, C]
\ No newline at end of file
+        return torch.cat(xs, dim=-1)  # [N, C]
diff --git a/fastNLP/modules/encoder/embedding.py b/fastNLP/modules/encoder/embedding.py
index c402f318..9fa89e7f 100644
--- a/fastNLP/modules/encoder/embedding.py
+++ b/fastNLP/modules/encoder/embedding.py
@@ -1,14 +1,19 @@
 import torch.nn as nn
 from ..utils import get_embeddings
 
+__all__ = [
+    "Embedding"
+]
+
+
 class Embedding(nn.Embedding):
     """
     别名：:class:`fastNLP.modules.Embedding`   :class:`fastNLP.modules.encoder.embedding.Embedding`
 
     Embedding组件. 可以通过self.num_embeddings获取词表大小; self.embedding_dim获取embedding的维度"""
-
+    
     def __init__(self, init_embed, padding_idx=None, dropout=0.0, sparse=False, max_norm=None, norm_type=2,
-                  scale_grad_by_freq=False):
+                 scale_grad_by_freq=False):
         """
 
         :param tuple(int,int),torch.FloatTensor,nn.Embedding,numpy.ndarray init_embed: Embedding的大小(传入tuple(int, int),
@@ -22,14 +27,14 @@ class Embedding(nn.Embedding):
         """
         embed = get_embeddings(init_embed)
         num_embeddings, embedding_dim = embed.weight.size()
-
+        
         super().__init__(num_embeddings, embedding_dim, padding_idx=padding_idx,
-                 max_norm=max_norm, norm_type=norm_type, scale_grad_by_freq=scale_grad_by_freq,
-                 sparse=sparse, _weight=embed.weight.data)
+                         max_norm=max_norm, norm_type=norm_type, scale_grad_by_freq=scale_grad_by_freq,
+                         sparse=sparse, _weight=embed.weight.data)
         del embed
-
+        
         self.dropout = nn.Dropout(dropout)
-
+    
     def forward(self, x):
         """
         :param torch.LongTensor x: [batch, seq_len]
diff --git a/fastNLP/modules/encoder/lstm.py b/fastNLP/modules/encoder/lstm.py
index c853c142..bc9cb155 100644
--- a/fastNLP/modules/encoder/lstm.py
+++ b/fastNLP/modules/encoder/lstm.py
@@ -1,4 +1,5 @@
-"""轻量封装的 Pytorch LSTM 模块.
+"""
+轻量封装的 Pytorch LSTM 模块.
 可在 forward 时传入序列的长度, 自动对padding做合适的处理.
 """
 import torch
@@ -7,6 +8,10 @@ import torch.nn.utils.rnn as rnn
 
 from ..utils import initial_parameter
 
+__all__ = [
+    "LSTM"
+]
+
 
 class LSTM(nn.Module):
     """
@@ -23,6 +28,7 @@ class LSTM(nn.Module):
         :(batch, seq, feature). Default: ``False``
     :param bias: 如果为 ``False``, 模型将不会使用bias. Default: ``True``
     """
+    
     def __init__(self, input_size, hidden_size=100, num_layers=1, dropout=0.0, batch_first=True,
                  bidirectional=False, bias=True, initial_method=None):
         super(LSTM, self).__init__()
@@ -30,7 +36,7 @@ class LSTM(nn.Module):
         self.lstm = nn.LSTM(input_size, hidden_size, num_layers, bias=bias, batch_first=batch_first,
                             dropout=dropout, bidirectional=bidirectional)
         initial_parameter(self, initial_method)
-
+    
     def forward(self, x, seq_len=None, h0=None, c0=None):
         """
 
diff --git a/fastNLP/modules/encoder/star_transformer.py b/fastNLP/modules/encoder/star_transformer.py
index f0d8e38b..677af48a 100644
--- a/fastNLP/modules/encoder/star_transformer.py
+++ b/fastNLP/modules/encoder/star_transformer.py
@@ -1,9 +1,14 @@
-"""Star-Transformer 的encoder部分的 Pytorch 实现
 """
+Star-Transformer 的encoder部分的 Pytorch 实现
+"""
+import numpy as NP
 import torch
 from torch import nn
 from torch.nn import functional as F
-import numpy as NP
+
+__all__ = [
+    "StarTransformer"
+]
 
 
 class StarTransformer(nn.Module):
@@ -24,10 +29,11 @@ class StarTransformer(nn.Module):
         模型会为输入序列加上position embedding。
         若为`None`，忽略加上position embedding的步骤. Default: `None`
     """
+    
     def __init__(self, hidden_size, num_layers, num_head, head_dim, dropout=0.1, max_len=None):
         super(StarTransformer, self).__init__()
         self.iters = num_layers
-
+        
         self.norm = nn.ModuleList([nn.LayerNorm(hidden_size) for _ in range(self.iters)])
         self.ring_att = nn.ModuleList(
             [_MSA1(hidden_size, nhead=num_head, head_dim=head_dim, dropout=dropout)
@@ -35,12 +41,12 @@ class StarTransformer(nn.Module):
         self.star_att = nn.ModuleList(
             [_MSA2(hidden_size, nhead=num_head, head_dim=head_dim, dropout=dropout)
              for _ in range(self.iters)])
-
+        
         if max_len is not None:
             self.pos_emb = self.pos_emb = nn.Embedding(max_len, hidden_size)
         else:
             self.pos_emb = None
-
+    
     def forward(self, data, mask):
         """
         :param FloatTensor data: [batch, length, hidden] 输入的序列
@@ -50,20 +56,21 @@ class StarTransformer(nn.Module):
 
                 [batch, hidden] 全局 relay 节点, 详见论文
         """
+        
         def norm_func(f, x):
             # B, H, L, 1
             return f(x.permute(0, 2, 3, 1)).permute(0, 3, 1, 2)
-
+        
         B, L, H = data.size()
-        mask = (mask == 0) # flip the mask for masked_fill_
+        mask = (mask == 0)  # flip the mask for masked_fill_
         smask = torch.cat([torch.zeros(B, 1, ).byte().to(mask), mask], 1)
-
-        embs = data.permute(0, 2, 1)[:,:,:,None] # B H L 1
+        
+        embs = data.permute(0, 2, 1)[:, :, :, None]  # B H L 1
         if self.pos_emb:
-            P = self.pos_emb(torch.arange(L, dtype=torch.long, device=embs.device)\
-                    .view(1, L)).permute(0, 2, 1).contiguous()[:, :, :, None]  # 1 H L 1
+            P = self.pos_emb(torch.arange(L, dtype=torch.long, device=embs.device) \
+                             .view(1, L)).permute(0, 2, 1).contiguous()[:, :, :, None]  # 1 H L 1
             embs = embs + P
-
+        
         nodes = embs
         relay = embs.mean(2, keepdim=True)
         ex_mask = mask[:, None, :, None].expand(B, H, L, 1)
@@ -72,11 +79,11 @@ class StarTransformer(nn.Module):
             ax = torch.cat([r_embs, relay.expand(B, H, 1, L)], 2)
             nodes = nodes + F.leaky_relu(self.ring_att[i](norm_func(self.norm[i], nodes), ax=ax))
             relay = F.leaky_relu(self.star_att[i](relay, torch.cat([relay, nodes], 2), smask))
-
+            
             nodes = nodes.masked_fill_(ex_mask, 0)
-
+        
         nodes = nodes.view(B, H, L).permute(0, 2, 1)
-
+        
         return nodes, relay.view(B, H)
 
 
@@ -89,37 +96,37 @@ class _MSA1(nn.Module):
         self.WK = nn.Conv2d(nhid, nhead * head_dim, 1)
         self.WV = nn.Conv2d(nhid, nhead * head_dim, 1)
         self.WO = nn.Conv2d(nhead * head_dim, nhid, 1)
-
+        
         self.drop = nn.Dropout(dropout)
-
+        
         # print('NUM_HEAD', nhead, 'DIM_HEAD', head_dim)
         self.nhid, self.nhead, self.head_dim, self.unfold_size = nhid, nhead, head_dim, 3
-
+    
     def forward(self, x, ax=None):
         # x: B, H, L, 1, ax : B, H, X, L append features
         nhid, nhead, head_dim, unfold_size = self.nhid, self.nhead, self.head_dim, self.unfold_size
         B, H, L, _ = x.shape
-
+        
         q, k, v = self.WQ(x), self.WK(x), self.WV(x)  # x: (B,H,L,1)
-
+        
         if ax is not None:
             aL = ax.shape[2]
             ak = self.WK(ax).view(B, nhead, head_dim, aL, L)
             av = self.WV(ax).view(B, nhead, head_dim, aL, L)
         q = q.view(B, nhead, head_dim, 1, L)
-        k = F.unfold(k.view(B, nhead * head_dim, L, 1), (unfold_size, 1), padding=(unfold_size // 2, 0))\
-                .view(B, nhead, head_dim, unfold_size, L)
-        v = F.unfold(v.view(B, nhead * head_dim, L, 1), (unfold_size, 1), padding=(unfold_size // 2, 0))\
-                .view(B, nhead, head_dim, unfold_size, L)
+        k = F.unfold(k.view(B, nhead * head_dim, L, 1), (unfold_size, 1), padding=(unfold_size // 2, 0)) \
+            .view(B, nhead, head_dim, unfold_size, L)
+        v = F.unfold(v.view(B, nhead * head_dim, L, 1), (unfold_size, 1), padding=(unfold_size // 2, 0)) \
+            .view(B, nhead, head_dim, unfold_size, L)
         if ax is not None:
             k = torch.cat([k, ak], 3)
             v = torch.cat([v, av], 3)
-
+        
         alphas = self.drop(F.softmax((q * k).sum(2, keepdim=True) / NP.sqrt(head_dim), 3))  # B N L 1 U
         att = (alphas * v).sum(3).view(B, nhead * head_dim, L, 1)
-
+        
         ret = self.WO(att)
-
+        
         return ret
 
 
@@ -131,19 +138,19 @@ class _MSA2(nn.Module):
         self.WK = nn.Conv2d(nhid, nhead * head_dim, 1)
         self.WV = nn.Conv2d(nhid, nhead * head_dim, 1)
         self.WO = nn.Conv2d(nhead * head_dim, nhid, 1)
-
+        
         self.drop = nn.Dropout(dropout)
-
+        
         # print('NUM_HEAD', nhead, 'DIM_HEAD', head_dim)
         self.nhid, self.nhead, self.head_dim, self.unfold_size = nhid, nhead, head_dim, 3
-
+    
     def forward(self, x, y, mask=None):
         # x: B, H, 1, 1, 1 y: B H L 1
         nhid, nhead, head_dim, unfold_size = self.nhid, self.nhead, self.head_dim, self.unfold_size
         B, H, L, _ = y.shape
-
+        
         q, k, v = self.WQ(x), self.WK(y), self.WV(y)
-
+        
         q = q.view(B, nhead, 1, head_dim)  # B, H, 1, 1 -> B, N, 1, h
         k = k.view(B, nhead, head_dim, L)  # B, H, L, 1 -> B, N, h, L
         v = v.view(B, nhead, head_dim, L).permute(0, 1, 3, 2)  # B, H, L, 1 -> B, N, L, h
diff --git a/fastNLP/modules/encoder/transformer.py b/fastNLP/modules/encoder/transformer.py
index 7dcae342..2532d90a 100644
--- a/fastNLP/modules/encoder/transformer.py
+++ b/fastNLP/modules/encoder/transformer.py
@@ -3,6 +3,10 @@ from torch import nn
 from ..aggregator.attention import MultiHeadAttention
 from ..dropout import TimestepDropout
 
+__all__ = [
+    "TransformerEncoder"
+]
+
 
 class TransformerEncoder(nn.Module):
     """
@@ -19,6 +23,7 @@ class TransformerEncoder(nn.Module):
     :param int num_head: head的数量。
     :param float dropout: dropout概率. Default: 0.1
     """
+    
     class SubLayer(nn.Module):
         def __init__(self, model_size, inner_size, key_size, value_size, num_head, dropout=0.1):
             super(TransformerEncoder.SubLayer, self).__init__()
@@ -27,9 +32,9 @@ class TransformerEncoder(nn.Module):
             self.ffn = nn.Sequential(nn.Linear(model_size, inner_size),
                                      nn.ReLU(),
                                      nn.Linear(inner_size, model_size),
-                                     TimestepDropout(dropout),)
+                                     TimestepDropout(dropout), )
             self.norm2 = nn.LayerNorm(model_size)
-
+        
         def forward(self, input, seq_mask=None, atte_mask_out=None):
             """
 
@@ -44,11 +49,11 @@ class TransformerEncoder(nn.Module):
             output = self.norm2(output + norm_atte)
             output *= seq_mask
             return output
-
+    
     def __init__(self, num_layers, **kargs):
         super(TransformerEncoder, self).__init__()
         self.layers = nn.ModuleList([self.SubLayer(**kargs) for _ in range(num_layers)])
-
+    
     def forward(self, x, seq_mask=None):
         """
         :param x: [batch, seq_len, model_size] 输入序列
@@ -60,8 +65,8 @@ class TransformerEncoder(nn.Module):
         if seq_mask is None:
             atte_mask_out = None
         else:
-            atte_mask_out = (seq_mask < 1)[:,None,:]
-            seq_mask = seq_mask[:,:,None]
+            atte_mask_out = (seq_mask < 1)[:, None, :]
+            seq_mask = seq_mask[:, :, None]
         for layer in self.layers:
             output = layer(output, seq_mask, atte_mask_out)
         return output
diff --git a/fastNLP/modules/encoder/variational_rnn.py b/fastNLP/modules/encoder/variational_rnn.py
index b926ba9e..60cdf9c5 100644
--- a/fastNLP/modules/encoder/variational_rnn.py
+++ b/fastNLP/modules/encoder/variational_rnn.py
@@ -1,9 +1,9 @@
-"""Variational RNN 的 Pytorch 实现
+"""
+Variational RNN 的 Pytorch 实现
 """
 import torch
 import torch.nn as nn
 from torch.nn.utils.rnn import PackedSequence, pack_padded_sequence, pad_packed_sequence
-from ..utils import initial_parameter
 
 try:
     from torch import flip
@@ -14,18 +14,27 @@ except ImportError:
             indices[dim] = torch.arange(x.size(dim) - 1, -1, -1, dtype=torch.long, device=x.device)
         return x[tuple(indices)]
 
+from ..utils import initial_parameter
+
+__all__ = [
+    "VarRNN",
+    "VarLSTM",
+    "VarGRU"
+]
+
 
 class VarRnnCellWrapper(nn.Module):
-    """Wrapper for normal RNN Cells, make it support variational dropout
     """
-
+    Wrapper for normal RNN Cells, make it support variational dropout
+    """
+    
     def __init__(self, cell, hidden_size, input_p, hidden_p):
         super(VarRnnCellWrapper, self).__init__()
         self.cell = cell
         self.hidden_size = hidden_size
         self.input_p = input_p
         self.hidden_p = hidden_p
-
+    
     def forward(self, input_x, hidden, mask_x, mask_h, is_reversed=False):
         """
         :param PackedSequence input_x: [seq_len, batch_size, input_size]
@@ -37,11 +46,13 @@ class VarRnnCellWrapper(nn.Module):
                 hidden: for LSTM, tuple of (h_n, c_n), [batch_size, hidden_size]
                         for other RNN, h_n, [batch_size, hidden_size]
         """
+        
         def get_hi(hi, h0, size):
             h0_size = size - hi.size(0)
             if h0_size > 0:
                 return torch.cat([hi, h0[:h0_size]], dim=0)
             return hi[:size]
+        
         is_lstm = isinstance(hidden, tuple)
         input, batch_sizes = input_x.data, input_x.batch_sizes
         output = []
@@ -52,7 +63,7 @@ class VarRnnCellWrapper(nn.Module):
         else:
             batch_iter = batch_sizes
             idx = 0
-
+        
         if is_lstm:
             hn = (hidden[0].clone(), hidden[1].clone())
         else:
@@ -60,10 +71,10 @@ class VarRnnCellWrapper(nn.Module):
         hi = hidden
         for size in batch_iter:
             if is_reversed:
-                input_i = input[idx-size: idx] * mask_x[:size]
+                input_i = input[idx - size: idx] * mask_x[:size]
                 idx -= size
             else:
-                input_i = input[idx: idx+size] * mask_x[:size]
+                input_i = input[idx: idx + size] * mask_x[:size]
                 idx += size
             mask_hi = mask_h[:size]
             if is_lstm:
@@ -78,7 +89,7 @@ class VarRnnCellWrapper(nn.Module):
                 hi = cell(input_i, hi)
                 hn[:size] = hi
                 output.append(hi)
-
+        
         if is_reversed:
             output = list(reversed(output))
         output = torch.cat(output, dim=0)
@@ -86,7 +97,9 @@ class VarRnnCellWrapper(nn.Module):
 
 
 class VarRNNBase(nn.Module):
-    """Variational Dropout RNN 实现.
+    """
+    Variational Dropout RNN 实现.
+    
     论文参考: `A Theoretically Grounded Application of Dropout in Recurrent Neural Networks (Yarin Gal and Zoubin Ghahramani, 2016)
     https://arxiv.org/abs/1512.05287`.
 
@@ -102,7 +115,7 @@ class VarRNNBase(nn.Module):
     :param hidden_dropout: 对每个隐状态的dropout概率. Default: 0
     :param bidirectional: 若为 ``True``, 使用双向的RNN. Default: ``False``
     """
-
+    
     def __init__(self, mode, Cell, input_size, hidden_size, num_layers=1,
                  bias=True, batch_first=False,
                  input_dropout=0, hidden_dropout=0, bidirectional=False):
@@ -125,7 +138,7 @@ class VarRNNBase(nn.Module):
                 self._all_cells.append(VarRnnCellWrapper(cell, self.hidden_size, input_dropout, hidden_dropout))
         initial_parameter(self)
         self.is_lstm = (self.mode == "LSTM")
-
+    
     def _forward_one(self, n_layer, n_direction, input, hx, mask_x, mask_h):
         is_lstm = self.is_lstm
         idx = self.num_directions * n_layer + n_direction
@@ -133,7 +146,7 @@ class VarRNNBase(nn.Module):
         hi = (hx[0][idx], hx[1][idx]) if is_lstm else hx[idx]
         output_x, hidden_x = cell(input, hi, mask_x, mask_h, is_reversed=(n_direction == 1))
         return output_x, hidden_x
-
+    
     def forward(self, x, hx=None):
         """
 
@@ -152,19 +165,19 @@ class VarRNNBase(nn.Module):
         else:
             max_batch_size = int(input.batch_sizes[0])
         input, batch_sizes = input.data, input.batch_sizes
-
+        
         if hx is None:
             hx = x.new_zeros(self.num_layers * self.num_directions,
                              max_batch_size, self.hidden_size, requires_grad=True)
             if is_lstm:
                 hx = (hx, hx.new_zeros(hx.size(), requires_grad=True))
-
+        
         mask_x = x.new_ones((max_batch_size, self.input_size))
         mask_out = x.new_ones((max_batch_size, self.hidden_size * self.num_directions))
         mask_h_ones = x.new_ones((max_batch_size, self.hidden_size))
         nn.functional.dropout(mask_x, p=self.input_dropout, training=self.training, inplace=True)
         nn.functional.dropout(mask_out, p=self.hidden_dropout, training=self.training, inplace=True)
-
+        
         hidden = x.new_zeros((self.num_layers * self.num_directions, max_batch_size, self.hidden_size))
         if is_lstm:
             cellstate = x.new_zeros((self.num_layers * self.num_directions, max_batch_size, self.hidden_size))
@@ -183,18 +196,19 @@ class VarRNNBase(nn.Module):
                 else:
                     hidden[idx] = hidden_x
             x = torch.cat(output_list, dim=-1)
-
+        
         if is_lstm:
             hidden = (hidden, cellstate)
-
+        
         if is_packed:
             output = PackedSequence(x, batch_sizes)
         else:
             x = PackedSequence(x, batch_sizes)
             output, _ = pad_packed_sequence(x, batch_first=self.batch_first)
-
+        
         return output, hidden
 
+
 class VarLSTM(VarRNNBase):
     """
     别名：:class:`fastNLP.modules.VarLSTM`  :class:`fastNLP.modules.encoder.variational_rnn.VarLSTM`
@@ -211,10 +225,10 @@ class VarLSTM(VarRNNBase):
     :param hidden_dropout: 对每个隐状态的dropout概率. Default: 0
     :param bidirectional: 若为 ``True``, 使用双向的LSTM. Default: ``False``
     """
-
+    
     def __init__(self, *args, **kwargs):
         super(VarLSTM, self).__init__(mode="LSTM", Cell=nn.LSTMCell, *args, **kwargs)
-
+    
     def forward(self, x, hx=None):
         return super(VarLSTM, self).forward(x, hx)
 
@@ -235,13 +249,14 @@ class VarRNN(VarRNNBase):
     :param hidden_dropout: 对每个隐状态的dropout概率. Default: 0
     :param bidirectional: 若为 ``True``, 使用双向的RNN. Default: ``False``
     """
-
+    
     def __init__(self, *args, **kwargs):
         super(VarRNN, self).__init__(mode="RNN", Cell=nn.RNNCell, *args, **kwargs)
-
+    
     def forward(self, x, hx=None):
         return super(VarRNN, self).forward(x, hx)
 
+
 class VarGRU(VarRNNBase):
     """
     别名：:class:`fastNLP.modules.VarGRU`  :class:`fastNLP.modules.encoder.variational_rnn.VarGRU`
@@ -258,10 +273,9 @@ class VarGRU(VarRNNBase):
     :param hidden_dropout: 对每个隐状态的dropout概率. Default: 0
     :param bidirectional: 若为 ``True``, 使用双向的GRU. Default: ``False``
     """
-
+    
     def __init__(self, *args, **kwargs):
         super(VarGRU, self).__init__(mode="GRU", Cell=nn.GRUCell, *args, **kwargs)
-
+    
     def forward(self, x, hx=None):
         return super(VarGRU, self).forward(x, hx)
-
diff --git a/reproduction/Chinese_word_segmentation/models/cws_model.py b/reproduction/Chinese_word_segmentation/models/cws_model.py
index 13632207..b41ad87d 100644
--- a/reproduction/Chinese_word_segmentation/models/cws_model.py
+++ b/reproduction/Chinese_word_segmentation/models/cws_model.py
@@ -3,7 +3,7 @@ import torch
 from torch import nn
 
 from fastNLP.models.base_model import BaseModel
-from fastNLP.modules.decoder.MLP import MLP
+from fastNLP.modules.decoder.mlp import MLP
 from reproduction.Chinese_word_segmentation.utils import seq_lens_to_mask
 
 
@@ -120,8 +120,8 @@ class CWSBiLSTMSegApp(BaseModel):
         return {'pred_tags': pred_tags}
 
 
-from fastNLP.modules.decoder.CRF import ConditionalRandomField
-from fastNLP.modules.decoder.CRF import allowed_transitions
+from fastNLP.modules.decoder.crf import ConditionalRandomField
+from fastNLP.modules.decoder.crf import allowed_transitions
 
 class CWSBiLSTMCRF(BaseModel):
     def __init__(self, vocab_num, embed_dim=100, bigram_vocab_num=None, bigram_embed_dim=100, num_bigram_per_char=None,
diff --git a/reproduction/Chinese_word_segmentation/models/cws_transformer.py b/reproduction/Chinese_word_segmentation/models/cws_transformer.py
index f6c2dab6..e8ae5ecc 100644
--- a/reproduction/Chinese_word_segmentation/models/cws_transformer.py
+++ b/reproduction/Chinese_word_segmentation/models/cws_transformer.py
@@ -10,8 +10,8 @@ from torch import nn
 import torch
 # from fastNLP.modules.encoder.transformer import TransformerEncoder
 from reproduction.Chinese_word_segmentation.models.transformer import TransformerEncoder
-from fastNLP.modules.decoder.CRF import ConditionalRandomField,seq_len_to_byte_mask
-from fastNLP.modules.decoder.CRF import allowed_transitions
+from fastNLP.modules.decoder.crf import ConditionalRandomField,seq_len_to_byte_mask
+from fastNLP.modules.decoder.crf import allowed_transitions
 
 class TransformerCWS(nn.Module):
     def __init__(self, vocab_num, embed_dim=100, bigram_vocab_num=None, bigram_embed_dim=100, num_bigram_per_char=None,
diff --git a/reproduction/LSTM+self_attention_sentiment_analysis/main.py b/reproduction/LSTM+self_attention_sentiment_analysis/main.py
index 4ca5388f..871dc476 100644
--- a/reproduction/LSTM+self_attention_sentiment_analysis/main.py
+++ b/reproduction/LSTM+self_attention_sentiment_analysis/main.py
@@ -7,7 +7,7 @@ from fastNLP.io.config_io import ConfigSection
 from fastNLP.io.dataset_loader import DummyClassificationReader as Dataset_loader
 from fastNLP.models.base_model import BaseModel
 from fastNLP.modules.aggregator.self_attention import SelfAttention
-from fastNLP.modules.decoder.MLP import MLP
+from fastNLP.modules.decoder.mlp import MLP
 from fastNLP.modules.encoder.embedding import Embedding as Embedding
 from fastNLP.modules.encoder.lstm import LSTM
 
diff --git a/test/modules/decoder/test_CRF.py b/test/modules/decoder/test_CRF.py
index 5fb49253..5dec7d47 100644
--- a/test/modules/decoder/test_CRF.py
+++ b/test/modules/decoder/test_CRF.py
@@ -5,7 +5,7 @@ import unittest
 class TestCRF(unittest.TestCase):
     def test_case1(self):
         # 检查allowed_transitions()能否正确使用
-        from fastNLP.modules.decoder.CRF import allowed_transitions
+        from fastNLP.modules.decoder.crf import allowed_transitions
 
         id2label = {0: 'B', 1: 'I', 2:'O'}
         expected_res = {(0, 0), (0, 1), (0, 2), (0, 4), (1, 0), (1, 1), (1, 2), (1, 4), (2, 0), (2, 2),
@@ -43,7 +43,7 @@ class TestCRF(unittest.TestCase):
         # 测试CRF能否避免解码出非法跃迁, 使用allennlp做了验证。
         pass
         # import torch
-        # from fastNLP.modules.decoder.CRF import seq_len_to_byte_mask
+        # from fastNLP.modules.decoder.crf import seq_len_to_byte_mask
         #
         # labels = ['O']
         # for label in ['X', 'Y']:
@@ -63,7 +63,7 @@ class TestCRF(unittest.TestCase):
         # mask = seq_len_to_byte_mask(seq_lens)
         # allen_res = allen_CRF.viterbi_tags(logits, mask)
         #
-        # from fastNLP.modules.decoder.CRF import ConditionalRandomField, allowed_transitions
+        # from fastNLP.modules.decoder.crf import ConditionalRandomField, allowed_transitions
         # fast_CRF = ConditionalRandomField(num_tags=num_tags, allowed_transitions=allowed_transitions(id2label))
         # fast_CRF.trans_m = trans_m
         # fast_res = fast_CRF.viterbi_decode(logits, mask, get_score=True, unpad=True)
@@ -91,7 +91,7 @@ class TestCRF(unittest.TestCase):
         # mask = seq_len_to_byte_mask(seq_lens)
         # allen_res = allen_CRF.viterbi_tags(logits, mask)
         #
-        # from fastNLP.modules.decoder.CRF import ConditionalRandomField, allowed_transitions
+        # from fastNLP.modules.decoder.crf import ConditionalRandomField, allowed_transitions
         # fast_CRF = ConditionalRandomField(num_tags=num_tags, allowed_transitions=allowed_transitions(id2label,
         #                                                                                              encoding_type='BMES'))
         # fast_CRF.trans_m = trans_m
@@ -104,7 +104,7 @@ class TestCRF(unittest.TestCase):
     def test_case3(self):
         # 测试crf的loss不会出现负数
         import torch
-        from fastNLP.modules.decoder.CRF import ConditionalRandomField
+        from fastNLP.modules.decoder.crf import ConditionalRandomField
         from fastNLP.core.utils import seq_len_to_mask
         from torch import optim
         from torch import nn

From 4ac4cda049a85f6cc73b854ac79f2bb549cfee97 Mon Sep 17 00:00:00 2001
From: yunfan <yunfan.shao@outlook.com>
Date: Fri, 17 May 2019 13:22:42 +0800
Subject: [PATCH 5/8] fix var runn

---
 fastNLP/modules/encoder/variational_rnn.py | 92 +++++++++++++---------
 1 file changed, 53 insertions(+), 39 deletions(-)

diff --git a/fastNLP/modules/encoder/variational_rnn.py b/fastNLP/modules/encoder/variational_rnn.py
index 60cdf9c5..753741de 100644
--- a/fastNLP/modules/encoder/variational_rnn.py
+++ b/fastNLP/modules/encoder/variational_rnn.py
@@ -11,7 +11,8 @@ except ImportError:
     def flip(x, dims):
         indices = [slice(None)] * x.dim()
         for dim in dims:
-            indices[dim] = torch.arange(x.size(dim) - 1, -1, -1, dtype=torch.long, device=x.device)
+            indices[dim] = torch.arange(
+                x.size(dim) - 1, -1, -1, dtype=torch.long, device=x.device)
         return x[tuple(indices)]
 
 from ..utils import initial_parameter
@@ -27,14 +28,14 @@ class VarRnnCellWrapper(nn.Module):
     """
     Wrapper for normal RNN Cells, make it support variational dropout
     """
-    
+
     def __init__(self, cell, hidden_size, input_p, hidden_p):
         super(VarRnnCellWrapper, self).__init__()
         self.cell = cell
         self.hidden_size = hidden_size
         self.input_p = input_p
         self.hidden_p = hidden_p
-    
+
     def forward(self, input_x, hidden, mask_x, mask_h, is_reversed=False):
         """
         :param PackedSequence input_x: [seq_len, batch_size, input_size]
@@ -46,13 +47,13 @@ class VarRnnCellWrapper(nn.Module):
                 hidden: for LSTM, tuple of (h_n, c_n), [batch_size, hidden_size]
                         for other RNN, h_n, [batch_size, hidden_size]
         """
-        
+
         def get_hi(hi, h0, size):
             h0_size = size - hi.size(0)
             if h0_size > 0:
                 return torch.cat([hi, h0[:h0_size]], dim=0)
             return hi[:size]
-        
+
         is_lstm = isinstance(hidden, tuple)
         input, batch_sizes = input_x.data, input_x.batch_sizes
         output = []
@@ -63,7 +64,7 @@ class VarRnnCellWrapper(nn.Module):
         else:
             batch_iter = batch_sizes
             idx = 0
-        
+
         if is_lstm:
             hn = (hidden[0].clone(), hidden[1].clone())
         else:
@@ -79,7 +80,8 @@ class VarRnnCellWrapper(nn.Module):
             mask_hi = mask_h[:size]
             if is_lstm:
                 hx, cx = hi
-                hi = (get_hi(hx, hidden[0], size) * mask_hi, get_hi(cx, hidden[1], size))
+                hi = (get_hi(hx, hidden[0], size) *
+                      mask_hi, get_hi(cx, hidden[1], size))
                 hi = cell(input_i, hi)
                 hn[0][:size] = hi[0]
                 hn[1][:size] = hi[1]
@@ -89,7 +91,7 @@ class VarRnnCellWrapper(nn.Module):
                 hi = cell(input_i, hi)
                 hn[:size] = hi
                 output.append(hi)
-        
+
         if is_reversed:
             output = list(reversed(output))
         output = torch.cat(output, dim=0)
@@ -99,7 +101,7 @@ class VarRnnCellWrapper(nn.Module):
 class VarRNNBase(nn.Module):
     """
     Variational Dropout RNN 实现.
-    
+
     论文参考: `A Theoretically Grounded Application of Dropout in Recurrent Neural Networks (Yarin Gal and Zoubin Ghahramani, 2016)
     https://arxiv.org/abs/1512.05287`.
 
@@ -115,7 +117,7 @@ class VarRNNBase(nn.Module):
     :param hidden_dropout: 对每个隐状态的dropout概率. Default: 0
     :param bidirectional: 若为 ``True``, 使用双向的RNN. Default: ``False``
     """
-    
+
     def __init__(self, mode, Cell, input_size, hidden_size, num_layers=1,
                  bias=True, batch_first=False,
                  input_dropout=0, hidden_dropout=0, bidirectional=False):
@@ -135,18 +137,20 @@ class VarRNNBase(nn.Module):
             for direction in range(self.num_directions):
                 input_size = self.input_size if layer == 0 else self.hidden_size * self.num_directions
                 cell = Cell(input_size, self.hidden_size, bias)
-                self._all_cells.append(VarRnnCellWrapper(cell, self.hidden_size, input_dropout, hidden_dropout))
+                self._all_cells.append(VarRnnCellWrapper(
+                    cell, self.hidden_size, input_dropout, hidden_dropout))
         initial_parameter(self)
         self.is_lstm = (self.mode == "LSTM")
-    
+
     def _forward_one(self, n_layer, n_direction, input, hx, mask_x, mask_h):
         is_lstm = self.is_lstm
         idx = self.num_directions * n_layer + n_direction
         cell = self._all_cells[idx]
         hi = (hx[0][idx], hx[1][idx]) if is_lstm else hx[idx]
-        output_x, hidden_x = cell(input, hi, mask_x, mask_h, is_reversed=(n_direction == 1))
+        output_x, hidden_x = cell(
+            input, hi, mask_x, mask_h, is_reversed=(n_direction == 1))
         return output_x, hidden_x
-    
+
     def forward(self, x, hx=None):
         """
 
@@ -160,31 +164,38 @@ class VarRNNBase(nn.Module):
         if not is_packed:
             seq_len = x.size(1) if self.batch_first else x.size(0)
             max_batch_size = x.size(0) if self.batch_first else x.size(1)
-            seq_lens = torch.LongTensor([seq_len for _ in range(max_batch_size)])
-            input = pack_padded_sequence(input, seq_lens, batch_first=self.batch_first)
+            seq_lens = torch.LongTensor(
+                [seq_len for _ in range(max_batch_size)])
+            x = pack_padded_sequence(x, seq_lens, batch_first=self.batch_first)
         else:
-            max_batch_size = int(input.batch_sizes[0])
-        input, batch_sizes = input.data, input.batch_sizes
-        
+            max_batch_size = int(x.batch_sizes[0])
+        x, batch_sizes = x.data, x.batch_sizes
+
         if hx is None:
             hx = x.new_zeros(self.num_layers * self.num_directions,
                              max_batch_size, self.hidden_size, requires_grad=True)
             if is_lstm:
                 hx = (hx, hx.new_zeros(hx.size(), requires_grad=True))
-        
+
         mask_x = x.new_ones((max_batch_size, self.input_size))
-        mask_out = x.new_ones((max_batch_size, self.hidden_size * self.num_directions))
+        mask_out = x.new_ones(
+            (max_batch_size, self.hidden_size * self.num_directions))
         mask_h_ones = x.new_ones((max_batch_size, self.hidden_size))
-        nn.functional.dropout(mask_x, p=self.input_dropout, training=self.training, inplace=True)
-        nn.functional.dropout(mask_out, p=self.hidden_dropout, training=self.training, inplace=True)
-        
-        hidden = x.new_zeros((self.num_layers * self.num_directions, max_batch_size, self.hidden_size))
+        nn.functional.dropout(mask_x, p=self.input_dropout,
+                              training=self.training, inplace=True)
+        nn.functional.dropout(mask_out, p=self.hidden_dropout,
+                              training=self.training, inplace=True)
+
+        hidden = x.new_zeros(
+            (self.num_layers * self.num_directions, max_batch_size, self.hidden_size))
         if is_lstm:
-            cellstate = x.new_zeros((self.num_layers * self.num_directions, max_batch_size, self.hidden_size))
+            cellstate = x.new_zeros(
+                (self.num_layers * self.num_directions, max_batch_size, self.hidden_size))
         for layer in range(self.num_layers):
             output_list = []
             input_seq = PackedSequence(x, batch_sizes)
-            mask_h = nn.functional.dropout(mask_h_ones, p=self.hidden_dropout, training=self.training, inplace=False)
+            mask_h = nn.functional.dropout(
+                mask_h_ones, p=self.hidden_dropout, training=self.training, inplace=False)
             for direction in range(self.num_directions):
                 output_x, hidden_x = self._forward_one(layer, direction, input_seq, hx,
                                                        mask_x if layer == 0 else mask_out, mask_h)
@@ -196,16 +207,16 @@ class VarRNNBase(nn.Module):
                 else:
                     hidden[idx] = hidden_x
             x = torch.cat(output_list, dim=-1)
-        
+
         if is_lstm:
             hidden = (hidden, cellstate)
-        
+
         if is_packed:
             output = PackedSequence(x, batch_sizes)
         else:
             x = PackedSequence(x, batch_sizes)
             output, _ = pad_packed_sequence(x, batch_first=self.batch_first)
-        
+
         return output, hidden
 
 
@@ -225,10 +236,11 @@ class VarLSTM(VarRNNBase):
     :param hidden_dropout: 对每个隐状态的dropout概率. Default: 0
     :param bidirectional: 若为 ``True``, 使用双向的LSTM. Default: ``False``
     """
-    
+
     def __init__(self, *args, **kwargs):
-        super(VarLSTM, self).__init__(mode="LSTM", Cell=nn.LSTMCell, *args, **kwargs)
-    
+        super(VarLSTM, self).__init__(
+            mode="LSTM", Cell=nn.LSTMCell, *args, **kwargs)
+
     def forward(self, x, hx=None):
         return super(VarLSTM, self).forward(x, hx)
 
@@ -249,10 +261,11 @@ class VarRNN(VarRNNBase):
     :param hidden_dropout: 对每个隐状态的dropout概率. Default: 0
     :param bidirectional: 若为 ``True``, 使用双向的RNN. Default: ``False``
     """
-    
+
     def __init__(self, *args, **kwargs):
-        super(VarRNN, self).__init__(mode="RNN", Cell=nn.RNNCell, *args, **kwargs)
-    
+        super(VarRNN, self).__init__(
+            mode="RNN", Cell=nn.RNNCell, *args, **kwargs)
+
     def forward(self, x, hx=None):
         return super(VarRNN, self).forward(x, hx)
 
@@ -273,9 +286,10 @@ class VarGRU(VarRNNBase):
     :param hidden_dropout: 对每个隐状态的dropout概率. Default: 0
     :param bidirectional: 若为 ``True``, 使用双向的GRU. Default: ``False``
     """
-    
+
     def __init__(self, *args, **kwargs):
-        super(VarGRU, self).__init__(mode="GRU", Cell=nn.GRUCell, *args, **kwargs)
-    
+        super(VarGRU, self).__init__(
+            mode="GRU", Cell=nn.GRUCell, *args, **kwargs)
+
     def forward(self, x, hx=None):
         return super(VarGRU, self).forward(x, hx)

From aabbdb4df5464cde1cc37e98772260c8a25997cc Mon Sep 17 00:00:00 2001
From: yh <poemsmileyh@gmail.com>
Date: Fri, 17 May 2019 14:01:40 +0800
Subject: [PATCH 6/8] =?UTF-8?q?=E5=A2=9E=E5=8A=A0vocab=E7=9A=84clear?=
 =?UTF-8?q?=E6=96=B9=E6=B3=95?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 fastNLP/core/vocabulary.py | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/fastNLP/core/vocabulary.py b/fastNLP/core/vocabulary.py
index 03759194..3d9598a3 100644
--- a/fastNLP/core/vocabulary.py
+++ b/fastNLP/core/vocabulary.py
@@ -322,7 +322,18 @@ class Vocabulary(object):
         :return str word: the word
         """
         return self.idx2word[idx]
-    
+
+    def clear(self):
+        """
+        删除Vocabulary中的词表数据。相当于重新初始化一下。
+
+        :return:
+        """
+        self.word_count.clear()
+        self.word2idx = None
+        self.idx2word = None
+        self.rebuild = True
+
     def __getstate__(self):
         """Use to prepare data for pickle.
 

From bdec6187a2a1e2c3d0b7a5095a79f7d4bb9fbbdf Mon Sep 17 00:00:00 2001
From: ChenXin <will131@foxmail.com>
Date: Sat, 18 May 2019 16:40:54 +0800
Subject: [PATCH 7/8] =?UTF-8?q?=E4=B8=80=E4=BA=9B=E7=AC=A6=E5=90=88=20PEP8?=
 =?UTF-8?q?=20=E7=9A=84=E5=BE=AE=E8=B0=83?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 fastNLP/__init__.py                         |  4 +-
 fastNLP/core/batch.py                       |  8 +--
 fastNLP/core/callback.py                    | 24 ++++-----
 fastNLP/core/dataset.py                     |  8 +--
 fastNLP/core/field.py                       |  8 +--
 fastNLP/core/losses.py                      | 24 ++++-----
 fastNLP/core/metrics.py                     | 14 +++---
 fastNLP/core/optimizer.py                   |  4 +-
 fastNLP/core/sampler.py                     |  8 +--
 fastNLP/core/trainer.py                     |  3 ++
 fastNLP/core/utils.py                       |  9 ++--
 fastNLP/core/vocabulary.py                  | 12 ++---
 fastNLP/io/__init__.py                      | 10 ++--
 fastNLP/io/base_loader.py                   |  6 +--
 fastNLP/io/config_io.py                     | 12 ++---
 fastNLP/io/dataset_loader.py                | 12 ++---
 fastNLP/io/embed_loader.py                  |  8 +--
 fastNLP/io/model_io.py                      |  8 +--
 fastNLP/models/__init__.py                  | 18 +++----
 fastNLP/models/biaffine_parser.py           | 10 ++--
 fastNLP/models/cnn_text_classification.py   |  8 +--
 fastNLP/models/enas_utils.py                |  1 -
 fastNLP/models/sequence_labeling.py         | 10 ++--
 fastNLP/models/snli.py                      |  8 +--
 fastNLP/models/star_transformer.py          | 14 +++---
 fastNLP/modules/__init__.py                 | 18 +++----
 fastNLP/modules/aggregator/__init__.py      | 14 +++---
 fastNLP/modules/aggregator/attention.py     |  8 +--
 fastNLP/modules/aggregator/pooling.py       |  9 +++-
 fastNLP/modules/decoder/__init__.py         | 10 ++--
 fastNLP/modules/decoder/crf.py              | 10 ++--
 fastNLP/modules/decoder/mlp.py              |  8 +--
 fastNLP/modules/decoder/utils.py            |  3 +-
 fastNLP/modules/dropout.py                  |  6 ++-
 fastNLP/modules/encoder/__init__.py         | 17 +++----
 fastNLP/modules/encoder/char_encoder.py     |  9 ++--
 fastNLP/modules/encoder/conv_maxpool.py     |  7 ++-
 fastNLP/modules/encoder/embedding.py        |  5 +-
 fastNLP/modules/encoder/lstm.py             |  8 +--
 fastNLP/modules/encoder/star_transformer.py |  8 +--
 fastNLP/modules/encoder/transformer.py      |  7 ++-
 fastNLP/modules/encoder/variational_rnn.py  | 54 ++++++++++-----------
 fastNLP/modules/utils.py                    |  2 +-
 43 files changed, 229 insertions(+), 225 deletions(-)

diff --git a/fastNLP/__init__.py b/fastNLP/__init__.py
index 5dd5fd54..c67e5919 100644
--- a/fastNLP/__init__.py
+++ b/fastNLP/__init__.py
@@ -52,8 +52,8 @@ __all__ = [
     
     "cache_results"
 ]
+__version__ = '0.4.0'
+
 from .core import *
 from . import models
 from . import modules
-
-__version__ = '0.4.0'
diff --git a/fastNLP/core/batch.py b/fastNLP/core/batch.py
index b031d051..c1289adf 100644
--- a/fastNLP/core/batch.py
+++ b/fastNLP/core/batch.py
@@ -2,6 +2,10 @@
 batch 模块实现了 fastNLP 所需的 Batch 类。
 
 """
+__all__ = [
+    "Batch"
+]
+
 import atexit
 from queue import Empty, Full
 
@@ -11,10 +15,6 @@ import torch.multiprocessing as mp
 
 from .sampler import RandomSampler
 
-__all__ = [
-    "Batch"
-]
-
 _python_is_exit = False
 
 
diff --git a/fastNLP/core/callback.py b/fastNLP/core/callback.py
index 51495f23..6825ea6e 100644
--- a/fastNLP/core/callback.py
+++ b/fastNLP/core/callback.py
@@ -49,6 +49,18 @@ callback模块实现了 fastNLP 中的许多 callback 类，用于增强 :class:
     trainer.train()
 
 """
+__all__ = [
+    "Callback",
+    "GradientClipCallback",
+    "EarlyStopCallback",
+    "TensorboardCallback",
+    "LRScheduler",
+    "ControlC",
+    
+    "CallbackException",
+    "EarlyStopError"
+]
+
 import os
 
 import torch
@@ -62,18 +74,6 @@ except:
 
 from ..io.model_io import ModelSaver, ModelLoader
 
-__all__ = [
-    "Callback",
-    "GradientClipCallback",
-    "EarlyStopCallback",
-    "TensorboardCallback",
-    "LRScheduler",
-    "ControlC",
-    
-    "CallbackException",
-    "EarlyStopError"
-]
-
 
 class Callback(object):
     """
diff --git a/fastNLP/core/dataset.py b/fastNLP/core/dataset.py
index f20dd1f8..2da9f6d9 100644
--- a/fastNLP/core/dataset.py
+++ b/fastNLP/core/dataset.py
@@ -272,6 +272,10 @@
 
 
 """
+__all__ = [
+    "DataSet"
+]
+
 import _pickle as pickle
 import warnings
 
@@ -282,10 +286,6 @@ from .field import FieldArray
 from .instance import Instance
 from .utils import _get_func_signature
 
-__all__ = [
-    "DataSet"
-]
-
 
 class DataSet(object):
     """
diff --git a/fastNLP/core/field.py b/fastNLP/core/field.py
index 14e2538d..21ead327 100644
--- a/fastNLP/core/field.py
+++ b/fastNLP/core/field.py
@@ -3,10 +3,6 @@ field模块实现了 FieldArray 和若干 Padder。 FieldArray 是  :class:`~fas
 原理部分请参考 :doc:`fastNLP.core.dataset`
 
 """
-from copy import deepcopy
-
-import numpy as np
-
 __all__ = [
     "FieldArray",
     "Padder",
@@ -14,6 +10,10 @@ __all__ = [
     "EngChar2DPadder"
 ]
 
+from copy import deepcopy
+
+import numpy as np
+
 
 class FieldArray(object):
     """
diff --git a/fastNLP/core/losses.py b/fastNLP/core/losses.py
index 797b557d..ddc2c49f 100644
--- a/fastNLP/core/losses.py
+++ b/fastNLP/core/losses.py
@@ -2,6 +2,18 @@
 losses 模块定义了 fastNLP 中所需的各种损失函数，一般做为 :class:`~fastNLP.Trainer` 的参数使用。
 
 """
+__all__ = [
+    "LossBase",
+    
+    "LossFunc",
+    "LossInForward",
+    
+    "CrossEntropyLoss",
+    "BCELoss",
+    "L1Loss",
+    "NLLLoss"
+]
+
 import inspect
 from collections import defaultdict
 
@@ -15,18 +27,6 @@ from .utils import _check_arg_dict_list
 from .utils import _check_function_or_method
 from .utils import _get_func_signature
 
-__all__ = [
-    "LossBase",
-    
-    "LossFunc",
-    "LossInForward",
-    
-    "CrossEntropyLoss",
-    "BCELoss",
-    "L1Loss",
-    "NLLLoss"
-]
-
 
 class LossBase(object):
     """
diff --git a/fastNLP/core/metrics.py b/fastNLP/core/metrics.py
index 5ea2a5f1..f633a80f 100644
--- a/fastNLP/core/metrics.py
+++ b/fastNLP/core/metrics.py
@@ -2,6 +2,13 @@
 metrics 模块实现了 fastNLP 所需的各种常用衡量指标，一般做为 :class:`~fastNLP.Trainer` 的参数使用。
 
 """
+__all__ = [
+    "MetricBase",
+    "AccuracyMetric",
+    "SpanFPreRecMetric",
+    "SQuADMetric"
+]
+
 import inspect
 from collections import defaultdict
 
@@ -16,13 +23,6 @@ from .utils import _get_func_signature
 from .utils import seq_len_to_mask
 from .vocabulary import Vocabulary
 
-__all__ = [
-    "MetricBase",
-    "AccuracyMetric",
-    "SpanFPreRecMetric",
-    "SQuADMetric"
-]
-
 
 class MetricBase(object):
     """
diff --git a/fastNLP/core/optimizer.py b/fastNLP/core/optimizer.py
index 28f618f9..ef619042 100644
--- a/fastNLP/core/optimizer.py
+++ b/fastNLP/core/optimizer.py
@@ -2,14 +2,14 @@
 optimizer 模块定义了 fastNLP 中所需的各种优化器，一般做为 :class:`~fastNLP.Trainer` 的参数使用。
 
 """
-import torch
-
 __all__ = [
     "Optimizer",
     "SGD",
     "Adam"
 ]
 
+import torch
+
 
 class Optimizer(object):
     """
diff --git a/fastNLP/core/sampler.py b/fastNLP/core/sampler.py
index c8577722..c5784f59 100644
--- a/fastNLP/core/sampler.py
+++ b/fastNLP/core/sampler.py
@@ -1,10 +1,6 @@
 """
 sampler 子类实现了 fastNLP 所需的各种采样器。
 """
-from itertools import chain
-
-import numpy as np
-
 __all__ = [
     "Sampler",
     "BucketSampler",
@@ -12,6 +8,10 @@ __all__ = [
     "RandomSampler"
 ]
 
+from itertools import chain
+
+import numpy as np
+
 
 class Sampler(object):
     """
diff --git a/fastNLP/core/trainer.py b/fastNLP/core/trainer.py
index 7efa5d28..702cb6e7 100644
--- a/fastNLP/core/trainer.py
+++ b/fastNLP/core/trainer.py
@@ -295,6 +295,9 @@ Example2.3
     fastNLP已经自带了很多callback函数供使用，可以参考 :doc:`fastNLP.core.callback` 。
 
 """
+__all__ = [
+    "Trainer"
+]
 
 import os
 import time
diff --git a/fastNLP/core/utils.py b/fastNLP/core/utils.py
index 6e2f99ff..14ac409f 100644
--- a/fastNLP/core/utils.py
+++ b/fastNLP/core/utils.py
@@ -1,6 +1,11 @@
 """
 utils模块实现了 fastNLP 内部和外部所需的很多工具。其中用户可以使用的是 :func:`cache_results` 修饰器。
 """
+__all__ = [
+    "cache_results",
+    "seq_len_to_mask"
+]
+
 import _pickle
 import inspect
 import os
@@ -11,10 +16,6 @@ import numpy as np
 import torch
 import torch.nn as nn
 
-__all__ = [
-    "cache_results",
-    "seq_len_to_mask"
-]
 
 _CheckRes = namedtuple('_CheckRes', ['missing', 'unused', 'duplicated', 'required', 'all_needed',
                                      'varargs'])
diff --git a/fastNLP/core/vocabulary.py b/fastNLP/core/vocabulary.py
index 3d9598a3..43f590fd 100644
--- a/fastNLP/core/vocabulary.py
+++ b/fastNLP/core/vocabulary.py
@@ -1,12 +1,12 @@
+__all__ = [
+    "Vocabulary"
+]
+
 from functools import wraps
 from collections import Counter
 
 from .dataset import DataSet
 
-__all__ = [
-    "Vocabulary"
-]
-
 
 def _check_build_vocab(func):
     """A decorator to make sure the indexing is built before used.
@@ -322,7 +322,7 @@ class Vocabulary(object):
         :return str word: the word
         """
         return self.idx2word[idx]
-
+    
     def clear(self):
         """
         删除Vocabulary中的词表数据。相当于重新初始化一下。
@@ -333,7 +333,7 @@ class Vocabulary(object):
         self.word2idx = None
         self.idx2word = None
         self.rebuild = True
-
+    
     def __getstate__(self):
         """Use to prepare data for pickle.
 
diff --git a/fastNLP/io/__init__.py b/fastNLP/io/__init__.py
index 6ce7ebc3..c8d6a441 100644
--- a/fastNLP/io/__init__.py
+++ b/fastNLP/io/__init__.py
@@ -9,11 +9,6 @@
 
 这些类的使用方法如下:
 """
-from .embed_loader import EmbedLoader
-from .dataset_loader import DataSetLoader, CSVLoader, JsonLoader, ConllLoader, SNLILoader, SSTLoader, \
-    PeopleDailyCorpusLoader, Conll2003Loader
-from .model_io import ModelLoader, ModelSaver
-
 __all__ = [
     'EmbedLoader',
     
@@ -29,3 +24,8 @@ __all__ = [
     'ModelLoader',
     'ModelSaver',
 ]
+
+from .embed_loader import EmbedLoader
+from .dataset_loader import DataSetLoader, CSVLoader, JsonLoader, ConllLoader, SNLILoader, SSTLoader, \
+    PeopleDailyCorpusLoader, Conll2003Loader
+from .model_io import ModelLoader, ModelSaver
diff --git a/fastNLP/io/base_loader.py b/fastNLP/io/base_loader.py
index 33f59fe5..4ab1e2d0 100644
--- a/fastNLP/io/base_loader.py
+++ b/fastNLP/io/base_loader.py
@@ -1,10 +1,10 @@
-import _pickle as pickle
-import os
-
 __all__ = [
     "BaseLoader"
 ]
 
+import _pickle as pickle
+import os
+
 
 class BaseLoader(object):
     """
diff --git a/fastNLP/io/config_io.py b/fastNLP/io/config_io.py
index e67511ee..4acdbb96 100644
--- a/fastNLP/io/config_io.py
+++ b/fastNLP/io/config_io.py
@@ -3,18 +3,18 @@
  .. todo::
     这个模块中的类可能被抛弃？
 """
-import configparser
-import json
-import os
-
-from .base_loader import BaseLoader
-
 __all__ = [
     "ConfigLoader",
     "ConfigSection",
     "ConfigSaver"
 ]
 
+import configparser
+import json
+import os
+
+from .base_loader import BaseLoader
+
 
 class ConfigLoader(BaseLoader):
     """
diff --git a/fastNLP/io/dataset_loader.py b/fastNLP/io/dataset_loader.py
index a4b233ad..b820af44 100644
--- a/fastNLP/io/dataset_loader.py
+++ b/fastNLP/io/dataset_loader.py
@@ -10,12 +10,6 @@ dataset_loader模块实现了许多 DataSetLoader, 用于读取不同格式的
 
     # ... do stuff
 """
-from nltk.tree import Tree
-
-from ..core.dataset import DataSet
-from ..core.instance import Instance
-from .file_reader import _read_csv, _read_json, _read_conll
-
 __all__ = [
     'DataSetLoader',
     'CSVLoader',
@@ -27,6 +21,12 @@ __all__ = [
     'Conll2003Loader',
 ]
 
+from nltk.tree import Tree
+
+from ..core.dataset import DataSet
+from ..core.instance import Instance
+from .file_reader import _read_csv, _read_json, _read_conll
+
 
 def _download_from_url(url, path):
     try:
diff --git a/fastNLP/io/embed_loader.py b/fastNLP/io/embed_loader.py
index 7a845366..fb024e73 100644
--- a/fastNLP/io/embed_loader.py
+++ b/fastNLP/io/embed_loader.py
@@ -1,3 +1,7 @@
+__all__ = [
+    "EmbedLoader"
+]
+
 import os
 import warnings
 
@@ -6,10 +10,6 @@ import numpy as np
 from ..core.vocabulary import Vocabulary
 from .base_loader import BaseLoader
 
-__all__ = [
-    "EmbedLoader"
-]
-
 
 class EmbedLoader(BaseLoader):
     """
diff --git a/fastNLP/io/model_io.py b/fastNLP/io/model_io.py
index 36393cd4..ffaa4ef5 100644
--- a/fastNLP/io/model_io.py
+++ b/fastNLP/io/model_io.py
@@ -1,15 +1,15 @@
 """
 用于载入和保存模型
 """
-import torch
-
-from .base_loader import BaseLoader
-
 __all__ = [
     "ModelLoader",
     "ModelSaver"
 ]
 
+import torch
+
+from .base_loader import BaseLoader
+
 
 class ModelLoader(BaseLoader):
     """
diff --git a/fastNLP/models/__init__.py b/fastNLP/models/__init__.py
index f9ade153..14314049 100644
--- a/fastNLP/models/__init__.py
+++ b/fastNLP/models/__init__.py
@@ -7,15 +7,6 @@ fastNLP 在 :mod:`~fastNLP.models` 模块中内置了如 :class:`~fastNLP.models
 
 
 """
-from .base_model import BaseModel
-from .bert import BertForMultipleChoice, BertForQuestionAnswering, BertForSequenceClassification, \
-    BertForTokenClassification
-from .biaffine_parser import BiaffineParser, GraphParser
-from .cnn_text_classification import CNNText
-from .sequence_labeling import SeqLabeling, AdvSeqLabel
-from .snli import ESIM
-from .star_transformer import StarTransEnc, STSeqCls, STNLICls, STSeqLabel
-
 __all__ = [
     "CNNText",
     
@@ -32,3 +23,12 @@ __all__ = [
     "BiaffineParser",
     "GraphParser"
 ]
+
+from .base_model import BaseModel
+from .bert import BertForMultipleChoice, BertForQuestionAnswering, BertForSequenceClassification, \
+    BertForTokenClassification
+from .biaffine_parser import BiaffineParser, GraphParser
+from .cnn_text_classification import CNNText
+from .sequence_labeling import SeqLabeling, AdvSeqLabel
+from .snli import ESIM
+from .star_transformer import StarTransEnc, STSeqCls, STNLICls, STSeqLabel
diff --git a/fastNLP/models/biaffine_parser.py b/fastNLP/models/biaffine_parser.py
index 7f16202d..8533e7af 100644
--- a/fastNLP/models/biaffine_parser.py
+++ b/fastNLP/models/biaffine_parser.py
@@ -1,6 +1,11 @@
 """
 Biaffine Dependency Parser 的 Pytorch 实现.
 """
+__all__ = [
+    "BiaffineParser",
+    "GraphParser"
+]
+
 import numpy as np
 import torch
 import torch.nn as nn
@@ -19,11 +24,6 @@ from ..modules.utils import get_embeddings
 from .base_model import BaseModel
 from ..core.utils import seq_len_to_mask
 
-__all__ = [
-    "BiaffineParser",
-    "GraphParser"
-]
-
 
 def _mst(scores):
     """
diff --git a/fastNLP/models/cnn_text_classification.py b/fastNLP/models/cnn_text_classification.py
index a9ccc568..3a71a80a 100644
--- a/fastNLP/models/cnn_text_classification.py
+++ b/fastNLP/models/cnn_text_classification.py
@@ -1,13 +1,13 @@
+__all__ = [
+    "CNNText"
+]
+
 import torch
 import torch.nn as nn
 
 from ..core.const import Const as C
 from ..modules import encoder
 
-__all__ = [
-    "CNNText"
-]
-
 
 class CNNText(torch.nn.Module):
     """
diff --git a/fastNLP/models/enas_utils.py b/fastNLP/models/enas_utils.py
index 68c170ed..4e402a9a 100644
--- a/fastNLP/models/enas_utils.py
+++ b/fastNLP/models/enas_utils.py
@@ -1,6 +1,5 @@
 # Code Modified from https://github.com/carpedm20/ENAS-pytorch
 
-from __future__ import print_function
 from collections import defaultdict
 import collections
 
diff --git a/fastNLP/models/sequence_labeling.py b/fastNLP/models/sequence_labeling.py
index 503c79ba..8e6a5db1 100644
--- a/fastNLP/models/sequence_labeling.py
+++ b/fastNLP/models/sequence_labeling.py
@@ -1,6 +1,11 @@
 """
     本模块实现了两种序列标注模型
 """
+__all__ = [
+    "SeqLabeling",
+    "AdvSeqLabel"
+]
+
 import torch
 import torch.nn as nn
 
@@ -10,11 +15,6 @@ from ..modules.decoder.crf import allowed_transitions
 from ..core.utils import seq_len_to_mask
 from ..core.const import Const as C
 
-__all__ = [
-    "SeqLabeling",
-    "AdvSeqLabel"
-]
-
 
 class SeqLabeling(BaseModel):
     """
diff --git a/fastNLP/models/snli.py b/fastNLP/models/snli.py
index 606bcc42..395a9bbf 100644
--- a/fastNLP/models/snli.py
+++ b/fastNLP/models/snli.py
@@ -1,3 +1,7 @@
+__all__ = [
+    "ESIM"
+]
+
 import torch
 import torch.nn as nn
 
@@ -8,10 +12,6 @@ from ..modules import encoder as Encoder
 from ..modules import aggregator as Aggregator
 from ..core.utils import seq_len_to_mask
 
-__all__ = [
-    "ESIM"
-]
-
 my_inf = 10e12
 
 
diff --git a/fastNLP/models/star_transformer.py b/fastNLP/models/star_transformer.py
index 2e55f7e4..c67e5938 100644
--- a/fastNLP/models/star_transformer.py
+++ b/fastNLP/models/star_transformer.py
@@ -1,6 +1,13 @@
 """
 Star-Transformer 的 Pytorch 实现。
 """
+__all__ = [
+    "StarTransEnc",
+    "STNLICls",
+    "STSeqCls",
+    "STSeqLabel",
+]
+
 import torch
 from torch import nn
 
@@ -9,13 +16,6 @@ from ..core.utils import seq_len_to_mask
 from ..modules.utils import get_embeddings
 from ..core.const import Const
 
-__all__ = [
-    "StarTransEnc",
-    "STNLICls",
-    "STSeqCls",
-    "STSeqLabel",
-]
-
 
 class StarTransEnc(nn.Module):
     """
diff --git a/fastNLP/modules/__init__.py b/fastNLP/modules/__init__.py
index cd54c8db..194fda4e 100644
--- a/fastNLP/modules/__init__.py
+++ b/fastNLP/modules/__init__.py
@@ -22,15 +22,6 @@
 +-----------------------+-----------------------+-----------------------+
 
 """
-from . import aggregator
-from . import decoder
-from . import encoder
-from .aggregator import *
-from .decoder import *
-from .dropout import TimestepDropout
-from .encoder import *
-from .utils import get_embeddings
-
 __all__ = [
     # "BertModel",
     "ConvolutionCharEncoder",
@@ -54,3 +45,12 @@ __all__ = [
     "viterbi_decode",
     "allowed_transitions",
 ]
+
+from . import aggregator
+from . import decoder
+from . import encoder
+from .aggregator import *
+from .decoder import *
+from .dropout import TimestepDropout
+from .encoder import *
+from .utils import get_embeddings
diff --git a/fastNLP/modules/aggregator/__init__.py b/fastNLP/modules/aggregator/__init__.py
index 117dad83..a82138e7 100644
--- a/fastNLP/modules/aggregator/__init__.py
+++ b/fastNLP/modules/aggregator/__init__.py
@@ -1,10 +1,3 @@
-from .pooling import MaxPool
-from .pooling import MaxPoolWithMask
-from .pooling import AvgPool
-from .pooling import AvgPoolWithMask
-
-from .attention import MultiHeadAttention
-
 __all__ = [
     "MaxPool",
     "MaxPoolWithMask",
@@ -12,3 +5,10 @@ __all__ = [
     
     "MultiHeadAttention",
 ]
+
+from .pooling import MaxPool
+from .pooling import MaxPoolWithMask
+from .pooling import AvgPool
+from .pooling import AvgPoolWithMask
+
+from .attention import MultiHeadAttention
diff --git a/fastNLP/modules/aggregator/attention.py b/fastNLP/modules/aggregator/attention.py
index a1a7fda8..4101b033 100644
--- a/fastNLP/modules/aggregator/attention.py
+++ b/fastNLP/modules/aggregator/attention.py
@@ -1,3 +1,7 @@
+__all__ = [
+    "MultiHeadAttention"
+]
+
 import math
 
 import torch
@@ -8,10 +12,6 @@ from ..dropout import TimestepDropout
 
 from ..utils import initial_parameter
 
-__all__ = [
-    "MultiHeadAttention"
-]
-
 
 class DotAttention(nn.Module):
     """
diff --git a/fastNLP/modules/aggregator/pooling.py b/fastNLP/modules/aggregator/pooling.py
index be454d7b..51438aae 100644
--- a/fastNLP/modules/aggregator/pooling.py
+++ b/fastNLP/modules/aggregator/pooling.py
@@ -1,4 +1,8 @@
-__all__ = ["MaxPool", "MaxPoolWithMask", "AvgPool"]
+__all__ = [
+    "MaxPool",
+    "MaxPoolWithMask",
+    "AvgPool"
+]
 import torch
 import torch.nn as nn
 
@@ -16,6 +20,7 @@ class MaxPool(nn.Module):
     :param kernel_size: max pooling的窗口大小，默认为tensor最后k维，其中k为dimension
     :param ceil_mode:
     """
+    
     def __init__(self, stride=None, padding=0, dilation=1, dimension=1, kernel_size=None, ceil_mode=False):
         
         super(MaxPool, self).__init__()
@@ -125,7 +130,7 @@ class AvgPoolWithMask(nn.Module):
     给定形如[batch_size, max_len, hidden_size]的输入，在最后一维进行avg pooling. 输出为[batch_size, hidden_size], pooling
     的时候只会考虑mask为1的位置
     """
-
+    
     def __init__(self):
         super(AvgPoolWithMask, self).__init__()
         self.inf = 10e12
diff --git a/fastNLP/modules/decoder/__init__.py b/fastNLP/modules/decoder/__init__.py
index 5df48c43..664618b2 100644
--- a/fastNLP/modules/decoder/__init__.py
+++ b/fastNLP/modules/decoder/__init__.py
@@ -1,11 +1,11 @@
-from .crf import ConditionalRandomField
-from .mlp import MLP
-from .utils import viterbi_decode
-from .crf import allowed_transitions
-
 __all__ = [
     "MLP",
     "ConditionalRandomField",
     "viterbi_decode",
     "allowed_transitions"
 ]
+
+from .crf import ConditionalRandomField
+from .mlp import MLP
+from .utils import viterbi_decode
+from .crf import allowed_transitions
diff --git a/fastNLP/modules/decoder/crf.py b/fastNLP/modules/decoder/crf.py
index 130ed40e..beb2b9be 100644
--- a/fastNLP/modules/decoder/crf.py
+++ b/fastNLP/modules/decoder/crf.py
@@ -1,13 +1,13 @@
-import torch
-from torch import nn
-
-from ..utils import initial_parameter
-
 __all__ = [
     "ConditionalRandomField",
     "allowed_transitions"
 ]
 
+import torch
+from torch import nn
+
+from ..utils import initial_parameter
+
 
 def allowed_transitions(id2target, encoding_type='bio', include_start_end=True):
     """
diff --git a/fastNLP/modules/decoder/mlp.py b/fastNLP/modules/decoder/mlp.py
index 27019432..c1579224 100644
--- a/fastNLP/modules/decoder/mlp.py
+++ b/fastNLP/modules/decoder/mlp.py
@@ -1,12 +1,12 @@
+__all__ = [
+    "MLP"
+]
+
 import torch
 import torch.nn as nn
 
 from ..utils import initial_parameter
 
-__all__ = [
-    "MLP"
-]
-
 
 class MLP(nn.Module):
     """
diff --git a/fastNLP/modules/decoder/utils.py b/fastNLP/modules/decoder/utils.py
index 434873c7..249f3ff6 100644
--- a/fastNLP/modules/decoder/utils.py
+++ b/fastNLP/modules/decoder/utils.py
@@ -1,8 +1,7 @@
-import torch
-
 __all__ = [
     "viterbi_decode"
 ]
+import torch
 
 
 def viterbi_decode(logits, transitions, mask=None, unpad=False):
diff --git a/fastNLP/modules/dropout.py b/fastNLP/modules/dropout.py
index 34b426fd..1363165c 100644
--- a/fastNLP/modules/dropout.py
+++ b/fastNLP/modules/dropout.py
@@ -1,6 +1,8 @@
-import torch
 __all__ = []
 
+import torch
+
+
 class TimestepDropout(torch.nn.Dropout):
     """
     别名：:class:`fastNLP.modules.TimestepDropout`
@@ -8,7 +10,7 @@ class TimestepDropout(torch.nn.Dropout):
     接受的参数shape为``[batch_size, num_timesteps, embedding_dim)]`` 使用同一个mask(shape为``(batch_size, embedding_dim)``)
      在每个timestamp上做dropout。
     """
-
+    
     def forward(self, x):
         dropout_mask = x.new_ones(x.shape[0], x.shape[-1])
         torch.nn.functional.dropout(dropout_mask, self.p, self.training, inplace=True)
diff --git a/fastNLP/modules/encoder/__init__.py b/fastNLP/modules/encoder/__init__.py
index 3d65867a..bdc4cbf3 100644
--- a/fastNLP/modules/encoder/__init__.py
+++ b/fastNLP/modules/encoder/__init__.py
@@ -1,12 +1,3 @@
-from .bert import BertModel
-from .char_encoder import ConvolutionCharEncoder, LSTMCharEncoder
-from .conv_maxpool import ConvMaxpool
-from .embedding import Embedding
-from .lstm import LSTM
-from .star_transformer import StarTransformer
-from .transformer import TransformerEncoder
-from .variational_rnn import VarRNN, VarLSTM, VarGRU
-
 __all__ = [
     # "BertModel",
     
@@ -27,3 +18,11 @@ __all__ = [
     "VarLSTM",
     "VarGRU"
 ]
+from .bert import BertModel
+from .char_encoder import ConvolutionCharEncoder, LSTMCharEncoder
+from .conv_maxpool import ConvMaxpool
+from .embedding import Embedding
+from .lstm import LSTM
+from .star_transformer import StarTransformer
+from .transformer import TransformerEncoder
+from .variational_rnn import VarRNN, VarLSTM, VarGRU
diff --git a/fastNLP/modules/encoder/char_encoder.py b/fastNLP/modules/encoder/char_encoder.py
index 8aefd284..481ad7ad 100644
--- a/fastNLP/modules/encoder/char_encoder.py
+++ b/fastNLP/modules/encoder/char_encoder.py
@@ -1,12 +1,11 @@
-import torch
-import torch.nn as nn
-
-from ..utils import initial_parameter
-
 __all__ = [
     "ConvolutionCharEncoder",
     "LSTMCharEncoder"
 ]
+import torch
+import torch.nn as nn
+
+from ..utils import initial_parameter
 
 
 # from torch.nn.init import xavier_uniform
diff --git a/fastNLP/modules/encoder/conv_maxpool.py b/fastNLP/modules/encoder/conv_maxpool.py
index 5e714e88..ae6bea04 100644
--- a/fastNLP/modules/encoder/conv_maxpool.py
+++ b/fastNLP/modules/encoder/conv_maxpool.py
@@ -1,13 +1,12 @@
+__all__ = [
+    "ConvMaxpool"
+]
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 
 from ..utils import initial_parameter
 
-__all__ = [
-    "ConvMaxpool"
-]
-
 
 class ConvMaxpool(nn.Module):
     """
diff --git a/fastNLP/modules/encoder/embedding.py b/fastNLP/modules/encoder/embedding.py
index 9fa89e7f..f3c1f475 100644
--- a/fastNLP/modules/encoder/embedding.py
+++ b/fastNLP/modules/encoder/embedding.py
@@ -1,9 +1,8 @@
-import torch.nn as nn
-from ..utils import get_embeddings
-
 __all__ = [
     "Embedding"
 ]
+import torch.nn as nn
+from ..utils import get_embeddings
 
 
 class Embedding(nn.Embedding):
diff --git a/fastNLP/modules/encoder/lstm.py b/fastNLP/modules/encoder/lstm.py
index bc9cb155..b4f960e7 100644
--- a/fastNLP/modules/encoder/lstm.py
+++ b/fastNLP/modules/encoder/lstm.py
@@ -2,16 +2,16 @@
 轻量封装的 Pytorch LSTM 模块.
 可在 forward 时传入序列的长度, 自动对padding做合适的处理.
 """
+__all__ = [
+    "LSTM"
+]
+
 import torch
 import torch.nn as nn
 import torch.nn.utils.rnn as rnn
 
 from ..utils import initial_parameter
 
-__all__ = [
-    "LSTM"
-]
-
 
 class LSTM(nn.Module):
     """
diff --git a/fastNLP/modules/encoder/star_transformer.py b/fastNLP/modules/encoder/star_transformer.py
index 677af48a..5a7f3d67 100644
--- a/fastNLP/modules/encoder/star_transformer.py
+++ b/fastNLP/modules/encoder/star_transformer.py
@@ -1,15 +1,15 @@
 """
 Star-Transformer 的encoder部分的 Pytorch 实现
 """
+__all__ = [
+    "StarTransformer"
+]
+
 import numpy as NP
 import torch
 from torch import nn
 from torch.nn import functional as F
 
-__all__ = [
-    "StarTransformer"
-]
-
 
 class StarTransformer(nn.Module):
     """
diff --git a/fastNLP/modules/encoder/transformer.py b/fastNLP/modules/encoder/transformer.py
index 2532d90a..698ff95c 100644
--- a/fastNLP/modules/encoder/transformer.py
+++ b/fastNLP/modules/encoder/transformer.py
@@ -1,12 +1,11 @@
+__all__ = [
+    "TransformerEncoder"
+]
 from torch import nn
 
 from ..aggregator.attention import MultiHeadAttention
 from ..dropout import TimestepDropout
 
-__all__ = [
-    "TransformerEncoder"
-]
-
 
 class TransformerEncoder(nn.Module):
     """
diff --git a/fastNLP/modules/encoder/variational_rnn.py b/fastNLP/modules/encoder/variational_rnn.py
index 753741de..29b728e5 100644
--- a/fastNLP/modules/encoder/variational_rnn.py
+++ b/fastNLP/modules/encoder/variational_rnn.py
@@ -1,6 +1,12 @@
 """
 Variational RNN 的 Pytorch 实现
 """
+__all__ = [
+    "VarRNN",
+    "VarLSTM",
+    "VarGRU"
+]
+
 import torch
 import torch.nn as nn
 from torch.nn.utils.rnn import PackedSequence, pack_padded_sequence, pad_packed_sequence
@@ -17,25 +23,19 @@ except ImportError:
 
 from ..utils import initial_parameter
 
-__all__ = [
-    "VarRNN",
-    "VarLSTM",
-    "VarGRU"
-]
-
 
 class VarRnnCellWrapper(nn.Module):
     """
     Wrapper for normal RNN Cells, make it support variational dropout
     """
-
+    
     def __init__(self, cell, hidden_size, input_p, hidden_p):
         super(VarRnnCellWrapper, self).__init__()
         self.cell = cell
         self.hidden_size = hidden_size
         self.input_p = input_p
         self.hidden_p = hidden_p
-
+    
     def forward(self, input_x, hidden, mask_x, mask_h, is_reversed=False):
         """
         :param PackedSequence input_x: [seq_len, batch_size, input_size]
@@ -47,13 +47,13 @@ class VarRnnCellWrapper(nn.Module):
                 hidden: for LSTM, tuple of (h_n, c_n), [batch_size, hidden_size]
                         for other RNN, h_n, [batch_size, hidden_size]
         """
-
+        
         def get_hi(hi, h0, size):
             h0_size = size - hi.size(0)
             if h0_size > 0:
                 return torch.cat([hi, h0[:h0_size]], dim=0)
             return hi[:size]
-
+        
         is_lstm = isinstance(hidden, tuple)
         input, batch_sizes = input_x.data, input_x.batch_sizes
         output = []
@@ -64,7 +64,7 @@ class VarRnnCellWrapper(nn.Module):
         else:
             batch_iter = batch_sizes
             idx = 0
-
+        
         if is_lstm:
             hn = (hidden[0].clone(), hidden[1].clone())
         else:
@@ -91,7 +91,7 @@ class VarRnnCellWrapper(nn.Module):
                 hi = cell(input_i, hi)
                 hn[:size] = hi
                 output.append(hi)
-
+        
         if is_reversed:
             output = list(reversed(output))
         output = torch.cat(output, dim=0)
@@ -117,7 +117,7 @@ class VarRNNBase(nn.Module):
     :param hidden_dropout: 对每个隐状态的dropout概率. Default: 0
     :param bidirectional: 若为 ``True``, 使用双向的RNN. Default: ``False``
     """
-
+    
     def __init__(self, mode, Cell, input_size, hidden_size, num_layers=1,
                  bias=True, batch_first=False,
                  input_dropout=0, hidden_dropout=0, bidirectional=False):
@@ -141,7 +141,7 @@ class VarRNNBase(nn.Module):
                     cell, self.hidden_size, input_dropout, hidden_dropout))
         initial_parameter(self)
         self.is_lstm = (self.mode == "LSTM")
-
+    
     def _forward_one(self, n_layer, n_direction, input, hx, mask_x, mask_h):
         is_lstm = self.is_lstm
         idx = self.num_directions * n_layer + n_direction
@@ -150,7 +150,7 @@ class VarRNNBase(nn.Module):
         output_x, hidden_x = cell(
             input, hi, mask_x, mask_h, is_reversed=(n_direction == 1))
         return output_x, hidden_x
-
+    
     def forward(self, x, hx=None):
         """
 
@@ -170,13 +170,13 @@ class VarRNNBase(nn.Module):
         else:
             max_batch_size = int(x.batch_sizes[0])
         x, batch_sizes = x.data, x.batch_sizes
-
+        
         if hx is None:
             hx = x.new_zeros(self.num_layers * self.num_directions,
                              max_batch_size, self.hidden_size, requires_grad=True)
             if is_lstm:
                 hx = (hx, hx.new_zeros(hx.size(), requires_grad=True))
-
+        
         mask_x = x.new_ones((max_batch_size, self.input_size))
         mask_out = x.new_ones(
             (max_batch_size, self.hidden_size * self.num_directions))
@@ -185,7 +185,7 @@ class VarRNNBase(nn.Module):
                               training=self.training, inplace=True)
         nn.functional.dropout(mask_out, p=self.hidden_dropout,
                               training=self.training, inplace=True)
-
+        
         hidden = x.new_zeros(
             (self.num_layers * self.num_directions, max_batch_size, self.hidden_size))
         if is_lstm:
@@ -207,16 +207,16 @@ class VarRNNBase(nn.Module):
                 else:
                     hidden[idx] = hidden_x
             x = torch.cat(output_list, dim=-1)
-
+        
         if is_lstm:
             hidden = (hidden, cellstate)
-
+        
         if is_packed:
             output = PackedSequence(x, batch_sizes)
         else:
             x = PackedSequence(x, batch_sizes)
             output, _ = pad_packed_sequence(x, batch_first=self.batch_first)
-
+        
         return output, hidden
 
 
@@ -236,11 +236,11 @@ class VarLSTM(VarRNNBase):
     :param hidden_dropout: 对每个隐状态的dropout概率. Default: 0
     :param bidirectional: 若为 ``True``, 使用双向的LSTM. Default: ``False``
     """
-
+    
     def __init__(self, *args, **kwargs):
         super(VarLSTM, self).__init__(
             mode="LSTM", Cell=nn.LSTMCell, *args, **kwargs)
-
+    
     def forward(self, x, hx=None):
         return super(VarLSTM, self).forward(x, hx)
 
@@ -261,11 +261,11 @@ class VarRNN(VarRNNBase):
     :param hidden_dropout: 对每个隐状态的dropout概率. Default: 0
     :param bidirectional: 若为 ``True``, 使用双向的RNN. Default: ``False``
     """
-
+    
     def __init__(self, *args, **kwargs):
         super(VarRNN, self).__init__(
             mode="RNN", Cell=nn.RNNCell, *args, **kwargs)
-
+    
     def forward(self, x, hx=None):
         return super(VarRNN, self).forward(x, hx)
 
@@ -286,10 +286,10 @@ class VarGRU(VarRNNBase):
     :param hidden_dropout: 对每个隐状态的dropout概率. Default: 0
     :param bidirectional: 若为 ``True``, 使用双向的GRU. Default: ``False``
     """
-
+    
     def __init__(self, *args, **kwargs):
         super(VarGRU, self).__init__(
             mode="GRU", Cell=nn.GRUCell, *args, **kwargs)
-
+    
     def forward(self, x, hx=None):
         return super(VarGRU, self).forward(x, hx)
diff --git a/fastNLP/modules/utils.py b/fastNLP/modules/utils.py
index 0aba7e62..c9a1f682 100644
--- a/fastNLP/modules/utils.py
+++ b/fastNLP/modules/utils.py
@@ -1,5 +1,5 @@
 from functools import reduce
-from collections import OrderedDict
+
 import numpy as np
 import torch
 import torch.nn as nn

From a6f60d8fead5b8d76721287f9dd53bed0e1dc3e3 Mon Sep 17 00:00:00 2001
From: ChenXin <will131@foxmail.com>
Date: Sun, 19 May 2019 19:03:43 +0800
Subject: [PATCH 8/8] =?UTF-8?q?=E6=9B=B4=E6=96=B0=E4=BA=86cache=5Fresults?=
 =?UTF-8?q?=E7=9A=84=E4=BE=8B=E5=AD=90?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 fastNLP/core/utils.py | 26 ++++++++++++++------------
 1 file changed, 14 insertions(+), 12 deletions(-)

diff --git a/fastNLP/core/utils.py b/fastNLP/core/utils.py
index 14ac409f..518c8213 100644
--- a/fastNLP/core/utils.py
+++ b/fastNLP/core/utils.py
@@ -45,26 +45,28 @@ def cache_results(_cache_fp, _refresh=False, _verbose=1):
         import time
         import numpy as np
         from fastNLP import cache_results
-
+        
         @cache_results('cache.pkl')
         def process_data():
             # 一些比较耗时的工作，比如读取数据，预处理数据等，这里用time.sleep()代替耗时
             time.sleep(1)
-            return np.random.randint(5, size=(10, 20))
-
+            return np.random.randint(10, size=(5,))
+        
         start_time = time.time()
-        process_data()
+        print("res =",process_data())
         print(time.time() - start_time)
-
+        
         start_time = time.time()
-        process_data()
+        print("res =",process_data())
         print(time.time() - start_time)
-
-        # 输出内容如下
-        #     Save cache to cache.pkl.
-        #     1.0015439987182617
-        #     Read cache from cache.pkl.
-        #     0.00013065338134765625
+        
+        # 输出内容如下，可以看到两次结果相同，且第二次几乎没有花费时间
+        # Save cache to cache.pkl.
+        # res = [5 4 9 1 8]
+        # 1.0042750835418701
+        # Read cache from cache.pkl.
+        # res = [5 4 9 1 8]
+        # 0.0040721893310546875
 
     可以看到第二次运行的时候，只用了0.0001s左右，是由于第二次运行将直接从cache.pkl这个文件读取数据，而不会经过再次预处理