From 9c078500198e550d72a8b13eb8206aed82a18803 Mon Sep 17 00:00:00 2001
From: ChenXin <will131@foxmail.com>
Date: Thu, 16 May 2019 20:32:10 +0800
Subject: [PATCH] =?UTF-8?q?=E4=BF=AE=E6=94=B9=E4=BA=86=20models=20?=
 =?UTF-8?q?=E9=83=A8=E5=88=86=20import=20=E7=9A=84=E9=A1=BA=E5=BA=8F?=
 =?UTF-8?q?=EF=BC=8C=5F=5Fall=5F=5F=20=E6=9A=B4=E9=9C=B2=E7=9A=84=E5=86=85?=
 =?UTF-8?q?=E5=AE=B9?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 docs/source/fastNLP.models.base_model.rst     |   7 -
 docs/source/fastNLP.models.bert.rst           |   7 -
 .../source/fastNLP.models.enas_controller.rst |   7 -
 docs/source/fastNLP.models.enas_model.rst     |   7 -
 docs/source/fastNLP.models.enas_trainer.rst   |   7 -
 docs/source/fastNLP.models.enas_utils.rst     |   7 -
 docs/source/fastNLP.models.rst                |   6 -
 fastNLP/core/batch.py                         |   4 +-
 fastNLP/core/callback.py                      |   1 +
 fastNLP/core/dataset.py                       |   3 +-
 fastNLP/core/field.py                         |   4 +-
 fastNLP/core/losses.py                        |   4 +-
 fastNLP/core/metrics.py                       |   4 +-
 fastNLP/core/predictor.py                     |   4 +-
 fastNLP/core/sampler.py                       |   4 +-
 fastNLP/core/tester.py                        |   1 +
 fastNLP/core/trainer.py                       |   6 +-
 fastNLP/core/utils.py                         |   9 +-
 fastNLP/io/__init__.py                        |   9 +-
 fastNLP/io/base_loader.py                     |  19 ++-
 fastNLP/io/config_io.py                       |  64 ++++---
 fastNLP/io/dataset_loader.py                  |  11 +-
 fastNLP/io/embed_loader.py                    |  56 +++---
 fastNLP/io/model_io.py                        |  15 +-
 fastNLP/models/__init__.py                    |  20 ++-
 fastNLP/models/base_model.py                  |  10 +-
 fastNLP/models/biaffine_parser.py             | 159 ++++++++++--------
 fastNLP/models/cnn_text_classification.py     |  17 +-
 fastNLP/models/enas_controller.py             |   1 +
 fastNLP/models/enas_model.py                  | 139 +++++++--------
 fastNLP/models/enas_trainer.py                | 141 ++++++++--------
 fastNLP/models/enas_utils.py                  |   2 -
 fastNLP/models/sequence_labeling.py           |  10 +-
 fastNLP/models/snli.py                        |  61 +++----
 fastNLP/models/star_transformer.py            |  67 +++++---
 35 files changed, 465 insertions(+), 428 deletions(-)
 delete mode 100644 docs/source/fastNLP.models.base_model.rst
 delete mode 100644 docs/source/fastNLP.models.bert.rst
 delete mode 100644 docs/source/fastNLP.models.enas_controller.rst
 delete mode 100644 docs/source/fastNLP.models.enas_model.rst
 delete mode 100644 docs/source/fastNLP.models.enas_trainer.rst
 delete mode 100644 docs/source/fastNLP.models.enas_utils.rst

diff --git a/docs/source/fastNLP.models.base_model.rst b/docs/source/fastNLP.models.base_model.rst
deleted file mode 100644
index e1d4d64f..00000000
--- a/docs/source/fastNLP.models.base_model.rst
+++ /dev/null
@@ -1,7 +0,0 @@
-fastNLP.models.base\_model
-==========================
-
-.. automodule:: fastNLP.models.base_model
-    :members:
-    :undoc-members:
-    :show-inheritance:
diff --git a/docs/source/fastNLP.models.bert.rst b/docs/source/fastNLP.models.bert.rst
deleted file mode 100644
index bba323df..00000000
--- a/docs/source/fastNLP.models.bert.rst
+++ /dev/null
@@ -1,7 +0,0 @@
-fastNLP.models.bert
-===================
-
-.. automodule:: fastNLP.models.bert
-    :members:
-    :undoc-members:
-    :show-inheritance:
diff --git a/docs/source/fastNLP.models.enas_controller.rst b/docs/source/fastNLP.models.enas_controller.rst
deleted file mode 100644
index 28655bd7..00000000
--- a/docs/source/fastNLP.models.enas_controller.rst
+++ /dev/null
@@ -1,7 +0,0 @@
-fastNLP.models.enas\_controller
-===============================
-
-.. automodule:: fastNLP.models.enas_controller
-    :members:
-    :undoc-members:
-    :show-inheritance:
diff --git a/docs/source/fastNLP.models.enas_model.rst b/docs/source/fastNLP.models.enas_model.rst
deleted file mode 100644
index 35fbe495..00000000
--- a/docs/source/fastNLP.models.enas_model.rst
+++ /dev/null
@@ -1,7 +0,0 @@
-fastNLP.models.enas\_model
-==========================
-
-.. automodule:: fastNLP.models.enas_model
-    :members:
-    :undoc-members:
-    :show-inheritance:
diff --git a/docs/source/fastNLP.models.enas_trainer.rst b/docs/source/fastNLP.models.enas_trainer.rst
deleted file mode 100644
index 7e0ef462..00000000
--- a/docs/source/fastNLP.models.enas_trainer.rst
+++ /dev/null
@@ -1,7 +0,0 @@
-fastNLP.models.enas\_trainer
-============================
-
-.. automodule:: fastNLP.models.enas_trainer
-    :members:
-    :undoc-members:
-    :show-inheritance:
diff --git a/docs/source/fastNLP.models.enas_utils.rst b/docs/source/fastNLP.models.enas_utils.rst
deleted file mode 100644
index 0a049706..00000000
--- a/docs/source/fastNLP.models.enas_utils.rst
+++ /dev/null
@@ -1,7 +0,0 @@
-fastNLP.models.enas\_utils
-==========================
-
-.. automodule:: fastNLP.models.enas_utils
-    :members:
-    :undoc-members:
-    :show-inheritance:
diff --git a/docs/source/fastNLP.models.rst b/docs/source/fastNLP.models.rst
index 57592bf4..5858ebcd 100644
--- a/docs/source/fastNLP.models.rst
+++ b/docs/source/fastNLP.models.rst
@@ -12,14 +12,8 @@ fastNLP.models
 .. toctree::
    :titlesonly:
 
-   fastNLP.models.base_model
-   fastNLP.models.bert
    fastNLP.models.biaffine_parser
    fastNLP.models.cnn_text_classification
-   fastNLP.models.enas_controller
-   fastNLP.models.enas_model
-   fastNLP.models.enas_trainer
-   fastNLP.models.enas_utils
    fastNLP.models.sequence_labeling
    fastNLP.models.snli
    fastNLP.models.star_transformer
diff --git a/fastNLP/core/batch.py b/fastNLP/core/batch.py
index 90f0fc8c..b031d051 100644
--- a/fastNLP/core/batch.py
+++ b/fastNLP/core/batch.py
@@ -3,12 +3,12 @@ batch 模块实现了 fastNLP 所需的 Batch 类。
 
 """
 import atexit
+from queue import Empty, Full
+
 import numpy as np
 import torch
 import torch.multiprocessing as mp
 
-from queue import Empty, Full
-
 from .sampler import RandomSampler
 
 __all__ = [
diff --git a/fastNLP/core/callback.py b/fastNLP/core/callback.py
index 0a5ddc52..51495f23 100644
--- a/fastNLP/core/callback.py
+++ b/fastNLP/core/callback.py
@@ -50,6 +50,7 @@ callback模块实现了 fastNLP 中的许多 callback 类，用于增强 :class:
 
 """
 import os
+
 import torch
 
 try:
diff --git a/fastNLP/core/dataset.py b/fastNLP/core/dataset.py
index 63f66019..f20dd1f8 100644
--- a/fastNLP/core/dataset.py
+++ b/fastNLP/core/dataset.py
@@ -273,9 +273,10 @@
 
 """
 import _pickle as pickle
-import numpy as np
 import warnings
 
+import numpy as np
+
 from .field import AutoPadder
 from .field import FieldArray
 from .instance import Instance
diff --git a/fastNLP/core/field.py b/fastNLP/core/field.py
index 4029a4ca..14e2538d 100644
--- a/fastNLP/core/field.py
+++ b/fastNLP/core/field.py
@@ -3,10 +3,10 @@ field模块实现了 FieldArray 和若干 Padder。 FieldArray 是  :class:`~fas
 原理部分请参考 :doc:`fastNLP.core.dataset`
 
 """
-import numpy as np
-
 from copy import deepcopy
 
+import numpy as np
+
 __all__ = [
     "FieldArray",
     "Padder",
diff --git a/fastNLP/core/losses.py b/fastNLP/core/losses.py
index b98c5ac7..797b557d 100644
--- a/fastNLP/core/losses.py
+++ b/fastNLP/core/losses.py
@@ -3,11 +3,11 @@ losses 模块定义了 fastNLP 中所需的各种损失函数，一般做为 :cl
 
 """
 import inspect
+from collections import defaultdict
+
 import torch
 import torch.nn.functional as F
 
-from collections import defaultdict
-
 from .utils import _CheckError
 from .utils import _CheckRes
 from .utils import _build_args
diff --git a/fastNLP/core/metrics.py b/fastNLP/core/metrics.py
index df85a318..5ea2a5f1 100644
--- a/fastNLP/core/metrics.py
+++ b/fastNLP/core/metrics.py
@@ -3,11 +3,11 @@ metrics 模块实现了 fastNLP 所需的各种常用衡量指标，一般做为
 
 """
 import inspect
+from collections import defaultdict
+
 import numpy as np
 import torch
 
-from collections import defaultdict
-
 from .utils import _CheckError
 from .utils import _CheckRes
 from .utils import _build_args
diff --git a/fastNLP/core/predictor.py b/fastNLP/core/predictor.py
index a9ef7924..4f37e105 100644
--- a/fastNLP/core/predictor.py
+++ b/fastNLP/core/predictor.py
@@ -2,10 +2,10 @@
     ..todo::
         检查这个类是否需要
 """
-import torch
-
 from collections import defaultdict
 
+import torch
+
 from . import Batch
 from . import DataSet
 from . import SequentialSampler
diff --git a/fastNLP/core/sampler.py b/fastNLP/core/sampler.py
index 0900e733..c8577722 100644
--- a/fastNLP/core/sampler.py
+++ b/fastNLP/core/sampler.py
@@ -1,10 +1,10 @@
 """
 sampler 子类实现了 fastNLP 所需的各种采样器。
 """
-import numpy as np
-
 from itertools import chain
 
+import numpy as np
+
 __all__ = [
     "Sampler",
     "BucketSampler",
diff --git a/fastNLP/core/tester.py b/fastNLP/core/tester.py
index 47aef46e..883e0d01 100644
--- a/fastNLP/core/tester.py
+++ b/fastNLP/core/tester.py
@@ -33,6 +33,7 @@ Tester在验证进行之前会调用model.eval()提示当前进入了evaluation
 
 """
 import warnings
+
 import torch
 import torch.nn as nn
 
diff --git a/fastNLP/core/trainer.py b/fastNLP/core/trainer.py
index 87d57f12..7efa5d28 100644
--- a/fastNLP/core/trainer.py
+++ b/fastNLP/core/trainer.py
@@ -297,13 +297,13 @@ Example2.3
 """
 
 import os
-import numpy as np
 import time
+from datetime import datetime, timedelta
+
+import numpy as np
 import torch
 import torch.nn as nn
 
-from datetime import datetime, timedelta
-
 try:
     from tqdm.auto import tqdm
 except:
diff --git a/fastNLP/core/utils.py b/fastNLP/core/utils.py
index a7ad3326..6e2f99ff 100644
--- a/fastNLP/core/utils.py
+++ b/fastNLP/core/utils.py
@@ -3,14 +3,13 @@ utils模块实现了 fastNLP 内部和外部所需的很多工具。其中用户
 """
 import _pickle
 import inspect
-import numpy as np
 import os
-import torch
-import torch.nn as nn
 import warnings
+from collections import Counter, namedtuple
 
-from collections import Counter
-from collections import namedtuple
+import numpy as np
+import torch
+import torch.nn as nn
 
 __all__ = [
     "cache_results",
diff --git a/fastNLP/io/__init__.py b/fastNLP/io/__init__.py
index 3baf878c..6ce7ebc3 100644
--- a/fastNLP/io/__init__.py
+++ b/fastNLP/io/__init__.py
@@ -9,6 +9,11 @@
 
 这些类的使用方法如下:
 """
+from .embed_loader import EmbedLoader
+from .dataset_loader import DataSetLoader, CSVLoader, JsonLoader, ConllLoader, SNLILoader, SSTLoader, \
+    PeopleDailyCorpusLoader, Conll2003Loader
+from .model_io import ModelLoader, ModelSaver
+
 __all__ = [
     'EmbedLoader',
     
@@ -24,7 +29,3 @@ __all__ = [
     'ModelLoader',
     'ModelSaver',
 ]
-from .embed_loader import EmbedLoader
-from .dataset_loader import DataSetLoader, CSVLoader, JsonLoader, ConllLoader, SNLILoader, SSTLoader, \
-    PeopleDailyCorpusLoader, Conll2003Loader
-from .model_io import ModelLoader as ModelLoader, ModelSaver as ModelSaver
\ No newline at end of file
diff --git a/fastNLP/io/base_loader.py b/fastNLP/io/base_loader.py
index 051de281..33f59fe5 100644
--- a/fastNLP/io/base_loader.py
+++ b/fastNLP/io/base_loader.py
@@ -1,15 +1,20 @@
 import _pickle as pickle
 import os
 
+__all__ = [
+    "BaseLoader"
+]
+
 
 class BaseLoader(object):
     """
     各个 Loader 的基类，提供了 API 的参考。
 
     """
+    
     def __init__(self):
         super(BaseLoader, self).__init__()
-
+    
     @staticmethod
     def load_lines(data_path):
         """
@@ -20,7 +25,7 @@ class BaseLoader(object):
         with open(data_path, "r", encoding="utf=8") as f:
             text = f.readlines()
         return [line.strip() for line in text]
-
+    
     @classmethod
     def load(cls, data_path):
         """
@@ -31,7 +36,7 @@ class BaseLoader(object):
         with open(data_path, "r", encoding="utf-8") as f:
             text = f.readlines()
         return [[word for word in sent.strip()] for sent in text]
-
+    
     @classmethod
     def load_with_cache(cls, data_path, cache_path):
         """缓存版的load
@@ -48,16 +53,18 @@ class BaseLoader(object):
 
 class DataLoaderRegister:
     _readers = {}
-
+    
     @classmethod
     def set_reader(cls, reader_cls, read_fn_name):
         # def wrapper(reader_cls):
         if read_fn_name in cls._readers:
-            raise KeyError('duplicate reader: {} and {} for read_func: {}'.format(cls._readers[read_fn_name], reader_cls, read_fn_name))
+            raise KeyError(
+                'duplicate reader: {} and {} for read_func: {}'.format(cls._readers[read_fn_name], reader_cls,
+                                                                       read_fn_name))
         if hasattr(reader_cls, 'load'):
             cls._readers[read_fn_name] = reader_cls().load
         return reader_cls
-
+    
     @classmethod
     def get_reader(cls, read_fn_name):
         if read_fn_name in cls._readers:
diff --git a/fastNLP/io/config_io.py b/fastNLP/io/config_io.py
index 8fa30dd4..e67511ee 100644
--- a/fastNLP/io/config_io.py
+++ b/fastNLP/io/config_io.py
@@ -1,14 +1,20 @@
 """
-
 用于读入和处理和保存 config 文件
+ .. todo::
+    这个模块中的类可能被抛弃？
 """
-__all__ = ["ConfigLoader","ConfigSection","ConfigSaver"]
 import configparser
 import json
 import os
 
 from .base_loader import BaseLoader
 
+__all__ = [
+    "ConfigLoader",
+    "ConfigSection",
+    "ConfigSaver"
+]
+
 
 class ConfigLoader(BaseLoader):
     """
@@ -19,15 +25,16 @@ class ConfigLoader(BaseLoader):
     :param str data_path: 配置文件的路径
 
     """
+    
     def __init__(self, data_path=None):
         super(ConfigLoader, self).__init__()
         if data_path is not None:
             self.config = self.parse(super(ConfigLoader, self).load(data_path))
-
+    
     @staticmethod
     def parse(string):
         raise NotImplementedError
-
+    
     @staticmethod
     def load_config(file_path, sections):
         """
@@ -81,10 +88,10 @@ class ConfigSection(object):
     ConfigSection是一个存储了一个section中所有键值对的数据结构，推荐使用此类的实例来配合 :meth:`ConfigLoader.load_config` 使用
 
     """
-
+    
     def __init__(self):
         super(ConfigSection, self).__init__()
-
+    
     def __getitem__(self, key):
         """
         :param key: str, the name of the attribute
@@ -97,7 +104,7 @@ class ConfigSection(object):
         if key in self.__dict__.keys():
             return getattr(self, key)
         raise AttributeError("do NOT have attribute %s" % key)
-
+    
     def __setitem__(self, key, value):
         """
         :param key: str, the name of the attribute
@@ -112,14 +119,14 @@ class ConfigSection(object):
                 raise AttributeError("attr %s except %s but got %s" %
                                      (key, str(type(getattr(self, key))), str(type(value))))
         setattr(self, key, value)
-
+    
     def __contains__(self, item):
         """
         :param item: The key of item.
         :return: True if the key in self.__dict__.keys() else False.
         """
         return item in self.__dict__.keys()
-
+    
     def __eq__(self, other):
         """Overwrite the == operator
 
@@ -131,15 +138,15 @@ class ConfigSection(object):
                 return False
             if getattr(self, k) != getattr(self, k):
                 return False
-
+        
         for k in other.__dict__.keys():
             if k not in self.__dict__.keys():
                 return False
             if getattr(self, k) != getattr(self, k):
                 return False
-
+        
         return True
-
+    
     def __ne__(self, other):
         """Overwrite the != operator
 
@@ -147,7 +154,7 @@ class ConfigSection(object):
         :return:
         """
         return not self.__eq__(other)
-
+    
     @property
     def data(self):
         return self.__dict__
@@ -162,11 +169,12 @@ class ConfigSaver(object):
     :param str file_path: 配置文件的路径
 
     """
+    
     def __init__(self, file_path):
         self.file_path = file_path
         if not os.path.exists(self.file_path):
             raise FileNotFoundError("file {} NOT found!".__format__(self.file_path))
-
+    
     def _get_section(self, sect_name):
         """
         This is the function to get the section with the section name.
@@ -177,7 +185,7 @@ class ConfigSaver(object):
         sect = ConfigSection()
         ConfigLoader().load_config(self.file_path, {sect_name: sect})
         return sect
-
+    
     def _read_section(self):
         """
         This is the function to read sections from the config file.
@@ -187,16 +195,16 @@ class ConfigSaver(object):
             sect_key_list: A list of names in sect_list.
         """
         sect_name = None
-
+        
         sect_list = {}
         sect_key_list = []
-
+        
         single_section = {}
         single_section_key = []
-
+        
         with open(self.file_path, 'r') as f:
             lines = f.readlines()
-
+        
         for line in lines:
             if line.startswith('[') and line.endswith(']\n'):
                 if sect_name is None:
@@ -208,29 +216,29 @@ class ConfigSaver(object):
                     sect_key_list.append(sect_name)
                 sect_name = line[1: -2]
                 continue
-
+            
             if line.startswith('#'):
                 single_section[line] = '#'
                 single_section_key.append(line)
                 continue
-
+            
             if line.startswith('\n'):
                 single_section_key.append('\n')
                 continue
-
+            
             if '=' not in line:
                 raise RuntimeError("can NOT load config file {}".__format__(self.file_path))
-
+            
             key = line.split('=', maxsplit=1)[0].strip()
             value = line.split('=', maxsplit=1)[1].strip() + '\n'
             single_section[key] = value
             single_section_key.append(key)
-
+        
         if sect_name is not None:
             sect_list[sect_name] = single_section, single_section_key
             sect_key_list.append(sect_name)
         return sect_list, sect_key_list
-
+    
     def _write_section(self, sect_list, sect_key_list):
         """
         This is the function to write config file with section list and name list.
@@ -252,7 +260,7 @@ class ConfigSaver(object):
                         continue
                     f.write(key + ' = ' + single_section[key])
                 f.write('\n')
-
+    
     def save_config_file(self, section_name, section):
         """
         这个方法可以用来修改并保存配置文件中单独的一个 section
@@ -284,11 +292,11 @@ class ConfigSaver(object):
                     break
             if not change_file:
                 return
-
+            
             sect_list, sect_key_list = self._read_section()
             if section_name not in sect_key_list:
                 raise AttributeError()
-
+            
             sect, sect_key = sect_list[section_name]
             for k in section.__dict__.keys():
                 if k not in sect_key:
diff --git a/fastNLP/io/dataset_loader.py b/fastNLP/io/dataset_loader.py
index 3cd475a5..a4b233ad 100644
--- a/fastNLP/io/dataset_loader.py
+++ b/fastNLP/io/dataset_loader.py
@@ -10,6 +10,12 @@ dataset_loader模块实现了许多 DataSetLoader, 用于读取不同格式的
 
     # ... do stuff
 """
+from nltk.tree import Tree
+
+from ..core.dataset import DataSet
+from ..core.instance import Instance
+from .file_reader import _read_csv, _read_json, _read_conll
+
 __all__ = [
     'DataSetLoader',
     'CSVLoader',
@@ -20,11 +26,6 @@ __all__ = [
     'PeopleDailyCorpusLoader',
     'Conll2003Loader',
 ]
-from nltk.tree import Tree
-
-from ..core.dataset import DataSet
-from ..core.instance import Instance
-from .file_reader import _read_csv, _read_json, _read_conll
 
 
 def _download_from_url(url, path):
diff --git a/fastNLP/io/embed_loader.py b/fastNLP/io/embed_loader.py
index 9f3a73dd..7a845366 100644
--- a/fastNLP/io/embed_loader.py
+++ b/fastNLP/io/embed_loader.py
@@ -1,11 +1,15 @@
 import os
+import warnings
 
 import numpy as np
 
 from ..core.vocabulary import Vocabulary
 from .base_loader import BaseLoader
 
-import warnings
+__all__ = [
+    "EmbedLoader"
+]
+
 
 class EmbedLoader(BaseLoader):
     """
@@ -13,10 +17,10 @@ class EmbedLoader(BaseLoader):
 
     用于读取预训练的embedding, 读取结果可直接载入为模型参数。
     """
-
+    
     def __init__(self):
         super(EmbedLoader, self).__init__()
-
+    
     @staticmethod
     def load_with_vocab(embed_filepath, vocab, dtype=np.float32, normalize=True, error='ignore'):
         """
@@ -40,11 +44,11 @@ class EmbedLoader(BaseLoader):
             line = f.readline().strip()
             parts = line.split()
             start_idx = 0
-            if len(parts)==2:
+            if len(parts) == 2:
                 dim = int(parts[1])
                 start_idx += 1
             else:
-                dim = len(parts)-1
+                dim = len(parts) - 1
                 f.seek(0)
             matrix = np.random.randn(len(vocab), dim).astype(dtype)
             for idx, line in enumerate(f, start_idx):
@@ -63,21 +67,21 @@ class EmbedLoader(BaseLoader):
             total_hits = sum(hit_flags)
             print("Found {} out of {} words in the pre-training embedding.".format(total_hits, len(vocab)))
             found_vectors = matrix[hit_flags]
-            if len(found_vectors)!=0:
+            if len(found_vectors) != 0:
                 mean = np.mean(found_vectors, axis=0, keepdims=True)
                 std = np.std(found_vectors, axis=0, keepdims=True)
                 unfound_vec_num = len(vocab) - total_hits
-                r_vecs = np.random.randn(unfound_vec_num, dim).astype(dtype)*std + mean
-                matrix[hit_flags==False] = r_vecs
-
+                r_vecs = np.random.randn(unfound_vec_num, dim).astype(dtype) * std + mean
+                matrix[hit_flags == False] = r_vecs
+            
             if normalize:
                 matrix /= np.linalg.norm(matrix, axis=1, keepdims=True)
-
+            
             return matrix
-
+    
     @staticmethod
     def load_without_vocab(embed_filepath, dtype=np.float32, padding='<pad>', unknown='<unk>', normalize=True,
-                            error='ignore'):
+                           error='ignore'):
         """
         从embed_filepath中读取预训练的word vector。根据预训练的词表读取embedding并生成一个对应的Vocabulary。
 
@@ -96,35 +100,35 @@ class EmbedLoader(BaseLoader):
         vec_dict = {}
         found_unknown = False
         found_pad = False
-
+        
         with open(embed_filepath, 'r', encoding='utf-8') as f:
             line = f.readline()
             start = 1
             dim = -1
-            if len(line.strip().split())!=2:
+            if len(line.strip().split()) != 2:
                 f.seek(0)
                 start = 0
             for idx, line in enumerate(f, start=start):
                 try:
                     parts = line.strip().split()
                     word = parts[0]
-                    if dim==-1:
-                        dim = len(parts)-1
+                    if dim == -1:
+                        dim = len(parts) - 1
                     vec = np.fromstring(' '.join(parts[1:]), sep=' ', dtype=dtype, count=dim)
                     vec_dict[word] = vec
                     vocab.add_word(word)
-                    if unknown is not None and unknown==word:
+                    if unknown is not None and unknown == word:
                         found_unknown = True
-                    if found_pad is not None and padding==word:
+                    if found_pad is not None and padding == word:
                         found_pad = True
                 except Exception as e:
-                    if error=='ignore':
+                    if error == 'ignore':
                         warnings.warn("Error occurred at the {} line.".format(idx))
                         pass
                     else:
                         print("Error occurred at the {} line.".format(idx))
                         raise e
-            if dim==-1:
+            if dim == -1:
                 raise RuntimeError("{} is an empty file.".format(embed_filepath))
             matrix = np.random.randn(len(vocab), dim).astype(dtype)
             if (unknown is not None and not found_unknown) or (padding is not None and not found_pad):
@@ -133,19 +137,19 @@ class EmbedLoader(BaseLoader):
                     start_idx += 1
                 if unknown is not None:
                     start_idx += 1
-
+                
                 mean = np.mean(matrix[start_idx:], axis=0, keepdims=True)
                 std = np.std(matrix[start_idx:], axis=0, keepdims=True)
                 if (unknown is not None and not found_unknown):
-                    matrix[start_idx-1] = np.random.randn(1, dim).astype(dtype)*std + mean
+                    matrix[start_idx - 1] = np.random.randn(1, dim).astype(dtype) * std + mean
                 if (padding is not None and not found_pad):
-                    matrix[0] = np.random.randn(1, dim).astype(dtype)*std + mean
-
+                    matrix[0] = np.random.randn(1, dim).astype(dtype) * std + mean
+            
             for key, vec in vec_dict.items():
                 index = vocab.to_index(key)
                 matrix[index] = vec
-
+            
             if normalize:
                 matrix /= np.linalg.norm(matrix, axis=1, keepdims=True)
-
+            
             return matrix, vocab
diff --git a/fastNLP/io/model_io.py b/fastNLP/io/model_io.py
index 48e53ab3..36393cd4 100644
--- a/fastNLP/io/model_io.py
+++ b/fastNLP/io/model_io.py
@@ -5,6 +5,11 @@ import torch
 
 from .base_loader import BaseLoader
 
+__all__ = [
+    "ModelLoader",
+    "ModelSaver"
+]
+
 
 class ModelLoader(BaseLoader):
     """
@@ -12,10 +17,10 @@ class ModelLoader(BaseLoader):
 
     用于读取模型
     """
-
+    
     def __init__(self):
         super(ModelLoader, self).__init__()
-
+    
     @staticmethod
     def load_pytorch(empty_model, model_path):
         """
@@ -25,7 +30,7 @@ class ModelLoader(BaseLoader):
         :param str model_path: 模型保存的路径
         """
         empty_model.load_state_dict(torch.load(model_path))
-
+    
     @staticmethod
     def load_pytorch_model(model_path):
         """
@@ -48,14 +53,14 @@ class ModelSaver(object):
         saver.save_pytorch(model)
 
     """
-
+    
     def __init__(self, save_path):
         """
 
         :param save_path: 模型保存的路径
         """
         self.save_path = save_path
-
+    
     def save_pytorch(self, model, param_only=True):
         """
         把 PyTorch 模型存入 ".pkl" 文件
diff --git a/fastNLP/models/__init__.py b/fastNLP/models/__init__.py
index 66af3a46..f9ade153 100644
--- a/fastNLP/models/__init__.py
+++ b/fastNLP/models/__init__.py
@@ -7,7 +7,6 @@ fastNLP 在 :mod:`~fastNLP.models` 模块中内置了如 :class:`~fastNLP.models
 
 
 """
-__all__ = ["CNNText", "SeqLabeling", "ESIM", "STSeqLabel", "AdvSeqLabel", "STNLICls", "STSeqCls"]
 from .base_model import BaseModel
 from .bert import BertForMultipleChoice, BertForQuestionAnswering, BertForSequenceClassification, \
     BertForTokenClassification
@@ -15,4 +14,21 @@ from .biaffine_parser import BiaffineParser, GraphParser
 from .cnn_text_classification import CNNText
 from .sequence_labeling import SeqLabeling, AdvSeqLabel
 from .snli import ESIM
-from .star_transformer import STSeqCls, STNLICls, STSeqLabel
+from .star_transformer import StarTransEnc, STSeqCls, STNLICls, STSeqLabel
+
+__all__ = [
+    "CNNText",
+    
+    "SeqLabeling",
+    "AdvSeqLabel",
+    
+    "ESIM",
+    
+    "StarTransEnc",
+    "STSeqLabel",
+    "STNLICls",
+    "STSeqCls",
+    
+    "BiaffineParser",
+    "GraphParser"
+]
diff --git a/fastNLP/models/base_model.py b/fastNLP/models/base_model.py
index 39ac99a0..d27f1d21 100644
--- a/fastNLP/models/base_model.py
+++ b/fastNLP/models/base_model.py
@@ -6,13 +6,13 @@ from ..modules.decoder.MLP import MLP
 class BaseModel(torch.nn.Module):
     """Base PyTorch model for all models.
     """
-
+    
     def __init__(self):
         super(BaseModel, self).__init__()
-
+    
     def fit(self, train_data, dev_data=None, **train_args):
         pass
-
+    
     def predict(self, *args, **kwargs):
         raise NotImplementedError
 
@@ -21,9 +21,9 @@ class NaiveClassifier(BaseModel):
     def __init__(self, in_feature_dim, out_feature_dim):
         super(NaiveClassifier, self).__init__()
         self.mlp = MLP([in_feature_dim, in_feature_dim, out_feature_dim])
-
+    
     def forward(self, x):
         return {"predict": torch.sigmoid(self.mlp(x))}
-
+    
     def predict(self, x):
         return {"predict": torch.sigmoid(self.mlp(x)) > 0.5}
diff --git a/fastNLP/models/biaffine_parser.py b/fastNLP/models/biaffine_parser.py
index 100bfb72..7f16202d 100644
--- a/fastNLP/models/biaffine_parser.py
+++ b/fastNLP/models/biaffine_parser.py
@@ -1,11 +1,12 @@
-"""Biaffine Dependency Parser 的 Pytorch 实现.
 """
-from collections import defaultdict
-
+Biaffine Dependency Parser 的 Pytorch 实现.
+"""
 import numpy as np
 import torch
-from torch import nn
-from torch.nn import functional as F
+import torch.nn as nn
+import torch.nn.functional as F
+
+from collections import defaultdict
 
 from ..core.const import Const as C
 from ..core.losses import LossFunc
@@ -18,6 +19,12 @@ from ..modules.utils import get_embeddings
 from .base_model import BaseModel
 from ..core.utils import seq_len_to_mask
 
+__all__ = [
+    "BiaffineParser",
+    "GraphParser"
+]
+
+
 def _mst(scores):
     """
     with some modification to support parser output for MST decoding
@@ -44,7 +51,7 @@ def _mst(scores):
             scores[roots, new_heads] / root_scores)]
         heads[roots] = new_heads
         heads[new_root] = 0
-
+    
     edges = defaultdict(set)
     vertices = set((0,))
     for dep, head in enumerate(heads[tokens]):
@@ -73,7 +80,7 @@ def _mst(scores):
         heads[changed_cycle] = new_head
         edges[new_head].add(changed_cycle)
         edges[old_head].remove(changed_cycle)
-
+    
     return heads
 
 
@@ -88,7 +95,7 @@ def _find_cycle(vertices, edges):
     _lowlinks = {}
     _onstack = defaultdict(lambda: False)
     _SCCs = []
-
+    
     def _strongconnect(v):
         nonlocal _index
         _indices[v] = _index
@@ -96,28 +103,28 @@ def _find_cycle(vertices, edges):
         _index += 1
         _stack.append(v)
         _onstack[v] = True
-
+        
         for w in edges[v]:
             if w not in _indices:
                 _strongconnect(w)
                 _lowlinks[v] = min(_lowlinks[v], _lowlinks[w])
             elif _onstack[w]:
                 _lowlinks[v] = min(_lowlinks[v], _indices[w])
-
+        
         if _lowlinks[v] == _indices[v]:
             SCC = set()
             while True:
                 w = _stack.pop()
                 _onstack[w] = False
                 SCC.add(w)
-                if not(w != v):
+                if not (w != v):
                     break
             _SCCs.append(SCC)
-
+    
     for v in vertices:
         if v not in _indices:
             _strongconnect(v)
-
+    
     return [SCC for SCC in _SCCs if len(SCC) > 1]
 
 
@@ -125,9 +132,10 @@ class GraphParser(BaseModel):
     """
     基于图的parser base class, 支持贪婪解码和最大生成树解码
     """
+    
     def __init__(self):
         super(GraphParser, self).__init__()
-
+    
     @staticmethod
     def greedy_decoder(arc_matrix, mask=None):
         """
@@ -146,7 +154,7 @@ class GraphParser(BaseModel):
         if mask is not None:
             heads *= mask.long()
         return heads
-
+    
     @staticmethod
     def mst_decoder(arc_matrix, mask=None):
         """
@@ -176,6 +184,7 @@ class ArcBiaffine(nn.Module):
     :param hidden_size: 输入的特征维度
     :param bias: 是否使用bias. Default: ``True``
     """
+    
     def __init__(self, hidden_size, bias=True):
         super(ArcBiaffine, self).__init__()
         self.U = nn.Parameter(torch.Tensor(hidden_size, hidden_size), requires_grad=True)
@@ -185,7 +194,7 @@ class ArcBiaffine(nn.Module):
         else:
             self.register_parameter("bias", None)
         initial_parameter(self)
-
+    
     def forward(self, head, dep):
         """
 
@@ -209,11 +218,12 @@ class LabelBilinear(nn.Module):
     :param num_label: 边类别的个数
     :param bias: 是否使用bias. Default: ``True``
     """
+    
     def __init__(self, in1_features, in2_features, num_label, bias=True):
         super(LabelBilinear, self).__init__()
         self.bilinear = nn.Bilinear(in1_features, in2_features, num_label, bias=bias)
         self.lin = nn.Linear(in1_features + in2_features, num_label, bias=False)
-
+    
     def forward(self, x1, x2):
         """
 
@@ -225,13 +235,13 @@ class LabelBilinear(nn.Module):
         output += self.lin(torch.cat([x1, x2], dim=2))
         return output
 
+
 class BiaffineParser(GraphParser):
     """
     别名：:class:`fastNLP.models.BiaffineParser`  :class:`fastNLP.models.baffine_parser.BiaffineParser`
 
     Biaffine Dependency Parser 实现.
-    论文参考 ` Deep Biaffine Attention for Neural Dependency Parsing (Dozat and Manning, 2016)
-    <https://arxiv.org/abs/1611.01734>`_ .
+    论文参考 `Deep Biaffine Attention for Neural Dependency Parsing (Dozat and Manning, 2016) <https://arxiv.org/abs/1611.01734>`_ .
 
     :param init_embed: 单词词典, 可以是 tuple, 包括(num_embedings, embedding_dim), 即
         embedding的大小和每个词的维度. 也可以传入 nn.Embedding 对象,
@@ -248,18 +258,19 @@ class BiaffineParser(GraphParser):
     :param use_greedy_infer: 是否在inference时使用贪心算法.
         若 ``False`` , 使用更加精确但相对缓慢的MST算法. Default: ``False``
     """
+    
     def __init__(self,
-                init_embed,
-                pos_vocab_size,
-                pos_emb_dim,
-                num_label,
-                rnn_layers=1,
-                rnn_hidden_size=200,
-                arc_mlp_size=100,
-                label_mlp_size=100,
-                dropout=0.3,
-                encoder='lstm',
-                use_greedy_infer=False):
+                 init_embed,
+                 pos_vocab_size,
+                 pos_emb_dim,
+                 num_label,
+                 rnn_layers=1,
+                 rnn_hidden_size=200,
+                 arc_mlp_size=100,
+                 label_mlp_size=100,
+                 dropout=0.3,
+                 encoder='lstm',
+                 use_greedy_infer=False):
         super(BiaffineParser, self).__init__()
         rnn_out_size = 2 * rnn_hidden_size
         word_hid_dim = pos_hid_dim = rnn_hidden_size
@@ -295,20 +306,20 @@ class BiaffineParser(GraphParser):
             if (d_k * n_head) != rnn_out_size:
                 raise ValueError('unsupported rnn_out_size: {} for transformer'.format(rnn_out_size))
             self.position_emb = nn.Embedding(num_embeddings=self.max_len,
-                                             embedding_dim=rnn_out_size,)
+                                             embedding_dim=rnn_out_size, )
             self.encoder = TransformerEncoder(num_layers=rnn_layers,
                                               model_size=rnn_out_size,
                                               inner_size=1024,
                                               key_size=d_k,
                                               value_size=d_v,
                                               num_head=n_head,
-                                              dropout=dropout,)
+                                              dropout=dropout, )
         else:
             raise ValueError('unsupported encoder type: {}'.format(encoder))
-
+        
         self.mlp = nn.Sequential(nn.Linear(rnn_out_size, arc_mlp_size * 2 + label_mlp_size * 2),
-                                          nn.ELU(),
-                                          TimestepDropout(p=dropout),)
+                                 nn.ELU(),
+                                 TimestepDropout(p=dropout), )
         self.arc_mlp_size = arc_mlp_size
         self.label_mlp_size = label_mlp_size
         self.arc_predictor = ArcBiaffine(arc_mlp_size, bias=True)
@@ -316,7 +327,7 @@ class BiaffineParser(GraphParser):
         self.use_greedy_infer = use_greedy_infer
         self.reset_parameters()
         self.dropout = dropout
-
+    
     def reset_parameters(self):
         for m in self.modules():
             if isinstance(m, nn.Embedding):
@@ -327,7 +338,7 @@ class BiaffineParser(GraphParser):
             else:
                 for p in m.parameters():
                     nn.init.normal_(p, 0, 0.1)
-
+    
     def forward(self, words1, words2, seq_len, target1=None):
         """模型forward阶段
 
@@ -337,50 +348,52 @@ class BiaffineParser(GraphParser):
         :param target1: [batch_size, seq_len] 输入真实标注的heads, 仅在训练阶段有效,
             用于训练label分类器. 若为 ``None`` , 使用预测的heads输入到label分类器
             Default: ``None``
-        :return dict: parsing结果::
+        :return dict: parsing
+                结果::
+
+                    pred1: [batch_size, seq_len, seq_len] 边预测logits
+                    pred2: [batch_size, seq_len, num_label] label预测logits
+                    pred3: [batch_size, seq_len] heads的预测结果, 在 ``target1=None`` 时预测
 
-            pred1: [batch_size, seq_len, seq_len] 边预测logits
-            pred2: [batch_size, seq_len, num_label] label预测logits
-            pred3: [batch_size, seq_len] heads的预测结果, 在 ``target1=None`` 时预测
         """
         # prepare embeddings
         batch_size, length = words1.shape
         # print('forward {} {}'.format(batch_size, seq_len))
-
+        
         # get sequence mask
         mask = seq_len_to_mask(seq_len).long()
-
-        word = self.word_embedding(words1) # [N,L] -> [N,L,C_0]
-        pos = self.pos_embedding(words2) # [N,L] -> [N,L,C_1]
-
+        
+        word = self.word_embedding(words1)  # [N,L] -> [N,L,C_0]
+        pos = self.pos_embedding(words2)  # [N,L] -> [N,L,C_1]
+        
         word, pos = self.word_fc(word), self.pos_fc(pos)
         word, pos = self.word_norm(word), self.pos_norm(pos)
-        x = torch.cat([word, pos], dim=2) # -> [N,L,C]
-
+        x = torch.cat([word, pos], dim=2)  # -> [N,L,C]
+        
         # encoder, extract features
         if self.encoder_name.endswith('lstm'):
             sort_lens, sort_idx = torch.sort(seq_len, dim=0, descending=True)
             x = x[sort_idx]
             x = nn.utils.rnn.pack_padded_sequence(x, sort_lens, batch_first=True)
-            feat, _ = self.encoder(x) # -> [N,L,C]
+            feat, _ = self.encoder(x)  # -> [N,L,C]
             feat, _ = nn.utils.rnn.pad_packed_sequence(feat, batch_first=True)
             _, unsort_idx = torch.sort(sort_idx, dim=0, descending=False)
             feat = feat[unsort_idx]
         else:
-            seq_range = torch.arange(length, dtype=torch.long, device=x.device)[None,:]
+            seq_range = torch.arange(length, dtype=torch.long, device=x.device)[None, :]
             x = x + self.position_emb(seq_range)
             feat = self.encoder(x, mask.float())
-
+        
         # for arc biaffine
         # mlp, reduce dim
         feat = self.mlp(feat)
         arc_sz, label_sz = self.arc_mlp_size, self.label_mlp_size
-        arc_dep, arc_head = feat[:,:,:arc_sz], feat[:,:,arc_sz:2*arc_sz]
-        label_dep, label_head = feat[:,:,2*arc_sz:2*arc_sz+label_sz], feat[:,:,2*arc_sz+label_sz:]
-
+        arc_dep, arc_head = feat[:, :, :arc_sz], feat[:, :, arc_sz:2 * arc_sz]
+        label_dep, label_head = feat[:, :, 2 * arc_sz:2 * arc_sz + label_sz], feat[:, :, 2 * arc_sz + label_sz:]
+        
         # biaffine arc classifier
-        arc_pred = self.arc_predictor(arc_head, arc_dep) # [N, L, L]
-
+        arc_pred = self.arc_predictor(arc_head, arc_dep)  # [N, L, L]
+        
         # use gold or predicted arc to predict label
         if target1 is None or not self.training:
             # use greedy decoding in training
@@ -390,22 +403,22 @@ class BiaffineParser(GraphParser):
                 heads = self.mst_decoder(arc_pred, mask)
             head_pred = heads
         else:
-            assert self.training # must be training mode
+            assert self.training  # must be training mode
             if target1 is None:
                 heads = self.greedy_decoder(arc_pred, mask)
                 head_pred = heads
             else:
                 head_pred = None
                 heads = target1
-
+        
         batch_range = torch.arange(start=0, end=batch_size, dtype=torch.long, device=words1.device).unsqueeze(1)
         label_head = label_head[batch_range, heads].contiguous()
-        label_pred = self.label_predictor(label_head, label_dep) # [N, L, num_label]
+        label_pred = self.label_predictor(label_head, label_dep)  # [N, L, num_label]
         res_dict = {C.OUTPUTS(0): arc_pred, C.OUTPUTS(1): label_pred}
         if head_pred is not None:
             res_dict[C.OUTPUTS(2)] = head_pred
         return res_dict
-
+    
     @staticmethod
     def loss(pred1, pred2, target1, target2, seq_len):
         """
@@ -418,7 +431,7 @@ class BiaffineParser(GraphParser):
         :param seq_len: [batch_size, seq_len] 真实目标的长度
         :return loss: scalar
         """
-
+        
         batch_size, length, _ = pred1.shape
         mask = seq_len_to_mask(seq_len)
         flip_mask = (mask == 0)
@@ -430,24 +443,26 @@ class BiaffineParser(GraphParser):
         child_index = torch.arange(length, device=arc_logits.device, dtype=torch.long).unsqueeze(0)
         arc_loss = arc_logits[batch_index, child_index, target1]
         label_loss = label_logits[batch_index, child_index, target2]
-
+        
         byte_mask = flip_mask.byte()
         arc_loss.masked_fill_(byte_mask, 0)
         label_loss.masked_fill_(byte_mask, 0)
         arc_nll = -arc_loss.mean()
         label_nll = -label_loss.mean()
         return arc_nll + label_nll
-
+    
     def predict(self, words1, words2, seq_len):
         """模型预测API
 
         :param words1: [batch_size, seq_len] 输入word序列
         :param words2: [batch_size, seq_len] 输入pos序列
         :param seq_len: [batch_size, seq_len] 输入序列长度
-        :return dict: parsing结果::
+        :return dict: parsing
+                结果::
+
+                    pred1: [batch_size, seq_len] heads的预测结果
+                    pred2: [batch_size, seq_len, num_label] label预测logits
 
-            pred1: [batch_size, seq_len] heads的预测结果
-            pred2: [batch_size, seq_len, num_label] label预测logits
         """
         res = self(words1, words2, seq_len)
         output = {}
@@ -470,6 +485,7 @@ class ParserLoss(LossFunc):
     :param seq_len: [batch_size, seq_len] 真实目标的长度
     :return loss: scalar
     """
+    
     def __init__(self, pred1=None, pred2=None,
                  target1=None, target2=None,
                  seq_len=None):
@@ -497,9 +513,10 @@ class ParserMetric(MetricBase):
         UAS: 不带label时, 边预测的准确率
         LAS: 同时预测边和label的准确率
     """
+    
     def __init__(self, pred1=None, pred2=None,
                  target1=None, target2=None, seq_len=None):
-
+        
         super().__init__()
         self._init_param_map(pred1=pred1, pred2=pred2,
                              target1=target1, target2=target2,
@@ -507,13 +524,13 @@ class ParserMetric(MetricBase):
         self.num_arc = 0
         self.num_label = 0
         self.num_sample = 0
-
+    
     def get_metric(self, reset=True):
-        res = {'UAS': self.num_arc*1.0 / self.num_sample, 'LAS': self.num_label*1.0 / self.num_sample}
+        res = {'UAS': self.num_arc * 1.0 / self.num_sample, 'LAS': self.num_label * 1.0 / self.num_sample}
         if reset:
             self.num_sample = self.num_label = self.num_arc = 0
         return res
-
+    
     def evaluate(self, pred1, pred2, target1, target2, seq_len=None):
         """Evaluate the performance of prediction.
         """
@@ -522,7 +539,7 @@ class ParserMetric(MetricBase):
         else:
             seq_mask = seq_len_to_mask(seq_len.long()).long()
         # mask out <root> tag
-        seq_mask[:,0] = 0
+        seq_mask[:, 0] = 0
         head_pred_correct = (pred1 == target1).long() * seq_mask
         label_pred_correct = (pred2 == target2).long() * head_pred_correct
         self.num_arc += head_pred_correct.sum().item()
diff --git a/fastNLP/models/cnn_text_classification.py b/fastNLP/models/cnn_text_classification.py
index 01b03b9f..a9ccc568 100644
--- a/fastNLP/models/cnn_text_classification.py
+++ b/fastNLP/models/cnn_text_classification.py
@@ -1,12 +1,13 @@
-# python: 3.6
-# encoding: utf-8
-
 import torch
 import torch.nn as nn
-from ..core.const import Const as C
 
+from ..core.const import Const as C
 from ..modules import encoder
 
+__all__ = [
+    "CNNText"
+]
+
 
 class CNNText(torch.nn.Module):
     """
@@ -23,7 +24,7 @@ class CNNText(torch.nn.Module):
     :param int padding: 对句子前后的pad的大小, 用0填充。
     :param float dropout: Dropout的大小
     """
-
+    
     def __init__(self, init_embed,
                  num_classes,
                  kernel_nums=(3, 4, 5),
@@ -31,7 +32,7 @@ class CNNText(torch.nn.Module):
                  padding=0,
                  dropout=0.5):
         super(CNNText, self).__init__()
-
+        
         # no support for pre-trained embedding currently
         self.embed = encoder.Embedding(init_embed)
         self.conv_pool = encoder.ConvMaxpool(
@@ -41,7 +42,7 @@ class CNNText(torch.nn.Module):
             padding=padding)
         self.dropout = nn.Dropout(dropout)
         self.fc = nn.Linear(sum(kernel_nums), num_classes)
-
+    
     def forward(self, words, seq_len=None):
         """
 
@@ -54,7 +55,7 @@ class CNNText(torch.nn.Module):
         x = self.dropout(x)
         x = self.fc(x)  # [N,C] -> [N, N_class]
         return {C.OUTPUT: x}
-
+    
     def predict(self, words, seq_len=None):
         """
         :param torch.LongTensor words: [batch_size, seq_len]，句子中word的index
diff --git a/fastNLP/models/enas_controller.py b/fastNLP/models/enas_controller.py
index 16b970e6..e83c6b51 100644
--- a/fastNLP/models/enas_controller.py
+++ b/fastNLP/models/enas_controller.py
@@ -5,6 +5,7 @@ import os
 
 import torch
 import torch.nn.functional as F
+
 from . import enas_utils as utils
 from .enas_utils import Node
 
diff --git a/fastNLP/models/enas_model.py b/fastNLP/models/enas_model.py
index 5c667927..b6b683c0 100644
--- a/fastNLP/models/enas_model.py
+++ b/fastNLP/models/enas_model.py
@@ -1,17 +1,19 @@
-# Code Modified from https://github.com/carpedm20/ENAS-pytorch
-
-"""Module containing the shared RNN model."""
-import numpy as np
+"""
+Module containing the shared RNN model.
+Code Modified from https://github.com/carpedm20/ENAS-pytorch
+"""
 import collections
 
+import numpy as np
 import torch
-from torch import nn
+import torch.nn as nn
 import torch.nn.functional as F
 from torch.autograd import Variable
 
 from . import enas_utils as utils
 from .base_model import BaseModel
 
+
 def _get_dropped_weights(w_raw, dropout_p, is_training):
     """Drops out weights to implement DropConnect.
 
@@ -35,12 +37,13 @@ def _get_dropped_weights(w_raw, dropout_p, is_training):
     The above TODO is the reason for the hacky check for `torch.nn.Parameter`.
     """
     dropped_w = F.dropout(w_raw, p=dropout_p, training=is_training)
-
+    
     if isinstance(dropped_w, torch.nn.Parameter):
         dropped_w = dropped_w.clone()
-
+    
     return dropped_w
 
+
 class EmbeddingDropout(torch.nn.Embedding):
     """Class for dropping out embeddings by zero'ing out parameters in the
     embedding matrix.
@@ -53,6 +56,7 @@ class EmbeddingDropout(torch.nn.Embedding):
     See 'A Theoretically Grounded Application of Dropout in Recurrent Neural
     Networks', (Gal and Ghahramani, 2016).
     """
+    
     def __init__(self,
                  num_embeddings,
                  embedding_dim,
@@ -83,14 +87,14 @@ class EmbeddingDropout(torch.nn.Embedding):
         assert (dropout >= 0.0) and (dropout < 1.0), ('Dropout must be >= 0.0 '
                                                       'and < 1.0')
         self.scale = scale
-
+    
     def forward(self, inputs):  # pylint:disable=arguments-differ
         """Embeds `inputs` with the dropped out embedding weight matrix."""
         if self.training:
             dropout = self.dropout
         else:
             dropout = 0
-
+        
         if dropout:
             mask = self.weight.data.new(self.weight.size(0), 1)
             mask.bernoulli_(1 - dropout)
@@ -101,7 +105,7 @@ class EmbeddingDropout(torch.nn.Embedding):
             masked_weight = self.weight
         if self.scale and self.scale != 1:
             masked_weight = masked_weight * self.scale
-
+        
         return F.embedding(inputs,
                            masked_weight,
                            max_norm=self.max_norm,
@@ -114,7 +118,7 @@ class LockedDropout(nn.Module):
     # code from https://github.com/salesforce/awd-lstm-lm/blob/master/locked_dropout.py
     def __init__(self):
         super().__init__()
-
+    
     def forward(self, x, dropout=0.5):
         if not self.training or not dropout:
             return x
@@ -126,11 +130,12 @@ class LockedDropout(nn.Module):
 
 class ENASModel(BaseModel):
     """Shared RNN model."""
+    
     def __init__(self, embed_num, num_classes, num_blocks=4, cuda=False, shared_hid=1000, shared_embed=1000):
         super(ENASModel, self).__init__()
-
+        
         self.use_cuda = cuda
-
+        
         self.shared_hid = shared_hid
         self.num_blocks = num_blocks
         self.decoder = nn.Linear(self.shared_hid, num_classes)
@@ -139,16 +144,16 @@ class ENASModel(BaseModel):
                                         dropout=0.1)
         self.lockdrop = LockedDropout()
         self.dag = None
-
+        
         # Tie weights
         # self.decoder.weight = self.encoder.weight
-
+        
         # Since W^{x, c} and W^{h, c} are always summed, there
         # is no point duplicating their bias offset parameter. Likewise for
         # W^{x, h} and W^{h, h}.
         self.w_xc = nn.Linear(shared_embed, self.shared_hid)
         self.w_xh = nn.Linear(shared_embed, self.shared_hid)
-
+        
         # The raw weights are stored here because the hidden-to-hidden weights
         # are weight dropped on the forward pass.
         self.w_hc_raw = torch.nn.Parameter(
@@ -157,10 +162,10 @@ class ENASModel(BaseModel):
             torch.Tensor(self.shared_hid, self.shared_hid))
         self.w_hc = None
         self.w_hh = None
-
+        
         self.w_h = collections.defaultdict(dict)
         self.w_c = collections.defaultdict(dict)
-
+        
         for idx in range(self.num_blocks):
             for jdx in range(idx + 1, self.num_blocks):
                 self.w_h[idx][jdx] = nn.Linear(self.shared_hid,
@@ -169,48 +174,47 @@ class ENASModel(BaseModel):
                 self.w_c[idx][jdx] = nn.Linear(self.shared_hid,
                                                self.shared_hid,
                                                bias=False)
-
+        
         self._w_h = nn.ModuleList([self.w_h[idx][jdx]
                                    for idx in self.w_h
                                    for jdx in self.w_h[idx]])
         self._w_c = nn.ModuleList([self.w_c[idx][jdx]
                                    for idx in self.w_c
                                    for jdx in self.w_c[idx]])
-
+        
         self.batch_norm = None
         # if args.mode == 'train':
         #     self.batch_norm = nn.BatchNorm1d(self.shared_hid)
         # else:
         #     self.batch_norm = None
-
+        
         self.reset_parameters()
         self.static_init_hidden = utils.keydefaultdict(self.init_hidden)
-
+    
     def setDAG(self, dag):
         if self.dag is None:
             self.dag = dag
-
+    
     def forward(self, word_seq, hidden=None):
         inputs = torch.transpose(word_seq, 0, 1)
-
+        
         time_steps = inputs.size(0)
         batch_size = inputs.size(1)
-
-
+        
         self.w_hh = _get_dropped_weights(self.w_hh_raw,
                                          0.5,
                                          self.training)
         self.w_hc = _get_dropped_weights(self.w_hc_raw,
                                          0.5,
                                          self.training)
-
+        
         # hidden = self.static_init_hidden[batch_size] if hidden is None else hidden
         hidden = self.static_init_hidden[batch_size]
-
+        
         embed = self.encoder(inputs)
-
+        
         embed = self.lockdrop(embed, 0.65 if self.training else 0)
-
+        
         # The norm of hidden states are clipped here because
         # otherwise ENAS is especially prone to exploding activations on the
         # forward pass. This could probably be fixed in a more elegant way, but
@@ -226,7 +230,7 @@ class ENASModel(BaseModel):
         for step in range(time_steps):
             x_t = embed[step]
             logit, hidden = self.cell(x_t, hidden, self.dag)
-
+            
             hidden_norms = hidden.norm(dim=-1)
             max_norm = 25.0
             if hidden_norms.data.max() > max_norm:
@@ -237,60 +241,60 @@ class ENASModel(BaseModel):
                 # because the PyTorch slicing and slice assignment is too
                 # flaky.
                 hidden_norms = hidden_norms.data.cpu().numpy()
-
+                
                 clipped_num += 1
                 if hidden_norms.max() > max_clipped_norm:
                     max_clipped_norm = hidden_norms.max()
-
+                
                 clip_select = hidden_norms > max_norm
                 clip_norms = hidden_norms[clip_select]
-
+                
                 mask = np.ones(hidden.size())
-                normalizer = max_norm/clip_norms
+                normalizer = max_norm / clip_norms
                 normalizer = normalizer[:, np.newaxis]
-
+                
                 mask[clip_select] = normalizer
-
+                
                 if self.use_cuda:
                     hidden *= torch.autograd.Variable(
                         torch.FloatTensor(mask).cuda(), requires_grad=False)
                 else:
                     hidden *= torch.autograd.Variable(
-                        torch.FloatTensor(mask), requires_grad=False)                    
+                        torch.FloatTensor(mask), requires_grad=False)
             logits.append(logit)
             h1tohT.append(hidden)
-
+        
         h1tohT = torch.stack(h1tohT)
         output = torch.stack(logits)
         raw_output = output
-
+        
         output = self.lockdrop(output, 0.4 if self.training else 0)
-
-        #Pooling 
+        
+        # Pooling
         output = torch.mean(output, 0)
-
+        
         decoded = self.decoder(output)
-
+        
         extra_out = {'dropped': decoded,
                      'hiddens': h1tohT,
                      'raw': raw_output}
         return {'pred': decoded, 'hidden': hidden, 'extra_out': extra_out}
-
+    
     def cell(self, x, h_prev, dag):
         """Computes a single pass through the discovered RNN cell."""
         c = {}
         h = {}
         f = {}
-
+        
         f[0] = self.get_f(dag[-1][0].name)
         c[0] = torch.sigmoid(self.w_xc(x) + F.linear(h_prev, self.w_hc, None))
-        h[0] = (c[0]*f[0](self.w_xh(x) + F.linear(h_prev, self.w_hh, None)) +
-                (1 - c[0])*h_prev)
-
+        h[0] = (c[0] * f[0](self.w_xh(x) + F.linear(h_prev, self.w_hh, None)) +
+                (1 - c[0]) * h_prev)
+        
         leaf_node_ids = []
         q = collections.deque()
         q.append(0)
-
+        
         # Computes connections from the parent nodes `node_id`
         # to their child nodes `next_id` recursively, skipping leaf nodes. A
         # leaf node is a node whose id == `self.num_blocks`.
@@ -306,10 +310,10 @@ class ENASModel(BaseModel):
         while True:
             if len(q) == 0:
                 break
-
+            
             node_id = q.popleft()
             nodes = dag[node_id]
-
+            
             for next_node in nodes:
                 next_id = next_node.id
                 if next_id == self.num_blocks:
@@ -317,38 +321,38 @@ class ENASModel(BaseModel):
                     assert len(nodes) == 1, ('parent of leaf node should have '
                                              'only one child')
                     continue
-
+                
                 w_h = self.w_h[node_id][next_id]
                 w_c = self.w_c[node_id][next_id]
-
+                
                 f[next_id] = self.get_f(next_node.name)
                 c[next_id] = torch.sigmoid(w_c(h[node_id]))
-                h[next_id] = (c[next_id]*f[next_id](w_h(h[node_id])) +
-                              (1 - c[next_id])*h[node_id])
-
+                h[next_id] = (c[next_id] * f[next_id](w_h(h[node_id])) +
+                              (1 - c[next_id]) * h[node_id])
+                
                 q.append(next_id)
-
+        
         # Instead of averaging loose ends, perhaps there should
         # be a set of separate unshared weights for each "loose" connection
         # between each node in a cell and the output.
         #
         # As it stands, all weights W^h_{ij} are doing double duty by
         # connecting both from i to j, as well as from i to the output.
-
+        
         # average all the loose ends
         leaf_nodes = [h[node_id] for node_id in leaf_node_ids]
         output = torch.mean(torch.stack(leaf_nodes, 2), -1)
-
+        
         # stabilizing the Updates of omega
         if self.batch_norm is not None:
             output = self.batch_norm(output)
-
+        
         return output, h[self.num_blocks - 1]
-
+    
     def init_hidden(self, batch_size):
         zeros = torch.zeros(batch_size, self.shared_hid)
         return utils.get_variable(zeros, self.use_cuda, requires_grad=False)
-
+    
     def get_f(self, name):
         name = name.lower()
         if name == 'relu':
@@ -360,22 +364,21 @@ class ENASModel(BaseModel):
         elif name == 'sigmoid':
             f = torch.sigmoid
         return f
-        
-
+    
     @property
     def num_parameters(self):
         def size(p):
             return np.prod(p.size())
+        
         return sum([size(param) for param in self.parameters()])
-
-
+    
     def reset_parameters(self):
         init_range = 0.025
         # init_range = 0.025 if self.args.mode == 'train' else 0.04
         for param in self.parameters():
             param.data.uniform_(-init_range, init_range)
         self.decoder.bias.data.fill_(0)
-
+    
     def predict(self, word_seq):
         """
 
diff --git a/fastNLP/models/enas_trainer.py b/fastNLP/models/enas_trainer.py
index 9cd7d8d0..ef596b03 100644
--- a/fastNLP/models/enas_trainer.py
+++ b/fastNLP/models/enas_trainer.py
@@ -1,12 +1,12 @@
 # Code Modified from https://github.com/carpedm20/ENAS-pytorch
-
-import time
-from datetime import datetime
-from datetime import timedelta
-
+import math
 import numpy as np
+import time
 import torch
-import math
+
+from datetime import datetime, timedelta
+
+from torch.optim import Adam
 
 try:
     from tqdm.auto import tqdm
@@ -21,8 +21,6 @@ from ..core.utils import _move_dict_value_to_device
 from . import enas_utils as utils
 from ..core.utils import _build_args
 
-from torch.optim import Adam
-
 
 def _get_no_grad_ctx_mgr():
     """Returns a the `torch.no_grad` context manager for PyTorch version >=
@@ -33,6 +31,7 @@ def _get_no_grad_ctx_mgr():
 
 class ENASTrainer(Trainer):
     """A class to wrap training code."""
+    
     def __init__(self, train_data, model, controller, **kwargs):
         """Constructor for training algorithm.
         :param DataSet train_data: the training data
@@ -45,19 +44,19 @@ class ENASTrainer(Trainer):
         self.controller_step = 0
         self.shared_step = 0
         self.max_length = 35
-
+        
         self.shared = model
         self.controller = controller
-
+        
         self.shared_optim = Adam(
             self.shared.parameters(),
             lr=20.0,
             weight_decay=1e-7)
-
+        
         self.controller_optim = Adam(
             self.controller.parameters(),
             lr=3.5e-4)
-
+    
     def train(self, load_best_model=True):
         """
         :param bool load_best_model: 该参数只有在初始化提供了dev_data的情况下有效，如果True, trainer将在返回之前重新加载dev表现
@@ -82,21 +81,22 @@ class ENASTrainer(Trainer):
                 self.model = self.model.cuda()
             self._model_device = self.model.parameters().__next__().device
             self._mode(self.model, is_test=False)
-
+            
             self.start_time = str(datetime.now().strftime('%Y-%m-%d-%H-%M-%S'))
             start_time = time.time()
             print("training epochs started " + self.start_time, flush=True)
-
+            
             try:
                 self.callback_manager.on_train_begin()
                 self._train()
                 self.callback_manager.on_train_end()
             except (CallbackException, KeyboardInterrupt) as e:
                 self.callback_manager.on_exception(e)
-
+            
             if self.dev_data is not None:
-                print("\nIn Epoch:{}/Step:{}, got best dev performance:".format(self.best_dev_epoch, self.best_dev_step) +
-                      self.tester._format_eval_results(self.best_dev_perf),)
+                print(
+                    "\nIn Epoch:{}/Step:{}, got best dev performance:".format(self.best_dev_epoch, self.best_dev_step) +
+                    self.tester._format_eval_results(self.best_dev_perf), )
                 results['best_eval'] = self.best_dev_perf
                 results['best_epoch'] = self.best_dev_epoch
                 results['best_step'] = self.best_dev_step
@@ -110,9 +110,9 @@ class ENASTrainer(Trainer):
         finally:
             pass
         results['seconds'] = round(time.time() - start_time, 2)
-
+        
         return results
-
+    
     def _train(self):
         if not self.use_tqdm:
             from fastNLP.core.utils import _pseudo_tqdm as inner_tqdm
@@ -126,21 +126,21 @@ class ENASTrainer(Trainer):
             avg_loss = 0
             data_iterator = Batch(self.train_data, batch_size=self.batch_size, sampler=self.sampler, as_numpy=False,
                                   prefetch=self.prefetch)
-            for epoch in range(1, self.n_epochs+1):
+            for epoch in range(1, self.n_epochs + 1):
                 pbar.set_description_str(desc="Epoch {}/{}".format(epoch, self.n_epochs))
                 last_stage = (epoch > self.n_epochs + 1 - self.final_epochs)
                 if epoch == self.n_epochs + 1 - self.final_epochs:
                     print('Entering the final stage. (Only train the selected structure)')
                 # early stopping
                 self.callback_manager.on_epoch_begin()
-
+                
                 # 1. Training the shared parameters omega of the child models
                 self.train_shared(pbar)
-
+                
                 # 2. Training the controller parameters theta
                 if not last_stage:
                     self.train_controller()
-
+                
                 if ((self.validate_every > 0 and self.step % self.validate_every == 0) or
                     (self.validate_every < 0 and self.step % len(data_iterator) == 0)) \
                         and self.dev_data is not None:
@@ -149,16 +149,15 @@ class ENASTrainer(Trainer):
                     eval_res = self._do_validation(epoch=epoch, step=self.step)
                     eval_str = "Evaluation at Epoch {}/{}. Step:{}/{}. ".format(epoch, self.n_epochs, self.step,
                                                                                 total_steps) + \
-                                self.tester._format_eval_results(eval_res)
+                               self.tester._format_eval_results(eval_res)
                     pbar.write(eval_str)
-
+                
                 # lr decay; early stopping
                 self.callback_manager.on_epoch_end()
             # =============== epochs end =================== #
             pbar.close()
         # ============ tqdm end ============== #
-
-
+    
     def get_loss(self, inputs, targets, hidden, dags):
         """Computes the loss for the same batch for M models.
 
@@ -167,7 +166,7 @@ class ENASTrainer(Trainer):
         """
         if not isinstance(dags, list):
             dags = [dags]
-
+        
         loss = 0
         for dag in dags:
             self.shared.setDAG(dag)
@@ -175,14 +174,14 @@ class ENASTrainer(Trainer):
             inputs['hidden'] = hidden
             result = self.shared(**inputs)
             output, hidden, extra_out = result['pred'], result['hidden'], result['extra_out']
-
+            
             self.callback_manager.on_loss_begin(targets, result)
             sample_loss = self._compute_loss(result, targets)
             loss += sample_loss
-
+        
         assert len(dags) == 1, 'there are multiple `hidden` for multple `dags`'
         return loss, hidden, extra_out
-
+    
     def train_shared(self, pbar=None, max_step=None, dag=None):
         """Train the language model for 400 steps of minibatches of 64
         examples.
@@ -200,9 +199,9 @@ class ENASTrainer(Trainer):
         model = self.shared
         model.train()
         self.controller.eval()
-
+        
         hidden = self.shared.init_hidden(self.batch_size)
-
+        
         abs_max_grad = 0
         abs_max_hidden_norm = 0
         step = 0
@@ -211,15 +210,15 @@ class ENASTrainer(Trainer):
         train_idx = 0
         avg_loss = 0
         data_iterator = Batch(self.train_data, batch_size=self.batch_size, sampler=self.sampler, as_numpy=False,
-                prefetch=self.prefetch)
-
+                              prefetch=self.prefetch)
+        
         for batch_x, batch_y in data_iterator:
             _move_dict_value_to_device(batch_x, batch_y, device=self._model_device)
             indices = data_iterator.get_batch_indices()
             # negative sampling; replace unknown; re-weight batch_y
             self.callback_manager.on_batch_begin(batch_x, batch_y, indices)
             # prediction = self._data_forward(self.model, batch_x)
-
+            
             dags = self.controller.sample(1)
             inputs, targets = batch_x, batch_y
             # self.callback_manager.on_loss_begin(batch_y, prediction)
@@ -228,18 +227,18 @@ class ENASTrainer(Trainer):
                                                     hidden,
                                                     dags)
             hidden.detach_()
-           
+            
             avg_loss += loss.item()
-
+            
             # Is loss NaN or inf? requires_grad = False
             self.callback_manager.on_backward_begin(loss)
             self._grad_backward(loss)
             self.callback_manager.on_backward_end()
-
+            
             self._update()
             self.callback_manager.on_step_end()
-
-            if (self.step+1) % self.print_every == 0:
+            
+            if (self.step + 1) % self.print_every == 0:
                 if self.use_tqdm:
                     print_output = "loss:{0:<6.5f}".format(avg_loss / self.print_every)
                     pbar.update(self.print_every)
@@ -255,30 +254,29 @@ class ENASTrainer(Trainer):
             self.shared_step += 1
             self.callback_manager.on_batch_end()
         # ================= mini-batch end ==================== #
-
-
+    
     def get_reward(self, dag, entropies, hidden, valid_idx=0):
         """Computes the perplexity of a single sampled model on a minibatch of
         validation data.
         """
         if not isinstance(entropies, np.ndarray):
             entropies = entropies.data.cpu().numpy()
-
+        
         data_iterator = Batch(self.dev_data, batch_size=self.batch_size, sampler=self.sampler, as_numpy=False,
-                prefetch=self.prefetch)
-
+                              prefetch=self.prefetch)
+        
         for inputs, targets in data_iterator:
             valid_loss, hidden, _ = self.get_loss(inputs, targets, hidden, dag)
             valid_loss = utils.to_item(valid_loss.data)
-
+            
             valid_ppl = math.exp(valid_loss)
-
+            
             R = 80 / valid_ppl
-
+            
             rewards = R + 1e-4 * entropies
-
+            
             return rewards, hidden
-
+    
     def train_controller(self):
         """Fixes the shared parameters and updates the controller parameters.
 
@@ -296,13 +294,13 @@ class ENASTrainer(Trainer):
         # Why can't we call shared.eval() here? Leads to loss
         # being uniformly zero for the controller.
         # self.shared.eval()
-
+        
         avg_reward_base = None
         baseline = None
         adv_history = []
         entropy_history = []
         reward_history = []
-
+        
         hidden = self.shared.init_hidden(self.batch_size)
         total_loss = 0
         valid_idx = 0
@@ -310,7 +308,7 @@ class ENASTrainer(Trainer):
             # sample models
             dags, log_probs, entropies = self.controller.sample(
                 with_details=True)
-
+            
             # calculate reward
             np_entropies = entropies.data.cpu().numpy()
             # No gradients should be backpropagated to the
@@ -320,40 +318,39 @@ class ENASTrainer(Trainer):
                                                   np_entropies,
                                                   hidden,
                                                   valid_idx)
-
-
+            
             reward_history.extend(rewards)
             entropy_history.extend(np_entropies)
-
+            
             # moving average baseline
             if baseline is None:
                 baseline = rewards
             else:
                 decay = 0.95
                 baseline = decay * baseline + (1 - decay) * rewards
-
+            
             adv = rewards - baseline
             adv_history.extend(adv)
-
+            
             # policy loss
-            loss = -log_probs*utils.get_variable(adv,
-                                                 'cuda' in self.device,
-                                                 requires_grad=False)
-
+            loss = -log_probs * utils.get_variable(adv,
+                                                   'cuda' in self.device,
+                                                   requires_grad=False)
+            
             loss = loss.sum()  # or loss.mean()
-
+            
             # update
             self.controller_optim.zero_grad()
             loss.backward()
-
+            
             self.controller_optim.step()
-
+            
             total_loss += utils.to_item(loss.data)
-
+            
             if ((step % 50) == 0) and (step > 0):
                 reward_history, adv_history, entropy_history = [], [], []
                 total_loss = 0
-
+            
             self.controller_step += 1
             # prev_valid_idx = valid_idx
             # valid_idx = ((valid_idx + self.max_length) %
@@ -362,16 +359,16 @@ class ENASTrainer(Trainer):
             # # validation data, we reset the hidden states.
             # if prev_valid_idx > valid_idx:
             #     hidden = self.shared.init_hidden(self.batch_size)
-
+    
     def derive(self, sample_num=10, valid_idx=0):
         """We are always deriving based on the very first batch
         of validation data? This seems wrong...
         """
         hidden = self.shared.init_hidden(self.batch_size)
-
+        
         dags, _, entropies = self.controller.sample(sample_num,
                                                     with_details=True)
-
+        
         max_R = 0
         best_dag = None
         for dag in dags:
@@ -379,5 +376,5 @@ class ENASTrainer(Trainer):
             if R.max() > max_R:
                 max_R = R.max()
                 best_dag = dag
-
+        
         self.model.setDAG(best_dag)
diff --git a/fastNLP/models/enas_utils.py b/fastNLP/models/enas_utils.py
index aafcb3a7..68c170ed 100644
--- a/fastNLP/models/enas_utils.py
+++ b/fastNLP/models/enas_utils.py
@@ -1,12 +1,10 @@
 # Code Modified from https://github.com/carpedm20/ENAS-pytorch
 
 from __future__ import print_function
-
 from collections import defaultdict
 import collections
 
 import numpy as np
-
 import torch
 from torch.autograd import Variable
 
diff --git a/fastNLP/models/sequence_labeling.py b/fastNLP/models/sequence_labeling.py
index 39f4c3fe..17f02298 100644
--- a/fastNLP/models/sequence_labeling.py
+++ b/fastNLP/models/sequence_labeling.py
@@ -1,11 +1,19 @@
+"""
+    本模块实现了两种序列标注模型
+"""
 import torch
+import torch.nn as nn
 
 from .base_model import BaseModel
 from ..modules import decoder, encoder
 from ..modules.decoder.CRF import allowed_transitions
 from ..core.utils import seq_len_to_mask
 from ..core.const import Const as C
-from torch import nn
+
+__all__ = [
+    "SeqLabeling",
+    "AdvSeqLabel"
+]
 
 
 class SeqLabeling(BaseModel):
diff --git a/fastNLP/models/snli.py b/fastNLP/models/snli.py
index 34b54302..606bcc42 100644
--- a/fastNLP/models/snli.py
+++ b/fastNLP/models/snli.py
@@ -8,6 +8,9 @@ from ..modules import encoder as Encoder
 from ..modules import aggregator as Aggregator
 from ..core.utils import seq_len_to_mask
 
+__all__ = [
+    "ESIM"
+]
 
 my_inf = 10e12
 
@@ -26,7 +29,7 @@ class ESIM(BaseModel):
     :param int num_classes: 标签数目，默认为3
     :param numpy.array init_embedding: 初始词嵌入矩阵，形状为(vocab_size, embed_dim)，默认为None，即随机初始化词嵌入矩阵
     """
-
+    
     def __init__(self, vocab_size, embed_dim, hidden_size, dropout=0.0, num_classes=3, init_embedding=None):
         
         super(ESIM, self).__init__()
@@ -35,35 +38,36 @@ class ESIM(BaseModel):
         self.hidden_size = hidden_size
         self.dropout = dropout
         self.n_labels = num_classes
-
+        
         self.drop = nn.Dropout(self.dropout)
-
+        
         self.embedding = Encoder.Embedding(
             (self.vocab_size, self.embed_dim), dropout=self.dropout,
         )
-
+        
         self.embedding_layer = nn.Linear(self.embed_dim, self.hidden_size)
-
+        
         self.encoder = Encoder.LSTM(
             input_size=self.embed_dim, hidden_size=self.hidden_size, num_layers=1, bias=True,
             batch_first=True, bidirectional=True
         )
-
+        
         self.bi_attention = Aggregator.BiAttention()
         self.mean_pooling = Aggregator.AvgPoolWithMask()
         self.max_pooling = Aggregator.MaxPoolWithMask()
-
+        
         self.inference_layer = nn.Linear(self.hidden_size * 4, self.hidden_size)
-
+        
         self.decoder = Encoder.LSTM(
             input_size=self.hidden_size, hidden_size=self.hidden_size, num_layers=1, bias=True,
             batch_first=True, bidirectional=True
         )
-
+        
         self.output = Decoder.MLP([4 * self.hidden_size, self.hidden_size, self.n_labels], 'tanh', dropout=self.dropout)
-
+    
     def forward(self, words1, words2, seq_len1=None, seq_len2=None, target=None):
         """ Forward function
+        
         :param torch.Tensor words1: [batch size(B), premise seq len(PL)] premise的token表示
         :param torch.Tensor words2: [B, hypothesis seq len(HL)] hypothesis的token表示
         :param torch.LongTensor seq_len1: [B] premise的长度
@@ -71,10 +75,10 @@ class ESIM(BaseModel):
         :param torch.LongTensor target: [B] 真实目标值
         :return: dict prediction: [B, n_labels(N)] 预测结果
         """
-
+        
         premise0 = self.embedding_layer(self.embedding(words1))
         hypothesis0 = self.embedding_layer(self.embedding(words2))
-
+        
         if seq_len1 is not None:
             seq_len1 = seq_len_to_mask(seq_len1)
         else:
@@ -85,55 +89,55 @@ class ESIM(BaseModel):
         else:
             seq_len2 = torch.ones(hypothesis0.size(0), hypothesis0.size(1))
             seq_len2 = (seq_len2.long()).to(device=hypothesis0.device)
-
+        
         _BP, _PSL, _HP = premise0.size()
         _BH, _HSL, _HH = hypothesis0.size()
         _BPL, _PLL = seq_len1.size()
         _HPL, _HLL = seq_len2.size()
-
+        
         assert _BP == _BH and _BPL == _HPL and _BP == _BPL
         assert _HP == _HH
         assert _PSL == _PLL and _HSL == _HLL
-
+        
         B, PL, H = premise0.size()
         B, HL, H = hypothesis0.size()
-
+        
         a0 = self.encoder(self.drop(premise0))  # a0: [B, PL, H * 2]
         b0 = self.encoder(self.drop(hypothesis0))  # b0: [B, HL, H * 2]
-
+        
         a = torch.mean(a0.view(B, PL, -1, H), dim=2)  # a: [B, PL, H]
         b = torch.mean(b0.view(B, HL, -1, H), dim=2)  # b: [B, HL, H]
-
+        
         ai, bi = self.bi_attention(a, b, seq_len1, seq_len2)
-
+        
         ma = torch.cat((a, ai, a - ai, a * ai), dim=2)  # ma: [B, PL, 4 * H]
         mb = torch.cat((b, bi, b - bi, b * bi), dim=2)  # mb: [B, HL, 4 * H]
-
+        
         f_ma = self.inference_layer(ma)
         f_mb = self.inference_layer(mb)
-
+        
         vat = self.decoder(self.drop(f_ma))
         vbt = self.decoder(self.drop(f_mb))
-
+        
         va = torch.mean(vat.view(B, PL, -1, H), dim=2)  # va: [B, PL, H]
         vb = torch.mean(vbt.view(B, HL, -1, H), dim=2)  # vb: [B, HL, H]
-
+        
         va_ave = self.mean_pooling(va, seq_len1, dim=1)  # va_ave: [B, H]
         va_max, va_arg_max = self.max_pooling(va, seq_len1, dim=1)  # va_max: [B, H]
         vb_ave = self.mean_pooling(vb, seq_len2, dim=1)  # vb_ave: [B, H]
         vb_max, vb_arg_max = self.max_pooling(vb, seq_len2, dim=1)  # vb_max: [B, H]
-
+        
         v = torch.cat((va_ave, va_max, vb_ave, vb_max), dim=1)  # v: [B, 4 * H]
-
+        
         prediction = torch.tanh(self.output(v))  # prediction: [B, N]
-
+        
         if target is not None:
             func = nn.CrossEntropyLoss()
             loss = func(prediction, target)
             return {Const.OUTPUT: prediction, Const.LOSS: loss}
-
+        
         return {Const.OUTPUT: prediction}
-
+    
     def predict(self, words1, words2, seq_len1=None, seq_len2=None, target=None):
         """ Predict function
 
@@ -146,4 +150,3 @@ class ESIM(BaseModel):
         """
         prediction = self.forward(words1, words2, seq_len1, seq_len2)[Const.OUTPUT]
         return {Const.OUTPUT: torch.argmax(prediction, dim=-1)}
-
diff --git a/fastNLP/models/star_transformer.py b/fastNLP/models/star_transformer.py
index cdd1f321..2e55f7e4 100644
--- a/fastNLP/models/star_transformer.py
+++ b/fastNLP/models/star_transformer.py
@@ -1,17 +1,25 @@
-"""Star-Transformer 的 一个 Pytorch 实现.
 """
+Star-Transformer 的 Pytorch 实现。
+"""
+import torch
+from torch import nn
+
 from ..modules.encoder.star_transformer import StarTransformer
 from ..core.utils import seq_len_to_mask
 from ..modules.utils import get_embeddings
 from ..core.const import Const
 
-import torch
-from torch import nn
+__all__ = [
+    "StarTransEnc",
+    "STNLICls",
+    "STSeqCls",
+    "STSeqLabel",
+]
 
 
 class StarTransEnc(nn.Module):
     """
-    别名：:class:`fastNLP.models.StarTransEnc`  :class:`fastNLP.models.start_transformer.StarTransEnc`
+    别名：:class:`fastNLP.models.StarTransEnc`  :class:`fastNLP.models.star_transformer.StarTransEnc`
 
     带word embedding的Star-Transformer Encoder
 
@@ -28,6 +36,7 @@ class StarTransEnc(nn.Module):
     :param emb_dropout: 词嵌入的dropout概率.
     :param dropout: 模型除词嵌入外的dropout概率.
     """
+    
     def __init__(self, init_embed,
                  hidden_size,
                  num_layers,
@@ -47,7 +56,7 @@ class StarTransEnc(nn.Module):
                                        head_dim=head_dim,
                                        dropout=dropout,
                                        max_len=max_len)
-
+    
     def forward(self, x, mask):
         """
         :param FloatTensor data: [batch, length, hidden] 输入的序列
@@ -72,7 +81,7 @@ class _Cls(nn.Module):
             nn.Dropout(dropout),
             nn.Linear(hid_dim, num_cls),
         )
-
+    
     def forward(self, x):
         h = self.fc(x)
         return h
@@ -83,20 +92,21 @@ class _NLICls(nn.Module):
         super(_NLICls, self).__init__()
         self.fc = nn.Sequential(
             nn.Dropout(dropout),
-            nn.Linear(in_dim*4, hid_dim),  #4
+            nn.Linear(in_dim * 4, hid_dim),  # 4
             nn.LeakyReLU(),
             nn.Dropout(dropout),
             nn.Linear(hid_dim, num_cls),
         )
-
+    
     def forward(self, x1, x2):
-        x = torch.cat([x1, x2, torch.abs(x1-x2), x1*x2], 1)
+        x = torch.cat([x1, x2, torch.abs(x1 - x2), x1 * x2], 1)
         h = self.fc(x)
         return h
 
+
 class STSeqLabel(nn.Module):
     """
-    别名：:class:`fastNLP.models.STSeqLabel`  :class:`fastNLP.models.start_transformer.STSeqLabel`
+    别名：:class:`fastNLP.models.STSeqLabel`  :class:`fastNLP.models.star_transformer.STSeqLabel`
 
     用于序列标注的Star-Transformer模型
 
@@ -112,6 +122,7 @@ class STSeqLabel(nn.Module):
     :param emb_dropout: 词嵌入的dropout概率. Default: 0.1
     :param dropout: 模型除词嵌入外的dropout概率. Default: 0.1
     """
+    
     def __init__(self, init_embed, num_cls,
                  hidden_size=300,
                  num_layers=4,
@@ -120,7 +131,7 @@ class STSeqLabel(nn.Module):
                  max_len=512,
                  cls_hidden_size=600,
                  emb_dropout=0.1,
-                 dropout=0.1,):
+                 dropout=0.1, ):
         super(STSeqLabel, self).__init__()
         self.enc = StarTransEnc(init_embed=init_embed,
                                 hidden_size=hidden_size,
@@ -131,7 +142,7 @@ class STSeqLabel(nn.Module):
                                 emb_dropout=emb_dropout,
                                 dropout=dropout)
         self.cls = _Cls(hidden_size, num_cls, cls_hidden_size)
-
+    
     def forward(self, words, seq_len):
         """
 
@@ -142,9 +153,9 @@ class STSeqLabel(nn.Module):
         mask = seq_len_to_mask(seq_len)
         nodes, _ = self.enc(words, mask)
         output = self.cls(nodes)
-        output = output.transpose(1,2) # make hidden to be dim 1
-        return {Const.OUTPUT: output} # [bsz, n_cls, seq_len]
-
+        output = output.transpose(1, 2)  # make hidden to be dim 1
+        return {Const.OUTPUT: output}  # [bsz, n_cls, seq_len]
+    
     def predict(self, words, seq_len):
         """
 
@@ -159,7 +170,7 @@ class STSeqLabel(nn.Module):
 
 class STSeqCls(nn.Module):
     """
-    别名：:class:`fastNLP.models.STSeqCls`  :class:`fastNLP.models.start_transformer.STSeqCls`
+    别名：:class:`fastNLP.models.STSeqCls`  :class:`fastNLP.models.star_transformer.STSeqCls`
 
     用于分类任务的Star-Transformer
 
@@ -175,7 +186,7 @@ class STSeqCls(nn.Module):
     :param emb_dropout: 词嵌入的dropout概率. Default: 0.1
     :param dropout: 模型除词嵌入外的dropout概率. Default: 0.1
     """
-
+    
     def __init__(self, init_embed, num_cls,
                  hidden_size=300,
                  num_layers=4,
@@ -184,7 +195,7 @@ class STSeqCls(nn.Module):
                  max_len=512,
                  cls_hidden_size=600,
                  emb_dropout=0.1,
-                 dropout=0.1,):
+                 dropout=0.1, ):
         super(STSeqCls, self).__init__()
         self.enc = StarTransEnc(init_embed=init_embed,
                                 hidden_size=hidden_size,
@@ -195,7 +206,7 @@ class STSeqCls(nn.Module):
                                 emb_dropout=emb_dropout,
                                 dropout=dropout)
         self.cls = _Cls(hidden_size, num_cls, cls_hidden_size)
-
+    
     def forward(self, words, seq_len):
         """
 
@@ -206,9 +217,9 @@ class STSeqCls(nn.Module):
         mask = seq_len_to_mask(seq_len)
         nodes, relay = self.enc(words, mask)
         y = 0.5 * (relay + nodes.max(1)[0])
-        output = self.cls(y) # [bsz, n_cls]
+        output = self.cls(y)  # [bsz, n_cls]
         return {Const.OUTPUT: output}
-
+    
     def predict(self, words, seq_len):
         """
 
@@ -223,7 +234,7 @@ class STSeqCls(nn.Module):
 
 class STNLICls(nn.Module):
     """
-    别名：:class:`fastNLP.models.STNLICls`  :class:`fastNLP.models.start_transformer.STNLICls`
+    别名：:class:`fastNLP.models.STNLICls`  :class:`fastNLP.models.star_transformer.STNLICls`
     
     用于自然语言推断(NLI)的Star-Transformer
 
@@ -239,7 +250,7 @@ class STNLICls(nn.Module):
     :param emb_dropout: 词嵌入的dropout概率. Default: 0.1
     :param dropout: 模型除词嵌入外的dropout概率. Default: 0.1
     """
-
+    
     def __init__(self, init_embed, num_cls,
                  hidden_size=300,
                  num_layers=4,
@@ -248,7 +259,7 @@ class STNLICls(nn.Module):
                  max_len=512,
                  cls_hidden_size=600,
                  emb_dropout=0.1,
-                 dropout=0.1,):
+                 dropout=0.1, ):
         super(STNLICls, self).__init__()
         self.enc = StarTransEnc(init_embed=init_embed,
                                 hidden_size=hidden_size,
@@ -259,7 +270,7 @@ class STNLICls(nn.Module):
                                 emb_dropout=emb_dropout,
                                 dropout=dropout)
         self.cls = _NLICls(hidden_size, num_cls, cls_hidden_size)
-
+    
     def forward(self, words1, words2, seq_len1, seq_len2):
         """
 
@@ -271,14 +282,16 @@ class STNLICls(nn.Module):
         """
         mask1 = seq_len_to_mask(seq_len1)
         mask2 = seq_len_to_mask(seq_len2)
+        
         def enc(seq, mask):
             nodes, relay = self.enc(seq, mask)
             return 0.5 * (relay + nodes.max(1)[0])
+        
         y1 = enc(words1, mask1)
         y2 = enc(words2, mask2)
-        output = self.cls(y1, y2) # [bsz, n_cls]
+        output = self.cls(y1, y2)  # [bsz, n_cls]
         return {Const.OUTPUT: output}
-
+    
     def predict(self, words1, words2, seq_len1, seq_len2):
         """