diff --git a/docs/Makefile b/docs/Makefile
index 3050e655..6ba2fa54 100644
--- a/docs/Makefile
+++ b/docs/Makefile
@@ -14,7 +14,7 @@ help:
 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
 
 apidoc:
-	$(SPHINXAPIDOC) -fM -o source ../$(SPHINXPROJ)
+	$(SPHINXAPIDOC) -efM -o source ../$(SPHINXPROJ)
 
 server:
 	cd build/html && python -m http.server
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 5fd9e56d..3e9753af 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -68,7 +68,7 @@ master_doc = 'index'
 #
 # This is also used if you do content translation via gettext catalogs.
 # Usually you set "language" from the command line for these cases.
-language = None
+language = "zh_CN"
 
 # List of patterns, relative to source directory, that match files and
 # directories to ignore when looking for source files.
@@ -89,7 +89,10 @@ html_theme = 'sphinx_rtd_theme'
 # further.  For a list of options available for each theme, see the
 # documentation.
 #
-# html_theme_options = {}
+html_theme_options = {
+    'collapse_navigation': False,
+    'titles_only': True
+}
 
 # Add any paths that contain custom static files (such as style sheets) here,
 # relative to this directory. They are copied after the builtin static files,
diff --git a/docs/source/fastNLP.api.rst b/docs/source/fastNLP.api.rst
deleted file mode 100644
index 955eb8c5..00000000
--- a/docs/source/fastNLP.api.rst
+++ /dev/null
@@ -1,60 +0,0 @@
-fastNLP.api package
-===================
-
-.. automodule:: fastNLP.api
-    :members:
-    :undoc-members:
-    :show-inheritance:
-
-Submodules
-----------
-
-fastNLP.api.api module
-----------------------
-
-.. automodule:: fastNLP.api.api
-    :members:
-    :undoc-members:
-    :show-inheritance:
-
-fastNLP.api.converter module
-----------------------------
-
-.. automodule:: fastNLP.api.converter
-    :members:
-    :undoc-members:
-    :show-inheritance:
-
-fastNLP.api.examples module
----------------------------
-
-.. automodule:: fastNLP.api.examples
-    :members:
-    :undoc-members:
-    :show-inheritance:
-
-fastNLP.api.pipeline module
----------------------------
-
-.. automodule:: fastNLP.api.pipeline
-    :members:
-    :undoc-members:
-    :show-inheritance:
-
-fastNLP.api.processor module
-----------------------------
-
-.. automodule:: fastNLP.api.processor
-    :members:
-    :undoc-members:
-    :show-inheritance:
-
-fastNLP.api.utils module
-------------------------
-
-.. automodule:: fastNLP.api.utils
-    :members:
-    :undoc-members:
-    :show-inheritance:
-
-
diff --git a/docs/source/fastNLP.automl.rst b/docs/source/fastNLP.automl.rst
deleted file mode 100644
index 3c12e271..00000000
--- a/docs/source/fastNLP.automl.rst
+++ /dev/null
@@ -1,44 +0,0 @@
-fastNLP.automl package
-======================
-
-.. automodule:: fastNLP.automl
-    :members:
-    :undoc-members:
-    :show-inheritance:
-
-Submodules
-----------
-
-fastNLP.automl.enas\_controller module
---------------------------------------
-
-.. automodule:: fastNLP.automl.enas_controller
-    :members:
-    :undoc-members:
-    :show-inheritance:
-
-fastNLP.automl.enas\_model module
----------------------------------
-
-.. automodule:: fastNLP.automl.enas_model
-    :members:
-    :undoc-members:
-    :show-inheritance:
-
-fastNLP.automl.enas\_trainer module
------------------------------------
-
-.. automodule:: fastNLP.automl.enas_trainer
-    :members:
-    :undoc-members:
-    :show-inheritance:
-
-fastNLP.automl.enas\_utils module
----------------------------------
-
-.. automodule:: fastNLP.automl.enas_utils
-    :members:
-    :undoc-members:
-    :show-inheritance:
-
-
diff --git a/docs/source/fastNLP.component.rst b/docs/source/fastNLP.component.rst
deleted file mode 100644
index 81fcf561..00000000
--- a/docs/source/fastNLP.component.rst
+++ /dev/null
@@ -1,20 +0,0 @@
-fastNLP.component package
-=========================
-
-.. automodule:: fastNLP.component
-    :members:
-    :undoc-members:
-    :show-inheritance:
-
-Submodules
-----------
-
-fastNLP.component.bert\_tokenizer module
-----------------------------------------
-
-.. automodule:: fastNLP.component.bert_tokenizer
-    :members:
-    :undoc-members:
-    :show-inheritance:
-
-
diff --git a/docs/source/fastNLP.core.rst b/docs/source/fastNLP.core.rst
deleted file mode 100644
index 540bd03c..00000000
--- a/docs/source/fastNLP.core.rst
+++ /dev/null
@@ -1,124 +0,0 @@
-fastNLP.core package
-====================
-
-.. automodule:: fastNLP.core
-    :members:
-    :undoc-members:
-    :show-inheritance:
-
-Submodules
-----------
-
-fastNLP.core.batch module
--------------------------
-
-.. automodule:: fastNLP.core.batch
-    :members:
-    :undoc-members:
-    :show-inheritance:
-
-fastNLP.core.callback module
-----------------------------
-
-.. automodule:: fastNLP.core.callback
-    :members:
-    :undoc-members:
-    :show-inheritance:
-
-fastNLP.core.dataset module
----------------------------
-
-.. automodule:: fastNLP.core.dataset
-    :members:
-    :undoc-members:
-    :show-inheritance:
-
-fastNLP.core.fieldarray module
-------------------------------
-
-.. automodule:: fastNLP.core.fieldarray
-    :members:
-    :undoc-members:
-    :show-inheritance:
-
-fastNLP.core.instance module
-----------------------------
-
-.. automodule:: fastNLP.core.instance
-    :members:
-    :undoc-members:
-    :show-inheritance:
-
-fastNLP.core.losses module
---------------------------
-
-.. automodule:: fastNLP.core.losses
-    :members:
-    :undoc-members:
-    :show-inheritance:
-
-fastNLP.core.metrics module
----------------------------
-
-.. automodule:: fastNLP.core.metrics
-    :members:
-    :undoc-members:
-    :show-inheritance:
-
-fastNLP.core.optimizer module
------------------------------
-
-.. automodule:: fastNLP.core.optimizer
-    :members:
-    :undoc-members:
-    :show-inheritance:
-
-fastNLP.core.predictor module
------------------------------
-
-.. automodule:: fastNLP.core.predictor
-    :members:
-    :undoc-members:
-    :show-inheritance:
-
-fastNLP.core.sampler module
----------------------------
-
-.. automodule:: fastNLP.core.sampler
-    :members:
-    :undoc-members:
-    :show-inheritance:
-
-fastNLP.core.tester module
---------------------------
-
-.. automodule:: fastNLP.core.tester
-    :members:
-    :undoc-members:
-    :show-inheritance:
-
-fastNLP.core.trainer module
----------------------------
-
-.. automodule:: fastNLP.core.trainer
-    :members:
-    :undoc-members:
-    :show-inheritance:
-
-fastNLP.core.utils module
--------------------------
-
-.. automodule:: fastNLP.core.utils
-    :members:
-    :undoc-members:
-    :show-inheritance:
-
-fastNLP.core.vocabulary module
-------------------------------
-
-.. automodule:: fastNLP.core.vocabulary
-    :members:
-    :undoc-members:
-    :show-inheritance:
-
-
diff --git a/docs/source/fastNLP.io.rst b/docs/source/fastNLP.io.rst
deleted file mode 100644
index 1eb95c6a..00000000
--- a/docs/source/fastNLP.io.rst
+++ /dev/null
@@ -1,60 +0,0 @@
-fastNLP.io package
-==================
-
-.. automodule:: fastNLP.io
-    :members:
-    :undoc-members:
-    :show-inheritance:
-
-Submodules
-----------
-
-fastNLP.io.base\_loader module
-------------------------------
-
-.. automodule:: fastNLP.io.base_loader
-    :members:
-    :undoc-members:
-    :show-inheritance:
-
-fastNLP.io.config\_io module
-----------------------------
-
-.. automodule:: fastNLP.io.config_io
-    :members:
-    :undoc-members:
-    :show-inheritance:
-
-fastNLP.io.dataset\_loader module
----------------------------------
-
-.. automodule:: fastNLP.io.dataset_loader
-    :members:
-    :undoc-members:
-    :show-inheritance:
-
-fastNLP.io.embed\_loader module
--------------------------------
-
-.. automodule:: fastNLP.io.embed_loader
-    :members:
-    :undoc-members:
-    :show-inheritance:
-
-fastNLP.io.file\_reader module
-------------------------------
-
-.. automodule:: fastNLP.io.file_reader
-    :members:
-    :undoc-members:
-    :show-inheritance:
-
-fastNLP.io.model\_io module
----------------------------
-
-.. automodule:: fastNLP.io.model_io
-    :members:
-    :undoc-members:
-    :show-inheritance:
-
-
diff --git a/docs/source/fastNLP.models.rst b/docs/source/fastNLP.models.rst
deleted file mode 100644
index 18b8186f..00000000
--- a/docs/source/fastNLP.models.rst
+++ /dev/null
@@ -1,108 +0,0 @@
-fastNLP.models package
-======================
-
-.. automodule:: fastNLP.models
-    :members:
-    :undoc-members:
-    :show-inheritance:
-
-Submodules
-----------
-
-fastNLP.models.base\_model module
----------------------------------
-
-.. automodule:: fastNLP.models.base_model
-    :members:
-    :undoc-members:
-    :show-inheritance:
-
-fastNLP.models.bert module
---------------------------
-
-.. automodule:: fastNLP.models.bert
-    :members:
-    :undoc-members:
-    :show-inheritance:
-
-fastNLP.models.biaffine\_parser module
---------------------------------------
-
-.. automodule:: fastNLP.models.biaffine_parser
-    :members:
-    :undoc-members:
-    :show-inheritance:
-
-fastNLP.models.char\_language\_model module
--------------------------------------------
-
-.. automodule:: fastNLP.models.char_language_model
-    :members:
-    :undoc-members:
-    :show-inheritance:
-
-fastNLP.models.cnn\_text\_classification module
------------------------------------------------
-
-.. automodule:: fastNLP.models.cnn_text_classification
-    :members:
-    :undoc-members:
-    :show-inheritance:
-
-fastNLP.models.enas\_controller module
---------------------------------------
-
-.. automodule:: fastNLP.models.enas_controller
-    :members:
-    :undoc-members:
-    :show-inheritance:
-
-fastNLP.models.enas\_model module
----------------------------------
-
-.. automodule:: fastNLP.models.enas_model
-    :members:
-    :undoc-members:
-    :show-inheritance:
-
-fastNLP.models.enas\_trainer module
------------------------------------
-
-.. automodule:: fastNLP.models.enas_trainer
-    :members:
-    :undoc-members:
-    :show-inheritance:
-
-fastNLP.models.enas\_utils module
----------------------------------
-
-.. automodule:: fastNLP.models.enas_utils
-    :members:
-    :undoc-members:
-    :show-inheritance:
-
-fastNLP.models.sequence\_modeling module
-----------------------------------------
-
-.. automodule:: fastNLP.models.sequence_modeling
-    :members:
-    :undoc-members:
-    :show-inheritance:
-
-fastNLP.models.snli module
---------------------------
-
-.. automodule:: fastNLP.models.snli
-    :members:
-    :undoc-members:
-    :show-inheritance:
-
-fastNLP.models.star\_transformer module
----------------------------------------
-
-.. automodule:: fastNLP.models.star_transformer
-    :members:
-    :undoc-members:
-    :show-inheritance:
-
-
diff --git a/docs/source/fastNLP.modules.aggregator.rst b/docs/source/fastNLP.modules.aggregator.rst
deleted file mode 100644
index 74ff5aed..00000000
--- a/docs/source/fastNLP.modules.aggregator.rst
+++ /dev/null
@@ -1,28 +0,0 @@
-fastNLP.modules.aggregator package
-==================================
-
-.. automodule:: fastNLP.modules.aggregator
-    :members:
-    :undoc-members:
-    :show-inheritance:
-
-Submodules
-----------
-
-fastNLP.modules.aggregator.attention module
--------------------------------------------
-
-.. automodule:: fastNLP.modules.aggregator.attention
-    :members:
-    :undoc-members:
-    :show-inheritance:
-
-fastNLP.modules.aggregator.pooling module
------------------------------------------
-
-.. automodule:: fastNLP.modules.aggregator.pooling
-    :members:
-    :undoc-members:
-    :show-inheritance:
-
-
diff --git a/docs/source/fastNLP.modules.decoder.rst b/docs/source/fastNLP.modules.decoder.rst
deleted file mode 100644
index 5e467b98..00000000
--- a/docs/source/fastNLP.modules.decoder.rst
+++ /dev/null
@@ -1,36 +0,0 @@
-fastNLP.modules.decoder package
-===============================
-
-.. automodule:: fastNLP.modules.decoder
-    :members:
-    :undoc-members:
-    :show-inheritance:
-
-Submodules
-----------
-
-fastNLP.modules.decoder.CRF module
-----------------------------------
-
-.. automodule:: fastNLP.modules.decoder.CRF
-    :members:
-    :undoc-members:
-    :show-inheritance:
-
-fastNLP.modules.decoder.MLP module
-----------------------------------
-
-.. automodule:: fastNLP.modules.decoder.MLP
-    :members:
-    :undoc-members:
-    :show-inheritance:
-
-fastNLP.modules.decoder.utils module
-------------------------------------
-
-.. automodule:: fastNLP.modules.decoder.utils
-    :members:
-    :undoc-members:
-    :show-inheritance:
-
-
diff --git a/docs/source/fastNLP.modules.encoder.rst b/docs/source/fastNLP.modules.encoder.rst
deleted file mode 100644
index ff048be7..00000000
--- a/docs/source/fastNLP.modules.encoder.rst
+++ /dev/null
@@ -1,100 +0,0 @@
-fastNLP.modules.encoder package
-===============================
-
-.. automodule:: fastNLP.modules.encoder
-    :members:
-    :undoc-members:
-    :show-inheritance:
-
-Submodules
-----------
-
-fastNLP.modules.encoder.bert module
------------------------------------
-
-.. automodule:: fastNLP.modules.encoder.bert
-    :members:
-    :undoc-members:
-    :show-inheritance:
-
-fastNLP.modules.encoder.char\_encoder module
---------------------------------------------
-
-.. automodule:: fastNLP.modules.encoder.char_encoder
-    :members:
-    :undoc-members:
-    :show-inheritance:
-
-fastNLP.modules.encoder.conv module
------------------------------------
-
-.. automodule:: fastNLP.modules.encoder.conv
-    :members:
-    :undoc-members:
-    :show-inheritance:
-
-fastNLP.modules.encoder.conv\_maxpool module
---------------------------------------------
-
-.. automodule:: fastNLP.modules.encoder.conv_maxpool
-    :members:
-    :undoc-members:
-    :show-inheritance:
-
-fastNLP.modules.encoder.embedding module
-----------------------------------------
-
-.. automodule:: fastNLP.modules.encoder.embedding
-    :members:
-    :undoc-members:
-    :show-inheritance:
-
-fastNLP.modules.encoder.linear module
--------------------------------------
-
-.. automodule:: fastNLP.modules.encoder.linear
-    :members:
-    :undoc-members:
-    :show-inheritance:
-
-fastNLP.modules.encoder.lstm module
------------------------------------
-
-.. automodule:: fastNLP.modules.encoder.lstm
-    :members:
-    :undoc-members:
-    :show-inheritance:
-
-fastNLP.modules.encoder.masked\_rnn module
-------------------------------------------
-
-.. automodule:: fastNLP.modules.encoder.masked_rnn
-    :members:
-    :undoc-members:
-    :show-inheritance:
-
-fastNLP.modules.encoder.star\_transformer module
-------------------------------------------------
-
-.. automodule:: fastNLP.modules.encoder.star_transformer
-    :members:
-    :undoc-members:
-    :show-inheritance:
-
-fastNLP.modules.encoder.transformer module
-------------------------------------------
-
-.. automodule:: fastNLP.modules.encoder.transformer
-    :members:
-    :undoc-members:
-    :show-inheritance:
-
-fastNLP.modules.encoder.variational\_rnn module
------------------------------------------------
-
-.. automodule:: fastNLP.modules.encoder.variational_rnn
-    :members:
-    :undoc-members:
-    :show-inheritance:
-
-
diff --git a/docs/source/fastNLP.modules.rst b/docs/source/fastNLP.modules.rst
deleted file mode 100644
index 5884e655..00000000
--- a/docs/source/fastNLP.modules.rst
+++ /dev/null
@@ -1,45 +0,0 @@
-fastNLP.modules package
-=======================
-
-.. automodule:: fastNLP.modules
-    :members:
-    :undoc-members:
-    :show-inheritance:
-
-Subpackages
------------
-
-.. toctree::
-
-    fastNLP.modules.aggregator
-    fastNLP.modules.decoder
-    fastNLP.modules.encoder
-
-Submodules
-----------
-
-fastNLP.modules.dropout module
-------------------------------
-
-.. automodule:: fastNLP.modules.dropout
-    :members:
-    :undoc-members:
-    :show-inheritance:
-
-fastNLP.modules.other\_modules module
--------------------------------------
-
-.. automodule:: fastNLP.modules.other_modules
-    :members:
-    :undoc-members:
-    :show-inheritance:
-
-fastNLP.modules.utils module
-----------------------------
-
-.. automodule:: fastNLP.modules.utils
-    :members:
-    :undoc-members:
-    :show-inheritance:
-
-
diff --git a/docs/source/fastNLP.rst b/docs/source/fastNLP.rst
deleted file mode 100644
index f5247748..00000000
--- a/docs/source/fastNLP.rst
+++ /dev/null
@@ -1,21 +0,0 @@
-fastNLP package
-===============
-
-.. automodule:: fastNLP
-    :members:
-    :undoc-members:
-    :show-inheritance:
-
-Subpackages
------------
-
-.. toctree::
-
-    fastNLP.api
-    fastNLP.automl
-    fastNLP.component
-    fastNLP.core
-    fastNLP.io
-    fastNLP.models
-    fastNLP.modules
-
diff --git a/fastNLP/__init__.py b/fastNLP/__init__.py
index 35309bd3..e7975c9b 100644
--- a/fastNLP/__init__.py
+++ b/fastNLP/__init__.py
@@ -1,5 +1,15 @@
+"""
+fastNLP 由 :mod:`~fastNLP.core` 、 :mod:`~fastNLP.io` 、:mod:`~fastNLP.modules` 等子模块组成，但常用的组件都可以直接 import ，常用组件如下：
+"""
+__all__ = ["Instance", "FieldArray", "Batch", "Vocabulary", "DataSet",
+           "Trainer", "Tester", "Callback",
+           "Padder", "AutoPadder", "EngChar2DPadder",
+           "AccuracyMetric", "Optimizer", "SGD", "Adam",
+           "Sampler", "SequentialSampler", "BucketSampler", "RandomSampler",
+           "LossFunc", "CrossEntropyLoss", "L1Loss", "BCELoss", "NLLLoss", "LossInForward",
+           "cache_results"]
 from .core import *
 from . import models
 from . import modules
 
-__version__ = '0.4.0'
\ No newline at end of file
+__version__ = '0.4.0'
diff --git a/fastNLP/api/__init__.py b/fastNLP/api/__init__.py
index a21a4c42..5171d8c2 100644
--- a/fastNLP/api/__init__.py
+++ b/fastNLP/api/__init__.py
@@ -1 +1,2 @@
+__all__ = ["CWS", "POS", "Parser"]
 from .api import CWS, POS, Parser
diff --git a/fastNLP/api/api.py b/fastNLP/api/api.py
index c4c21832..351b210d 100644
--- a/fastNLP/api/api.py
+++ b/fastNLP/api/api.py
@@ -1,41 +1,3 @@
-"""
-api.api的介绍文档
-    直接缩进会把上面的文字变成标题
-
-空行缩进的写法比较合理
-
-    比较合理
-    
-*这里是斜体内容*
-
-**这里是粗体内容**
-
-数学公式块
-
-.. math::
-    E = mc^2
-    
-.. note::
-    注解型提示。
-    
-.. warning::
-   警告型提示。
-
-.. seealso::
-    `参考与超链接 <https://willqvq.github.io/doc_guide/%E6%B3%A8%E9%87%8A%E6%8C%87%E5%AF%BC>`_
-
-普通代码块需要空一行, Example::
-
-    from fitlog import fitlog
-    fitlog.commit()
- 
-普通下标和上标:
-
-H\ :sub:`2`\ O
-
-E = mc\ :sup:`2`
-
-"""
 import warnings
 
 import torch
@@ -43,15 +5,14 @@ import torch
 warnings.filterwarnings('ignore')
 import os
 
-from fastNLP.core.dataset import DataSet
-
-from fastNLP.api.utils import load_url
-from fastNLP.api.processor import ModelProcessor
-from fastNLP.io.dataset_loader import _cut_long_sentence, ConllLoader
-from fastNLP.core.instance import Instance
-from fastNLP.api.pipeline import Pipeline
-from fastNLP.core.metrics import SpanFPreRecMetric
-from fastNLP.api.processor import IndexerProcessor
+from ..core.dataset import DataSet
+from .utils import load_url
+from .processor import ModelProcessor
+from ..io.dataset_loader import _cut_long_sentence, ConllLoader
+from ..core.instance import Instance
+from ..api.pipeline import Pipeline
+from ..core.metrics import SpanFPreRecMetric
+from .processor import IndexerProcessor
 
 # TODO add pretrain urls
 model_urls = {
@@ -63,9 +24,10 @@ model_urls = {
 
 class ConllCWSReader(object):
     """Deprecated. Use ConllLoader for all types of conll-format files."""
+    
     def __init__(self):
         pass
-
+    
     def load(self, path, cut_long_sent=False):
         """
         返回的DataSet只包含raw_sentence这个field，内容为str。
@@ -98,7 +60,7 @@ class ConllCWSReader(object):
                     sample.append(line.strip().split())
             if len(sample) > 0:
                 datalist.append(sample)
-
+        
         ds = DataSet()
         for sample in datalist:
             # print(sample)
@@ -113,7 +75,7 @@ class ConllCWSReader(object):
             for raw_sentence in sents:
                 ds.append(Instance(raw_sentence=raw_sentence))
         return ds
-
+    
     def get_char_lst(self, sample):
         if len(sample) == 0:
             return None
@@ -125,11 +87,13 @@ class ConllCWSReader(object):
             text.append(t1)
         return text
 
+
 class ConllxDataLoader(ConllLoader):
     """返回“词级别”的标签信息，包括词、词性、（句法）头依赖、（句法）边标签。跟``ZhConllPOSReader``完全不同。
 
         Deprecated. Use ConllLoader for all types of conll-format files.
     """
+    
     def __init__(self):
         headers = [
             'words', 'pos_tags', 'heads', 'labels',
@@ -141,18 +105,15 @@ class ConllxDataLoader(ConllLoader):
 
 
 class API:
-    """
-        这是 API 类的文档
-    """
     def __init__(self):
         self.pipeline = None
         self._dict = None
-
+    
     def predict(self, *args, **kwargs):
         """Do prediction for the given input.
         """
         raise NotImplementedError
-
+    
     def test(self, file_path):
         """Test performance over the given data set.
 
@@ -160,7 +121,7 @@ class API:
         :return: a dictionary of metric values
         """
         raise NotImplementedError
-
+    
     def load(self, path, device):
         if os.path.exists(os.path.expanduser(path)):
             _dict = torch.load(path, map_location='cpu')
@@ -180,14 +141,14 @@ class POS(API):
     :param str device: device name such as "cpu" or "cuda:0". Use the same notation as PyTorch.
 
     """
-
+    
     def __init__(self, model_path=None, device='cpu'):
         super(POS, self).__init__()
         if model_path is None:
             model_path = model_urls['pos']
-
+        
         self.load(model_path, device)
-
+    
     def predict(self, content):
         """predict函数的介绍，
         函数介绍的第二句，这句话不会换行
@@ -197,48 +158,48 @@ class POS(API):
         """
         if not hasattr(self, "pipeline"):
             raise ValueError("You have to load model first.")
-
+        
         sentence_list = content
         # 1. 检查sentence的类型
         for sentence in sentence_list:
             if not all((type(obj) == str for obj in sentence)):
                 raise ValueError("Input must be list of list of string.")
-
+        
         # 2. 组建dataset
         dataset = DataSet()
         dataset.add_field("words", sentence_list)
-
+        
         # 3. 使用pipeline
         self.pipeline(dataset)
-
+        
         def merge_tag(words_list, tags_list):
             rtn = []
             for words, tags in zip(words_list, tags_list):
                 rtn.append([w + "/" + t for w, t in zip(words, tags)])
             return rtn
-
+        
         output = dataset.field_arrays["tag"].content
         if isinstance(content, str):
             return output[0]
         elif isinstance(content, list):
             return merge_tag(content, output)
-
+    
     def test(self, file_path):
         test_data = ConllxDataLoader().load(file_path)
-
+        
         save_dict = self._dict
         tag_vocab = save_dict["tag_vocab"]
         pipeline = save_dict["pipeline"]
         index_tag = IndexerProcessor(vocab=tag_vocab, field_name="tag", new_added_field_name="truth", is_input=False)
         pipeline.pipeline = [index_tag] + pipeline.pipeline
-
+        
         test_data.rename_field("pos_tags", "tag")
         pipeline(test_data)
         test_data.set_target("truth")
         prediction = test_data.field_arrays["predict"].content
         truth = test_data.field_arrays["truth"].content
         seq_len = test_data.field_arrays["word_seq_origin_len"].content
-
+        
         # padding by hand
         max_length = max([len(seq) for seq in prediction])
         for idx in range(len(prediction)):
@@ -252,7 +213,7 @@ class POS(API):
         f1 = round(test_result['f'] * 100, 2)
         pre = round(test_result['pre'] * 100, 2)
         rec = round(test_result['rec'] * 100, 2)
-
+        
         return {"F1": f1, "precision": pre, "recall": rec}
 
 
@@ -263,14 +224,15 @@ class CWS(API):
     :param model_path: 当model_path为None，使用默认位置的model。如果默认位置不存在，则自动下载模型
     :param device: str，可以为'cpu', 'cuda'或'cuda:0'等。会将模型load到相应device进行推断。
     """
+    
     def __init__(self, model_path=None, device='cpu'):
         
         super(CWS, self).__init__()
         if model_path is None:
             model_path = model_urls['cws']
-
+        
         self.load(model_path, device)
-
+    
     def predict(self, content):
         """
         分词接口。
@@ -281,27 +243,27 @@ class CWS(API):
         """
         if not hasattr(self, 'pipeline'):
             raise ValueError("You have to load model first.")
-
+        
         sentence_list = []
         # 1. 检查sentence的类型
         if isinstance(content, str):
             sentence_list.append(content)
         elif isinstance(content, list):
             sentence_list = content
-
+        
         # 2. 组建dataset
         dataset = DataSet()
         dataset.add_field('raw_sentence', sentence_list)
-
+        
         # 3. 使用pipeline
         self.pipeline(dataset)
-
+        
         output = dataset.get_field('output').content
         if isinstance(content, str):
             return output[0]
         elif isinstance(content, list):
             return output
-
+    
     def test(self, filepath):
         """
         传入一个分词文件路径，返回该数据集上分词f1, precision, recall。
@@ -327,28 +289,28 @@ class CWS(API):
         tag_proc = self._dict['tag_proc']
         cws_model = self.pipeline.pipeline[-2].model
         pipeline = self.pipeline.pipeline[:-2]
-
+        
         pipeline.insert(1, tag_proc)
         pp = Pipeline(pipeline)
-
+        
         reader = ConllCWSReader()
-
+        
         # te_filename = '/home/hyan/ctb3/test.conllx'
         te_dataset = reader.load(filepath)
         pp(te_dataset)
-
-        from fastNLP.core.tester import Tester
-        from fastNLP.core.metrics import BMESF1PreRecMetric
-
+        
+        from ..core.tester import Tester
+        from ..core.metrics import BMESF1PreRecMetric
+        
         tester = Tester(data=te_dataset, model=cws_model, metrics=BMESF1PreRecMetric(target='target'), batch_size=64,
                         verbose=0)
         eval_res = tester.test()
-
+        
         f1 = eval_res['BMESF1PreRecMetric']['f']
         pre = eval_res['BMESF1PreRecMetric']['pre']
         rec = eval_res['BMESF1PreRecMetric']['rec']
         # print("f1:{:.2f}, pre:{:.2f}, rec:{:.2f}".format(f1, pre, rec))
-
+        
         return {"F1": f1, "precision": pre, "recall": rec}
 
 
@@ -357,25 +319,25 @@ class Parser(API):
         super(Parser, self).__init__()
         if model_path is None:
             model_path = model_urls['parser']
-
+        
         self.pos_tagger = POS(device=device)
         self.load(model_path, device)
-
+    
     def predict(self, content):
         if not hasattr(self, 'pipeline'):
             raise ValueError("You have to load model first.")
-
+        
         # 1. 利用POS得到分词和pos tagging结果
         pos_out = self.pos_tagger.predict(content)
         # pos_out = ['这里/NN 是/VB 分词/NN 结果/NN'.split()]
-
+        
         # 2. 组建dataset
         dataset = DataSet()
         dataset.add_field('wp', pos_out)
         dataset.apply(lambda x: ['<BOS>'] + [w.split('/')[0] for w in x['wp']], new_field_name='words')
         dataset.apply(lambda x: ['<BOS>'] + [w.split('/')[1] for w in x['wp']], new_field_name='pos')
         dataset.rename_field("words", "raw_words")
-
+        
         # 3. 使用pipeline
         self.pipeline(dataset)
         dataset.apply(lambda x: [str(arc) for arc in x['arc_pred']], new_field_name='arc_pred')
@@ -383,7 +345,7 @@ class Parser(API):
                                  zip(x['arc_pred'], x['label_pred_seq'])][1:], new_field_name='output')
         # output like: [['2/top', '0/root', '4/nn', '2/dep']]
         return dataset.field_arrays['output'].content
-
+    
     def load_test_file(self, path):
         def get_one(sample):
             sample = list(map(list, zip(*sample)))
@@ -395,7 +357,7 @@ class Parser(API):
                     return None
             # return word_seq, pos_seq, head_seq, head_tag_seq
             return sample[1], sample[3], list(map(int, sample[6])), sample[7]
-
+        
         datalist = []
         with open(path, 'r', encoding='utf-8') as f:
             sample = []
@@ -409,14 +371,14 @@ class Parser(API):
                     sample.append(line.split('\t'))
             if len(sample) > 0:
                 datalist.append(sample)
-
+        
         data = [get_one(sample) for sample in datalist]
         data_list = list(filter(lambda x: x is not None, data))
         return data_list
-
+    
     def test(self, filepath):
         data = self.load_test_file(filepath)
-
+        
         def convert(data):
             BOS = '<BOS>'
             dataset = DataSet()
@@ -431,7 +393,7 @@ class Parser(API):
                                         arc_true=heads,
                                         tags=head_tags))
             return dataset
-
+        
         ds = convert(data)
         pp = self.pipeline
         for p in pp:
@@ -452,23 +414,23 @@ class Parser(API):
                 head_cor += 1 if head_pred[i] == head_gold[i] else 0
         uas = head_cor / total
         # print('uas:{:.2f}'.format(uas))
-
+        
         for p in pp:
             if p.field_name == 'gold_words':
                 p.field_name = 'word_list'
             elif p.field_name == 'gold_pos':
                 p.field_name = 'pos_list'
-
+        
         return {"USA": round(uas, 5)}
 
 
 class Analyzer:
     def __init__(self, device='cpu'):
-
+        
         self.cws = CWS(device=device)
         self.pos = POS(device=device)
         self.parser = Parser(device=device)
-
+    
     def predict(self, content, seg=False, pos=False, parser=False):
         if seg is False and pos is False and parser is False:
             seg = True
@@ -482,9 +444,9 @@ class Analyzer:
         if parser:
             parser_output = self.parser.predict(content)
             output_dict['parser'] = parser_output
-
+        
         return output_dict
-
+    
     def test(self, filepath):
         output_dict = {}
         if self.cws:
@@ -496,5 +458,5 @@ class Analyzer:
         if self.parser:
             parser_output = self.parser.test(filepath)
             output_dict['parser'] = parser_output
-
+        
         return output_dict
diff --git a/fastNLP/api/examples.py b/fastNLP/api/examples.py
index a85e7c30..c1b2e155 100644
--- a/fastNLP/api/examples.py
+++ b/fastNLP/api/examples.py
@@ -3,7 +3,7 @@ api/example.py contains all API examples provided by fastNLP.
 It is used as a tutorial for API or a test script since it is difficult to test APIs in travis.
 
 """
-from fastNLP.api import CWS, POS, Parser
+from . import CWS, POS, Parser
 
 text = ['编者按：7月12日，英国航空航天系统公司公布了该公司研制的第一款高科技隐形无人机雷电之神。',
         '这款飞行从外型上来看酷似电影中的太空飞行器，据英国方面介绍，可以实现洲际远程打击。',
diff --git a/fastNLP/api/pipeline.py b/fastNLP/api/pipeline.py
index 0c567678..2cec16b3 100644
--- a/fastNLP/api/pipeline.py
+++ b/fastNLP/api/pipeline.py
@@ -1,4 +1,4 @@
-from fastNLP.api.processor import Processor
+from ..api.processor import Processor
 
 
 class Pipeline:
diff --git a/fastNLP/api/processor.py b/fastNLP/api/processor.py
index 0bba96c0..3c60e621 100644
--- a/fastNLP/api/processor.py
+++ b/fastNLP/api/processor.py
@@ -3,10 +3,10 @@ from collections import defaultdict
 
 import torch
 
-from fastNLP.core.batch import Batch
-from fastNLP.core.dataset import DataSet
-from fastNLP.core.sampler import SequentialSampler
-from fastNLP.core.vocabulary import Vocabulary
+from ..core.batch import Batch
+from ..core.dataset import DataSet
+from ..core.sampler import SequentialSampler
+from ..core.vocabulary import Vocabulary
 
 
 class Processor(object):
@@ -232,7 +232,7 @@ class SeqLenProcessor(Processor):
         return dataset
 
 
-from fastNLP.core.utils import _build_args
+from ..core.utils import _build_args
 
 
 class ModelProcessor(Processor):
diff --git a/fastNLP/automl/enas_trainer.py b/fastNLP/automl/enas_trainer.py
index a6316341..a9b1b8c3 100644
--- a/fastNLP/automl/enas_trainer.py
+++ b/fastNLP/automl/enas_trainer.py
@@ -11,15 +11,15 @@ import torch
 try:
     from tqdm.autonotebook import tqdm
 except:
-    from fastNLP.core.utils import _pseudo_tqdm as tqdm
+    from ..core.utils import _pseudo_tqdm as tqdm
 
-from fastNLP.core.batch import Batch
-from fastNLP.core.callback import CallbackException
-from fastNLP.core.dataset import DataSet
-from fastNLP.core.utils import _move_dict_value_to_device
+from ..core.batch import Batch
+from ..core.callback import CallbackException
+from ..core.dataset import DataSet
+from ..core.utils import _move_dict_value_to_device
 import fastNLP
-import fastNLP.automl.enas_utils as utils
-from fastNLP.core.utils import _build_args
+from . import enas_utils as utils
+from ..core.utils import _build_args
 
 from torch.optim import Adam
 
diff --git a/fastNLP/core/__init__.py b/fastNLP/core/__init__.py
index 087882aa..f11f0e12 100644
--- a/fastNLP/core/__init__.py
+++ b/fastNLP/core/__init__.py
@@ -1,6 +1,20 @@
+"""
+core 模块里实现了 fastNLP 的核心框架，常用的组件都可以从 fastNLP 包中直接 import。当然你也同样可以从 core 模块的子模块中 import，
+例如 Batch 组件有两种 import 的方式::
+    
+    # 直接从 fastNLP 中 import
+    from fastNLP import Batch
+    
+    # 从 core 模块的子模块 batch 中 import
+    from fastNLP.core.batch import Batch
+
+对于常用的功能，你只需要在 :doc:`fastNLP` 中查看即可。如果想了解各个子模块的分工，您可以阅读以下文档：
+
+
+"""
 from .batch import Batch
 from .dataset import DataSet
-from .fieldarray import FieldArray
+from .field import FieldArray, Padder, AutoPadder, EngChar2DPadder
 from .instance import Instance
 from .losses import LossFunc, CrossEntropyLoss, L1Loss, BCELoss, NLLLoss, LossInForward
 from .metrics import AccuracyMetric
diff --git a/fastNLP/core/batch.py b/fastNLP/core/batch.py
index fbb122e4..4af6d651 100644
--- a/fastNLP/core/batch.py
+++ b/fastNLP/core/batch.py
@@ -1,24 +1,34 @@
+"""
+batch 模块实现了 fastNLP 所需的 Batch 类。
+
+"""
+__all__ = ["Batch"]
 import numpy as np
 import torch
 import atexit
 
-from fastNLP.core.sampler import RandomSampler, Sampler
+from .sampler import RandomSampler, Sampler
 import torch.multiprocessing as mp
 
 _python_is_exit = False
+
+
 def _set_python_is_exit():
     global _python_is_exit
     _python_is_exit = True
+
+
 atexit.register(_set_python_is_exit)
 
+
 class Batch(object):
     """
-
-     .. _Batch:
+    别名：:class:`fastNLP.Batch` :class:`fastNLP.core.batch.Batch`
 
     Batch 用于从 `DataSet` 中按一定的顺序, 依次按 ``batch_size`` 的大小将数据取出.
     组成 `x` 和 `y`
 
+
     Example::
 
         batch = Batch(data_set, batch_size=16, sampler=SequentialSampler())
@@ -26,16 +36,19 @@ class Batch(object):
         for batch_x, batch_y in batch:
             # do stuff ...
 
-    :param DataSet dataset: `DataSet` 对象, 数据集
+    :param dataset: :class:`~fastNLP.DataSet` 对象, 数据集
     :param int batch_size: 取出的batch大小
-    :param Sampler sampler: 规定使用的 Sample 方式. 若为 ``None`` , 使用 RandomSampler.
+    :param sampler: 规定使用的 :class:`~fastNLP.Sampler` 方式. 若为 ``None`` , 使用 :class:`~fastNLP.RandomSampler`.
+    
         Default: ``None``
-    :param bool as_numpy: 若为 ``True`` , 输出batch为 numpy.array. 否则为 torch.Tensor.
+    :param bool as_numpy: 若为 ``True`` , 输出batch为 numpy.array. 否则为 :class:`torch.Tensor`.
+    
         Default: ``False``
     :param bool prefetch: 若为 ``True`` 使用多进程预先取出下一batch.
+    
         Default: ``False``
     """
-
+    
     def __init__(self, dataset, batch_size, sampler=None, as_numpy=False, prefetch=False):
         self.dataset = dataset
         self.batch_size = batch_size
@@ -49,17 +62,17 @@ class Batch(object):
         self.cur_batch_indices = None
         self.prefetch = prefetch
         self.lengths = 0
-
-    def _fetch_one(self):
+    
+    def fetch_one(self):
         if self.curidx >= len(self.idx_list):
             return None
         else:
             endidx = min(self.curidx + self.batch_size, len(self.idx_list))
             batch_x, batch_y = {}, {}
-
+            
             indices = self.idx_list[self.curidx:endidx]
             self.cur_batch_indices = indices
-
+            
             for field_name, field in self.dataset.get_all_fields().items():
                 if field.is_target or field.is_input:
                     batch = field.get(indices)
@@ -69,10 +82,10 @@ class Batch(object):
                         batch_y[field_name] = batch
                     if field.is_input:
                         batch_x[field_name] = batch
-
+            
             self.curidx = endidx
             return batch_x, batch_y
-
+    
     def __iter__(self):
         """
         Iterate on dataset, fetch batch data. Fetch process don't block the iterate process
@@ -80,25 +93,28 @@ class Batch(object):
         """
         if self.prefetch:
             return _run_batch_iter(self)
+        
         def batch_iter():
-            self._init_iter()
+            self.init_iter()
             while 1:
-                res = self._fetch_one()
+                res = self.fetch_one()
                 if res is None:
                     break
                 yield res
+        
         return batch_iter()
-
-    def _init_iter(self):
+    
+    def init_iter(self):
         self.idx_list = self.sampler(self.dataset)
         self.curidx = 0
         self.lengths = self.dataset.get_length()
-
+    
     def __len__(self):
         return self.num_batches
-
+    
     def get_batch_indices(self):
-        """取得当前batch在DataSet中所在的index下标序列
+        """
+        取得当前batch在DataSet中所在的index下标序列
 
         :return list(int) indexes: 下标序列
         """
@@ -118,16 +134,16 @@ def _to_tensor(batch, dtype):
 
 def _run_fetch(batch, q):
     global _python_is_exit
-    batch._init_iter()
+    batch.init_iter()
     # print('start fetch')
     while 1:
-        res = batch._fetch_one()
+        res = batch.fetch_one()
         # print('fetch one')
         while 1:
             try:
                 q.put(res, timeout=3)
                 break
-            except Exception as e:
+            except:
                 if _python_is_exit:
                     return
         if res is None:
@@ -159,4 +175,3 @@ def _run_batch_iter(batch):
     fetch_p.terminate()
     fetch_p.join()
     # print('iter done')
-
diff --git a/fastNLP/core/callback.py b/fastNLP/core/callback.py
index 914e4d28..b3aaffaa 100644
--- a/fastNLP/core/callback.py
+++ b/fastNLP/core/callback.py
@@ -1,87 +1,89 @@
 """
-Callback的说明文档
-
- .. _Callback:
-
-Callback是fastNLP中被设计用于增强 Trainer_ 的类。如果Callback被传递给了 Trainer_ , 则 Trainer_ 会在对应的阶段调用Callback
-的函数，具体调用时机可以通过 Trainer_ 查看。
-
+callback模块实现了 fastNLP 中的Callback类，用于增强 :class:`~fastNLP.Trainer` 类，
+关于Trainer的详细文档，请参见 :doc:`trainer 模块<fastNLP.core.trainer>`
 """
-
 import os
 import torch
-from fastNLP.io.model_io import ModelSaver, ModelLoader
+from ..io.model_io import ModelSaver, ModelLoader
+
 try:
     from tensorboardX import SummaryWriter
 except:
     pass
 
-class Callback(object):
-    """这是Callback的基类，所有的callback必须继承自这个类。
 
+class Callback(object):
     """
+    别名：:class:`fastNLP.Callback` :class:`fastNLP.core.callback.Callback`
+
+    Callback是fastNLP中被设计用于增强 :class:`~fastNLP.Trainer` 的类。
+    如果Callback被传递给了 Trainer , 则 Trainer 会在对应的阶段调用Callback的函数，
+    具体调用时机可以通过 :doc:`trainer 模块<fastNLP.core.trainer>` 查看。
+    这是Callback的基类，所有的callback必须继承自这个类（参见 :doc:`callback 模块 <fastNLP.core.callback>` ）
 
+    """
+    
     def __init__(self):
         super(Callback, self).__init__()
         self._trainer = None  # 在Trainer内部被重新赋值
-
+    
     @property
     def trainer(self):
         """
         该属性可以通过self.trainer获取到，一般情况下不需要使用这个属性。
         """
         return self._trainer
-
+    
     @property
     def step(self):
         """当前运行到的step, 范围为[1, self.n_steps+1)"""
         return self._trainer.step
-
+    
     @property
     def n_steps(self):
         """Trainer一共会运行多少步"""
         return self._trainer.n_steps
-
+    
     @property
     def batch_size(self):
         """train和evaluate时的batch_size为多大"""
         return self._trainer.batch_size
-
+    
     @property
     def epoch(self):
         """当前运行的epoch数，范围是[1, self.n_epochs+1)"""
         return self._trainer.epoch
-
+    
     @property
     def n_epochs(self):
         """一共会运行多少个epoch"""
         return self._trainer.n_epochs
-
+    
     @property
     def optimizer(self):
         """初始化Trainer时传递的Optimizer"""
         return self._trainer.optimizer
-
+    
     @property
     def model(self):
         """正在被Trainer训练的模型"""
         return self._trainer.model
-
+    
     @property
     def pbar(self):
         """如果在Callback中需要打印内容，请使用self.pbar.write(str)。否则可能出现命令行显示效果不太好的问题。"""
         return self._trainer.pbar
-
+    
     @property
     def update_every(self):
         """Trainer中的模型多少次反向传播才进行一次梯度更新，在Trainer初始化时传入的。"""
         return self._trainer.update_every
-
+    
     @property
     def batch_per_epoch(self):
         """每个epoch一共有多少个batch，只有在on_epoch_begin之后才能调用该属性。"""
         return self._trainer.batch_per_epoch
-
+    
     def on_train_begin(self):
         """
         在Train过程开始之前调用。
@@ -89,7 +91,7 @@ class Callback(object):
         :return:
         """
         pass
-
+    
     def on_epoch_begin(self):
         """
         在每个epoch开始之前调用一次
@@ -97,7 +99,7 @@ class Callback(object):
         :return:
         """
         pass
-
+    
     def on_batch_begin(self, batch_x, batch_y, indices):
         """
         每次采集到一个batch的数据则调用一次。这里对batch_x或batch_y删除添加内容是可以影响到Trainer中内容的。所以在这一步
@@ -110,7 +112,7 @@ class Callback(object):
         :return:
         """
         pass
-
+    
     def on_loss_begin(self, batch_y, predict_y):
         """
         在计算loss前调用，即这里修改batch_y或predict_y的值是可以影响到loss计算的。
@@ -120,7 +122,7 @@ class Callback(object):
         :return:
         """
         pass
-
+    
     def on_backward_begin(self, loss):
         """
         在loss得到之后，但在反向传播之前。可能可以进行loss是否为NaN的检查。
@@ -129,7 +131,7 @@ class Callback(object):
         :return:
         """
         pass
-
+    
     def on_backward_end(self):
         """
         反向梯度传播已完成，但由于update_every的设置，可能并不是每一次调用都有梯度。到这一步，还没有更新参数。
@@ -137,7 +139,7 @@ class Callback(object):
         :return:
         """
         pass
-
+    
     def on_step_end(self):
         """
         到这里模型的参数已经按照梯度更新。但可能受update_every影响，并不是每次都更新了。
@@ -145,14 +147,14 @@ class Callback(object):
         :return:
         """
         pass
-
+    
     def on_batch_end(self):
         """
         这一步与on_step_end是紧接着的。只是为了对称性加上了这一步。
 
         """
         pass
-
+    
     def on_valid_begin(self):
         """
         如果Trainer中设置了验证，则发生验证前会调用该函数
@@ -160,7 +162,7 @@ class Callback(object):
         :return:
         """
         pass
-
+    
     def on_valid_end(self, eval_result, metric_key, optimizer, is_better_eval):
         """
         每次执行验证集的evaluation后会调用。
@@ -173,19 +175,19 @@ class Callback(object):
         :return:
         """
         pass
-
+    
     def on_epoch_end(self):
         """
         每个epoch结束将会调用该方法
         """
         pass
-
+    
     def on_train_end(self):
         """
         训练结束，调用该方法
         """
         pass
-
+    
     def on_exception(self, exception):
         """
         当训练过程出现异常，会触发该方法
@@ -196,32 +198,31 @@ class Callback(object):
 
 def _transfer(func):
     """装饰器，将对CallbackManager的调用转发到各个Callback子类.
+    
     :param func:
     :return:
     """
-
+    
     def wrapper(manager, *arg):
         returns = []
         for callback in manager.callbacks:
             returns.append(getattr(callback, func.__name__)(*arg))
         return returns
-
+    
     return wrapper
 
 
 class CallbackManager(Callback):
-    """内部使用的Callback管理类
-    """
-
     def __init__(self, env, callbacks=None):
         """
+        内部使用的Callback管理类
 
         :param dict env: The key is the name of the Trainer attribute(str). The value is the attribute itself.
         :param List[Callback] callbacks:
         """
         super(CallbackManager, self).__init__()
         # set attribute of trainer environment
-
+        
         self.callbacks = []
         if callbacks is not None:
             if isinstance(callbacks, list):
@@ -232,78 +233,82 @@ class CallbackManager(Callback):
                     raise TypeError(f"Expect sub-classes of Callback. Got {type(obj)}")
             else:
                 raise TypeError(f"Expect callbacks in CallbackManager(callbacks) to be list. Got {type(callbacks)}.")
-
+        
         for env_name, env_val in env.items():
             for callback in self.callbacks:
-                setattr(callback, '_'+env_name, env_val)  # Callback.trainer
-
+                setattr(callback, '_' + env_name, env_val)  # Callback.trainer
+    
     @_transfer
     def on_train_begin(self):
         pass
-
+    
     @_transfer
     def on_epoch_begin(self):
         pass
-
+    
     @_transfer
     def on_batch_begin(self, batch_x, batch_y, indices):
         pass
-
+    
     @_transfer
     def on_loss_begin(self, batch_y, predict_y):
         pass
-
+    
     @_transfer
     def on_backward_begin(self, loss):
         pass
-
+    
     @_transfer
     def on_backward_end(self):
         pass
-
+    
     @_transfer
     def on_step_end(self):
         pass
-
+    
     @_transfer
     def on_batch_end(self):
         pass
-
+    
     @_transfer
     def on_valid_begin(self):
         pass
-
+    
     @_transfer
     def on_valid_end(self, eval_result, metric_key, optimizer, is_better_eval):
         pass
-
+    
     @_transfer
     def on_epoch_end(self):
         pass
-
+    
     @_transfer
     def on_train_end(self):
         pass
-
+    
     @_transfer
     def on_exception(self, exception):
         pass
 
 
 class GradientClipCallback(Callback):
+    """每次backward前，将parameter的gradient clip到某个范围。
+
+    :param None,torch.Tensor,List[torch.Tensor] parameters: 一般通过model.parameters()获得。如果为None则默认对Trainer
+        的model中所有参数进行clip
+    :param float clip_value: 将gradient 限制到[-clip_value, clip_value]。clip_value应该为正数
+    :param str clip_type: 支持'norm', 'value'两种::
+
+            1 'norm', 将gradient的norm rescale到[-clip_value, clip_value]
+        
+            2 'value', 将gradient限制在[-clip_value, clip_value], 小于-clip_value的gradient被赋值为-clip_value;
+            大于clip_value的gradient被赋值为clip_value.
+    """
+    
     def __init__(self, parameters=None, clip_value=1, clip_type='norm'):
-        """每次backward前，将parameter的gradient clip到某个范围。
-
-        :param None,torch.Tensor,List[torch.Tensor] parameters: 一般通过model.parameters()获得。如果为None则默认对Trainer
-            的model中所有参数进行clip
-        :param float clip_value: 将gradient 限制到[-clip_value, clip_value]。clip_value应该为正数
-        :param str clip_type: 支持'norm', 'value'两种。
-            1. 'norm', 将gradient的norm rescale到[-clip_value, clip_value]
-            2. 'value', 将gradient限制在[-clip_value, clip_value], 小于-clip_value的gradient被赋值为-clip_value; 大于
-            clip_value的gradient被赋值为clip_value.
-        """
+        
         super().__init__()
-
+        
         from torch import nn
         if clip_type == 'norm':
             self.clip_fun = nn.utils.clip_grad_norm_
@@ -313,7 +318,7 @@ class GradientClipCallback(Callback):
             raise ValueError("Only supports `norm` or `value` right now.")
         self.parameters = parameters
         self.clip_value = clip_value
-
+    
     def on_backward_end(self):
         if self.parameters is None:
             self.clip_fun(self.model.parameters(), self.clip_value)
@@ -321,31 +326,17 @@ class GradientClipCallback(Callback):
             self.clip_fun(self.parameters, self.clip_value)
 
 
-class CallbackException(BaseException):
-    def __init__(self, msg):
-        """
-        当需要通过callback跳出训练的时候可以通过抛出CallbackException并在on_exception中捕获这个值。
-        :param str msg: Exception的信息。
-        """
-        super(CallbackException, self).__init__(msg)
-
-
-class EarlyStopError(CallbackException):
-    def __init__(self, msg):
-        """用于EarlyStop时从Trainer训练循环中跳出。"""
-        super(EarlyStopError, self).__init__(msg)
-
-
 class EarlyStopCallback(Callback):
-    def __init__(self, patience):
-        """
+    """
 
-        :param int patience: 多少个epoch没有变好就停止训练
-        """
+    :param int patience: 多少个epoch没有变好就停止训练
+    """
+    
+    def __init__(self, patience):
         super(EarlyStopCallback, self).__init__()
         self.patience = patience
         self.wait = 0
-
+    
     def on_valid_end(self, eval_result, metric_key, optimizer, is_better_eval):
         if not is_better_eval:
             # current result is getting worse
@@ -355,7 +346,7 @@ class EarlyStopCallback(Callback):
                 self.wait += 1
         else:
             self.wait = 0
-
+    
     def on_exception(self, exception):
         if isinstance(exception, EarlyStopError):
             print("Early Stopping triggered in epoch {}!".format(self.epoch))
@@ -364,39 +355,41 @@ class EarlyStopCallback(Callback):
 
 
 class LRScheduler(Callback):
-    def __init__(self, lr_scheduler):
-        """对PyTorch LR Scheduler的包装以使得其可以被Trainer所使用
-
-        Example::
+    """对PyTorch LR Scheduler的包装以使得其可以被Trainer所使用
 
-            from fastNLP import LRScheduler
+    Example::
 
+        from fastNLP import LRScheduler
 
-
-        :param torch.optim.lr_scheduler._LRScheduler lr_scheduler: PyTorch的lr_scheduler
-        """
+    :param torch.optim.lr_scheduler._LRScheduler lr_scheduler: PyTorch的lr_scheduler
+    """
+    
+    def __init__(self, lr_scheduler):
+        
         super(LRScheduler, self).__init__()
         import torch.optim
         if isinstance(lr_scheduler, torch.optim.lr_scheduler._LRScheduler):
             self.scheduler = lr_scheduler
         else:
             raise ValueError(f"Expect torch.optim.lr_scheduler for LRScheduler. Got {type(lr_scheduler)}.")
-
+    
     def on_epoch_begin(self):
         self.scheduler.step()
 
 
 class ControlC(Callback):
-    def __init__(self, quit_all):
-        """
+    """
 
-        :param bool quit_all: 若为True,则检测到control+C 直接退出程序；否则只退出Trainer
-        """
+    :param bool quit_all: 若为True,则检测到control+C 直接退出程序；否则只退出Trainer
+    """
+    
+    def __init__(self, quit_all):
+        
         super(ControlC, self).__init__()
         if type(quit_all) != bool:
             raise ValueError("In KeyBoardInterrupt, quit_all arguemnt must be a bool.")
         self.quit_all = quit_all
-
+    
     def on_exception(self, exception):
         if isinstance(exception, KeyboardInterrupt):
             if self.quit_all is True:
@@ -412,7 +405,7 @@ class SmoothValue(object):
     def __init__(self, beta: float):
         self.beta, self.n, self.mov_avg = beta, 0, 0
         self.smooth = None
-
+    
     def add_value(self, val: float) -> None:
         "Add `val` to calculate updated smoothed value."
         self.n += 1
@@ -421,13 +414,15 @@ class SmoothValue(object):
 
 
 class LRFinder(Callback):
-    def __init__(self, start_lr=1e-6, end_lr=10):
-        """用第一个 epoch 找最佳的学习率，从第二个epoch开始应用它
+    """
+    用第一个 epoch 找最佳的学习率，从第二个epoch开始应用它
 
-        :param int n_batch: 一个epoch内的iteration数
-        :param float start_lr: 学习率下界
-        :param float end_lr: 学习率上界
-        """
+    :param float start_lr: 学习率下界
+    :param float end_lr: 学习率上界
+    """
+    
+    def __init__(self, start_lr=1e-6, end_lr=10):
+        
         super(LRFinder, self).__init__()
         self.start_lr, self.end_lr = start_lr, end_lr
         self.num_it = self.batch_per_epoch
@@ -438,19 +433,19 @@ class LRFinder(Callback):
         self.smooth_value = SmoothValue(0.8)
         self.opt = None
         scale = (self.end_lr - self.start_lr) / self.num_it
-
+        
         self.lr_gen = (self.start_lr + scale * (step + 1) for step in range(self.num_it))
         self.find = None
         self.loader = ModelLoader()
-
+    
     def on_epoch_begin(self):
-        if self.epoch == 1: # first epoch
+        if self.epoch == 1:  # first epoch
             self.opt = self.trainer.optimizer  # pytorch optimizer
             self.opt.param_groups[0]["lr"] = self.start_lr
             # save model
             ModelSaver("tmp").save_pytorch(self.trainer.model, param_only=True)
             self.find = True
-
+    
     def on_backward_begin(self, loss):
         if self.find:
             if torch.isnan(loss) or self.stop is True:
@@ -462,7 +457,7 @@ class LRFinder(Callback):
             if self.best_loss == 0. or self.smooth_value.smooth < self.best_loss:
                 self.best_loss = self.smooth_value.smooth
                 self.best_lr = self.opt.param_groups[0]["lr"]
-
+    
     def on_batch_end(self, *args):
         if self.find:
             lr = next(self.lr_gen, None)
@@ -471,9 +466,9 @@ class LRFinder(Callback):
                 return
             self.opt.param_groups[0]["lr"] = lr
             # self.loader.load_pytorch(self.trainer.model, "tmp")
-
+    
     def on_epoch_end(self):
-        if self.epoch == 1: # first epoch
+        if self.epoch == 1:  # first epoch
             self.opt.param_groups[0]["lr"] = self.best_lr
             self.find = False
             # reset model
@@ -483,12 +478,12 @@ class LRFinder(Callback):
 
 class TensorboardCallback(Callback):
     """
-        接受以下一个或多个字符串作为参数：
-        - "model"
-        - "loss"
-        - "metric"
+    接受以下一个或多个字符串作为参数：
+    - "model"
+    - "loss"
+    - "metric"
     """
-
+    
     def __init__(self, *options):
         super(TensorboardCallback, self).__init__()
         args = {"model", "loss", "metric"}
@@ -498,7 +493,7 @@ class TensorboardCallback(Callback):
         self.options = options
         self._summary_writer = None
         self.graph_added = False
-
+    
     def on_train_begin(self):
         save_dir = self.trainer.save_path
         if save_dir is None:
@@ -506,7 +501,7 @@ class TensorboardCallback(Callback):
         else:
             path = os.path.join(save_dir, 'tensorboard_logs_{}'.format(self.trainer.start_time))
         self._summary_writer = SummaryWriter(path)
-
+    
     def on_batch_begin(self, batch_x, batch_y, indices):
         if "model" in self.options and self.graph_added is False:
             # tesorboardX 这里有大bug，暂时没法画模型图
@@ -516,11 +511,11 @@ class TensorboardCallback(Callback):
             # args = args[0] if len(args) == 1 else args
             # self._summary_writer.add_graph(self.trainer.model, torch.zeros(32, 2))
             self.graph_added = True
-
+    
     def on_backward_begin(self, loss):
         if "loss" in self.options:
             self._summary_writer.add_scalar("loss", loss.item(), global_step=self.trainer.step)
-
+        
         if "model" in self.options:
             for name, param in self.trainer.model.named_parameters():
                 if param.requires_grad:
@@ -528,21 +523,40 @@ class TensorboardCallback(Callback):
                     # self._summary_writer.add_scalar(name + "_std", param.std(), global_step=self.trainer.step)
                     self._summary_writer.add_scalar(name + "_grad_mean", param.grad.mean(),
                                                     global_step=self.trainer.step)
-
+    
     def on_valid_end(self, eval_result, metric_key, optimizer, is_better_eval):
         if "metric" in self.options:
             for name, metric in eval_result.items():
                 for metric_key, metric_val in metric.items():
                     self._summary_writer.add_scalar("valid_{}_{}".format(name, metric_key), metric_val,
                                                     global_step=self.trainer.step)
-
+    
     def on_train_end(self):
         self._summary_writer.close()
         del self._summary_writer
-
+    
     def on_exception(self, exception):
         if hasattr(self, "_summary_writer"):
             self._summary_writer.close()
             del self._summary_writer
 
 
+class CallbackException(BaseException):
+    """
+   当需要通过callback跳出训练的时候可以通过抛出CallbackException并在on_exception中捕获这个值。
+
+   :param str msg: Exception的信息。
+   """
+    
+    def __init__(self, msg):
+        super(CallbackException, self).__init__(msg)
+
+
+class EarlyStopError(CallbackException):
+    """
+    用于EarlyStop时从Trainer训练循环中跳出。
+    
+    """
+    
+    def __init__(self, msg):
+        super(EarlyStopError, self).__init__(msg)
diff --git a/fastNLP/core/const.py b/fastNLP/core/const.py
new file mode 100644
index 00000000..dcb0a786
--- /dev/null
+++ b/fastNLP/core/const.py
@@ -0,0 +1,46 @@
+class Const:
+    """fastNLP中field命名常量。
+    具体列表::
+
+        INPUT       模型的序列输入      words（复数words1, words2）
+        CHAR_INPUT  模型character输入  chars（复数chars1， chars2）
+        INPUT_LEN   序列长度           seq_len（复数seq_len1，seq_len2）
+        OUTPUT      模型输出           pred（复数pred1， pred2）
+        TARGET      真实目标           target（复数target1，target2）
+
+    """
+    INPUT = 'words'
+    CHAR_INPUT = 'chars'
+    INPUT_LEN = 'seq_len'
+    OUTPUT = 'pred'
+    TARGET = 'target'
+
+    @staticmethod
+    def INPUTS(i):
+        """得到第 i 个 ``INPUT`` 的命名"""
+        i = int(i) + 1
+        return Const.INPUT + str(i)
+
+    @staticmethod
+    def CHAR_INPUTS(i):
+        """得到第 i 个 ``CHAR_INPUT`` 的命名"""
+        i = int(i) + 1
+        return Const.CHAR_INPUT + str(i)
+
+    @staticmethod
+    def INPUT_LENS(i):
+        """得到第 i 个 ``INPUT_LEN`` 的命名"""
+        i = int(i) + 1
+        return Const.INPUT_LEN + str(i)
+
+    @staticmethod
+    def OUTPUTS(i):
+        """得到第 i 个 ``OUTPUT`` 的命名"""
+        i = int(i) + 1
+        return Const.OUTPUT + str(i)
+
+    @staticmethod
+    def TARGETS(i):
+        """得到第 i 个 ``TARGET`` 的命名"""
+        i = int(i) + 1
+        return Const.TARGET + str(i)
diff --git a/fastNLP/core/dataset.py b/fastNLP/core/dataset.py
index 00f8ce04..013f7602 100644
--- a/fastNLP/core/dataset.py
+++ b/fastNLP/core/dataset.py
@@ -1,7 +1,7 @@
 """
-DataSet是fastNLP中用于承载数据的容器。可以将DataSet看做是一个表格，每一行是一个sample(在fastNLP中被称为Instance)，每一列是一个feature(在fastNLP中称为field)。
-
- .. _DataSet:
+:class:`~fastNLP.core.dataset.DataSet` 是fastNLP中用于承载数据的容器。可以将DataSet看做是一个表格，
+每一行是一个sample (在fastNLP中被称为 :mod:`~.instance` )，
+每一列是一个feature (在fastNLP中称为 :mod:`.field` )。
 
 .. csv-table:: Following is a demo layout of DataSet
    :header: "sentence", "words", "seq_len"
@@ -11,285 +11,294 @@ DataSet是fastNLP中用于承载数据的容器。可以将DataSet看做是一
    "Third instance .", "[Third, instance, .]", 3
    "...", "[...]", "..."
 
-在fastNLP内部每一行是一个 Instance_ 对象； 每一列是一个 FieldArray_ 对象。
-
-1. DataSet的创建
+在fastNLP内部每一行是一个 :class:`~fastNLP.Instance` 对象； 每一列是一个 :class:`~fastNLP.FieldArray` 对象。
 
+1 DataSet的创建
     创建DataSet主要有以下的3种方式
 
-    1. 传入dict
+1.1 传入dict
 
-      Example::
+  Example::
 
-        from fastNLP import DataSet
-        data = {'sentence':["This is the first instance .", "Second instance .", "Third instance ."],
-                'words': [['this', 'is', 'the', 'first', 'instance', '.'], ['Second', 'instance', '.'], ['Third', 'instance', '.'],
-                'seq_len': [6, 3, 3]}
-        dataset = DataSet(data)
-        # 传入的dict的每个key的value应该为具有相同长度的list
+    from fastNLP import DataSet
+    data = {'sentence':["This is the first instance .", "Second instance .", "Third instance ."],
+            'words': [['this', 'is', 'the', 'first', 'instance', '.'], ['Second', 'instance', '.'], ['Third', 'instance', '.'],
+            'seq_len': [6, 3, 3]}
+    dataset = DataSet(data)
+    # 传入的dict的每个key的value应该为具有相同长度的list
 
-    2. 通过构建Instance
+1.2 通过构建Instance
 
-      Example::
+  Example::
 
-        from fastNLP import DataSet
-        from fastNLP import Instance
-        dataset = DataSet()
-        instance = Instance(sentence="This is the first instance",
-                            words=['this', 'is', 'the', 'first', 'instance', '.'],
-                            seq_len=6)
-        dataset.append(instance)
-        # 可以继续append更多内容，但是append的instance应该和第一个instance拥有完全相同的field
+    from fastNLP import DataSet
+    from fastNLP import Instance
+    dataset = DataSet()
+    instance = Instance(sentence="This is the first instance",
+                        words=['this', 'is', 'the', 'first', 'instance', '.'],
+                        seq_len=6)
+    dataset.append(instance)
+    # 可以继续append更多内容，但是append的instance应该和第一个instance拥有完全相同的field
+
+1.3 通过list(Instance)
+
+   Example::
 
-    3. 通过list(Instance)
+    from fastNLP import DataSet
+    from fastNLP import Instance
+    instances = []
+    instances.append(Instance(sentence="This is the first instance",
+                        words=['this', 'is', 'the', 'first', 'instance', '.'],
+                        seq_len=6))
+    instances.append(Instance(sentence="Second instance .",
+                        words=['Second', 'instance', '.'],
+                        seq_len=3))
+    dataset = DataSet(instances)
 
-       Example::
+2 DataSet与预处理
+    常见的预处理有如下几种
+
+2.1 从某个文本文件读取内容 # TODO 引用DataLoader
+
+    Example::
 
         from fastNLP import DataSet
         from fastNLP import Instance
-        instances = []
-        instances.append(Instance(sentence="This is the first instance",
-                            words=['this', 'is', 'the', 'first', 'instance', '.'],
-                            seq_len=6))
-        instances.append(Instance(sentence="Second instance .",
-                            words=['Second', 'instance', '.'],
-                            seq_len=3))
-        dataset = DataSet(instances)
-
-2. DataSet与预处理
-    1. 从某个文本文件读取内容 # TODO 引用DataLoader
+        dataset = DataSet()
+        filepath='some/text/file'
+        # 假设文件中每行内容如下(sentence  label):
+        #    This is a fantastic day    positive
+        #    The bad weather    negative
+        #    .....
+        with open(filepath, 'r') as f:
+            for line in f:
+                sent, label = line.strip().split('\t')
+                dataset.append(Instance(sentence=sent, label=label))
 
-        Example::
+2.2 index, 返回结果为对DataSet对象的浅拷贝
 
-            from fastNLP import DataSet
-            from fastNLP import Instance
-            dataset = DataSet()
-            filepath='some/text/file'
-            # 假设文件中每行内容如下(sentence  label):
-            #    This is a fantastic day    positive
-            #    The bad weather    negative
-            #    .....
-            with open(filepath, 'r') as f:
-                for line in f:
-                    sent, label = line.strip().split('\t')
-                    dataset.append(Instance(sentence=sent, label=label))
-
-    2. index, 返回结果为对DataSet对象的浅拷贝
+    Example::
 
-        Example::
+        import numpy as np
+        from fastNLP import DataSet
+        dataset = DataSet({'a': np.arange(10), 'b': [[_] for _ in range(10)]})
+        d[0]  # 使用一个下标获取一个instance
+        >>{'a': 0 type=int,'b': [2] type=list} # 得到一个instance
+        d[1:3]  # 使用slice获取一个新的DataSet
+        >>DataSet({'a': 1 type=int, 'b': [2] type=list}, {'a': 2 type=int, 'b': [2] type=list})
 
-            import numpy as np
-            from fastNLP import DataSet
-            dataset = DataSet({'a': np.arange(10), 'b': [[_] for _ in range(10)]})
-            d[0]  # 使用一个下标获取一个instance
-            >>{'a': 0 type=int,'b': [2] type=list} # 得到一个instance
-            d[1:3]  # 使用slice获取一个新的DataSet
-            >>DataSet({'a': 1 type=int, 'b': [2] type=list}, {'a': 2 type=int, 'b': [2] type=list})
+2.3 对DataSet中的内容处理
 
-    3. 对DataSet中的内容处理
+    Example::
 
-        Example::
+        from fastNLP import DataSet
+        data = {'sentence':["This is the first instance .", "Second instance .", "Third instance ."]}
+        dataset = DataSet(data)
+        # 将句子分成单词形式, 详见DataSet.apply()方法
+        dataset.apply(lambda ins: ins['sentence'].split(), new_field_name='words')
+        # 或使用DataSet.apply_field()
+        dataset.apply(lambda sent:sent.split(), field_name='sentence', new_field_name='words')
+        # 除了匿名函数，也可以定义函数传递进去
+        def get_words(instance):
+            sentence = instance['sentence']
+            words = sentence.split()
+            return words
+        dataset.apply(get_words, new_field_name='words')
 
-            from fastNLP import DataSet
-            data = {'sentence':["This is the first instance .", "Second instance .", "Third instance ."]}
-            dataset = DataSet(data)
-            # 将句子分成单词形式, 详见DataSet.apply()方法
-            dataset.apply(lambda ins: ins['sentence'].split(), new_field_name='words')
-            # 或使用DataSet.apply_field()
-            dataset.apply(lambda sent:sent.split(), field_name='sentence', new_field_name='words')
-            # 除了匿名函数，也可以定义函数传递进去
-            def get_words(instance):
-                sentence = instance['sentence']
-                words = sentence.split()
-                return words
-            dataset.apply(get_words, new_field_name='words')
-
-    4. 删除DataSet的内容
+2.4 删除DataSet的内容
 
-        Example::
+    Example::
 
-            from fastNLP import DataSet
-            dataset = DataSet({'a': list(range(-5, 5))})
-            # 返回满足条件的instance,并放入DataSet中
-            dropped_dataset = dataset.drop(lambda ins:ins['a']<0, inplace=False)
-            # 在dataset中删除满足条件的instance
-            dataset.drop(lambda ins:ins['a']<0)  # dataset的instance数量减少
-            #  删除第3个instance
-            dataset.delete_instance(2)
-            #  删除名为'a'的field
-            dataset.delete_field('a')
+        from fastNLP import DataSet
+        dataset = DataSet({'a': list(range(-5, 5))})
+        # 返回满足条件的instance,并放入DataSet中
+        dropped_dataset = dataset.drop(lambda ins:ins['a']<0, inplace=False)
+        # 在dataset中删除满足条件的instance
+        dataset.drop(lambda ins:ins['a']<0)  # dataset的instance数量减少
+        #  删除第3个instance
+        dataset.delete_instance(2)
+        #  删除名为'a'的field
+        dataset.delete_field('a')
 
 
-    5. 遍历DataSet的内容
+2.5 遍历DataSet的内容
 
-        Example::
+    Example::
 
-            for instance in dataset:
-                # do something
+        for instance in dataset:
+            # do something
 
-    6. 一些其它操作
+2.6 一些其它操作
 
-        Example::
+    Example::
 
-            #  检查是否存在名为'a'的field
-            dataset.has_field('a')  # 或 ('a' in dataset)
-            #  将名为'a'的field改名为'b'
-            dataset.rename_field('a', 'b')
-            #  DataSet的长度
-            len(dataset)
+        #  检查是否存在名为'a'的field
+        dataset.has_field('a')  # 或 ('a' in dataset)
+        #  将名为'a'的field改名为'b'
+        dataset.rename_field('a', 'b')
+        #  DataSet的长度
+        len(dataset)
 
-3. DataSet与自然语言处理(NLP)
+3 DataSet与自然语言处理(NLP)
     在目前深度学习的模型中，大都依赖于随机梯度下降法(SGD)进行模型的优化。随机梯度下降需要将数据切分成一个一个的Batch，
     一个Batch进行一次前向计算(forward)与梯度后向传播(backward)。在自然语言处理的场景下，往往还需要对数据进行pad。这是
     由于句子的长度一般是不同的，但是一次Batch中的每个field都必须是一个tensor，所以需要将所有句子都补齐到相同的长度。
 
-    1. DataSet与Batch
+3.1 DataSet与Batch
+
+    我们先看fastNLP中如何将数据分成一个一个的Batch的例子, 这里我们使用随机生成的数据来模拟一个二分类文本分类任务，
+    words和characters是输入，labels是文本类别
 
-        我们先看fastNLP中如何将数据分成一个一个的Batch的例子, 这里我们使用随机生成的数据来模拟一个二分类文本分类任务，
-        words和characters是输入，labels是文本类别
+    Example::
+
+        from fastNLP import DataSet
+        from fastNLP import Batch
+        from fastNLP import SequentialSampler
+        from fastNLP import EngChar2DPadder
+
+        num_instances = 100
+        # 假设每句话最少2个词，最多5个词; 词表的大小是100个; 一共26个字母，每个单词最短1个字母，最长5个字母
+        lengths = [random.randint(2, 5) for _ in range(num_instances)]
+        data = {'words': [[random.randint(1, 100) for _ in range(lengths[idx]) ] for idx in range(num_instances)],
+                'chars': [
+                            [[random.randint(1, 27) for _ in range(random.randint(1, 5))]
+                            for _ in range(lengths[idx])]
+                     for idx in range(num_instances)],
+                'label': [random.randint(0, 1) for _ in range(num_instances)]}
+
+        d = DataSet(data)
+        d.set_padder('chars', EngChar2DPadder())  # 因为英文character的pad方式与word的pad方式不一样
+
+        d.set_target('label')
+        d.set_input('words', 'chars')
+
+        for batch_x, batch_y in Batch(d, sampler=SequentialSampler(), batch_size=2):
+            print("batch_x:", batch_x)
+            print("batch_y:", batch_y)
+            break
+            # 输出为
+            # {'words': tensor([[49, 27, 20, 36, 63],
+            #     [53, 82, 23, 11,  0]]), 'chars': tensor([[[13,  3, 14, 25,  1],
+            #      [ 8, 20, 12,  0,  0],
+            #      [27,  8,  0,  0,  0],
+            #      [ 1, 15, 26,  0,  0],
+            #      [11, 24, 17,  0,  0]],
+            #
+            #     [[ 6, 14, 11, 27, 22],
+            #      [18,  6,  4, 19,  0],
+            #      [19, 22,  9,  0,  0],
+            #      [10, 25,  0,  0,  0],
+            #      [ 0,  0,  0,  0,  0]]])}
+            # {'label': tensor([0, 0])}
+
+    其中 :class:`~fastNLP.Batch` 是用于从DataSet中按照batch_size为大小取出batch的迭代器，
+    :class:`~fastNLP.SequentialSampler` 用于指示 Batch 以怎样的
+    顺序从DataSet中取出instance以组成一个batch，
+    更详细的说明请参照 :class:`~fastNLP.Batch` 和 :class:`~fastNLP.SequentialSampler` 文档。
+
+    通过DataSet.set_input('words', 'chars'), fastNLP将认为'words'和'chars'这两个field都是input，并将它们都放入迭代器
+    生成的第一个dict中; DataSet.set_target('labels'), fastNLP将认为'labels'这个field是target，并将其放入到迭代器的第
+    二个dict中。如上例中所打印结果。分为input和target的原因是由于它们在被 :class:`~fastNLP.Trainer` 所使用时会有所差异，
+    详见  :class:`~fastNLP.Trainer`
+
+    当把某个field设置为'target'或者'input'的时候(两者不是互斥的，可以同时设为input和target)，fastNLP不仅仅只是将其放
+    置到不同的dict中，而还会对被设置为input或target的field进行类型检查。类型检查的目的是为了看能否把该field转为
+    pytorch的torch.LongTensor或torch.FloatTensor类型(也可以在Batch中设置输出numpy类型，参考 :class:`~fastNLP.Batch` )，如上例所示，
+    fastNLP已将words，chars和label转为了Tensor类型。如果field在每个instance都拥有相同的维度(不能超过两维)，且最内层
+    的元素都为相同的type(int, float, np.int*, np.float*)，则fastNLP默认将对该field进行pad。也支持全为str的field作为
+    target和input，这种情况下，fastNLP默认不进行pad。另外，当某个field已经被设置为了target或者input后，之后append的
+    instance对应的field必须要和前面已有的内容一致，否则会报错。
+
+    可以查看field的dtype
+
+        Example::
+        
+            from fastNLP import DataSet
+
+            d = DataSet({'a': [0, 1, 3], 'b':[[1.0, 2.0], [0.1, 0.2], [3]]})
+            d.set_input('a', 'b')
+            d.a.dtype
+            >> numpy.int64
+            d.b.dtype
+            >> numpy.float64
+            # 默认情况下'a'这个field将被转换为torch.LongTensor，但如果需要其为torch.FloatTensor可以手动修改dtype
+            d.a.dtype = float  #  请确保该field的确可以全部转换为float。
+
+    如果某个field中出现了多种类型混合(比如一部分为str，一部分为int)的情况，fastNLP无法判断该field的类型，会报如下的
+    错误:
 
         Example::
 
             from fastNLP import DataSet
-            from fastNLP import Batch
-            from fastNLP import SequentialSampler
-            from fastNLP import EngChar2DPadder
+            d = DataSet({'data': [1, 'a']})
+            d.set_input('data')
+            >> RuntimeError: Mixed data types in Field data: [<class 'str'>, <class 'int'>]
 
-            num_instances = 100
-            # 假设每句话最少2个词，最多5个词; 词表的大小是100个; 一共26个字母，每个单词最短1个字母，最长5个字母
-            lengths = [random.randint(2, 5) for _ in range(num_instances)]
-            data = {'words': [[random.randint(1, 100) for _ in range(lengths[idx]) ] for idx in range(num_instances)],
-                    'chars': [
-                                [[random.randint(1, 27) for _ in range(random.randint(1, 5))]
-                                for _ in range(lengths[idx])]
-                         for idx in range(num_instances)],
-                    'label': [random.randint(0, 1) for _ in range(num_instances)]}
-
-            d = DataSet(data)
-            d.set_padder('chars', EngChar2DPadder())  # 因为英文character的pad方式与word的pad方式不一样
-
-            d.set_target('label')
-            d.set_input('words', 'chars')
-
-            for batch_x, batch_y in Batch(d, sampler=SequentialSampler(), batch_size=2):
-                print("batch_x:", batch_x)
-                print("batch_y:", batch_y)
-                break
-                # 输出为
-                # {'words': tensor([[49, 27, 20, 36, 63],
-                #     [53, 82, 23, 11,  0]]), 'chars': tensor([[[13,  3, 14, 25,  1],
-                #      [ 8, 20, 12,  0,  0],
-                #      [27,  8,  0,  0,  0],
-                #      [ 1, 15, 26,  0,  0],
-                #      [11, 24, 17,  0,  0]],
-                #
-                #     [[ 6, 14, 11, 27, 22],
-                #      [18,  6,  4, 19,  0],
-                #      [19, 22,  9,  0,  0],
-                #      [10, 25,  0,  0,  0],
-                #      [ 0,  0,  0,  0,  0]]])}
-                # {'label': tensor([0, 0])}
-
-        其中 Batch_ 是用于从DataSet中按照batch_size为大小取出batch的迭代器， SequentialSampler_ 用于指示 Batch_ 以怎样的
-        顺序从DataSet中取出instance以组成一个batch，更详细的说明请参照 Batch_ 和 SequentialSampler_ 文档。
-
-        通过DataSet.set_input('words', 'chars'), fastNLP将认为'words'和'chars'这两个field都是input，并将它们都放入迭代器
-        生成的第一个dict中; DataSet.set_target('labels'), fastNLP将认为'labels'这个field是target，并将其放入到迭代器的第
-        二个dict中。如上例中所打印结果。分为input和target的原因是由于它们在被 Trainer_ 所使用时会有所差异，详见 Trainer_
-        。
-
-        当把某个field设置为'target'或者'input'的时候(两者不是互斥的，可以同时设为input和target)，fastNLP不仅仅只是将其放
-        置到不同的dict中，而还会对被设置为input或target的field进行类型检查。类型检查的目的是为了看能否把该field转为
-        pytorch的torch.LongTensor或torch.FloatTensor类型(也可以在Batch中设置输出numpy类型，参考 Batch_ )，如上例所示，
-        fastNLP已将words，chars和label转为了Tensor类型。如果field在每个instance都拥有相同的维度(不能超过两维)，且最内层
-        的元素都为相同的type(int, float, np.int*, np.float*)，则fastNLP默认将对该field进行pad。也支持全为str的field作为
-        target和input，这种情况下，fastNLP默认不进行pad。另外，当某个field已经被设置为了target或者input后，之后append的
-        instance对应的field必须要和前面已有的内容一致，否则会报错。
-
-        可以查看field的dtype
-
-            Example::
-                from fastNLP import DataSet
-
-                d = DataSet({'a': [0, 1, 3], 'b':[[1.0, 2.0], [0.1, 0.2], [3]]})
-                d.set_input('a', 'b')
-                d.a.dtype
-                >>>numpy.int64
-                d.b.dtype
-                >>>numpy.float64
-                # 默认情况下'a'这个field将被转换为torch.LongTensor，但如果需要其为torch.FloatTensor可以手动修改dtype
-                d.a.dtype = float  #  请确保该field的确可以全部转换为float。
-
-        如果某个field中出现了多种类型混合(比如一部分为str，一部分为int)的情况，fastNLP无法判断该field的类型，会报如下的
-        错误:
-
-            Example::
-
-                from fastNLP import DataSet
-                d = DataSet({'data': [1, 'a']})
-                d.set_input('data')
-                >>> RuntimeError: Mixed data types in Field data: [<class 'str'>, <class 'int'>]
-
-        可以通过设置以忽略对该field进行类型检查
-
-            Example::
-
-                from fastNLP import DataSet
-                d = DataSet({'data': [1, 'a']})
-                d.set_ignore_type('data')
-                d.set_input('data')
-
-        当某个field被设置为忽略type之后，fastNLP将不对其进行pad。
-
-    2. DataSet与pad
-
-        在fastNLP里，pad是与一个field绑定的。即不同的field可以使用不同的pad方式，比如在英文任务中word需要的pad和
-        character的pad方式往往是不同的。fastNLP是通过一个叫做 Padder_ 的子类来完成的。默认情况下，所有field使用 AutoPadder_
-        。可以通过使用以下方式设置Padder(如果将padder设置为None，则该field不会进行pad操作)。大多数情况下直接使用 AutoPadder_
-        就可以了。如果 AutoPadder_ 或 EngChar2DPadder_ 无法满足需求，也可以自己写一个 Padder_ 。
+    可以通过设置以忽略对该field进行类型检查
 
         Example::
 
             from fastNLP import DataSet
-            from fastNLP import EngChar2DPadder
-            import random
-            dataset = DataSet()
-            max_chars, max_words, sent_num = 5, 10, 20
-            contents = [[
-                            [random.randint(1, 27) for _ in range(random.randint(1, max_chars))]
-                                for _ in range(random.randint(1, max_words))
-                        ]  for _ in range(sent_num)]
-            #  初始化时传入
-            dataset.add_field('chars', contents, padder=EngChar2DPadder())
-            #  直接设置
-            dataset.set_padder('chars', EngChar2DPadder())
-            #  也可以设置pad的value
-            dataset.set_pad_val('chars', -1)
+            d = DataSet({'data': [1, 'a']})
+            d.set_ignore_type('data')
+            d.set_input('data')
 
-"""
+    当某个field被设置为忽略type之后，fastNLP将不对其进行pad。
+
+3.2 DataSet与pad
 
+    在fastNLP里，pad是与一个field绑定的。即不同的field可以使用不同的pad方式，比如在英文任务中word需要的pad和
+    character的pad方式往往是不同的。fastNLP是通过一个叫做 :class:`~fastNLP.Padder` 的子类来完成的。
+    默认情况下，所有field使用 :class:`~fastNLP.AutoPadder`
+    。可以通过使用以下方式设置Padder(如果将padder设置为None，则该field不会进行pad操作)。
+    大多数情况下直接使用 :class:`~fastNLP.AutoPadder` 就可以了。
+    如果 :class:`~fastNLP.AutoPadder` 或 :class:`~fastNLP.EngChar2DPadder` 无法满足需求，
+    也可以自己写一个 :class:`~fastNLP.Padder` 。
 
+    Example::
+
+        from fastNLP import DataSet
+        from fastNLP import EngChar2DPadder
+        import random
+        dataset = DataSet()
+        max_chars, max_words, sent_num = 5, 10, 20
+        contents = [[
+                        [random.randint(1, 27) for _ in range(random.randint(1, max_chars))]
+                            for _ in range(random.randint(1, max_words))
+                    ]  for _ in range(sent_num)]
+        #  初始化时传入
+        dataset.add_field('chars', contents, padder=EngChar2DPadder())
+        #  直接设置
+        dataset.set_padder('chars', EngChar2DPadder())
+        #  也可以设置pad的value
+        dataset.set_pad_val('chars', -1)
+
+
+"""
+__all__ = ["DataSet"]
 import _pickle as pickle
 
 import numpy as np
 import warnings
 
-from fastNLP.core.fieldarray import AutoPadder
-from fastNLP.core.fieldarray import FieldArray
-from fastNLP.core.instance import Instance
-from fastNLP.core.utils import _get_func_signature
+from .field import AutoPadder
+from .field import FieldArray
+from .instance import Instance
+from .utils import _get_func_signature
 
-class DataSet(object):
-    """fastNLP的数据容器
 
+class DataSet(object):
     """
+    别名：:class:`fastNLP.DataSet`   :class:`fastNLP.core.dataset.DataSet`
 
-    def __init__(self, data=None):
-        """
+    fastNLP的数据容器，详细的使用方法见文档  :doc:`fastNLP.core.dataset`
+    
+    :param data: 如果为dict类型，则每个key的value应该为等长的list; 如果为list，
+        每个元素应该为具有相同field的 :class:`~fastNLP.Instance` 。
 
-        :param dict,list(Instance) data: 如果为dict类型，则每个key的value应该为等长的list; 如果为list，每个元素应该为具
-            :有相同field的 instance_ 。
-        """
+    """
+    
+    def __init__(self, data=None):
         self.field_arrays = {}
         if data is not None:
             if isinstance(data, dict):
@@ -303,41 +312,41 @@ class DataSet(object):
                 for ins in data:
                     assert isinstance(ins, Instance), "Must be Instance type, not {}.".format(type(ins))
                     self.append(ins)
-
+            
             else:
                 raise ValueError("data only be dict or list type.")
-
+    
     def __contains__(self, item):
         return item in self.field_arrays
-
+    
     def __iter__(self):
         def iter_func():
             for idx in range(len(self)):
                 yield self[idx]
-
+        
         return iter_func()
-
+    
     def _inner_iter(self):
         class Iter_ptr:
             def __init__(self, dataset, idx):
                 self.dataset = dataset
                 self.idx = idx
-
+            
             def __getitem__(self, item):
                 assert item in self.dataset.field_arrays, "no such field:{} in Instance {}".format(item, self.dataset[
                     self.idx])
                 assert self.idx < len(self.dataset.field_arrays[item]), "index:{} out of range".format(self.idx)
                 return self.dataset.field_arrays[item][self.idx]
-
+            
             def __repr__(self):
                 return self.dataset[self.idx].__repr__()
-
+        
         def inner_iter_func():
             for idx in range(len(self)):
                 yield Iter_ptr(self, idx)
-
+        
         return inner_iter_func()
-
+    
     def __getitem__(self, idx):
         """给定int的index，返回一个Instance; 给定slice，返回包含这个slice内容的新的DataSet。
 
@@ -349,7 +358,7 @@ class DataSet(object):
             return Instance(**{name: self.field_arrays[name][idx] for name in self.field_arrays})
         elif isinstance(idx, slice):
             if idx.start is not None and (idx.start >= len(self) or idx.start <= -len(self)):
-                raise RuntimeError(f"Start index {idx.start} out of range 0-{len(self)-1}")
+                raise RuntimeError(f"Start index {idx.start} out of range 0-{len(self) - 1}")
             data_set = DataSet()
             for field in self.field_arrays.values():
                 data_set.add_field(field_name=field.name, fields=field.content[idx], padder=field.padder,
@@ -361,20 +370,20 @@ class DataSet(object):
             return self.field_arrays[idx]
         else:
             raise KeyError("Unrecognized type {} for idx in __getitem__ method".format(type(idx)))
-
+    
     def __getattr__(self, item):
         # Not tested. Don't use !!
         if item == "field_arrays":
             raise AttributeError
         if isinstance(item, str) and item in self.field_arrays:
             return self.field_arrays[item]
-
+    
     def __setstate__(self, state):
         self.__dict__ = state
-
+    
     def __getstate__(self):
         return self.__dict__
-
+    
     def __len__(self):
         """Fetch the length of the dataset.
 
@@ -384,20 +393,21 @@ class DataSet(object):
             return 0
         field = iter(self.field_arrays.values()).__next__()
         return len(field)
-
+    
     def __inner_repr__(self):
         if len(self) < 20:
             return ",\n".join([ins.__repr__() for ins in self])
         else:
             return self[:5].__inner_repr__() + "\n...\n" + self[-5:].__inner_repr__()
-
+    
     def __repr__(self):
         return "DataSet(" + self.__inner_repr__() + ")"
-
+    
     def append(self, instance):
-        """将一个instance对象append到DataSet后面。
+        """
+        将一个instance对象append到DataSet后面。
 
-        :param Instance instance: 若DataSet不为空，则instance应该拥有和DataSet完全一样的field。
+        :param instance: :class:`~fastNLP.Instance` 类型。若DataSet不为空，则instance应该拥有和DataSet完全一样的field。
 
         """
         if len(self.field_arrays) == 0:
@@ -413,12 +423,13 @@ class DataSet(object):
             for name, field in instance.fields.items():
                 assert name in self.field_arrays
                 self.field_arrays[name].append(field)
-
+    
     def add_fieldarray(self, field_name, fieldarray):
-        """将fieldarray添加到DataSet中.
+        """
+        将fieldarray添加到DataSet中.
 
         :param str field_name: 新加入的field的名称
-        :param FieldArray fieldarray: 需要加入DataSet的field的内容
+        :param fieldarray: :class:`~fastNLP.FieldArray` 类型。需要加入DataSet的field的内容
         :return:
         """
         if not isinstance(fieldarray, FieldArray):
@@ -427,89 +438,99 @@ class DataSet(object):
             raise RuntimeError(f"The field to add must have the same size as dataset. "
                                f"Dataset size {len(self)} != field size {len(fieldarray)}")
         self.field_arrays[field_name] = fieldarray
-
-
+    
     def add_field(self, field_name, fields, padder=AutoPadder(), is_input=False, is_target=False, ignore_type=False):
-        """新增一个field
+        """
+        新增一个field
         
         :param str field_name: 新增的field的名称
         :param list fields: 需要新增的field的内容
-        :param None,Padder padder: 如果为None,则不进行pad，默认使用 AutoPadder_ 自动判断是否需要做pad。
+        :param None, padder: :class:`~fastNLP.Padder` 类型，
+                    如果为None,则不进行pad，默认使用 :class:`~fastNLP.AutoPadder` 自动判断是否需要做pad。
         :param bool is_input: 新加入的field是否是input
         :param bool is_target: 新加入的field是否是target
         :param bool ignore_type: 是否忽略对新加入的field的类型检查
-        :return: DataSet
         """
-
+        
         if len(self.field_arrays) != 0:
             if len(self) != len(fields):
                 raise RuntimeError(f"The field to add must have the same size as dataset. "
                                    f"Dataset size {len(self)} != field size {len(fields)}")
         self.field_arrays[field_name] = FieldArray(field_name, fields, is_target=is_target, is_input=is_input,
                                                    padder=padder, ignore_type=ignore_type)
-        return self
-
+    
     def delete_instance(self, index):
-        """删除第index个instance
+        """
+        删除第index个instance
 
         :param int index: 需要删除的instance的index，从0开始
-        :return: DataSet
         """
         assert isinstance(index, int), "Only integer supported."
-        if len(self)<=index:
+        if len(self) <= index:
             raise IndexError("{} is too large for as DataSet with {} instances.".format(index, len(self)))
-        if len(self)==1:
+        if len(self) == 1:
             self.field_arrays.clear()
         else:
             for field in self.field_arrays.values():
                 field.pop(index)
-        return self
-
+    
     def delete_field(self, field_name):
-        """删除名为field_name的field
+        """
+        删除名为field_name的field
 
         :param str field_name: 需要删除的field的名称.
-        :return: DataSet
         """
         self.field_arrays.pop(field_name)
-        return self
-
+    
     def has_field(self, field_name):
-        """判断DataSet中是否有field_name这个field
+        """
+        判断DataSet中是否有名为field_name这个field
 
         :param str field_name: field的名称
-        :return: bool
+        :return bool: 表示是否有名为field_name这个field
         """
         if isinstance(field_name, str):
             return field_name in self.field_arrays
         return False
-
+    
     def get_field(self, field_name):
-        """获取field_name这个field
+        """
+        获取field_name这个field
 
         :param str field_name: field的名称
-        :return: FieldArray
+        :return: :class:`~fastNLP.FieldArray`
         """
         if field_name not in self.field_arrays:
             raise KeyError("Field name {} not found in DataSet".format(field_name))
         return self.field_arrays[field_name]
-
+    
     def get_all_fields(self):
-        """返回一个dict，key为field_name, value为对应的FieldArray
+        """
+        返回一个dict，key为field_name, value为对应的 :class:`~fastNLP.FieldArray`
 
-        :return: dict:
+        :return: dict: 返回如上所述的字典
         """
         return self.field_arrays
+    
+    def get_field_names(self) -> list:
+        """
+        返回一个list，包含所有 field 的名字
 
+        :return: list: 返回如上所述的列表
+        """
+        return sorted(self.field_arrays.keys())
+    
     def get_length(self):
-        """获取DataSet的元素数量
+        """
+        获取DataSet的元素数量
 
-        :return: int length: DataSet中Instance的个数。
+        :return: int: DataSet中Instance的个数。
         """
         return len(self)
-
+    
     def rename_field(self, old_name, new_name):
-        """将某个field重新命名.
+        """
+        将某个field重新命名.
 
         :param str old_name: 原来的field名称。
         :param str new_name: 修改为new_name。
@@ -519,9 +540,10 @@ class DataSet(object):
             self.field_arrays[new_name].name = new_name
         else:
             raise KeyError("DataSet has no field named {}.".format(old_name))
-
+    
     def set_target(self, *field_names, flag=True):
-        """将field_names的field设置为target
+        """
+        将field_names的field设置为target
 
         Example::
 
@@ -537,9 +559,10 @@ class DataSet(object):
                 self.field_arrays[name].is_target = flag
             else:
                 raise KeyError("{} is not a valid field name.".format(name))
-
+    
     def set_input(self, *field_names, flag=True):
-        """将field_names的field设置为input
+        """
+        将field_names的field设置为input
 
         Example::
 
@@ -554,10 +577,11 @@ class DataSet(object):
                 self.field_arrays[name].is_input = flag
             else:
                 raise KeyError("{} is not a valid field name.".format(name))
-
+    
     def set_ignore_type(self, *field_names, flag=True):
-        """将field设置为忽略类型状态。当某个field被设置了ignore_type, 则在被设置为target或者input时将不进行类型检查，默
-        认情况下也不进行pad。
+        """
+        将field设置为忽略类型状态。当某个field被设置了ignore_type, 则在被设置为target或者input时将不进行类型检查，
+        默认情况下也不进行pad。
 
         :param str field_names: field的名称
         :param bool flag: 将field_name的ignore_type状态设置为flag
@@ -569,9 +593,10 @@ class DataSet(object):
                 self.field_arrays[name].ignore_type = flag
             else:
                 raise KeyError("{} is not a valid field name.".format(name))
-
+    
     def set_padder(self, field_name, padder):
-        """为field_name设置padder
+        """
+        为field_name设置padder
 
         Example::
 
@@ -581,55 +606,57 @@ class DataSet(object):
 
         :param str field_name: 设置field的padding方式为padder
         :param None, Padder padder: 设置为None即删除padder, 即对该field不进行pad操作。
-        :return:
         """
         if field_name not in self.field_arrays:
             raise KeyError("There is no field named {}.".format(field_name))
         self.field_arrays[field_name].set_padder(padder)
-
+    
     def set_pad_val(self, field_name, pad_val):
-        """为某个field设置对应的pad_val.
+        """
+        为某个field设置对应的pad_val.
 
         :param str field_name: 修改该field的pad_val
         :param int pad_val: 该field的padder会以pad_val作为padding index
-        :return:
         """
         if field_name not in self.field_arrays:
             raise KeyError("There is no field named {}.".format(field_name))
         self.field_arrays[field_name].set_pad_val(pad_val)
-
+    
     def get_input_name(self):
-        """返回所有is_input被设置为True的field名称
+        """
+        返回所有is_input被设置为True的field名称
 
-        :return: list, 里面的元素为被设置为input的field名称
+        :return list: 里面的元素为被设置为input的field名称
         """
         return [name for name, field in self.field_arrays.items() if field.is_input]
-
+    
     def get_target_name(self):
-        """返回所有is_target被设置为True的field名称
+        """
+        返回所有is_target被设置为True的field名称
 
-        :return list, 里面的元素为被设置为target的field名称
+        :return list: 里面的元素为被设置为target的field名称
         """
         return [name for name, field in self.field_arrays.items() if field.is_target]
-
+    
     def apply_field(self, func, field_name, new_field_name=None, **kwargs):
-        """将DataSet中的每个instance中的`field_name`这个field传给func，并获取它的返回值。
+        """
+        将DataSet中的每个instance中的名为 `field_name` 的field传给func，并获取它的返回值。
 
-        :param callable func: input是instance的`field_name`这个field的内容。
+        :param callable func: input是instance中名为 `field_name` 的field的内容。
         :param str field_name: 传入func的是哪个field。
-        :param None,str new_field_name: 将func返回的内容放入到new_field_name这个field中，如果名称与已有的field相同，则覆
+        :param None,str new_field_name: 将func返回的内容放入到 `new_field_name` 这个field中，如果名称与已有的field相同，则覆
             盖之前的field。如果为None则不创建新的field。
         :param optional kwargs: 支持输入is_input,is_target,ignore_type
 
-            1. is_input: bool, 如果为True则将`new_field_name`的field设置为input
+            1. is_input: bool, 如果为True则将名为 `new_field_name` 的field设置为input
 
-            2. is_target: bool, 如果为True则将`new_field_name`的field设置为target
+            2. is_target: bool, 如果为True则将名为 `new_field_name` 的field设置为target
 
-            3. ignore_type: bool, 如果为True则将`new_field_name`的field的ignore_type设置为true, 忽略其类型
+            3. ignore_type: bool, 如果为True则将名为 `new_field_name` 的field的ignore_type设置为true, 忽略其类型
         :return: list(Any), 里面的元素为func的返回值，所以list长度为DataSet的长度
 
         """
-        assert len(self)!=0, "Null DataSet cannot use apply_field()."
+        assert len(self) != 0, "Null DataSet cannot use apply_field()."
         if field_name not in self:
             raise KeyError("DataSet has no field named `{}`.".format(field_name))
         results = []
@@ -638,19 +665,20 @@ class DataSet(object):
             for idx, ins in enumerate(self._inner_iter()):
                 results.append(func(ins[field_name]))
         except Exception as e:
-            if idx!=-1:
+            if idx != -1:
                 print("Exception happens at the `{}`th instance.".format(idx))
             raise e
         if not (new_field_name is None) and len(list(filter(lambda x: x is not None, results))) == 0:  # all None
             raise ValueError("{} always return None.".format(_get_func_signature(func=func)))
-
+        
         if new_field_name is not None:
             self._add_apply_field(results, new_field_name, kwargs)
-
+        
         return results
-
+    
     def _add_apply_field(self, results, new_field_name, kwargs):
-        """将results作为加入到新的field中，field名称为new_field_name
+        """
+        将results作为加入到新的field中，field名称为new_field_name
 
         :param list(str) results: 一般是apply*()之后的结果
         :param str new_field_name: 新加入的field的名称
@@ -679,43 +707,46 @@ class DataSet(object):
             self.add_field(field_name=new_field_name, fields=results, is_input=extra_param.get("is_input", None),
                            is_target=extra_param.get("is_target", None),
                            ignore_type=extra_param.get("ignore_type", False))
-
+    
     def apply(self, func, new_field_name=None, **kwargs):
-        """ 将DataSet中每个instance传入到func中，并获取它的返回值.
+        """
+        将DataSet中每个instance传入到func中，并获取它的返回值.
 
         :param callable func: 参数是DataSet中的Instance
         :param None,str new_field_name: 将func返回的内容放入到new_field_name这个field中，如果名称与已有的field相同，则覆
             盖之前的field。如果为None则不创建新的field。
         :param optional kwargs: 支持输入is_input,is_target,ignore_type
 
-            1. is_input: bool, 如果为True则将`new_field_name`的field设置为input
+            1. is_input: bool, 如果为True则将 `new_field_name` 的field设置为input
 
-            2. is_target: bool, 如果为True则将`new_field_name`的field设置为target
+            2. is_target: bool, 如果为True则将 `new_field_name` 的field设置为target
 
-            3. ignore_type: bool, 如果为True则将`new_field_name`的field的ignore_type设置为true, 忽略其类型
+            3. ignore_type: bool, 如果为True则将 `new_field_name` 的field的ignore_type设置为true, 忽略其类型
+            
         :return: list(Any), 里面的元素为func的返回值，所以list长度为DataSet的长度
         """
-        assert len(self)!=0, "Null DataSet cannot use apply()."
+        assert len(self) != 0, "Null DataSet cannot use apply()."
         idx = -1
         try:
             results = []
             for idx, ins in enumerate(self._inner_iter()):
                 results.append(func(ins))
         except Exception as e:
-            if idx!=-1:
+            if idx != -1:
                 print("Exception happens at the `{}`th instance.".format(idx))
             raise e
         # results = [func(ins) for ins in self._inner_iter()]
         if not (new_field_name is None) and len(list(filter(lambda x: x is not None, results))) == 0:  # all None
             raise ValueError("{} always return None.".format(_get_func_signature(func=func)))
-
+        
         if new_field_name is not None:
             self._add_apply_field(results, new_field_name, kwargs)
-
+        
         return results
-
+    
     def drop(self, func, inplace=True):
-        """func接受一个Instance，返回bool值。返回值为True时，该Instance会被移除或者加入到返回的DataSet中。
+        """
+        func接受一个Instance，返回bool值。返回值为True时，该Instance会被移除或者加入到返回的DataSet中。
 
         :param callable func: 接受一个Instance作为参数，返回bool值。为True时删除该instance
         :param bool inplace: 是否在当前DataSet中直接删除instance。如果为False，被删除的Instance的组成的新DataSet将作为
@@ -730,18 +761,19 @@ class DataSet(object):
             return self
         else:
             results = [ins for ins in self if not func(ins)]
-            if len(results)!=0:
+            if len(results) != 0:
                 dataset = DataSet(results)
                 for field_name, field in self.field_arrays.items():
                     dataset.field_arrays[field_name].to(field)
                 return dataset
             else:
                 return DataSet()
-
+    
     def split(self, ratio):
-        """将DataSet按照ratio的比例拆分，返回两个DataSet
+        """
+        将DataSet按照ratio的比例拆分，返回两个DataSet
 
-        :param float ratio: 0<ratio<1, 返回的第一个DataSet拥有`ratio`这么多数据，第二个DataSet拥有`(1-ratio)`这么多数据
+        :param float ratio: 0<ratio<1, 返回的第一个DataSet拥有 `ratio` 这么多数据，第二个DataSet拥有 `(1-ratio)` 这么多数据
         :return: [DataSet, DataSet]
         """
         assert isinstance(ratio, float)
@@ -760,19 +792,20 @@ class DataSet(object):
         for field_name in self.field_arrays:
             train_set.field_arrays[field_name].to(self.field_arrays[field_name])
             dev_set.field_arrays[field_name].to(self.field_arrays[field_name])
-
+        
         return train_set, dev_set
-
+    
     @classmethod
     def read_csv(cls, csv_path, headers=None, sep=",", dropna=True):
-        """从csv_path路径下以csv的格式读取数据.
+        """
+        从csv_path路径下以csv的格式读取数据.
 
         :param str csv_path: 从哪里读取csv文件
-        :param list(str),None headers: 如果为None，则使用csv文件的第一行作为header; 如果传入list(str), 则元素的个数必须
+        :param list[str] headers: 如果为None，则使用csv文件的第一行作为header; 如果传入list(str), 则元素的个数必须
             与csv文件中每行的元素个数相同。
         :param str sep: 分割符
         :param bool dropna: 是否忽略与header数量不一致行。
-        :return DataSet
+        :return: 一个 :class:`~fastNLP.DataSet` 类型的对象
         """
         warnings.warn('DataSet.read_csv is deprecated, use CSVLoader instead',
                       category=DeprecationWarning)
@@ -800,23 +833,25 @@ class DataSet(object):
                 for header, content in zip(headers, contents):
                     _dict[header].append(content)
         return cls(_dict)
-
+    
     def save(self, path):
-        """保存DataSet.
+        """
+        保存DataSet.
 
         :param str path: 将DataSet存在哪个路径
         """
         with open(path, 'wb') as f:
             pickle.dump(self, f)
-
+    
     @staticmethod
     def load(path):
-        """从保存的DataSet pickle文件的路径中读取DataSet
+        """
+        从保存的DataSet pickle文件的路径中读取DataSet
 
         :param str path: 从哪里读取DataSet
-        :return: DataSet
+        :return: 一个 :class:`~fastNLP.DataSet` 类型的对象
         """
         with open(path, 'rb') as f:
             d = pickle.load(f)
             assert isinstance(d, DataSet), "The object is not DataSet, but {}.".format(type(d))
-        return d
+        return d
\ No newline at end of file
diff --git a/fastNLP/core/fieldarray.py b/fastNLP/core/field.py
similarity index 86%
rename from fastNLP/core/fieldarray.py
rename to fastNLP/core/field.py
index 127410d1..b4858c2e 100644
--- a/fastNLP/core/fieldarray.py
+++ b/fastNLP/core/field.py
@@ -1,7 +1,6 @@
 """
-FieldArray是  DataSet_ 中一列的存储方式，原理部分请参考 DataSet_ 处
-
- .. _FieldArray:
+field模块实现了 FieldArray 和若干 Padder。 FieldArray 是  :class:`~fastNLP.DataSet` 中一列的存储方式，
+原理部分请参考 :doc:`fastNLP.core.dataset`
 
 """
 
@@ -11,19 +10,21 @@ from copy import deepcopy
 
 
 class FieldArray(object):
+    """
+    别名：:class:`fastNLP.FieldArray` :class:`fastNLP.core.field.FieldArray`
+
+    FieldArray 是用于保存 :class:`~fastNLP.DataSet` 中一个field的类型。
+    
+    :param str name: FieldArray的名称
+    :param list,numpy.ndarray content: 列表的元素可以为list，int，float，
+    :param bool is_target: 这个field是否是一个target field。
+    :param bool is_input: 这个field是否是一个input field。
+    :param padder: :class:`~fastNLP.Padder` 类型。赋值给fieldarray的padder的对象会被deepcopy一份，需要修改padder参数必须通过
+       fieldarray.set_pad_val()。默认为None，即使用 :class:`~fastNLP.AutoPadder`  。
+    :param bool ignore_type: 是否忽略该field的type，一般如果这个field不需要转为torch.FloatTensor或torch.LongTensor,
+        就可以设置为True。具体意义请参考 :class:`~fastNLP.DataSet` 。
+    """
     def __init__(self, name, content, is_target=None, is_input=None, padder=None, ignore_type=False):
-        """FieldArray是用于保存 DataSet_ 中一个field的实体。
-
-        :param str name: FieldArray的名称
-        :param list,numpy.ndarray content: 列表的元素可以为list，int，float，
-        :param bool is_target: 这个field是否是一个target field。
-        :param bool is_input: 这个field是否是一个input field。
-        :param Padder padder: PadderBase类型。赋值给fieldarray的padder的对象会被deepcopy一份，需要修改padder参数必须通过
-            fieldarray.set_pad_val()。默认为None，即使用 AutoPadder_ 。
-        :param bool ignore_type: 是否忽略该field的type，一般如果这个field不需要转为torch.FloatTensor或torch.LongTensor, 就
-            可以设置为True。具体意义请参考 DataSet_ 。
-        """
-
         self.name = name
         if isinstance(content, list):
             # 如果DataSet使用dict初始化, content 可能是二维list/二维array/三维list
@@ -87,14 +88,15 @@ class FieldArray(object):
     @is_target.setter
     def is_target(self, value):
         """
-            当 field_array.is_target = True / False 时被调用
+        当 field_array.is_target = True / False 时被调用
         """
         if value is True:
             self._set_dtype()
         self._is_target = value
 
     def _type_detection(self, content):
-        """当该field被设置为is_input或者is_target时被调用
+        """
+        当该field被设置为is_input或者is_target时被调用
 
         """
         if len(content) == 0:
@@ -238,11 +240,12 @@ class FieldArray(object):
         self.content[idx] = val
 
     def get(self, indices, pad=True):
-        """根据给定的indices返回内容
+        """
+        根据给定的indices返回内容
 
-        :param  int,list(int) indices:, 获取indices对应的内容。
-        :param bool pad: , 是否对返回的结果进行padding。仅对indices为List[int]时有效
-        :return: (single, List)
+        :param int,List[int] indices: 获取indices对应的内容。
+        :param bool pad:  是否对返回的结果进行padding。仅对indices为List[int]时有效
+        :return: 根据给定的indices返回的内容，可能是单个值或List
         """
         if isinstance(indices, int):
             return self.content[indices]
@@ -259,8 +262,7 @@ class FieldArray(object):
         """
         设置padder，在这个field进行pad的时候用这个padder进行pad，如果为None则不进行pad。
 
-        :param None,Padder padder:. 设置为None即删除padder。
-        :return:
+        :param padder: :class:`~fastNLP.Padder` 类型，设置为None即删除padder。
         """
         if padder is not None:
             assert isinstance(padder, Padder), "padder must be of type Padder."
@@ -269,10 +271,10 @@ class FieldArray(object):
             self.padder = None
 
     def set_pad_val(self, pad_val):
-        """修改padder的pad_val.
+        """
+        修改padder的pad_val.
 
         :param int pad_val: 该field的pad值设置为该值。
-        :return:
         """
         if self.padder is not None:
             self.padder.set_pad_val(pad_val)
@@ -280,7 +282,8 @@ class FieldArray(object):
 
 
     def __len__(self):
-        """Returns the size of FieldArray.
+        """
+        Returns the size of FieldArray.
 
         :return int length:
         """
@@ -288,10 +291,11 @@ class FieldArray(object):
 
     def to(self, other):
         """
-        将other的属性复制给本FieldArray(other必须为FieldArray类型).属性包括 is_input, is_target, padder, ignore_type
+        将other的属性复制给本FieldArray(other必须为FieldArray类型).
+        属性包括 is_input, is_target, padder, ignore_type
 
-        :param FieldArray other: 从哪个field拷贝属性
-        :return: FieldArray
+        :param  other: :class:`~fastNLP.FieldArray` 从哪个field拷贝属性
+        :return: :class:`~fastNLP.FieldArray`
         """
         assert isinstance(other, FieldArray), "Only support FieldArray type, not {}.".format(type(other))
 
@@ -312,10 +316,20 @@ def _is_iterable(content):
 
 class Padder:
     """
-         .. _Padder:
+    别名：:class:`fastNLP.Padder` :class:`fastNLP.core.field.Padder`
 
-        所有padder都需要继承这个类，并覆盖__call__()方法。
-        用于对batch进行padding操作。传入的element是inplace的，即直接修改element可能导致数据变化，建议inplace修改之前deepcopy一份。
+    所有padder都需要继承这个类，并覆盖__call__方法。
+    用于对batch进行padding操作。传入的element是inplace的，即直接修改element可能导致数据变化，建议inplace修改之前deepcopy一份。
+    
+    .. py:function:: __call__(self, contents, field_name, field_ele_dtype):
+        传入的是List内容。假设有以下的DataSet。
+        
+        :param list(Any) contents: 传入的element是inplace的，即直接修改element可能导致数据变化，建议inplace修改之前
+            deepcopy一份。
+        :param str, field_name: field的名称。
+        :param np.int64,np.float64,np.str,None, field_ele_dtype: 该field的内层元素的类型。如果该field的ignore_type为True，该这个值为None。
+        :return: np.array([padded_element])
+    
     """
 
     def __init__(self, pad_val=0, **kwargs):
@@ -368,7 +382,7 @@ class Padder:
 
 class AutoPadder(Padder):
     """
-     .. _AutoPadder:
+    别名：:class:`fastNLP.AutoPadder` :class:`fastNLP.core.field.AutoPadder`
 
     根据contents的数据自动判定是否需要做padding。
 
@@ -420,7 +434,7 @@ class AutoPadder(Padder):
 
 class EngChar2DPadder(Padder):
     """
-        .. _EngChar2DPadder:
+    别名：:class:`fastNLP.EngChar2DPadder` :class:`fastNLP.core.field.EngChar2DPadder`
 
     用于为英语执行character级别的2D padding操作。对应的field内容应该类似[['T', 'h', 'i', 's'], ['a'], ['d', 'e', 'm', 'o']]，
     但这个Padder只能处理index为int的情况。
diff --git a/fastNLP/core/instance.py b/fastNLP/core/instance.py
index 51441906..2303c510 100644
--- a/fastNLP/core/instance.py
+++ b/fastNLP/core/instance.py
@@ -1,47 +1,50 @@
 """
-Instance文档
-
- .. _Instance:
-
-Instance是fastNLP中对应于一个sample的类。一个sample可以认为是fastNLP中的一个Instance对象。一个具像化的表示类似与 DataSet_
-出那个表中所展示的一行。
+instance 模块实现了Instance 类在fastNLP中对应sample。一个sample可以认为是一个Instance类型的对象。
+便于理解的例子可以参考文档 :doc:`fastNLP.core.dataset` 中的表格
 
 """
-
+__all__ = ["Instance"]
 
 
 class Instance(object):
+    """
+    别名：:class:`fastNLP.Instance` :class:`fastNLP.core.instance.Instance`
+
+    Instance是fastNLP中对应一个sample的类。每个sample在fastNLP中是一个Instance对象。
+    Instance一般与 :class:`~fastNLP.DataSet` 一起使用, Instance的初始化如下面的Example所示
+
+    Example::
+    
+        >>>from fastNLP import Instance
+        >>>ins = Instance(field_1=[1, 1, 1], field_2=[2, 2, 2])
+        >>>ins["field_1"]
+        [1, 1, 1]
+        >>>ins.add_field("field_3", [3, 3, 3])
+        >>>ins = Instance(**{'x1': 1, 'x2':np.zeros((3, 4))})
+    """
+    
     def __init__(self, **fields):
-        """Instance的初始化如下面的Example所示
-
-        Example::
-
-            ins = Instance(field_1=[1, 1, 1], field_2=[2, 2, 2])
-            ins["field_1"]
-            >>[1, 1, 1]
-            ins.add_field("field_3", [3, 3, 3])
-
-            ins = Instance(**{'x1': 1, 'x2':np.zeros((3, 4))})
-        """
+        
         self.fields = fields
-
+    
     def add_field(self, field_name, field):
-        """向Instance中增加一个field
+        """
+        向Instance中增加一个field
 
         :param str field_name: 新增field的名称
         :param Any field: 新增field的内容
         """
         self.fields[field_name] = field
-
+    
     def __getitem__(self, name):
         if name in self.fields:
             return self.fields[name]
         else:
             raise KeyError("{} not found".format(name))
-
+    
     def __setitem__(self, name, field):
         return self.add_field(name, field)
-
+    
     def __repr__(self):
         s = '\''
         return "{" + ",\n".join(
diff --git a/fastNLP/core/losses.py b/fastNLP/core/losses.py
index abb1cd29..ac08b46f 100644
--- a/fastNLP/core/losses.py
+++ b/fastNLP/core/losses.py
@@ -1,36 +1,34 @@
 """
-
- .. _LossBase:
-
- .. _Loss:
+losses 模块定义了 fastNLP 中所需的各种损失函数，一般做为 :class:`~fastNLP.Trainer` 的参数使用。
 
 """
-
+__all__ = ["LossBase", "L1Loss", "LossFunc", "LossInForward", "BCELoss", "CrossEntropyLoss", "NLLLoss"]
 import inspect
 from collections import defaultdict
 
 import torch
 import torch.nn.functional as F
 
-from fastNLP.core.utils import _CheckError
-from fastNLP.core.utils import _CheckRes
-from fastNLP.core.utils import _build_args
-from fastNLP.core.utils import _check_arg_dict_list
-from fastNLP.core.utils import _check_function_or_method
-from fastNLP.core.utils import _get_func_signature
+from .utils import _CheckError
+from .utils import _CheckRes
+from .utils import _build_args
+from .utils import _check_arg_dict_list
+from .utils import _check_function_or_method
+from .utils import _get_func_signature
 
 
 class LossBase(object):
-    """所有loss的基类.
-
     """
+    所有loss的基类。如果想了解其中的原理，请查看源码。
+    """
+    
     def __init__(self):
         self.param_map = {}
         self._checked = False
-
+    
     def get_loss(self, *args, **kwargs):
         raise NotImplementedError
-
+    
     def _init_param_map(self, key_map=None, **kwargs):
         """检查key_map和其他参数map，并将这些映射关系添加到self.param_map
 
@@ -63,7 +61,7 @@ class LossBase(object):
         for value, key_set in value_counter.items():
             if len(key_set) > 1:
                 raise ValueError(f"Several parameters:{key_set} are provided with one output {value}.")
-
+        
         # check consistence between signature and param_map
         func_spect = inspect.getfullargspec(self.get_loss)
         func_args = [arg for arg in func_spect.args if arg != 'self']
@@ -72,12 +70,12 @@ class LossBase(object):
                 raise NameError(
                     f"Parameter `{func_param}` is not in {_get_func_signature(self.get_loss)}. Please check the "
                     f"initialization parameters, or change its signature.")
-
+        
         # evaluate should not have varargs.
         # if func_spect.varargs:
         #     raise NameError(f"Delete `*{func_spect.varargs}` in {get_func_signature(self.get_loss)}(Do not use "
         #                     f"positional argument.).")
-
+    
     def _fast_param_map(self, pred_dict, target_dict):
         """Only used as inner function. When the pred_dict, target is unequivocal. Don't need users to pass key_map.
             such as pred_dict has one element, target_dict has one element
@@ -92,7 +90,7 @@ class LossBase(object):
             fast_param['target'] = list(target_dict.values())[0]
             return fast_param
         return fast_param
-
+    
     def __call__(self, pred_dict, target_dict, check=False):
         """
         :param dict pred_dict: 模型的forward函数返回的dict
@@ -104,7 +102,7 @@ class LossBase(object):
         if fast_param:
             loss = self.get_loss(**fast_param)
             return loss
-
+        
         if not self._checked:
             # 1. check consistence between signature and param_map
             func_spect = inspect.getfullargspec(self.get_loss)
@@ -112,14 +110,14 @@ class LossBase(object):
             for func_arg, input_arg in self.param_map.items():
                 if func_arg not in func_args:
                     raise NameError(f"`{func_arg}` not in {_get_func_signature(self.get_loss)}.")
-
+            
             # 2. only part of the param_map are passed, left are not
             for arg in func_args:
                 if arg not in self.param_map:
                     self.param_map[arg] = arg  # This param does not need mapping.
             self._evaluate_args = func_args
             self._reverse_param_map = {input_arg: func_arg for func_arg, input_arg in self.param_map.items()}
-
+        
         # need to wrap inputs in dict.
         mapped_pred_dict = {}
         mapped_target_dict = {}
@@ -139,7 +137,7 @@ class LossBase(object):
                 not_duplicate_flag += 1
             if not_duplicate_flag == 3:
                 duplicated.append(input_arg)
-
+        
         # missing
         if not self._checked:
             check_res = _check_arg_dict_list(self.get_loss, [mapped_pred_dict, mapped_target_dict])
@@ -149,47 +147,50 @@ class LossBase(object):
             for idx, func_arg in enumerate(missing):
                 # Don't delete `` in this information, nor add ``
                 replaced_missing[idx] = f"{self.param_map[func_arg]}" + f"(assign to `{func_arg}` " \
-                                                                        f"in `{self.__class__.__name__}`)"
-
+                    f"in `{self.__class__.__name__}`)"
+            
             check_res = _CheckRes(missing=replaced_missing,
                                   unused=check_res.unused,
                                   duplicated=duplicated,
                                   required=check_res.required,
                                   all_needed=check_res.all_needed,
                                   varargs=check_res.varargs)
-
+            
             if check_res.missing or check_res.duplicated:
                 raise _CheckError(check_res=check_res,
                                   func_signature=_get_func_signature(self.get_loss))
         refined_args = _build_args(self.get_loss, **mapped_pred_dict, **mapped_target_dict)
-
+        
         loss = self.get_loss(**refined_args)
         self._checked = True
-
+        
         return loss
 
 
 class LossFunc(LossBase):
-    """提供给用户使用自定义损失函数的类
     """
-    def __init__(self, func, key_map=None, **kwargs):
-        """
+    别名：:class:`fastNLP.LossFunc` :class:`fastNLP.core.losses.LossFunc`
 
-        :param func: 用户自行定义的损失函数，应当为一个函数或者callable(func)为True的ojbect
-        :param dict key_map: 参数映射表。键为Model/DataSet参数名，值为损失函数参数名。
-                             fastNLP的trainer将在训练时从模型返回值或者训练数据DataSet的target=True的field中
-                             找到相对应的参数名为value的参数，并传入func中作为参数名为key的参数
-        :param kwargs: 除了参数映射表以外可以用key word args的方式设置参数映射关系
+    提供给用户使用自定义损失函数的类
 
-        Example::
+    :param func: 用户自行定义的损失函数，应当为一个函数或者callable(func)为True的ojbect
+    :param dict key_map: 参数映射表。键为Model/DataSet参数名，值为损失函数参数名。
+                         fastNLP的trainer将在训练时从模型返回值或者训练数据DataSet的target=True的field中
+                         找到相对应的参数名为value的参数，并传入func中作为参数名为key的参数
+    :param kwargs: 除了参数映射表以外可以用key word args的方式设置参数映射关系
 
-            >>> func = torch.nn.CrossEntropyLoss()
-            >>> loss_func = LossFunc(func, input="pred", target="label")
-            >>> # 这表示构建了一个损失函数类，由func计算损失函数，其中将从模型返回值或者DataSet的target=True的field
-            >>> # 当中找到一个参数名为`pred`的参数传入func一个参数名为`input`的参数；找到一个参数名为`label`的参数
-            >>> # 传入func作为一个名为`target`的参数
+    Example::
 
-        """
+        >>> func = torch.nn.CrossEntropyLoss()
+        >>> loss_func = LossFunc(func, input="pred", target="label")
+        # 这表示构建了一个损失函数类，由func计算损失函数，其中将从模型返回值或者DataSet的target=True的field
+        # 当中找到一个参数名为`pred`的参数传入func一个参数名为`input`的参数；找到一个参数名为`label`的参数
+        # 传入func作为一个名为`target`的参数
+
+    """
+    
+    def __init__(self, func, key_map=None, **kwargs):
+        
         super(LossFunc, self).__init__()
         _check_function_or_method(func)
         if key_map is not None:
@@ -199,94 +200,108 @@ class LossFunc(LossBase):
         if len(kwargs) > 0:
             for key, val in kwargs.items():
                 self.param_map.update({key: val})
-
+        
         self.get_loss = func
 
 
 class CrossEntropyLoss(LossBase):
     """
-     .. _CrossEntropyLoss:
+    别名：:class:`fastNLP.CrossEntropyLoss` :class:`fastNLP.core.losses.CrossEntropyLoss`
 
-    交叉熵损失函数"""
-    def __init__(self, pred=None, target=None, padding_idx=-100):
-        """
-        :param pred: 参数映射表中`pred`的映射关系，None表示映射关系为`pred`->`pred`
-        :param target: 参数映射表中`target`的映射关系，None表示映射关系为`target`->`target`
-        :param padding_idx: padding的index，在计算loss时将忽略target中标号为padding_idx的内容
+    交叉熵损失函数
+    
+    :param pred: 参数映射表中 `pred` 的映射关系，None表示映射关系为 `pred` -> `pred`
+    :param target: 参数映射表中 `target` 的映射关系，None表示映射关系为 `target` -> `target`
+    :param padding_idx: padding的index，在计算loss时将忽略target中标号为padding_idx的内容
 
-        Example::
+    Example::
 
-            >>> loss = CrossEntropyLoss(pred='pred', target='label', padding_idx=0)
-        """
+        >>> loss = CrossEntropyLoss(pred='pred', target='label', padding_idx=0)
+        
+    """
+    
+    def __init__(self, pred=None, target=None, padding_idx=-100):
         # TODO 需要做一些检查，F.cross_entropy在计算时，如果pred是(16, 10 ,4), target的形状按道理应该是(16, 10), 但实际却需要
         # TODO  （16， 4）
         super(CrossEntropyLoss, self).__init__()
         self._init_param_map(pred=pred, target=target)
         self.padding_idx = padding_idx
-
+    
     def get_loss(self, pred, target):
         return F.cross_entropy(input=pred, target=target,
                                ignore_index=self.padding_idx)
 
 
 class L1Loss(LossBase):
-    """L1损失函数"""
+    """
+    别名：:class:`fastNLP.L1Loss` :class:`fastNLP.core.losses.L1Loss`
+
+    L1损失函数
+    
+    :param pred: 参数映射表中 `pred` 的映射关系，None表示映射关系为 `pred` -> `pred`
+    :param target: 参数映射表中 `target` 的映射关系，None表示映射关系为 `target` >`target`
+    
+    """
+    
     def __init__(self, pred=None, target=None):
-        """
-        :param pred: 参数映射表中`pred`的映射关系，None表示映射关系为`pred`->`pred`
-        :param target: 参数映射表中`target`的映射关系，None表示映射关系为`target`->`target`
-        """
         super(L1Loss, self).__init__()
         self._init_param_map(pred=pred, target=target)
-
+    
     def get_loss(self, pred, target):
         return F.l1_loss(input=pred, target=target)
 
 
 class BCELoss(LossBase):
-    """二分类交叉熵损失函数"""
+    """
+    别名：:class:`fastNLP.BCELoss` :class:`fastNLP.core.losses.BCELoss`
+
+    二分类交叉熵损失函数
+    
+    :param pred: 参数映射表中`pred`的映射关系，None表示映射关系为`pred`->`pred`
+    :param target: 参数映射表中`target`的映射关系，None表示映射关系为`target`->`target`
+    """
+    
     def __init__(self, pred=None, target=None):
-        """
-        :param pred: 参数映射表中`pred`的映射关系，None表示映射关系为`pred`->`pred`
-        :param target: 参数映射表中`target`的映射关系，None表示映射关系为`target`->`target`
-        """
         super(BCELoss, self).__init__()
         self._init_param_map(pred=pred, target=target)
-
+    
     def get_loss(self, pred, target):
         return F.binary_cross_entropy(input=pred, target=target)
 
 
 class NLLLoss(LossBase):
-    """负对数似然损失函数"""
+    """
+    别名：:class:`fastNLP.NLLLoss` :class:`fastNLP.core.losses.NLLLoss`
+    
+    负对数似然损失函数
+    
+    :param pred: 参数映射表中`pred`的映射关系，None表示映射关系为`pred`->`pred`
+    :param target: 参数映射表中`target`的映射关系，None表示映射关系为`target`->`target`
+    """
+    
     def __init__(self, pred=None, target=None):
-        """
-        :param pred: 参数映射表中`pred`的映射关系，None表示映射关系为`pred`->`pred`
-        :param target: 参数映射表中`target`的映射关系，None表示映射关系为`target`->`target`
-        """
         super(NLLLoss, self).__init__()
         self._init_param_map(pred=pred, target=target)
-
+    
     def get_loss(self, pred, target):
         return F.nll_loss(input=pred, target=target)
 
 
 class LossInForward(LossBase):
     """
-
-     .. _LossInForward:
+    别名：:class:`fastNLP.LossInForward` :class:`fastNLP.core.losses.LossInForward`
 
     从forward()函数返回结果中获取loss
+    
+    :param str loss_key: 在forward函数中loss的键名，默认为loss
     """
+    
     def __init__(self, loss_key='loss'):
-        """
-        :param str loss_key: 在forward函数中loss的键名，默认为loss
-        """
         super().__init__()
         if not isinstance(loss_key, str):
             raise TypeError(f"Only str allowed for loss_key, got {type(loss_key)}.")
         self.loss_key = loss_key
-
+    
     def get_loss(self, **kwargs):
         if self.loss_key not in kwargs:
             check_res = _CheckRes(
@@ -298,17 +313,17 @@ class LossInForward(LossBase):
                 varargs=[])
             raise _CheckError(check_res=check_res, func_signature=_get_func_signature(self.get_loss))
         return kwargs[self.loss_key]
-
+    
     def __call__(self, pred_dict, target_dict, check=False):
-
+        
         loss = self.get_loss(**pred_dict)
-
+        
         if not (isinstance(loss, torch.Tensor) and len(loss.size()) == 0):
             if not isinstance(loss, torch.Tensor):
                 raise TypeError(f"Loss excepted to be a torch.Tensor, got {type(loss)}")
             loss = torch.sum(loss) / (loss.view(-1)).size(0)
             # raise RuntimeError(f"The size of loss excepts to be torch.Size([]), got {loss.size()}")
-
+        
         return loss
 
 
@@ -378,13 +393,13 @@ def mask(predict, truth, **kwargs):
     if kwargs.get("mask") is None:
         return predict, truth
     mask = kwargs["mask"]
-
+    
     predict, truth = squash(predict, truth)
     mask = mask.view(-1, )
-
+    
     predict = torch.masked_select(predict.permute(1, 0), mask).view(predict.size()[-1], -1).permute(1, 0)
     truth = torch.masked_select(truth, mask)
-
+    
     return predict, truth
 
 
@@ -399,4 +414,3 @@ def make_mask(lens, tar_len):
     mask = [torch.ge(lens, i + 1) for i in range(tar_len)]
     mask = torch.stack(mask, 1)
     return mask
-
diff --git a/fastNLP/core/metrics.py b/fastNLP/core/metrics.py
index 938a67be..829684d3 100644
--- a/fastNLP/core/metrics.py
+++ b/fastNLP/core/metrics.py
@@ -1,31 +1,25 @@
 """
-
- .. _Metric:
+metrics 模块实现了 fastNLP 所需的各种常用衡量指标，一般做为 :class:`~fastNLP.Trainer` 的参数使用。
 
 """
-
-
-
-
 import inspect
 from collections import defaultdict
 
 import numpy as np
 import torch
 
-from fastNLP.core.utils import _CheckError
-from fastNLP.core.utils import _CheckRes
-from fastNLP.core.utils import _build_args
-from fastNLP.core.utils import _check_arg_dict_list
-from fastNLP.core.utils import _get_func_signature
-from fastNLP.core.utils import seq_lens_to_masks
-from fastNLP.core.vocabulary import Vocabulary
+from .utils import _CheckError
+from .utils import _CheckRes
+from .utils import _build_args
+from .utils import _check_arg_dict_list
+from .utils import _get_func_signature
+from .utils import seq_lens_to_masks
+from .vocabulary import Vocabulary
 
 
 class MetricBase(object):
-    """所有metrics的基类
-
-    所有的传入到Trainer, Tester的Metric需要继承自该对象。需要覆盖写入evaluate(), get_metric()方法。
+    """
+    所有metrics的基类,，所有的传入到Trainer, Tester的Metric需要继承自该对象，需要覆盖写入evaluate(), get_metric()方法。
     
         evaluate(xxx)中传入的是一个batch的数据。
         
@@ -94,17 +88,17 @@ class MetricBase(object):
                 return {'acc': acc} # 需要返回一个dict，key为该metric的名称，该名称会显示到Trainer的progress bar中
 
 
-    ``MetricBase`` 将会在输入的字典``pred_dict``和``target_dict``中进行检查.
-    ``pred_dict`` 是模型当中``forward()``函数或者``predict()``函数的返回值.
-    ``target_dict`` 是DataSet当中的ground truth, 判定ground truth的条件是field的``is_target``被设置为True.
+    ``MetricBase`` 将会在输入的字典 ``pred_dict`` 和 ``target_dict`` 中进行检查.
+    ``pred_dict`` 是模型当中 ``forward()`` 函数或者 ``predict()`` 函数的返回值.
+    ``target_dict`` 是DataSet当中的ground truth, 判定ground truth的条件是field的 ``is_target`` 被设置为True.
 
     ``MetricBase`` 会进行以下的类型检测:
 
     1. self.evaluate当中是否有varargs, 这是不支持的.
-    2. self.evaluate当中所需要的参数是否既不在``pred_dict``也不在``target_dict``.
-    3. self.evaluate当中所需要的参数是否既在``pred_dict``也在``target_dict``.
+    2. self.evaluate当中所需要的参数是否既不在 ``pred_dict`` 也不在 ``target_dict`` .
+    3. self.evaluate当中所需要的参数是否既在 ``pred_dict`` 也在 ``target_dict`` .
 
-    除此以外，在参数被传入self.evaluate以前，这个函数会检测``pred_dict``和``target_dict``当中没有被用到的参数
+    除此以外，在参数被传入self.evaluate以前，这个函数会检测 ``pred_dict`` 和 ``target_dict`` 当中没有被用到的参数
     如果kwargs是self.evaluate的参数，则不会检测
 
 
@@ -267,13 +261,18 @@ class MetricBase(object):
 
 
 class AccuracyMetric(MetricBase):
-    """准确率Metric"""
+    """
+    
+    别名：:class:`fastNLP.AccuracyMetric` :class:`fastNLP.core.metrics.AccuracyMetric`
+
+    准确率Metric（其它的Metric参见 :doc:`fastNLP.core.metrics` ）
+    
+    :param pred: 参数映射表中 `pred` 的映射关系，None表示映射关系为 `pred` -> `pred`
+    :param target: 参数映射表中 `target` 的映射关系，None表示映射关系为 `target` -> `target`
+    :param seq_len: 参数映射表中 `seq_lens` 的映射关系，None表示映射关系为 `seq_len` -> `seq_len`
+    """
     def __init__(self, pred=None, target=None, seq_len=None):
-        """
-        :param pred: 参数映射表中`pred`的映射关系，None表示映射关系为`pred`->`pred`
-        :param target: 参数映射表中`target`的映射关系，None表示映射关系为`target`->`target`
-        :param seq_len: 参数映射表中`seq_lens`的映射关系，None表示映射关系为`seq_len`->`seq_len`
-        """
+        
         super().__init__()
 
         self._init_param_map(pred=pred, target=target, seq_len=seq_len)
@@ -282,7 +281,8 @@ class AccuracyMetric(MetricBase):
         self.acc_count = 0
 
     def evaluate(self, pred, target, seq_len=None):
-        """evaluate函数将针对一个批次的预测结果做评价指标的累计
+        """
+        evaluate函数将针对一个批次的预测结果做评价指标的累计
 
         :param torch.Tensor pred: 预测的tensor, tensor的形状可以是torch.Size([B,]), torch.Size([B, n_classes]),
                 torch.Size([B, max_len]), 或者torch.Size([B, max_len, n_classes])
@@ -327,7 +327,8 @@ class AccuracyMetric(MetricBase):
             self.total += np.prod(list(pred.size()))
 
     def get_metric(self, reset=True):
-        """get_metric函数将根据evaluate函数累计的评价指标统计量来计算最终的评价结果.
+        """
+        get_metric函数将根据evaluate函数累计的评价指标统计量来计算最终的评价结果.
 
         :param bool reset: 在调用完get_metric后是否清空评价指标统计量.
         :return dict evaluate_result: {"acc": float}
@@ -430,8 +431,6 @@ def _bio_tag_to_spans(tags, ignore_labels=None):
 class SpanFPreRecMetric(MetricBase):
     """
 
-     .. _SpanFPreRecMetric:
-
     在序列标注问题中，以span的方式计算F, pre, rec.
     比如中文Part of speech中，会以character的方式进行标注，句子'中国在亚洲'对应的POS可能为(以BMES为例)
     ['B-NN', 'E-NN', 'S-DET', 'B-NN', 'E-NN']。该metric就是为类似情况下的F1计算。
@@ -455,26 +454,24 @@ class SpanFPreRecMetric(MetricBase):
             ...
         }
 
+    :param tag_vocab: 标签的 :class:`~fastNLP.Vocabulary` 。支持的标签为"B"(没有label)；或"B-xxx"(xxx为某种label，比如POS中的NN)，
+        在解码时，会将相同xxx的认为是同一个label，比如['B-NN', 'E-NN']会被合并为一个'NN'.
+    :param str pred: 用该key在evaluate()时从传入dict中取出prediction数据。 为None，则使用'pred'取数据
+    :param str target: 用该key在evaluate()时从传入dict中取出target数据。 为None，则使用'target'取数据
+    :param str seq_len: 用该key在evaluate()时从传入dict中取出sequence length数据。为None，则使用'seq_lens'取数据。
+    :param str encoding_type: 目前支持bio, bmes
+    :param list ignore_labels: str 组成的list. 这个list中的class不会被用于计算。例如在POS tagging时传入['NN']，则不会计算'NN'这
+        个label
+    :param bool only_gross: 是否只计算总的f1, precision, recall的值；如果为False，不仅返回总的f1, pre, rec, 还会返回每个
+        label的f1, pre, rec
+    :param str f_type: 'micro'或'macro'. 'micro':通过先计算总体的TP，FN和FP的数量，再计算f, precision, recall; 'macro':
+        分布计算每个类别的f, precision, recall，然后做平均（各类别f的权重相同）
+    :param float beta: f_beta分数，f_beta = (1 + beta^2)*(pre*rec)/(beta^2*pre + rec). 常用为beta=0.5, 1, 2. 若为0.5
+        则精确率的权重高于召回率；若为1，则两者平等；若为2，则召回率权重高于精确率。
     """
     def __init__(self, tag_vocab, pred=None, target=None, seq_len=None, encoding_type='bio', ignore_labels=None,
                   only_gross=True, f_type='micro', beta=1):
-        """
-
-        :param Vocabulary tag_vocab: 标签的vocabulary。支持的标签为"B"(没有label)；或"B-xxx"(xxx为某种label，比如POS中的NN)，
-            在解码时，会将相同xxx的认为是同一个label，比如['B-NN', 'E-NN']会被合并为一个'NN'.
-        :param str pred: 用该key在evaluate()时从传入dict中取出prediction数据。 为None，则使用'pred'取数据
-        :param str target: 用该key在evaluate()时从传入dict中取出target数据。 为None，则使用'target'取数据
-        :param str seq_len: 用该key在evaluate()时从传入dict中取出sequence length数据。为None，则使用'seq_lens'取数据。
-        :param str encoding_type: 目前支持bio, bmes
-        :param list ignore_labels: str 组成的list. 这个list中的class不会被用于计算。例如在POS tagging时传入['NN']，则不会计算'NN'这
-            个label
-        :param bool only_gross: 是否只计算总的f1, precision, recall的值；如果为False，不仅返回总的f1, pre, rec, 还会返回每个
-            label的f1, pre, rec
-        :param str f_type: 'micro'或'macro'. 'micro':通过先计算总体的TP，FN和FP的数量，再计算f, precision, recall; 'macro':
-            分布计算每个类别的f, precision, recall，然后做平均（各类别f的权重相同）
-        :param float beta: f_beta分数，f_beta = (1 + beta^2)*(pre*rec)/(beta^2*pre + rec). 常用为beta=0.5, 1, 2. 若为0.5
-            则精确率的权重高于召回率；若为1，则两者平等；若为2，则召回率权重高于精确率。
-        """
+        
         encoding_type = encoding_type.lower()
 
         if not isinstance(tag_vocab, Vocabulary):
@@ -647,20 +644,18 @@ class BMESF1PreRecMetric(MetricBase):
         target形状为 (batch_size, max_len)
         seq_lens形状为 (batch_size, )
 
-    """
+    需要申明BMES这四种tag中，各种tag对应的idx。所有不为b_idx, m_idx, e_idx, s_idx的数字都认为是s_idx。
 
+    :param b_idx: int, Begin标签所对应的tag idx.
+    :param m_idx: int, Middle标签所对应的tag idx.
+    :param e_idx: int, End标签所对应的tag idx.
+    :param s_idx: int, Single标签所对应的tag idx
+    :param pred: str, 用该key在evaluate()时从传入dict中取出prediction数据。 为None，则使用'pred'取数据
+    :param target: str, 用该key在evaluate()时从传入dict中取出target数据。 为None，则使用'target'取数据
+    :param seq_len: str, 用该key在evaluate()时从传入dict中取出seqence length数据。为None，则使用'seq_len'取数据。
+    """
+    
     def __init__(self, b_idx=0, m_idx=1, e_idx=2, s_idx=3, pred=None, target=None, seq_len=None):
-        """
-        需要申明BMES这四种tag中，各种tag对应的idx。所有不为b_idx, m_idx, e_idx, s_idx的数字都认为是s_idx。
-
-        :param b_idx: int, Begin标签所对应的tag idx.
-        :param m_idx: int, Middle标签所对应的tag idx.
-        :param e_idx: int, End标签所对应的tag idx.
-        :param s_idx: int, Single标签所对应的tag idx
-        :param pred: str, 用该key在evaluate()时从传入dict中取出prediction数据。 为None，则使用'pred'取数据
-        :param target: str, 用该key在evaluate()时从传入dict中取出target数据。 为None，则使用'target'取数据
-        :param seq_len: str, 用该key在evaluate()时从传入dict中取出seqence length数据。为None，则使用'seq_len'取数据。
-        """
         super().__init__()
 
         self._init_param_map(pred=pred, target=target, seq_len=seq_len)
@@ -831,21 +826,23 @@ def _pred_topk(y_prob, k=1):
 
 
 class SQuADMetric(MetricBase):
-    """SQuAD数据集metric
+    """
+    SQuAD数据集metric
+    
+    :param pred1: 参数映射表中`pred1`的映射关系，None表示映射关系为`pred1`->`pred1`
+    :param pred2: 参数映射表中`pred2`的映射关系，None表示映射关系为`pred2`->`pred2`
+    :param target1: 参数映射表中`target1`的映射关系，None表示映射关系为`target1`->`target1`
+    :param target2: 参数映射表中`target2`的映射关系，None表示映射关系为`target2`->`target2`
+    :param float beta: f_beta分数，f_beta = (1 + beta^2)*(pre*rec)/(beta^2*pre + rec). 常用为beta=0.5, 1, 2. 若为0.5
+        则精确率的权重高于召回率；若为1，则两者平等；若为2，则召回率权重高于精确率。
+    :param bool right_open: right_open为true表示start跟end指针指向一个左闭右开区间，为false表示指向一个左闭右闭区间。
+    :param bool print_predict_stat: True则输出预测答案是否为空与正确答案是否为空的统计信息, False则不输出
+    
     """
 
     def __init__(self, pred1=None, pred2=None, target1=None, target2=None,
                  beta=1, right_open=True, print_predict_stat=False):
-        """
-        :param pred1: 参数映射表中`pred1`的映射关系，None表示映射关系为`pred1`->`pred1`
-        :param pred2: 参数映射表中`pred2`的映射关系，None表示映射关系为`pred2`->`pred2`
-        :param target1: 参数映射表中`target1`的映射关系，None表示映射关系为`target1`->`target1`
-        :param target2: 参数映射表中`target2`的映射关系，None表示映射关系为`target2`->`target2`
-        :param float beta: f_beta分数，f_beta = (1 + beta^2)*(pre*rec)/(beta^2*pre + rec). 常用为beta=0.5, 1, 2. 若为0.5
-            则精确率的权重高于召回率；若为1，则两者平等；若为2，则召回率权重高于精确率。
-        :param bool right_open: right_open为true表示start跟end指针指向一个左闭右开区间，为false表示指向一个左闭右闭区间。
-        :param bool print_predict_stat: True则输出预测答案是否为空与正确答案是否为空的统计信息, False则不输出
-        """
+        
         super(SQuADMetric, self).__init__()
 
         self._init_param_map(pred1=pred1, pred2=pred2, target1=target1, target2=target2)
diff --git a/fastNLP/core/optimizer.py b/fastNLP/core/optimizer.py
index 584aa5ff..ea4905eb 100644
--- a/fastNLP/core/optimizer.py
+++ b/fastNLP/core/optimizer.py
@@ -1,11 +1,16 @@
+"""
+optimizer 模块定义了 fastNLP 中所需的各种优化器，一般做为 :class:`~fastNLP.Trainer` 的参数使用。
+
+"""
 import torch
 
 
 class Optimizer(object):
     """
+    别名：:class:`fastNLP.Optimizer` :class:`fastNLP.core.optimizer.Optimizer`
 
-        :param model_params: a generator. E.g. ``model.parameters()`` for PyTorch models.
-        :param kwargs: additional parameters.
+    :param model_params: a generator. E.g. ``model.parameters()`` for PyTorch models.
+    :param kwargs: additional parameters.
     """
     def __init__(self, model_params, **kwargs):
         if model_params is not None and not hasattr(model_params, "__next__"):
@@ -26,10 +31,11 @@ class Optimizer(object):
 
 class SGD(Optimizer):
     """
+    别名：:class:`fastNLP.SGD` :class:`fastNLP.core.optimizer.SGD`
 
-        :param float lr: learning rate. Default: 0.01
-        :param float momentum: momentum. Default: 0
-        :param model_params: a generator. E.g. ``model.parameters()`` for PyTorch models.
+    :param float lr: learning rate. Default: 0.01
+    :param float momentum: momentum. Default: 0
+    :param model_params: a generator. E.g. ``model.parameters()`` for PyTorch models.
     """
 
     def __init__(self, lr=0.001, momentum=0, model_params=None):
@@ -47,10 +53,11 @@ class SGD(Optimizer):
 
 class Adam(Optimizer):
     """
+    别名：:class:`fastNLP.Adam` :class:`fastNLP.core.optimizer.Adam`
 
-        :param float lr: learning rate
-        :param float weight_decay:
-        :param model_params: a generator. E.g. ``model.parameters()`` for PyTorch models.
+    :param float lr: learning rate
+    :param float weight_decay:
+    :param model_params: a generator. E.g. ``model.parameters()`` for PyTorch models.
     """
 
     def __init__(self, lr=0.001, weight_decay=0, betas=(0.9, 0.999), eps=1e-8, amsgrad=False, model_params=None):
diff --git a/fastNLP/core/predictor.py b/fastNLP/core/predictor.py
index ae648e47..34784b7c 100644
--- a/fastNLP/core/predictor.py
+++ b/fastNLP/core/predictor.py
@@ -2,10 +2,10 @@ from collections import defaultdict
 
 import torch
 
-from fastNLP.core import Batch
-from fastNLP.core import DataSet
-from fastNLP.core import SequentialSampler
-from fastNLP.core.utils import _build_args
+from . import Batch
+from . import DataSet
+from . import SequentialSampler
+from .utils import _build_args
 
 
 class Predictor(object):
diff --git a/fastNLP/core/sampler.py b/fastNLP/core/sampler.py
index 1f9b92fb..2182ae1c 100644
--- a/fastNLP/core/sampler.py
+++ b/fastNLP/core/sampler.py
@@ -1,22 +1,24 @@
 """
+sampler 子类实现了 fastNLP 所需的各种采样器。
 
-  .. _Sampler:
 
 """
-
-
-
+__all__ = ["Sampler", "BucketSampler", "SequentialSampler", "RandomSampler"]
 from itertools import chain
 
 import numpy as np
-import torch
+
 
 class Sampler(object):
-    """ `Sampler` 类的基类. 规定以何种顺序取出data中的元素
+    """
+    别名：:class:`fastNLP.Sampler` :class:`fastNLP.core.sampler.Sampler`
+
+     
+    `Sampler` 类的基类. 规定以何种顺序取出data中的元素
 
     子类必须实现 ``__call__`` 方法. 输入 `DataSet` 对象, 返回其中元素的下标序列
     """
-
+    
     def __call__(self, data_set):
         """
        :param DataSet data_set: `DataSet` 对象, 需要Sample的数据
@@ -26,56 +28,62 @@ class Sampler(object):
 
 
 class SequentialSampler(Sampler):
-    """顺序取出元素的 `Sampler`
-
-        .. _SequentialSampler:
+    """
+    别名：:class:`fastNLP.SequentialSampler` :class:`fastNLP.core.sampler.SequentialSampler`
+     
+    顺序取出元素的 `Sampler`
 
     """
+    
     def __call__(self, data_set):
         return list(range(len(data_set)))
 
 
 class RandomSampler(Sampler):
     """
-
-      .. _RandomSampler:
+    别名：:class:`fastNLP.RandomSampler` :class:`fastNLP.core.sampler.RandomSampler`
 
     随机化取元素的 `Sampler`
 
     """
+    
     def __call__(self, data_set):
         return list(np.random.permutation(len(data_set)))
 
 
 class BucketSampler(Sampler):
-    """带Bucket的 `Random Sampler`. 可以随机地取出长度相似的元素
+    """
+    别名：:class:`fastNLP.BucketSampler` :class:`fastNLP.core.sampler.BucketSampler`
+
+    带Bucket的 `Random Sampler`. 可以随机地取出长度相似的元素
 
     :param int num_buckets: bucket的数量
     :param int batch_size: batch的大小
     :param str seq_lens_field_name: 对应序列长度的 `field` 的名字
     """
+    
     def __init__(self, num_buckets=10, batch_size=32, seq_lens_field_name='seq_len'):
         self.num_buckets = num_buckets
         self.batch_size = batch_size
         self.seq_lens_field_name = seq_lens_field_name
-
+    
     def __call__(self, data_set):
         seq_lens = data_set.get_all_fields()[self.seq_lens_field_name].content
         total_sample_num = len(seq_lens)
-
+        
         bucket_indexes = []
-        assert total_sample_num>=self.num_buckets, "The number of samples is smaller than the number of buckets."
+        assert total_sample_num >= self.num_buckets, "The number of samples is smaller than the number of buckets."
         num_sample_per_bucket = total_sample_num // self.num_buckets
         for i in range(self.num_buckets):
             bucket_indexes.append([num_sample_per_bucket * i, num_sample_per_bucket * (i + 1)])
         bucket_indexes[-1][1] = total_sample_num
-
+        
         sorted_seq_lens = list(sorted([(idx, seq_len) for
                                        idx, seq_len in zip(range(total_sample_num), seq_lens)],
                                       key=lambda x: x[1]))
-
+        
         batchs = []
-
+        
         left_init_indexes = []
         for b_idx in range(self.num_buckets):
             start_idx = bucket_indexes[b_idx][0]
@@ -90,7 +98,7 @@ class BucketSampler(Sampler):
         if (left_init_indexes) != 0:
             batchs.append(left_init_indexes)
         np.random.shuffle(batchs)
-
+        
         return list(chain(*batchs))
 
 
@@ -128,10 +136,10 @@ def k_means_1d(x, k, max_iter=100):
     if len(sorted_x) < k:
         raise ValueError("too few buckets")
     gap = len(sorted_x) / k
-
+    
     centroids = np.array([sorted_x[int(x * gap)] for x in range(k)])
     assign = None
-
+    
     for i in range(max_iter):
         # Cluster Assignment step
         assign = np.array([np.argmin([np.absolute(x_i - x) for x in centroids]) for x_i in x])
@@ -163,7 +171,7 @@ def k_means_bucketing(lengths, buckets):
     bucket_data = [[] for _ in buckets]
     num_buckets = len(buckets)
     _, assignments = k_means_1d(lengths, num_buckets)
-
+    
     for idx, bucket_id in enumerate(assignments):
         if buckets[bucket_id] is None or lengths[idx] <= buckets[bucket_id]:
             bucket_data[bucket_id].append(idx)
diff --git a/fastNLP/core/tester.py b/fastNLP/core/tester.py
index c2aae37b..6eaa5add 100644
--- a/fastNLP/core/tester.py
+++ b/fastNLP/core/tester.py
@@ -1,81 +1,81 @@
-import torch
-from torch import nn
+"""
+tester模块实现了 fastNLP 所需的Tester类，能在提供数据、模型以及metric的情况下进行性能测试。
 
-from fastNLP.core.batch import Batch
-from fastNLP.core.dataset import DataSet
-from fastNLP.core.metrics import _prepare_metrics
-from fastNLP.core.sampler import SequentialSampler
-from fastNLP.core.utils import _CheckError
-from fastNLP.core.utils import _build_args
-from fastNLP.core.utils import _check_loss_evaluate
-from fastNLP.core.utils import _move_dict_value_to_device
-from fastNLP.core.utils import _get_func_signature
-from fastNLP.core.utils import _get_model_device
-from fastNLP.core.utils import _move_model_to_device
+Example::
 
+    import numpy as np
+    import torch
+    from torch import nn
+    from fastNLP import Tester
+    from fastNLP import DataSet
+    from fastNLP import AccuracyMetric
 
-class Tester(object):
-    """
-    Tester是在提供数据，模型以及metric的情况下进行性能测试的类
+    class Model(nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.fc = nn.Linear(1, 1)
+        def forward(self, a):
+            return {'pred': self.fc(a.unsqueeze(1)).squeeze(1)}
 
-    Example::
+    model = Model()
 
-        import numpy as np
-        import torch
-        from torch import nn
-        from fastNLP import Tester
-        from fastNLP import DataSet
-        from fastNLP import AccuracyMetric
+    dataset = DataSet({'a': np.arange(10, dtype=float), 'b':np.arange(10, dtype=float)*2})
 
+    dataset.set_input('a')
+    dataset.set_target('b')
 
-        class Model(nn.Module):
-            def __init__(self):
-                super().__init__()
-                self.fc = nn.Linear(1, 1)
-            def forward(self, a):
-                return {'pred': self.fc(a.unsqueeze(1)).squeeze(1)}
+    tester = Tester(dataset, model, metrics=AccuracyMetric())
+    eval_results = tester.test()
 
-        model = Model()
+这里Metric的映射规律是和 :class:`fastNLP.Trainer` 中一致的，具体使用请参考 :doc:`trainer 模块<fastNLP.core.trainer>` 的1.3部分
 
-        dataset = DataSet({'a': np.arange(10, dtype=float), 'b':np.arange(10, dtype=float)*2})
 
-        dataset.set_input('a')
-        dataset.set_target('b')
 
-        tester = Tester(dataset, model, metrics=AccuracyMetric())
-        eval_results = tester.test()
-
-    这里Metric的映射规律是和 Trainer_ 中一致的，请参考 Trainer_ 使用metrics。
+"""
+import torch
+from torch import nn
 
+from .batch import Batch
+from .dataset import DataSet
+from .metrics import _prepare_metrics
+from .sampler import SequentialSampler
+from .utils import _CheckError
+from .utils import _build_args
+from .utils import _check_loss_evaluate
+from .utils import _move_dict_value_to_device
+from .utils import _get_func_signature
+from .utils import _get_model_device
+from .utils import _move_model_to_device
 
 
+class Tester(object):
     """
+    别名：:class:`fastNLP.Tester` :class:`fastNLP.core.tester.Tester`
 
-    def __init__(self, data, model, metrics, batch_size=16, device=None, verbose=1):
-        """传入模型，数据以及metric进行验证。
-
-            :param DataSet data: 需要测试的数据集
-            :param torch.nn.module model: 使用的模型
-            :param MetricBase metrics: 一个Metric或者一个列表的metric对象
-            :param int batch_size: evaluation时使用的batch_size有多大。
-            :param str,int,torch.device,list(int) device: 将模型load到哪个设备。默认为None，即Trainer不对模型
-                的计算位置进行管理。支持以下的输入:
+    Tester是在提供数据，模型以及metric的情况下进行性能测试的类。需要传入模型，数据以及metric进行验证。
 
-                1. str: ['cpu', 'cuda', 'cuda:0', 'cuda:1', ...] 依次为'cpu'中, 可见的第一个GPU中, 可见的第一个GPU中,
-                可见的第二个GPU中;
+    :param data: 需要测试的数据集， :class:`~fastNLP.DataSet` 类型
+    :param torch.nn.module model: 使用的模型
+    :param metrics: :class:`~fastNLP.core.metrics.MetricBase` 或者一个列表的 :class:`~fastNLP.core.metrics.MetricBase`
+    :param int batch_size: evaluation时使用的batch_size有多大。
+    :param str,int,torch.device,list(int) device: 将模型load到哪个设备。默认为None，即Trainer不对模型
+        的计算位置进行管理。支持以下的输入:
 
-                2. torch.device：将模型装载到torch.device上。
+        1. str: ['cpu', 'cuda', 'cuda:0', 'cuda:1', ...] 依次为'cpu'中, 可见的第一个GPU中, 可见的第一个GPU中,
+        可见的第二个GPU中;
 
-                3. int: 将使用device_id为该值的gpu进行训练
+        2. torch.device：将模型装载到torch.device上。
 
-                4. list(int)：如果多于1个device，将使用torch.nn.DataParallel包裹model, 并使用传入的device。
+        3. int: 将使用device_id为该值的gpu进行训练
 
-                5. None. 为None则不对模型进行任何处理，如果传入的model为torch.nn.DataParallel该值必须为None。
+        4. list(int)：如果多于1个device，将使用torch.nn.DataParallel包裹model, 并使用传入的device。
 
-            :param int verbose: 如果为0不输出任何信息; 如果为1，打印出验证结果。
+        5. None. 为None则不对模型进行任何处理，如果传入的model为torch.nn.DataParallel该值必须为None。
 
-        """
+    :param int verbose: 如果为0不输出任何信息; 如果为1，打印出验证结果。
+    """
 
+    def __init__(self, data, model, metrics, batch_size=16, device=None, verbose=1):
         super(Tester, self).__init__()
 
         if not isinstance(data, DataSet):
@@ -103,7 +103,7 @@ class Tester(object):
     def test(self):
         """开始进行验证，并返回验证结果。
 
-        :return dict(dict) eval_results: dict为二层嵌套结构，dict的第一层是metric的名称; 第二层是这个metric的指标。
+        :return Dict[Dict] : dict的二层嵌套结构，dict的第一层是metric的名称; 第二层是这个metric的指标。
             一个AccuracyMetric的例子为{'AccuracyMetric': {'acc': 1.0}}。
         """
         # turn on the testing mode; clean up the history
diff --git a/fastNLP/core/trainer.py b/fastNLP/core/trainer.py
index 7ace58d9..90aa0c19 100644
--- a/fastNLP/core/trainer.py
+++ b/fastNLP/core/trainer.py
@@ -1,13 +1,17 @@
 """
-Trainer的说明文档
-
- .. _Trainer:
-
-Trainer在fastNLP中用于组织单任务的训练过程，可以避免用户在不同训练任务中重复撰写 (1) epoch循环; (2) 将数据分成不同的Batch; (3)
-对Batch进行pad; (4) 每个epoch结束或一定step后进行验证集验证; (5) 保存获得更好验证性能的模型等。
-
-1. Trainer的基本使用
-
+Trainer在fastNLP中用于组织单任务的训练过程，可以避免用户在不同训练任务中重复撰以下步骤的代码
+
+    (1) epoch循环;
+    
+    (2) 将数据分成不同的Batch;
+    
+    (3) 对Batch进行pad;
+    
+    (4) 每个epoch结束或一定step后进行验证集验证;
+    
+    (5) 保存获得更好验证性能的模型。
+
+1 Trainer的基本使用
     下面的例子是使用神经网络来进行预测一个序列中是否有偶数个1。
 
     Example::
@@ -20,8 +24,8 @@ Trainer在fastNLP中用于组织单任务的训练过程，可以避免用户在
 
         from fastNLP import DataSet
         from fastNLP import Trainer
-        from fastNLP.core.losses import CrossEntropyLoss
-        from fastNLP.core.metrics import AccuracyMetric
+        from fastNLP import CrossEntropyLoss
+        from fastNLP import AccuracyMetric
         from fastNLP.modules.decoder import MLP
 
         # 模型
@@ -56,208 +60,214 @@ Trainer在fastNLP中用于组织单任务的训练过程，可以避免用户在
     由上面的例子可以看出通过使用Trainer，可以使得训练部分的代码大幅减少。
     使用Trainer需要满足以下几个条件:
 
-    1. 模型
+1.1 模型
+    1 模型的forward()的参数名需要与DataSet中的名字对应。实际上fastNLP在将DataSet中的数据传递给模型forward()时，是
+    通过匹配名称实现的。所以上例中，如果Model的forward函数修改为forward(self, data), 则DataSet中的'x'这个field就应该
+    改名为'data'。
 
-        1. 模型的forward()的参数名需要与DataSet中的名字对应。实际上fastNLP在将DataSet中的数据传递给模型forward()时，是
-        通过匹配名称实现的。所以上例中，如果Model的forward函数修改为forward(self, data), 则DataSet中的'x'这个field就应该
-        改名为'data'。
+    2 传递给forward()的参数是DataSet中被设置为input的那些field。但如果forward()中没有对应的参数，则不会将数据传递
+    给forward()。例如，DataSet中'x1', 'x2'都是input，但是模型的函数为forward(self, x1), 那么'x2'不会传递给forward()。
 
-        2. 传递给forward()的参数是DataSet中被设置为input的那些field。但如果forward()中没有对应的参数，则不会将数据传递
-        给forward()。例如，DataSet中'x1', 'x2'都是input，但是模型的函数为forward(self, x1), 那么'x2'不会传递给forward()。
+    3 模型的forward()返回值需要为一个dict。
 
-        3. 模型的forward()返回值需要为一个dict。
+1.2 Loss
+    fastNLP中的为了不限制forward函数的返回内容数量(比如一些复杂任务需要返回多个内容，如Dependency Parsing，
+    :mod:`Loss<fastNLP.core.losses>` 与 :mod:`Metric<fastNLP.core.metrics>` 都使用了通过名称来匹配相应内容的策略。如上面的例子中
 
-    2. Loss
+    Example::
 
-    fastNLP中的为了不限制forward函数的返回内容数量(比如一些复杂任务需要返回多个内容，如Dependency Parsing， Loss_ 与 Metric_ 都使
-    用了通过名称来匹配相应内容的策略。如上面的例子中
+        trainer = Trainer(tr_dataset, model, loss=CrossEntropyLoss(target='label'),
+                   optimizer=SGD(model.parameters(), lr=0.1),n_epochs=1000,
+                   dev_data = dev_data, metrics=AccuracyMetric(target='label'))
+
+    loss被设置为了 :class:`~fastNLP.CrossEntropyLoss` , 但在初始化的时候传入了target='label'这个参数，
+    :class:`~fastNLP.CrossEntropyLoss` 的初始化参数为(pred=None, target=None, padding_idx=-100)。
+    
+    这里的两个参数分别为计算CrossEntropy时需要使用到的模型的预测值与真实值。
+    其中 `pred` 一般来自于模型forward()的返回结果，`target` 一般是来自于DataSet中被设置为target的field。
+    由于每个人对真实值或者model的返回值取名并不一样，所以fastNLP的 :mod:`Loss<fastNLP.core.losses>` 提供一种类似于映射的机制来匹配对应的值，
+    比如这里 :class:`~fastNLP.CrossEntropyLoss` 将尝试找到名为'label'的内容来作为真实值得到loss；
+    而pred=None, 则 :class:`~fastNLP.CrossEntropyLoss` 使用'pred'作为名称匹配预测值，
+    正好forward的返回值也叫pred，所以这里不需要申明pred。
+
+    尽管fastNLP使用了映射机制来使得loss的计算变得比较灵活，但有些情况下loss必须在模型中进行计算，比如使用了CRF的模型。
+    fastNLP中提供了 :class:`~fastNLP.LossInForward` 这个loss。
+    这个loss的原理是直接在forward()的返回结果中找到loss_key(默认寻找'loss')指定的那个tensor，并使用它作为loss。
+    如果Trainer初始化没有提供loss则默认使用 :class:`~fastNLP.LossInForward` 。TODO 补充一个例子  详细例子可以参照
+
+1.3 Metric
+    :mod:`Metric<fastNLP.core.metrics>` 使用了与上述Loss一样的策略，即使用名称进行匹配。
+    AccuracyMetric(target='label')的情况与CrossEntropyLoss 是同理的。
+    
+    在进行验证时，可能用到的计算与forward()中不太一致，没有办法直接从forward()的结果中得到预测值，这时模型可以提供一个predict()方法，
+    如果提供的模型具有predict方法，则在模型验证时将调用predict()方法获取预测结果，
+    传入到predict()的参数也是从DataSet中被设置为input的field中选择出来的;
+    与forward()一样，返回值需要为一个dict。 TODO 补充一个例子 具体例子可以参考
 
-        Example::
+2 Trainer的代码检查
+    由于在fastNLP中采取了映射的机制，所以难免可能存在对应出错的情况。Trainer提供一种映射检查机制，可以通过check_code_level来进行控制
+    比如下面的例子中，由于各种原因产生的报错
+
+Example2.1
+    ::
+    
+        import numpy as np
+        from torch import nn
+        import torch
+        from torch.optim import SGD
+        from fastNLP import Trainer
+        from fastNLP import DataSet
 
-            trainer = Trainer(tr_dataset, model, loss=CrossEntropyLoss(target='label'),
-                       optimizer=SGD(model.parameters(), lr=0.1),n_epochs=1000,
-                       dev_data = dev_data, metrics=AccuracyMetric(target='label'))
+        class Model(nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.fc = nn.Linear(1, 1)
+            def forward(self, x, b):
+                loss = torch.mean((self.fc(x)-b)**2)
+                return {'loss': loss}
+        model = Model()
 
-        loss被设置为了 CrossEntropyLoss_ , 但在初始化的时候传入了target='label'这个参数， CrossEntropyLoss_ 的初始化
-        参数为(pred=None, target=None, padding_idx=-100)。这里的两个参数分别为计算CrossEntropy时需要使用到的模型的预测值
-        与真实值。其中'pred'一般来自于模型forward()的返回结果，'target'一般是来自于DataSet中被设置为target的
-        field。由于每个人对真实值或者model的返回值取名并不一样，所以fastNLP的 Loss_ 提供一种类似于映射的机制来匹配
-        对应的值，比如这里 CrossEntropyLoss_ 将尝试找到名为'label'的内容来作为真实值得到loss；而pred=None, 则 CrossEntropyLoss_
-        使用'pred'作为名称匹配预测值，正好forward的返回值也叫pred，所以这里不需要申明pred。
+        dataset = DataSet({'a': np.arange(10), 'b':np.arange(10)*2})
+        dataset.set_input('a', 'b')
 
-    尽管fastNLP使用了映射机制来使得loss的计算变得比较灵活，但有些情况下loss必须在模型中进行计算，比如使用了CRF的模型。fastNLP中提供了 LossInForward_ 这
-    个loss。这个loss的原理是直接在forward()的返回结果中找到loss_key(默认寻找'loss')指定的那个tensor，
-    并使用它作为loss。 如果Trainer初始化没有提供loss则默认使用 LossInForward_ 。详细例子可以参照 TODO 补充一个例子
+        trainer = Trainer(dataset, model, loss=None, optimizer=SGD(model.parameters(), lr=0.001))
 
-    3. Metric
+        trainer = Trainer(dataset, model, SGD(model.parameters()))
+        #  会报以下的错误
+        # input fields after batch(if batch size is 2):
+        #     a: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2])
+        #     b: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2])
+        # There is no target field.
+        # ....
+        # NameError:
+        # Problems occurred when calling Model.forward(self, x, b)
+        #     missing param: ['x']
+        #     unused field: ['a']
+        #     Suggestion: You need to provide ['x'] in DataSet and set it as input.
 
-    Metric_ 使用了与上述Loss一样的策略，即使用名称进行匹配。AccuracyMetric(target='label')的情况与CrossEntropyLoss 是同理的。
+    这里就是由于在Trainer初始化的时候，fastNLP会尝试使用一个batch_size=2的batch去运行一遍forward()以及backward()。这里有两类
+    信息可以为你提供参考
 
-    在进行验证时，可能用到的计算与forward()中不太一致，没有办法直接从forward()的结果中得到预测值，这时模型可以提供一个predict()方法，
-    如果提供的模型具有predict方法，则在模型验证时将调用predict()方法获取预测结果，传入到predict()的参数也是从DataSet中被设置为input
-    的field中选择出来的; 与forward()一样，返回值需要为一个dict。具体例子可以参考 TODO 补充一个例子
+    1 'input fields after batch...'这部分显示的是train dataset经过Batch操作后，每个field对应的类型以及进行shape。这里
+    因为train dataset没有target所以没有显示。根据这里可以看出是否正确将需要的内容设置为了input或target。
 
-2. Trainer的代码检查
+    2 NameError，NameError发生在映射出错的情况。这里报错的原因是由于尝试进行forward计算时(可以通过Model.forward(self, x, b)判断
+    出当前是在调取forward)，却没有获取到forward()函数中需要的'x'；在报错信息中同时指出了缺'x'，而'a'没有被使用，那么可能
+    就是由于field的名称不对。这里将dataset中'a'这个field的名称改为'x'，或者model的参数从'x'修改为'a'都可以解决问题。
 
-    由于在fastNLP中采取了映射的机制，所以难免可能存在对应出错的情况。Trainer提供一种映射检查机制，可以通过check_code_level来进行控制
-    比如下面的例子中，由于各种原因产生的报错
+    下面的例子是由于loss计算的时候找不到需要的值
 
-        Example1::
-
-            import numpy as np
-            from torch import nn
-            import torch
-            from torch.optim import SGD
-            from fastNLP import Trainer
-            from fastNLP import DataSet
-
-            class Model(nn.Module):
-                def __init__(self):
-                    super().__init__()
-                    self.fc = nn.Linear(1, 1)
-                def forward(self, x, b):
-                    loss = torch.mean((self.fc(x)-b)**2)
-                    return {'loss': loss}
-            model = Model()
-
-            dataset = DataSet({'a': np.arange(10), 'b':np.arange(10)*2})
-            dataset.set_input('a', 'b')
-
-            trainer = Trainer(dataset, model, loss=None, optimizer=SGD(model.parameters(), lr=0.001))
-
-            trainer = Trainer(dataset, model, SGD(model.parameters()))
-            #  会报以下的错误
-            # input fields after batch(if batch size is 2):
-            #     a: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2])
-            #     b: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2])
-            # There is no target field.
-            # ....
-            # NameError:
-            # Problems occurred when calling Model.forward(self, x, b)
-            #     missing param: ['x']
-            #     unused field: ['a']
-            #     Suggestion: You need to provide ['x'] in DataSet and set it as input.
-
-        这里就是由于在Trainer初始化的时候，fastNLP会尝试使用一个batch_size=2的batch去运行一遍forward()以及backward()。这里有两类
-        信息可以为你提供参考
-
-            1. 'input fields after batch...'这部分显示的是train dataset经过Batch操作后，每个field对应的类型以及进行shape。这里
-            因为train dataset没有target所以没有显示。根据这里可以看出是否正确将需要的内容设置为了input或target。
-
-            2. NameError，NameError发生在映射出错的情况。这里报错的原因是由于尝试进行forward计算时(可以通过Model.forward(self, x, b)判断
-            出当前是在调取forward)，却没有获取到forward()函数中需要的'x'；在报错信息中同时指出了缺'x'，而'a'没有被使用，那么可能
-            就是由于field的名称不对。这里将dataset中'a'这个field的名称改为'x'，或者model的参数从'x'修改为'a'都可以解决问题。
-
-        下面的例子是由于loss计算的时候找不到需要的值
-
-        Example2::
-
-            import numpy as np
-            from torch import nn
-            from torch.optim import SGD
-            from fastNLP import Trainer
-            from fastNLP import DataSet
-            from fastNLP.core.losses import L1Loss
-            import torch
-
-            class Model(nn.Module):
-                def __init__(self):
-                    super().__init__()
-                    self.fc = nn.Linear(1, 1)
-                def forward(self, a):
-                    return {'pred_b': self.fc(a.unsqueeze(1)).squeeze(1), 'No use':1}
-
-            model = Model()
-
-            dataset = DataSet({'a': np.arange(10, dtype=float), 'b':np.arange(10, dtype=float)*2})
-
-            dataset.set_input('a')
-            dataset.set_target('b')
-
-            trainer = Trainer(dataset, model, loss=L1Loss(target='label'), optimizer=SGD(model.parameters(), lr=0.001))
-            # 报错信息如下
-            # input fields after batch(if batch size is 2):
-            #     a: (1)type:torch.Tensor (2)dtype:torch.float32, (3)shape:torch.Size([2])
-            # target fields after batch(if batch size is 2):
-            #     b: (1)type:torch.Tensor (2)dtype:torch.float32, (3)shape:torch.Size([2])
-            # ....
-            # NameError:
-            # Problems occurred when calling L1Loss.get_loss(self, pred, target)
-            #     missing param: ['pred(assign to `pred` in `L1Loss`)', 'label(assign to `target` in `L1Loss`)']
-            #     unused field: ['b']
-            #     unused param: ['pred_b', 'No use']
-            #     target field: ['b']
-            #     param from Model.forward(self, a): ['pred_b', 'No use']
-            #     Suggestion: (1). Check key assignment for `target` when initialize L1Loss. Or provide `label` in DataSet or output of Model.forward(self, a).
-            #             (2). Check key assignment for `pred` when initialize L1Loss. Or provide `pred` in DataSet or output of Model.forward(self, a).
-
-        报错信息也包含两部分:
-
-            1. 第一部分与上面是一样的
-
-            2. 这里报错的原因是由于计算loss的时候找不到相应的值(通过L1Loss.get_loss(self, pred, target)判断出来的)；报错的原因是因为
-            `pred`和`label`(我们在初始化L1Loss时将target指定为了label)都没有找到。这里'unused field'是DataSet中出现了，但却没有
-            被设置为input或者target的field；'unused param'是forward()中返回且没有被使用到的内容；'target field'是被设置为了
-            target的field; 'param from Model.forward(self, a)'是forward()返回的所有key。"Suggestion"是关于当前错误处理的建议。
-
-        但是在一些情况下，比如forward()返回值只有一个，target也只有一个，fastNLP不会进行匹配，而直接将forward()的结果作为pred, 将
-        DataSet中的target设置为target。上面的例子在返回值中加入了一个'No use'则只是为了使得Loss去匹配结果。
-
-
-        下面是带有dev dataset时如果出现错误会发生的报错，
-
-        Example3::
-
-            import numpy as np
-            from torch import nn
-            from torch.optim import SGD
-            from fastNLP import Trainer
-            from fastNLP import DataSet
-            from fastNLP import AccuracyMetric
-            import torch
-
-            class Model(nn.Module):
-                def __init__(self):
-                    super().__init__()
-                    self.fc = nn.Linear(1, 1)
-                def forward(self, a, b):
-                    loss = torch.mean((self.fc(a.float().unsqueeze(1))-b.float())**2)
-                    return {'loss': loss}
-                def predict(self, a):  # 使用predict()进行验证
-                    return {'output':self.fc(a.float().unsqueeze(1))} #这里return的值不包含'pred'这个key
-            model = Model()
-
-            dataset = DataSet({'a': np.arange(10), 'b':np.arange(10)*2})
-            dev_data = DataSet({'a': np.arange(10, 20), 'b':np.arange(10, 20)*2})
-
-            dataset.set_input('a', 'b')
-            dev_data.set_input('a')  # 这里没有设置target
-
-            trainer = Trainer(dataset, model, loss=None, optimizer=SGD(model.parameters(), lr=0.001),
-                             dev_data=dev_data, metrics=AccuracyMetric())
-
-            # 报错信息
-            # ...
-            # NameError:
-            # Problems occurred when calling AccuracyMetric.evaluate(self, pred, target, seq_len=None)
-            #     missing param: ['pred(assign to `pred` in `AccuracyMetric`)', 'target(assign to `target` in `AccuracyMetric`)']
-            #     unused param: ['output']
-            #     target field: []
-            #     param from Model.predict(self, a): ['output']
-            #     Suggestion: (1). Check key assignment for `pred` when initialize AccuracyMetric. Or provide `pred` in DataSet or output of Model.predict(self, a).
-            #             (2). Check key assignment for `target` when initialize AccuracyMetric. Or provide `target` in DataSet or output of Model.predict(self, a).
-
-        报错信息和前面都是类似的，但是可以通过'AccuracyMetric.evaluate(self, pred, target, seq_len=None)'看出这里是evaluation
-        的时候发生了错误。这样避免了需要在完成一整个epoch的训练才能发现evaluation弄错的情况。这里的修改是通过在初始化metric的时候
-        指明通过'output'获取`pred`, 即AccuracyMetric(pred='output')。
+Example2.2
+    ::
 
-    可以通过check_code_level调节检查的强度。默认为0，即进行检查。
+        import numpy as np
+        from torch import nn
+        from torch.optim import SGD
+        from fastNLP import Trainer
+        from fastNLP import DataSet
+        from fastNLP import L1Loss
+        import torch
+
+        class Model(nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.fc = nn.Linear(1, 1)
+            def forward(self, a):
+                return {'pred_b': self.fc(a.unsqueeze(1)).squeeze(1), 'No use':1}
+
+        model = Model()
+
+        dataset = DataSet({'a': np.arange(10, dtype=float), 'b':np.arange(10, dtype=float)*2})
+
+        dataset.set_input('a')
+        dataset.set_target('b')
+
+        trainer = Trainer(dataset, model, loss=L1Loss(target='label'), optimizer=SGD(model.parameters(), lr=0.001))
+        # 报错信息如下
+        # input fields after batch(if batch size is 2):
+        #     a: (1)type:torch.Tensor (2)dtype:torch.float32, (3)shape:torch.Size([2])
+        # target fields after batch(if batch size is 2):
+        #     b: (1)type:torch.Tensor (2)dtype:torch.float32, (3)shape:torch.Size([2])
+        # ....
+        # NameError:
+        # Problems occurred when calling L1Loss.get_loss(self, pred, target)
+        #     missing param: ['pred(assign to `pred` in `L1Loss`)', 'label(assign to `target` in `L1Loss`)']
+        #     unused field: ['b']
+        #     unused param: ['pred_b', 'No use']
+        #     target field: ['b']
+        #     param from Model.forward(self, a): ['pred_b', 'No use']
+        #     Suggestion: (1). Check key assignment for `target` when initialize L1Loss. Or provide `label` in DataSet or output of Model.forward(self, a).
+        #             (2). Check key assignment for `pred` when initialize L1Loss. Or provide `pred` in DataSet or output of Model.forward(self, a).
+
+    报错信息也包含两部分:
+
+    1 第一部分与上面是一样的
+
+    2 这里报错的原因是由于计算loss的时候找不到相应的值(通过L1Loss.get_loss(self, pred, target)判断出来的)；
+    报错的原因是因为 `pred` 和 `label` (我们在初始化L1Loss时将target指定为了label)都没有找到。
+    这里'unused field'是DataSet中出现了，但却没有被设置为input或者target的field；
+    'unused param'是forward()中返回且没有被使用到的内容；'target field'是被设置为了target的field;
+    'param from Model.forward(self, a)'是forward()返回的所有key。"Suggestion"是关于当前错误处理的建议。
+
+    但是在一些情况下，比如forward()返回值只有一个，target也只有一个，fastNLP不会进行匹配，而直接将forward()的结果作为pred,
+    将DataSet中的target设置为target。上面的例子在返回值中加入了一个'No use'则只是为了使得Loss去匹配结果。
 
-3. Trainer与callback
 
+    下面是带有dev dataset时如果出现错误会发生的报错，
+
+Example2.3
+    ::
+    
+        import numpy as np
+        from torch import nn
+        from torch.optim import SGD
+        from fastNLP import Trainer
+        from fastNLP import DataSet
+        from fastNLP import AccuracyMetric
+        import torch
+
+        class Model(nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.fc = nn.Linear(1, 1)
+            def forward(self, a, b):
+                loss = torch.mean((self.fc(a.float().unsqueeze(1))-b.float())**2)
+                return {'loss': loss}
+            def predict(self, a):  # 使用predict()进行验证
+                return {'output':self.fc(a.float().unsqueeze(1))} #这里return的值不包含'pred'这个key
+        model = Model()
+
+        dataset = DataSet({'a': np.arange(10), 'b':np.arange(10)*2})
+        dev_data = DataSet({'a': np.arange(10, 20), 'b':np.arange(10, 20)*2})
+
+        dataset.set_input('a', 'b')
+        dev_data.set_input('a')  # 这里没有设置target
+
+        trainer = Trainer(dataset, model, loss=None, optimizer=SGD(model.parameters(), lr=0.001),
+                         dev_data=dev_data, metrics=AccuracyMetric())
+
+        # 报错信息
+        # ...
+        # NameError:
+        # Problems occurred when calling AccuracyMetric.evaluate(self, pred, target, seq_len=None)
+        #     missing param: ['pred(assign to `pred` in `AccuracyMetric`)', 'target(assign to `target` in `AccuracyMetric`)']
+        #     unused param: ['output']
+        #     target field: []
+        #     param from Model.predict(self, a): ['output']
+        #     Suggestion: (1). Check key assignment for `pred` when initialize AccuracyMetric. Or provide `pred` in DataSet or output of Model.predict(self, a).
+        #             (2). Check key assignment for `target` when initialize AccuracyMetric. Or provide `target` in DataSet or output of Model.predict(self, a).
+
+    报错信息和前面都是类似的，但是可以通过'AccuracyMetric.evaluate(self, pred, target, seq_len=None)'看出这里是evaluation
+    的时候发生了错误。这样避免了需要在完成一整个epoch的训练才能发现evaluation弄错的情况。这里的修改是通过在初始化metric的时候
+    指明通过'output'获取`pred`, 即AccuracyMetric(pred='output')。
+
+    可以通过check_code_level调节检查的强度。默认为0，即进行检查。
+
+3 Trainer与callback
     虽然Trainer本身已经集成了一些功能，但仍然不足以囊括训练过程中可能需要到的功能，比如负采样，learning rate decay, Early Stop等。
-    为了解决这个问题fastNLP引入了callback的机制，Callback_ 是一种在Trainer训练过程中特定阶段会运行的函数集合，所有的 Callback_ 都具有
-    on_*(比如on_train_start, on_backward_begin)等函数。如果 Callback 实现了该函数，则Trainer运行至对应阶段，会进行调用。
+    为了解决这个问题fastNLP引入了callback的机制，:class:`~fastNLP.Callback` 是一种在Trainer训练过程中特定阶段会运行的函数集合，
+    所有的 :class:`~fastNLP.Callback` 都具有on_*(比如on_train_start, on_backward_begin)等函数。
+    如果 Callback 实现了该函数，则Trainer运行至对应阶段，会进行调用。
 
     我们将Train.train()这个函数内部分为以下的阶段，在对应阶段会触发相应的调用。
 
@@ -286,12 +296,11 @@ Trainer在fastNLP中用于组织单任务的训练过程，可以避免用户在
         callback.on_train_end() # 训练结束
         callback.on_exception() # 这是一个特殊的步骤，在训练过程中遭遇exception会跳转到这里
 
-    fastNLP已经自带了很多callback函数供使用，可以参考 Callback_ 。一些关于callback的例子，请参考 #TODO callback的例子
+    fastNLP已经自带了很多callback函数供使用，可以参考 :class:`~fastNLP.Callback` 。
+    TODO callback的例子 一些关于callback的例子，请参考
 
 """
 
-
-
 import os
 import time
 from datetime import datetime
@@ -300,32 +309,91 @@ from datetime import timedelta
 import numpy as np
 import torch
 from torch import nn
-import warnings
+
 try:
     from tqdm.autonotebook import tqdm
 except:
-    from fastNLP.core.utils import _pseudo_tqdm as tqdm
-
-from fastNLP.core.batch import Batch
-from fastNLP.core.callback import CallbackManager, CallbackException
-from fastNLP.core.dataset import DataSet
-from fastNLP.core.losses import _prepare_losser
-from fastNLP.core.metrics import _prepare_metrics
-from fastNLP.core.sampler import Sampler
-from fastNLP.core.sampler import RandomSampler
-from fastNLP.core.sampler import SequentialSampler
-from fastNLP.core.tester import Tester
-from fastNLP.core.utils import _CheckError
-from fastNLP.core.utils import _build_args
-from fastNLP.core.utils import _check_forward_error
-from fastNLP.core.utils import _check_loss_evaluate
-from fastNLP.core.utils import _move_dict_value_to_device
-from fastNLP.core.utils import _get_func_signature
-from fastNLP.core.utils import _get_model_device
-from fastNLP.core.optimizer import Optimizer
-from fastNLP.core.utils import _move_model_to_device
+    from .utils import _pseudo_tqdm as tqdm
+
+from .batch import Batch
+from .callback import CallbackManager, CallbackException
+from .dataset import DataSet
+from .losses import _prepare_losser
+from .metrics import _prepare_metrics
+from .sampler import Sampler
+from .sampler import RandomSampler
+from .sampler import SequentialSampler
+from .tester import Tester
+from .utils import _CheckError
+from .utils import _build_args
+from .utils import _check_forward_error
+from .utils import _check_loss_evaluate
+from .utils import _move_dict_value_to_device
+from .utils import _get_func_signature
+from .utils import _get_model_device
+from .optimizer import Optimizer
+from .utils import _move_model_to_device
+
 
 class Trainer(object):
+    """
+    别名：:class:`fastNLP.Trainer` :class:`fastNLP.core.trainer.Trainer`
+    
+    Trainer在fastNLP中用于组织单任务的训练过程，可以避免用户在不同训练任务中重复撰写
+        (1) epoch循环;
+        (2) 将数据分成不同的Batch;
+        (3) 对Batch进行pad;
+        (4) 每个epoch结束或一定step后进行验证集验证;
+        (5) 保存获得更好验证性能的模型等。
+    
+    详细的介绍参见 :doc:`fastNLP.core.trainer`
+    
+    :param train_data: 训练集， :class:`~fastNLP.DataSet` 类型。
+    :param nn.modules model: 待训练的模型
+    :param torch.optim.Optimizer optimizer: 优化器。如果为None，则Trainer使用默认的Adam(model.parameters(), lr=4e-3)这个优化器
+    :param int batch_size: 训练和验证的时候的batch大小。
+    :param loss: 使用的 :class:`~fastNLP.core.losses.LossBase` 对象。当为None时，默认使用 :class:`~fastNLP.LossInForward`
+    :param sampler: Batch数据生成的顺序， :class:`~fastNLP.Sampler` 类型。如果为None，默认使用 :class:`~fastNLP.RandomSampler`
+    :param update_every: int, 多少步更新一次梯度。用于希望累计梯度的场景，比如需要128的batch_size, 但是直接设为128
+        会导致内存不足，通过设置batch_size=32, update_every=4达到目的。当optimizer为None时，该参数无效。
+    :param int n_epochs: 需要优化迭代多少次。
+    :param int print_every: 多少次反向传播更新tqdm显示的loss; 如果use_tqdm=False, 则多少次反向传播打印loss。
+    :param dev_data: 用于做验证的DataSet， :class:`~fastNLP.DataSet` 类型。
+    :param metrics: 验证的评估函数。可以只使用一个 :class:`Metric<fastNLP.core.metrics.MetricBase>` ，
+        也可以使用多个 :class:`Metric<fastNLP.core.metrics.MetricBase>` ，通过列表传入。
+        如验证时取得了更好的验证结果(如果有多个Metric，以列表中第一个Metric为准)，且save_path不为None，
+        则保存当前模型。Metric种类详见 :doc:`metrics模块 <fastNLP.core.metrics>` 。仅在传入dev_data时有效。
+    :param str,None metric_key:  :class:`Metric<fastNLP.core.metrics.MetricBase>` 有时会有多个指标，
+        比如 :class:`~fastNLP.core.metrics.SpanFPreRecMetric` 中包含了'f', 'pre', 'rec'。此时需
+        要指定以哪个指标为准。另外有些指标是越小效果越好，比如语言模型的困惑度，这种情况下，在key前面增加一个'-'来表
+        明验证时，值越小越好(比如: "-ppl")。仅在传入dev_data时有效。
+    :param int validate_every: 多少个step在验证集上验证一次; 如果为-1，则每个epoch结束验证一次。仅在传入dev_data时有效。
+    :param str,None save_path: 将模型保存路径。如果为None，则不保存模型。如果dev_data为None，则保存最后一次迭代的模型。
+        保存的时候不仅保存了参数，还保存了模型结构。即便使用DataParallel，这里也只保存模型。
+    :param prefetch: bool, 是否使用额外的进程对产生batch数据。理论上会使得Batch迭代更快。
+    :param bool use_tqdm: 是否使用tqdm来显示训练进度; 如果为False，则将loss打印在终端中。
+    :param str,int,torch.device,list(int) device: 将模型load到哪个设备。默认为None，即Trainer不对模型
+        的计算位置进行管理。支持以下的输入:
+
+        1. str: ['cpu', 'cuda', 'cuda:0', 'cuda:1', ...] 依次为'cpu'中, 可见的第一个GPU中, 可见的第一个GPU中,
+        可见的第二个GPU中;
+
+        2. torch.device：将模型装载到torch.device上。
+
+        3. int: 将使用device_id为该值的gpu进行训练
+
+        4. list(int)：如果多于1个device，将使用torch.nn.DataParallel包裹model, 并使用传入的device。
+
+        5. None. 为None则不对模型进行任何处理，如果传入的model为torch.nn.DataParallel该值必须为None。
+
+    :param list(callbacks) callbacks: 用于在train过程中起调节作用的回调函数。比如early stop，negative sampling等可以
+        通过callback机制实现。 可使用的callback参见 :doc:`callback模块 <fastNLP.core.callback>`
+    :param int check_code_level: 模型检查等级. -1: 不进行检查; 0: 仅出现错误时停止; 1: 如果有field没有被使用，
+        报告警告信息; 2: 有任何field没有被使用都报错. 检查的原理是通过使用很小的batch(默认2个sample)来运行代码，但是
+        这个过程理论上不会修改任何参数，只是会检查能否运行。但如果(1)模型中存在将batch_size写为某个固定值的情况；
+        (2)模型中存在累加前向计算次数的，可能会多计算1次。以上情况建议将check_code_level设置为-1。
+    """
+    
     def __init__(self, train_data, model, optimizer=None, loss=None,
                  batch_size=32, sampler=None, update_every=1,
                  n_epochs=10, print_every=5,
@@ -334,74 +402,30 @@ class Trainer(object):
                  prefetch=False, use_tqdm=True, device=None,
                  callbacks=None,
                  check_code_level=0):
-        """
-        :param DataSet train_data: 训练集
-        :param nn.modules model: 待训练的模型
-        :param torch.optim.Optimizer,None optimizer: 优化器。如果为None，则Trainer使用默认的Adam(model.parameters(), lr=4e-3)这个优化器
-        :param int batch_size: 训练和验证的时候的batch大小。
-        :param LossBase loss: 使用的Loss对象。 详见 LossBase_ 。当loss为None时，默认使用 LossInForward_ 。
-        :param Sampler sampler: Batch数据生成的顺序。详见 Sampler_ 。如果为None，默认使用 RandomSampler_ 。
-        :param update_every: int, 多少步更新一次梯度。用于希望累计梯度的场景，比如需要128的batch_size, 但是直接设为128
-            会导致内存不足，通过设置batch_size=32, update_every=4达到目的。当optimizer为None时，该参数无效。
-        :param int n_epochs: 需要优化迭代多少次。
-        :param int print_every: 多少次反向传播更新tqdm显示的loss; 如果use_tqdm=False, 则多少次反向传播打印loss。
-        :param DataSet dev_data: 用于做验证的DataSet。
-        :param MetricBase,list(MetricBase) metrics: 验证的评估函数。可以只使用一个Metric，也可以使用多个Metric，通过
-            列表传入。如验证时取得了更好的验证结果(如果有多个Metric，以列表中第一个Metric为准)，且save_path不为None，
-            则保存当前模型。Metric种类详见 Metric_ 。仅在传入dev_data时有效。
-        :param str,None metric_key:  Metric_ 有时会有多个指标，比如 SpanFPreRecMetric_ 中包含了'f', 'pre', 'rec'。此时需
-            要指定以哪个指标为准。另外有些指标是越小效果越好，比如语言模型的困惑度，这种情况下，在key前面增加一个'-'来表
-            明验证时，值越小越好(比如: "-ppl")。仅在传入dev_data时有效。
-        :param int validate_every: 多少个step在验证集上验证一次; 如果为-1，则每个epoch结束验证一次。仅在传入dev_data时有
-            效。
-        :param str,None save_path: 将模型保存路径。如果为None，则不保存模型。如果dev_data为None，则保存最后一次迭代的模
-            型。保存的时候不仅保存了参数，还保存了模型结构。即便使用DataParallel，这里也只保存模型。
-        :param prefetch: bool, 是否使用额外的进程对产生batch数据。理论上会使得Batch迭代更快。
-        :param bool use_tqdm: 是否使用tqdm来显示训练进度; 如果为False，则将loss打印在终端中。
-        :param str,int,torch.device,list(int) device: 将模型load到哪个设备。默认为None，即Trainer不对模型
-            的计算位置进行管理。支持以下的输入:
-
-            1. str: ['cpu', 'cuda', 'cuda:0', 'cuda:1', ...] 依次为'cpu'中, 可见的第一个GPU中, 可见的第一个GPU中,
-            可见的第二个GPU中;
-
-            2. torch.device：将模型装载到torch.device上。
-
-            3. int: 将使用该device的gpu进行训练
-
-            4. list(int)：如果多于1个device，将使用torch.nn.DataParallel包裹model, 并使用传入的device。
-
-            5. None. 为None则不对模型进行任何处理，如果传入的model为torch.nn.DataParallel该值必须为None。
-
-        :param list(callbacks) callbacks: 用于在train过程中起调节作用的回调函数。比如early stop，negative sampling等可以
-            通过callback机制实现。 可使用的callback参见 Callback_ 。
-        :param int check_code_level: 模型检查等级. -1: 不进行检查; 0: 仅出现错误时停止; 1: 如果有field没有被使用，
-            报告警告信息; 2: 有任何field没有被使用都报错. 检查的原理是通过使用很小的batch(默认2个sample)来运行代码，但是
-            这个过程理论上不会修改任何参数，只是会检查能否运行。但如果(1)模型中存在将batch_size写为某个固定值的情况；
-            (2)模型中存在累加前向计算次数的，可能会多计算1次。以上情况建议将check_code_level设置为-1。
-        """
-        super(Trainer, self).__init__()
 
+        super(Trainer, self).__init__()
+        
         if not isinstance(train_data, DataSet):
             raise TypeError(f"The type of train_data must be fastNLP.DataSet, got {type(train_data)}.")
         if not isinstance(model, nn.Module):
             raise TypeError(f"The type of model must be torch.nn.Module, got {type(model)}.")
-
+        
         # check metrics and dev_data
         if (not metrics) and dev_data is not None:
             raise ValueError("No metric for dev_data evaluation.")
         if metrics and (dev_data is None):
             raise ValueError("No dev_data for evaluations, pass dev_data or set metrics to None. ")
-
+        
         # check update every
         assert update_every >= 1, "update_every must be no less than 1."
         self.update_every = int(update_every)
-
+        
         # check save_path
         if not (save_path is None or isinstance(save_path, str)):
             raise ValueError("save_path can only be None or `str`.")
         # prepare evaluate
         metrics = _prepare_metrics(metrics)
-
+        
         # parse metric_key
         # increase_better is True. It means the exp result gets better if the indicator increases.
         # It is true by default.
@@ -411,19 +435,19 @@ class Trainer(object):
             self.metric_key = metric_key[1:] if metric_key[0] == "+" or metric_key[0] == "-" else metric_key
         elif len(metrics) > 0:
             self.metric_key = metrics[0].__class__.__name__.lower().strip('metric')
-
+        
         # prepare loss
         losser = _prepare_losser(loss)
-
+        
         # sampler check
         if sampler is not None and not isinstance(sampler, Sampler):
             raise ValueError("The type of sampler should be fastNLP.BaseSampler, got {}.".format(type(sampler)))
-
+        
         if check_code_level > -1:
             _check_code(dataset=train_data, model=model, losser=losser, metrics=metrics, dev_data=dev_data,
                         metric_key=metric_key, check_level=check_code_level,
                         batch_size=min(batch_size, DEFAULT_CHECK_BATCH_SIZE))
-
+        
         self.train_data = train_data
         self.dev_data = dev_data  # If None, No validation.
         self.model = model
@@ -443,10 +467,9 @@ class Trainer(object):
         self.callback_manager = CallbackManager(env={"trainer": self}, callbacks=callbacks)
         self.n_steps = (len(self.train_data) // self.batch_size + int(
             len(self.train_data) % self.batch_size != 0)) * self.n_epochs
-
-        # 是否一开始就是DataParallel的。
+        
         self.model = _move_model_to_device(self.model, device=device)
-
+        
         if isinstance(optimizer, torch.optim.Optimizer):
             self.optimizer = optimizer
         elif isinstance(optimizer, Optimizer):
@@ -455,11 +478,11 @@ class Trainer(object):
             self.optimizer = torch.optim.Adam(model.parameters(), lr=4e-3)
         else:
             raise TypeError("optimizer can only be torch.optim.Optimizer type, not {}.".format(type(optimizer)))
-
+        
         self.use_tqdm = use_tqdm
         self.pbar = None
         self.print_every = abs(self.print_every)
-
+        
         if self.dev_data is not None:
             self.tester = Tester(model=self.model,
                                  data=self.dev_data,
@@ -467,13 +490,13 @@ class Trainer(object):
                                  batch_size=self.batch_size,
                                  device=None,  # 由上面的部分处理device
                                  verbose=0)
-
+        
         self.step = 0
         self.start_time = None  # start timestamp
-
+        
         self.callback_manager = CallbackManager(env={"trainer": self},
                                                 callbacks=callbacks)
-
+    
     def train(self, load_best_model=True):
         """
         使用该函数使Trainer开始训练。
@@ -502,14 +525,14 @@ class Trainer(object):
             self.start_time = str(datetime.now().strftime('%Y-%m-%d-%H-%M-%S'))
             start_time = time.time()
             print("training epochs started " + self.start_time, flush=True)
-
+            
             try:
                 self.callback_manager.on_train_begin()
                 self._train()
                 self.callback_manager.on_train_end()
             except (CallbackException, KeyboardInterrupt) as e:
                 self.callback_manager.on_exception(e)
-
+            
             if self.dev_data is not None and hasattr(self, 'best_dev_perf'):
                 print(
                     "\nIn Epoch:{}/Step:{}, got best dev performance:".format(self.best_dev_epoch, self.best_dev_step) +
@@ -527,9 +550,9 @@ class Trainer(object):
         finally:
             pass
         results['seconds'] = round(time.time() - start_time, 2)
-
+        
         return results
-
+    
     def _train(self):
         if not self.use_tqdm:
             from fastNLP.core.utils import _pseudo_tqdm as inner_tqdm
@@ -538,7 +561,7 @@ class Trainer(object):
         self.step = 0
         self.epoch = 0
         start = time.time()
-
+        
         with inner_tqdm(total=self.n_steps, postfix='loss:{0:<6.5f}', leave=False, dynamic_ncols=True) as pbar:
             self.pbar = pbar if isinstance(pbar, tqdm) else None
             avg_loss = 0
@@ -557,21 +580,21 @@ class Trainer(object):
                     # negative sampling; replace unknown; re-weight batch_y
                     self.callback_manager.on_batch_begin(batch_x, batch_y, indices)
                     prediction = self._data_forward(self.model, batch_x)
-
+                    
                     # edit prediction
                     self.callback_manager.on_loss_begin(batch_y, prediction)
                     loss = self._compute_loss(prediction, batch_y).mean()
                     avg_loss += loss.item()
                     loss = loss / self.update_every
-
+                    
                     # Is loss NaN or inf? requires_grad = False
                     self.callback_manager.on_backward_begin(loss)
                     self._grad_backward(loss)
                     self.callback_manager.on_backward_end()
-
+                    
                     self._update()
                     self.callback_manager.on_step_end()
-
+                    
                     if self.step % self.print_every == 0:
                         avg_loss = float(avg_loss) / self.print_every
                         if self.use_tqdm:
@@ -585,7 +608,7 @@ class Trainer(object):
                         pbar.set_postfix_str(print_output)
                         avg_loss = 0
                     self.callback_manager.on_batch_end()
-
+                    
                     if ((self.validate_every > 0 and self.step % self.validate_every == 0) or
                         (self.validate_every < 0 and self.step % len(data_iterator) == 0)) \
                             and self.dev_data is not None:
@@ -594,20 +617,20 @@ class Trainer(object):
                                                                                     self.n_steps) + \
                                    self.tester._format_eval_results(eval_res)
                         pbar.write(eval_str + '\n')
-
+                
                 # ================= mini-batch end ==================== #
-
+                
                 # lr decay; early stopping
                 self.callback_manager.on_epoch_end()
             # =============== epochs end =================== #
             pbar.close()
             self.pbar = None
         # ============ tqdm end ============== #
-
+    
     def _do_validation(self, epoch, step):
         self.callback_manager.on_valid_begin()
         res = self.tester.test()
-
+        
         is_better_eval = False
         if self._better_eval_result(res):
             if self.save_path is not None:
@@ -622,7 +645,7 @@ class Trainer(object):
         # get validation results; adjust optimizer
         self.callback_manager.on_valid_end(res, self.metric_key, self.optimizer, is_better_eval)
         return res
-
+    
     def _mode(self, model, is_test=False):
         """Train mode or Test mode. This is for PyTorch currently.
 
@@ -634,21 +657,22 @@ class Trainer(object):
             model.eval()
         else:
             model.train()
-
+    
     def _update(self):
         """Perform weight update on a model.
 
         """
         if self.optimizer is not None and (self.step + 1) % self.update_every == 0:
             self.optimizer.step()
-
+    
     def _data_forward(self, network, x):
         x = _build_args(network.forward, **x)
         y = network(**x)
         if not isinstance(y, dict):
-            raise TypeError(f"The return value of {_get_func_signature(network.forward)} should be dict, got {type(y)}.")
+            raise TypeError(
+                f"The return value of {_get_func_signature(network.forward)} should be dict, got {type(y)}.")
         return y
-
+    
     def _grad_backward(self, loss):
         """Compute gradient with link rules.
 
@@ -659,7 +683,7 @@ class Trainer(object):
         if self.step % self.update_every == 0:
             self.model.zero_grad()
         loss.backward()
-
+    
     def _compute_loss(self, predict, truth):
         """Compute loss given prediction and ground truth.
 
@@ -668,7 +692,7 @@ class Trainer(object):
         :return: a scalar
         """
         return self.losser(predict, truth)
-
+    
     def _save_model(self, model, model_name, only_param=False):
         """ 存储不含有显卡信息的state_dict或model
         :param model:
@@ -691,7 +715,7 @@ class Trainer(object):
                 model.cpu()
                 torch.save(model, model_path)
                 model.to(self._model_device)
-
+    
     def _load_model(self, model, model_name, only_param=False):
         # 返回bool值指示是否成功reload模型
         if self.save_path is not None:
@@ -709,7 +733,7 @@ class Trainer(object):
         else:
             return False
         return True
-
+    
     def _better_eval_result(self, metrics):
         """Check if the current epoch yields better validation results.
 
@@ -760,7 +784,7 @@ def _check_code(dataset, model, losser, metrics, batch_size=DEFAULT_CHECK_BATCH_
                 check_level=0):
     # check get_loss 方法
     model_devcie = model.parameters().__next__().device
-
+    
     batch = Batch(dataset=dataset, batch_size=batch_size, sampler=SequentialSampler())
     for batch_count, (batch_x, batch_y) in enumerate(batch):
         _move_dict_value_to_device(batch_x, batch_y, device=model_devcie)
@@ -784,13 +808,13 @@ def _check_code(dataset, model, losser, metrics, batch_size=DEFAULT_CHECK_BATCH_
             print(info_str)
             _check_forward_error(forward_func=model.forward, dataset=dataset,
                                  batch_x=batch_x, check_level=check_level)
-
+        
         refined_batch_x = _build_args(model.forward, **batch_x)
         pred_dict = model(**refined_batch_x)
         func_signature = _get_func_signature(model.forward)
         if not isinstance(pred_dict, dict):
             raise TypeError(f"The return value of {func_signature} should be `dict`, not `{type(pred_dict)}`.")
-
+        
         # loss check
         try:
             loss = losser(pred_dict, batch_y)
@@ -814,7 +838,7 @@ def _check_code(dataset, model, losser, metrics, batch_size=DEFAULT_CHECK_BATCH_
         model.zero_grad()
         if batch_count + 1 >= DEFAULT_CHECK_NUM_BATCH:
             break
-
+    
     if dev_data is not None:
         tester = Tester(data=dev_data[:batch_size * DEFAULT_CHECK_NUM_BATCH], model=model, metrics=metrics,
                         batch_size=batch_size, verbose=-1)
@@ -828,7 +852,7 @@ def _check_eval_results(metrics, metric_key, metric_list):
     # metric_list: 多个用来做评价的指标，来自Trainer的初始化
     if isinstance(metrics, tuple):
         loss, metrics = metrics
-
+    
     if isinstance(metrics, dict):
         if len(metrics) == 1:
             # only single metric, just use it
@@ -839,7 +863,7 @@ def _check_eval_results(metrics, metric_key, metric_list):
             if metrics_name not in metrics:
                 raise RuntimeError(f"{metrics_name} is chosen to do validation, but got {metrics}")
             metric_dict = metrics[metrics_name]
-
+        
         if len(metric_dict) == 1:
             indicator_val, indicator = list(metric_dict.values())[0], list(metric_dict.keys())[0]
         elif len(metric_dict) > 1 and metric_key is None:
diff --git a/fastNLP/core/utils.py b/fastNLP/core/utils.py
index efb4faa7..23743ecf 100644
--- a/fastNLP/core/utils.py
+++ b/fastNLP/core/utils.py
@@ -1,3 +1,7 @@
+"""
+utils模块实现了 fastNLP 内部和外部所需的很多工具。其中用户可以使用的是 :func:`cache_results` 修饰器。
+"""
+__all__ = ["cache_results"]
 import _pickle
 import inspect
 import os
@@ -29,6 +33,8 @@ def _prepare_cache_filepath(filepath):
 #  TODO 可以保存下缓存时的参数，如果load的时候发现参数不一致，发出警告。
 def cache_results(_cache_fp, _refresh=False, _verbose=1):
     """
+    别名：:class:`fastNLP.cache_results` :class:`fastNLP.core.uitls.cache_results`
+
     cache_results是fastNLP中用于cache数据的装饰器。通过下面的例子看一下如何使用
 
     Example::
@@ -193,13 +199,14 @@ def _move_model_to_device(model, device):
     if isinstance(model, torch.nn.parallel.DistributedDataParallel):
         raise RuntimeError("model of `torch.nn.parallel.DistributedDataParallel` is not supported right now.")
 
-    if not torch.cuda.is_available() and (device!='cpu' or (isinstance(device, torch.device) and device.type!='cpu')):
-        raise ValueError("There is no usable gpu. set `device` as `cpu`.")
-
     if device is None:
         if isinstance(model, torch.nn.DataParallel):
             model.cuda()
         return model
+    else:
+        if not torch.cuda.is_available() and (
+                device != 'cpu' or (isinstance(device, torch.device) and device.type != 'cpu')):
+            raise ValueError("There is no usable gpu. set `device` as `cpu`.")
 
     if isinstance(model, torch.nn.DataParallel):
         raise RuntimeError("When model is `torch.nn.DataParallel`, the device has to be `None`.")
diff --git a/fastNLP/core/vocabulary.py b/fastNLP/core/vocabulary.py
index 633a748f..c82c316e 100644
--- a/fastNLP/core/vocabulary.py
+++ b/fastNLP/core/vocabulary.py
@@ -1,6 +1,6 @@
 from functools import wraps
 from collections import Counter
-from fastNLP.core.dataset import DataSet
+from .dataset import DataSet
 
 def _check_build_vocab(func):
     """A decorator to make sure the indexing is built before used.
@@ -34,6 +34,8 @@ def _check_build_status(func):
 
 class Vocabulary(object):
     """
+    别名：:class:`fastNLP.Vocabulary` :class:`fastNLP.core.vocabulary.Vocabulary`
+    
     用于构建, 存储和使用 `str` 到 `int` 的一一映射
 
     Example::
@@ -98,7 +100,7 @@ class Vocabulary(object):
         """
         依次增加序列中词在词典中的出现频率
 
-        :param list(str) word_lst: 词的序列
+        :param list[str] word_lst: 词的序列
         """
         self.update(word_lst)
 
@@ -185,12 +187,11 @@ class Vocabulary(object):
             # remember to use `field_name`
             vocab.index_dataset(train_data, dev_data, test_data, field_name='words')
 
-        :param DataSet datasets: 需要转index的 DataSet, 支持一个或多个
+        :param datasets: 需要转index的 class:`~fastNLP.DataSet` , 支持一个或多个（list）
         :param str field_name: 需要转index的field, 若有多个 DataSet, 每个DataSet都必须有此 field.
             目前仅支持 ``str`` , ``list(str)`` , ``list(list(str))``
         :param str new_field_name: 保存结果的field_name. 若为 ``None`` , 将覆盖原field.
             Default: ``None``
-        :return self:
         """
         def index_instance(ins):
             """
@@ -230,7 +231,7 @@ class Vocabulary(object):
             # remember to use `field_name`
             vocab.from_dataset(train_data1, train_data2, field_name='words')
 
-        :param DataSet datasets: 需要转index的 DataSet, 支持一个或多个.
+        :param datasets: 需要转index的 class:`~fastNLP.DataSet` , 支持一个或多个（list）
         :param field_name: 可为 ``str`` 或 ``list(str)`` .
             构建词典所使用的 field(s), 支持一个或多个field
             若有多个 DataSet, 每个DataSet都必须有这些field.
diff --git a/fastNLP/io/__init__.py b/fastNLP/io/__init__.py
index e8ccca30..dc1e3d15 100644
--- a/fastNLP/io/__init__.py
+++ b/fastNLP/io/__init__.py
@@ -12,13 +12,14 @@
 这些类的使用方法可以在对应module的文档下查看.
 """
 from .embed_loader import EmbedLoader
-from .dataset_loader import *
-from .config_io import *
-from .model_io import *
+from .dataset_loader import DataSetLoader, CSVLoader, JsonLoader, ConllLoader, SNLILoader, SSTLoader, \
+    PeopleDailyCorpusLoader, Conll2003Loader
+from .config_io import ConfigLoader, ConfigSection, ConfigSaver
+from .model_io import ModelLoader as ModelLoader, ModelSaver as ModelSaver
 
 __all__ = [
     'EmbedLoader',
-
+    
     'DataSetLoader',
     'CSVLoader',
     'JsonLoader',
@@ -27,11 +28,11 @@ __all__ = [
     'SSTLoader',
     'PeopleDailyCorpusLoader',
     'Conll2003Loader',
-
+    
     'ConfigLoader',
     'ConfigSection',
     'ConfigSaver',
-
+    
     'ModelLoader',
     'ModelSaver',
-]
\ No newline at end of file
+]
diff --git a/fastNLP/io/base_loader.py b/fastNLP/io/base_loader.py
index 5d5fe63a..569f7e2e 100644
--- a/fastNLP/io/base_loader.py
+++ b/fastNLP/io/base_loader.py
@@ -3,7 +3,8 @@ import os
 
 
 class BaseLoader(object):
-    """Base loader for all loaders.
+    """
+    各个 Loader 的基类，提供了 API 的参考。
 
     """
     def __init__(self):
@@ -11,7 +12,10 @@ class BaseLoader(object):
 
     @staticmethod
     def load_lines(data_path):
-        """按行读取，舍弃每行两侧空白字符，返回list of str
+        """
+        按行读取，舍弃每行两侧空白字符，返回list of str
+
+        :param data_path: 读取数据的路径
         """
         with open(data_path, "r", encoding="utf=8") as f:
             text = f.readlines()
@@ -19,7 +23,10 @@ class BaseLoader(object):
 
     @classmethod
     def load(cls, data_path):
-        """先按行读取，去除一行两侧空白，再提取每行的字符。返回list of list of str
+        """
+        先按行读取，去除一行两侧空白，再提取每行的字符。返回list of list of str
+        
+        :param data_path:
         """
         with open(data_path, "r", encoding="utf-8") as f:
             text = f.readlines()
@@ -40,9 +47,7 @@ class BaseLoader(object):
 
 
 class DataLoaderRegister:
-    """Register for all data sets.
-
-    """
+    # TODO 这个类使用在何处？
     _readers = {}
 
     @classmethod
diff --git a/fastNLP/io/config_io.py b/fastNLP/io/config_io.py
index f303f0e9..8fa30dd4 100644
--- a/fastNLP/io/config_io.py
+++ b/fastNLP/io/config_io.py
@@ -1,19 +1,22 @@
 """
-.. _config-io:
 
 用于读入和处理和保存 config 文件
 """
+__all__ = ["ConfigLoader","ConfigSection","ConfigSaver"]
 import configparser
 import json
 import os
 
-from fastNLP.io.base_loader import BaseLoader
+from .base_loader import BaseLoader
 
 
 class ConfigLoader(BaseLoader):
-    """Loader for configuration.
+    """
+    别名：:class:`fastNLP.io.ConfigLoader` :class:`fastNLP.io.config_io.ConfigLoader`
+
+    读取配置文件的Loader
 
-    :param str data_path: path to the config
+    :param str data_path: 配置文件的路径
 
     """
     def __init__(self, data_path=None):
@@ -27,14 +30,16 @@ class ConfigLoader(BaseLoader):
 
     @staticmethod
     def load_config(file_path, sections):
-        """Load section(s) of configuration into the ``sections`` provided. No returns.
+        """
+        把配置文件的section 存入提供的 ``sections`` 中
 
-        :param str file_path: the path of config file
-        :param dict sections: the dict of ``{section_name(string): ConfigSection object}``
-            Example::
-    
-                test_args = ConfigSection()
-                ConfigLoader("config.cfg").load_config("./data_for_tests/config", {"POS_test": test_args})
+        :param str file_path: 配置文件的路径
+        :param dict sections:  符合如下键值对组成的字典 `section_name(string)` : :class:`~fastNLP.io.ConfigSection`
+            
+        Example::
+
+            test_args = ConfigSection()
+            ConfigLoader("config.cfg").load_config("./data_for_tests/config", {"POS_test": test_args})
 
         """
         assert isinstance(sections, dict)
@@ -70,7 +75,10 @@ class ConfigLoader(BaseLoader):
 
 
 class ConfigSection(object):
-    """ConfigSection is the data structure storing all key-value pairs in one section in a config file.
+    """
+    别名：:class:`fastNLP.io.ConfigSection` :class:`fastNLP.io.config_io.ConfigSection`
+
+    ConfigSection是一个存储了一个section中所有键值对的数据结构，推荐使用此类的实例来配合 :meth:`ConfigLoader.load_config` 使用
 
     """
 
@@ -146,9 +154,12 @@ class ConfigSection(object):
 
 
 class ConfigSaver(object):
-    """ConfigSaver is used to save config file and solve related conflicts.
+    """
+    别名：:class:`fastNLP.io.ConfigSaver` :class:`fastNLP.io.config_io.ConfigSaver`
+
+    ConfigSaver 是用来存储配置文件并解决相关冲突的类
 
-    :param str file_path: path to the config file
+    :param str file_path: 配置文件的路径
 
     """
     def __init__(self, file_path):
@@ -157,7 +168,8 @@ class ConfigSaver(object):
             raise FileNotFoundError("file {} NOT found!".__format__(self.file_path))
 
     def _get_section(self, sect_name):
-        """This is the function to get the section with the section name.
+        """
+        This is the function to get the section with the section name.
 
         :param sect_name: The name of section what wants to load.
         :return: The section.
@@ -167,7 +179,8 @@ class ConfigSaver(object):
         return sect
 
     def _read_section(self):
-        """This is the function to read sections from the config file.
+        """
+        This is the function to read sections from the config file.
 
         :return: sect_list, sect_key_list
             sect_list: A list of ConfigSection().
@@ -219,7 +232,8 @@ class ConfigSaver(object):
         return sect_list, sect_key_list
 
     def _write_section(self, sect_list, sect_key_list):
-        """This is the function to write config file with section list and name list.
+        """
+        This is the function to write config file with section list and name list.
 
         :param sect_list: A list of ConfigSection() need to be writen into file.
         :param sect_key_list: A list of name of sect_list.
@@ -240,10 +254,11 @@ class ConfigSaver(object):
                 f.write('\n')
 
     def save_config_file(self, section_name, section):
-        """This is the function to be called to change the config file with a single section and its name.
+        """
+        这个方法可以用来修改并保存配置文件中单独的一个 section
 
-        :param str section_name: The name of section what needs to be changed and saved.
-        :param ConfigSection section: The section with key and value what needs to be changed and saved.
+        :param str section_name: 需要保存的 section 的名字.
+        :param section: 你需要修改并保存的 section， :class:`~fastNLP.io.ConfigSaver` 类型
         """
         section_file = self._get_section(section_name)
         if len(section_file.__dict__.keys()) == 0:  # the section not in the file before
diff --git a/fastNLP/io/dataset_loader.py b/fastNLP/io/dataset_loader.py
index bb5e2f64..6d64ede2 100644
--- a/fastNLP/io/dataset_loader.py
+++ b/fastNLP/io/dataset_loader.py
@@ -1,8 +1,6 @@
 """
-.. _dataset-loader:
-
-DataSetLoader 的 API, 用于读取不同格式的数据, 并返回 `DataSet` ,
-得到的 `DataSet` 对象可以直接传入 `Trainer`, `Tester`, 用于模型的训练和测试
+dataset_loader模块实现了许多 DataSetLoader, 用于读取不同格式的数据, 并返回 `DataSet` ,
+得到的 :class:`~fastNLP.DataSet` 对象可以直接传入 :class:`~fastNLP.Trainer`, :class:`~fastNLP.Tester`, 用于模型的训练和测试
 
 Example::
 
@@ -13,50 +11,50 @@ Example::
 
     # ... do stuff
 """
-import os
-import json
+
 from nltk.tree import Tree
 
-from fastNLP.core.dataset import DataSet
-from fastNLP.core.instance import Instance
-from fastNLP.io.file_reader import _read_csv, _read_json, _read_conll
+from ..core.dataset import DataSet
+from ..core.instance import Instance
+from .file_reader import _read_csv, _read_json, _read_conll
 
 
 def _download_from_url(url, path):
     from tqdm import tqdm
     import requests
-
+    
     """Download file"""
     r = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'}, stream=True)
     chunk_size = 16 * 1024
     total_size = int(r.headers.get('Content-length', 0))
-    with open(path, "wb") as file ,\
-        tqdm(total=total_size, unit='B', unit_scale=1, desc=path.split('/')[-1]) as t:
+    with open(path, "wb") as file, \
+            tqdm(total=total_size, unit='B', unit_scale=1, desc=path.split('/')[-1]) as t:
         for chunk in r.iter_content(chunk_size):
             if chunk:
                 file.write(chunk)
                 t.update(len(chunk))
     return
 
+
 def _uncompress(src, dst):
     import zipfile, gzip, tarfile, os
-
+    
     def unzip(src, dst):
         with zipfile.ZipFile(src, 'r') as f:
             f.extractall(dst)
-
+    
     def ungz(src, dst):
         with gzip.open(src, 'rb') as f, open(dst, 'wb') as uf:
-            length = 16 * 1024 # 16KB
+            length = 16 * 1024  # 16KB
             buf = f.read(length)
             while buf:
                 uf.write(buf)
                 buf = f.read(length)
-
+    
     def untar(src, dst):
         with tarfile.open(src, 'r:gz') as f:
             f.extractall(dst)
-
+    
     fn, ext = os.path.splitext(src)
     _, ext_2 = os.path.splitext(fn)
     if ext == '.zip':
@@ -71,42 +69,48 @@ def _uncompress(src, dst):
 
 class DataSetLoader:
     """
+    别名：:class:`fastNLP.io.DataSetLoader` :class:`fastNLP.io.dataset_loader.DataSetLoader`
 
-    所有`DataSetLoader`的接口
+    所有 DataSetLoader 的 API 接口，你可以继承它实现自己的 DataSetLoader
     """
-
+    
     def load(self, path):
         """从指定 ``path`` 的文件中读取数据,返回DataSet
 
-        :param str path: file path
-        :return: a DataSet object
+        :param str path: 文件路径
+        :return: 一个 :class:`~fastNLP.DataSet` 类型的对象
         """
         raise NotImplementedError
-
+    
     def convert(self, data):
-        """用Python数据对象创建DataSet
+        """
+        用Python数据对象创建DataSet，各个子类需要自行实现这个方法
 
-        :param data: inner data structure (user-defined) to represent the data.
-        :return: a DataSet object
+        :param data: Python 内置的数据结构
+        :return: 一个 :class:`~fastNLP.DataSet` 类型的对象
         """
         raise NotImplementedError
 
 
 class PeopleDailyCorpusLoader(DataSetLoader):
-    """读取人民日报数据集
     """
+    别名：:class:`fastNLP.io.PeopleDailyCorpusLoader` :class:`fastNLP.io.dataset_loader.PeopleDailyCorpusLoader`
+
+    读取人民日报数据集
+    """
+    
     def __init__(self):
         super(PeopleDailyCorpusLoader, self).__init__()
         self.pos = True
         self.ner = True
-
+    
     def load(self, data_path, pos=True, ner=True):
         """
 
         :param str data_path: 数据路径
         :param bool pos: 是否使用词性标签
         :param bool ner: 是否使用命名实体标签
-        :return: a DataSet object
+        :return: 一个 :class:`~fastNLP.DataSet` 类型的对象
         """
         self.pos, self.ner = pos, ner
         with open(data_path, "r", encoding="utf-8") as f:
@@ -152,8 +156,13 @@ class PeopleDailyCorpusLoader(DataSetLoader):
                 example.append(sent_ner)
             examples.append(example)
         return self.convert(examples)
-
+    
     def convert(self, data):
+        """
+        
+        :param data: python 内置对象
+        :return: 一个 :class:`~fastNLP.DataSet` 类型的对象
+        """
         data_set = DataSet()
         for item in data:
             sent_words = item[0]
@@ -172,6 +181,8 @@ class PeopleDailyCorpusLoader(DataSetLoader):
 
 class ConllLoader(DataSetLoader):
     """
+    别名：:class:`fastNLP.io.ConllLoader` :class:`fastNLP.io.dataset_loader.ConllLoader`
+
     读取Conll格式的数据. 数据格式详见 http://conll.cemantix.org/2012/data.html
 
     列号从0开始, 每列对应内容为::
@@ -193,9 +204,10 @@ class ConllLoader(DataSetLoader):
 
     :param headers: 每一列数据的名称，需为List or Tuple  of str。``header`` 与 ``indexs`` 一一对应
     :param indexs: 需要保留的数据列下标，从0开始。若为 ``None`` ，则所有列都保留。Default: ``None``
-    :param dropna: 是否忽略非法数据，若 ``False`` ，遇到非法数据时抛出 ``ValueError`` 。Default: ``True``
+    :param dropna: 是否忽略非法数据，若 ``False`` ，遇到非法数据时抛出 ``ValueError`` 。Default: ``False``
     """
-    def __init__(self, headers, indexs=None, dropna=True):
+    
+    def __init__(self, headers, indexs=None, dropna=False):
         super(ConllLoader, self).__init__()
         if not isinstance(headers, (list, tuple)):
             raise TypeError('invalid headers: {}, should be list of strings'.format(headers))
@@ -207,21 +219,25 @@ class ConllLoader(DataSetLoader):
             if len(indexs) != len(headers):
                 raise ValueError
             self.indexs = indexs
-
+    
     def load(self, path):
         ds = DataSet()
         for idx, data in _read_conll(path, indexes=self.indexs, dropna=self.dropna):
-            ins = {h:data[i] for i, h in enumerate(self.headers)}
+            ins = {h: data[i] for i, h in enumerate(self.headers)}
             ds.append(Instance(**ins))
         return ds
 
 
 class Conll2003Loader(ConllLoader):
-    """读取Conll2003数据
+    """
+    别名：:class:`fastNLP.io.Conll2003Loader` :class:`fastNLP.io.dataset_loader.Conll2003Loader`
+
+    读取Conll2003数据
     
     关于数据集的更多信息,参考:
     https://sites.google.com/site/ermasoftware/getting-started/ne-tagging-conll2003-data
     """
+    
     def __init__(self):
         headers = [
             'tokens', 'pos', 'chunks', 'ner',
@@ -260,7 +276,10 @@ def _cut_long_sentence(sent, max_sample_length=200):
 
 
 class SSTLoader(DataSetLoader):
-    """读取SST数据集, DataSet包含fields::
+    """
+    别名：:class:`fastNLP.io.SSTLoader` :class:`fastNLP.io.dataset_loader.SSTLoader`
+    
+    读取SST数据集, DataSet包含fields::
 
         words: list(str) 需要分类的文本
         target: str 文本的标签
@@ -270,21 +289,22 @@ class SSTLoader(DataSetLoader):
     :param subtree: 是否将数据展开为子树，扩充数据量. Default: ``False``
     :param fine_grained: 是否使用SST-5标准，若 ``False`` , 使用SST-2。Default: ``False``
     """
+    
     def __init__(self, subtree=False, fine_grained=False):
         self.subtree = subtree
-
-        tag_v = {'0':'very negative', '1':'negative', '2':'neutral',
-                 '3':'positive', '4':'very positive'}
+        
+        tag_v = {'0': 'very negative', '1': 'negative', '2': 'neutral',
+                 '3': 'positive', '4': 'very positive'}
         if not fine_grained:
             tag_v['0'] = tag_v['1']
             tag_v['4'] = tag_v['3']
         self.tag_v = tag_v
-
+    
     def load(self, path):
         """
 
-        :param path: str，存储数据的路径
-        :return: DataSet。
+        :param str path: 存储数据的路径
+        :return: 一个 :class:`~fastNLP.DataSet` 类型的对象
         """
         datalist = []
         with open(path, 'r', encoding='utf-8') as f:
@@ -296,7 +316,7 @@ class SSTLoader(DataSetLoader):
         for words, tag in datas:
             ds.append(Instance(words=words, target=tag))
         return ds
-
+    
     @staticmethod
     def _get_one(data, subtree):
         tree = Tree.fromstring(data)
@@ -307,15 +327,18 @@ class SSTLoader(DataSetLoader):
 
 class JsonLoader(DataSetLoader):
     """
+    别名：:class:`fastNLP.io.JsonLoader` :class:`fastNLP.io.dataset_loader.JsonLoader`
+
     读取json格式数据.数据必须按行存储,每行是一个包含各类属性的json对象
 
     :param dict fields: 需要读入的json属性名称, 和读入后在DataSet中存储的field_name
-        ``fields`` 的`key`必须是json对象的属性名. ``fields`` 的`value`为读入后在DataSet存储的`field_name`,
-        `value`也可为 ``None`` , 这时读入后的`field_name`与json对象对应属性同名
+        ``fields`` 的 `key` 必须是json对象的属性名. ``fields`` 的 `value` 为读入后在DataSet存储的 `field_name` ,
+        `value` 也可为 ``None`` , 这时读入后的 `field_name` 与json对象对应属性同名
         ``fields`` 可为 ``None`` , 这时,json对象所有属性都保存在DataSet中. Default: ``None``
     :param bool dropna: 是否忽略非法数据,若 ``True`` 则忽略,若 ``False`` ,在遇到非法数据时,抛出 ``ValueError`` .
-        Default: ``True``
+        Default: ``False``
     """
+    
     def __init__(self, fields=None, dropna=False):
         super(JsonLoader, self).__init__()
         self.dropna = dropna
@@ -326,17 +349,22 @@ class JsonLoader(DataSetLoader):
             for k, v in fields.items():
                 self.fields[k] = k if v is None else v
             self.fields_list = list(self.fields.keys())
-
+    
     def load(self, path):
         ds = DataSet()
         for idx, d in _read_json(path, fields=self.fields_list, dropna=self.dropna):
-            ins = {self.fields[k]:v for k,v in d.items()}
+            if self.fields:
+                ins = {self.fields[k]: v for k, v in d.items()}
+            else:
+                ins = d
             ds.append(Instance(**ins))
         return ds
 
 
 class SNLILoader(JsonLoader):
     """
+    别名：:class:`fastNLP.io.SNLILoader` :class:`fastNLP.io.dataset_loader.SNLILoader`
+
     读取SNLI数据集，读取的DataSet包含fields::
 
         words1: list(str)，第一句文本, premise
@@ -345,6 +373,7 @@ class SNLILoader(JsonLoader):
 
     数据来源: https://nlp.stanford.edu/projects/snli/snli_1.0.zip
     """
+    
     def __init__(self):
         fields = {
             'sentence1_parse': 'words1',
@@ -352,12 +381,14 @@ class SNLILoader(JsonLoader):
             'gold_label': 'target',
         }
         super(SNLILoader, self).__init__(fields=fields)
-
+    
     def load(self, path):
         ds = super(SNLILoader, self).load(path)
+        
         def parse_tree(x):
             t = Tree.fromstring(x)
             return t.leaves()
+        
         ds.apply(lambda ins: parse_tree(ins['words1']), new_field_name='words1')
         ds.apply(lambda ins: parse_tree(ins['words2']), new_field_name='words2')
         ds.drop(lambda x: x['target'] == '-')
@@ -366,19 +397,22 @@ class SNLILoader(JsonLoader):
 
 class CSVLoader(DataSetLoader):
     """
+    别名：:class:`fastNLP.io.CSVLoader` :class:`fastNLP.io.dataset_loader.CSVLoader`
+
     读取CSV格式的数据集。返回 ``DataSet``
 
     :param List[str] headers: CSV文件的文件头.定义每一列的属性名称,即返回的DataSet中`field`的名称
         若为 ``None`` ,则将读入文件的第一行视作 ``headers`` . Default: ``None``
     :param str sep: CSV文件中列与列之间的分隔符. Default: ","
     :param bool dropna: 是否忽略非法数据,若 ``True`` 则忽略,若 ``False`` ,在遇到非法数据时,抛出 ``ValueError`` .
-        Default: ``True``
+        Default: ``False``
     """
-    def __init__(self, headers=None, sep=",", dropna=True):
+    
+    def __init__(self, headers=None, sep=",", dropna=False):
         self.headers = headers
         self.sep = sep
         self.dropna = dropna
-
+    
     def load(self, path):
         ds = DataSet()
         for idx, data in _read_csv(path, headers=self.headers,
@@ -393,7 +427,7 @@ def _add_seg_tag(data):
     :param data: list of ([word], [pos], [heads], [head_tags])
     :return: list of ([word], [pos])
     """
-
+    
     _processed = []
     for word_list, pos_list, _, _ in data:
         new_sample = []
@@ -407,4 +441,3 @@ def _add_seg_tag(data):
                 new_sample.append((word[-1], 'E-' + pos))
         _processed.append(list(map(list, zip(*new_sample))))
     return _processed
-
diff --git a/fastNLP/io/embed_loader.py b/fastNLP/io/embed_loader.py
index 39d93fab..4cc8f596 100644
--- a/fastNLP/io/embed_loader.py
+++ b/fastNLP/io/embed_loader.py
@@ -7,13 +7,17 @@ import os
 
 import numpy as np
 
-from fastNLP.core.vocabulary import Vocabulary
-from fastNLP.io.base_loader import BaseLoader
+from ..core.vocabulary import Vocabulary
+from .base_loader import BaseLoader
 
 import warnings
 
 class EmbedLoader(BaseLoader):
-    """这个类用于从预训练的Embedding中load数据。"""
+    """
+    别名：:class:`fastNLP.io.EmbedLoader` :class:`fastNLP.io.embed_loader.EmbedLoader`
+
+    这个类用于从预训练的Embedding中load数据。
+    """
 
     def __init__(self):
         super(EmbedLoader, self).__init__()
@@ -25,13 +29,13 @@ class EmbedLoader(BaseLoader):
         word2vec(第一行只有两个元素)还是glove格式的数据。
 
         :param str embed_filepath: 预训练的embedding的路径。
-        :param Vocabulary vocab: 词表，读取出现在vocab中的词的embedding。没有出现在vocab中的词的embedding将通过找到的词的
-            embedding的正态分布采样出来，以使得整个Embedding是同分布的。
+        :param vocab: 词表 :class:`~fastNLP.Vocabulary` 类型，读取出现在vocab中的词的embedding。
+            没有出现在vocab中的词的embedding将通过找到的词的embedding的正态分布采样出来，以使得整个Embedding是同分布的。
         :param dtype: 读出的embedding的类型
         :param bool normalize: 是否将每个vector归一化到norm为1
-        :param str error: 'ignore', 'strict'; 如果'ignore'，错误将自动跳过; 如果strict, 错误将抛出。这里主要可能出错的地
-            方在于词表有空行或者词表出现了维度不一致。
-        :return: numpy.ndarray, shape为 [len(vocab), dimension], dimension由pretrain的embedding决定。
+        :param str error: `ignore` , `strict` ; 如果 `ignore` ，错误将自动跳过; 如果 `strict` , 错误将抛出。
+            这里主要可能出错的地方在于词表有空行或者词表出现了维度不一致。
+        :return numpy.ndarray:  shape为 [len(vocab), dimension], dimension由pretrain的embedding决定。
         """
         assert isinstance(vocab, Vocabulary), "Only fastNLP.Vocabulary is supported."
         if not os.path.exists(embed_filepath):
@@ -87,11 +91,11 @@ class EmbedLoader(BaseLoader):
         :param str padding: the padding tag for vocabulary.
         :param str unknown: the unknown tag for vocabulary.
         :param bool normalize: 是否将每个vector归一化到norm为1
-        :param str error: 'ignore', 'strict'; 如果'ignore'，错误将自动跳过; 如果strict, 错误将抛出。这里主要可能出错的地
+        :param str error: `ignore` , `strict` ; 如果 `ignore` ，错误将自动跳过; 如果 `strict` , 错误将抛出。这里主要可能出错的地
             方在于词表有空行或者词表出现了维度不一致。
-        :return: numpy.ndarray, shape为 [len(vocab), dimension], dimension由pretrain的embedding决定。
-        :return: numpy.ndarray,Vocabulary embedding的shape是[词表大小+x, 词表维度], "词表大小+x"是由于最终的大小还取决与
-            是否使用padding, 以及unknown有没有在词表中找到对应的词。Vocabulary中的词的顺序与Embedding的顺序是一一对应的。
+        :return numpy.ndarray: shape为 [len(vocab), dimension], dimension由pretrain的embedding决定。
+        :return numpy.ndarray: Vocabulary Embedding的shape是[词表大小+x, 词表维度], "词表大小+x"是由于最终的大小还取决与
+            是否使用padding, 以及unknown有没有在词表中找到对应的词。 Vocabulary中的词的顺序与Embedding的顺序是一一对应的。
         """
         vocab = Vocabulary(padding=padding, unknown=unknown)
         vec_dict = {}
diff --git a/fastNLP/io/file_reader.py b/fastNLP/io/file_reader.py
index ffbab510..5963bb56 100644
--- a/fastNLP/io/file_reader.py
+++ b/fastNLP/io/file_reader.py
@@ -1,3 +1,6 @@
+"""
+此模块用于给其它模块提供读取文件的函数，没有为用户提供 API
+"""
 import json
 
 
diff --git a/fastNLP/io/model_io.py b/fastNLP/io/model_io.py
index d28034c8..48e53ab3 100644
--- a/fastNLP/io/model_io.py
+++ b/fastNLP/io/model_io.py
@@ -1,16 +1,16 @@
 """
-.. _model-io:
-
 用于载入和保存模型
 """
 import torch
 
-from fastNLP.io.base_loader import BaseLoader
+from .base_loader import BaseLoader
 
 
 class ModelLoader(BaseLoader):
     """
-        Loader for models.
+    别名：:class:`fastNLP.io.ModelLoader` :class:`fastNLP.io.model_io.ModelLoader`
+
+    用于读取模型
     """
 
     def __init__(self):
@@ -18,24 +18,30 @@ class ModelLoader(BaseLoader):
 
     @staticmethod
     def load_pytorch(empty_model, model_path):
-        """Load model parameters from ".pkl" files into the empty PyTorch model.
+        """
+        从 ".pkl" 文件读取 PyTorch 模型
 
-        :param empty_model: a PyTorch model with initialized parameters.
-        :param str model_path: the path to the saved model.
+        :param empty_model: 初始化参数的 PyTorch 模型
+        :param str model_path: 模型保存的路径
         """
         empty_model.load_state_dict(torch.load(model_path))
 
     @staticmethod
     def load_pytorch_model(model_path):
-        """Load the entire model.
+        """
+        读取整个模型
 
-        :param str model_path: the path to the saved model.
+        :param str model_path: 模型保存的路径
         """
         return torch.load(model_path)
 
 
 class ModelSaver(object):
-    """Save a model
+    """
+    别名：:class:`fastNLP.io.ModelSaver` :class:`fastNLP.io.model_io.ModelSaver`
+
+    用于保存模型
+    
     Example::
 
         saver = ModelSaver("./save/model_ckpt_100.pkl")
@@ -46,15 +52,16 @@ class ModelSaver(object):
     def __init__(self, save_path):
         """
 
-        :param save_path: the path to the saving directory.
+        :param save_path: 模型保存的路径
         """
         self.save_path = save_path
 
     def save_pytorch(self, model, param_only=True):
-        """Save a pytorch model into ".pkl" file.
+        """
+        把 PyTorch 模型存入 ".pkl" 文件
 
-        :param model: a PyTorch model
-        :param bool param_only: whether only to save the model parameters or the entire model.
+        :param model: PyTorch 模型
+        :param bool param_only: 是否只保存模型的参数（否则保存整个模型）
 
         """
         if param_only is True:
diff --git a/fastNLP/models/base_model.py b/fastNLP/models/base_model.py
index ec532014..39ac99a0 100644
--- a/fastNLP/models/base_model.py
+++ b/fastNLP/models/base_model.py
@@ -1,6 +1,6 @@
 import torch
 
-from fastNLP.modules.decoder.MLP import MLP
+from ..modules.decoder.MLP import MLP
 
 
 class BaseModel(torch.nn.Module):
diff --git a/fastNLP/models/bert.py b/fastNLP/models/bert.py
index 42626934..7934b435 100644
--- a/fastNLP/models/bert.py
+++ b/fastNLP/models/bert.py
@@ -6,7 +6,7 @@ import torch
 from torch import nn
 
 from .base_model import BaseModel
-from fastNLP.modules.encoder import BertModel
+from ..modules.encoder import BertModel
 
 
 class BertForSequenceClassification(BaseModel):
diff --git a/fastNLP/models/biaffine_parser.py b/fastNLP/models/biaffine_parser.py
index f2329dca..b8d1e7a9 100644
--- a/fastNLP/models/biaffine_parser.py
+++ b/fastNLP/models/biaffine_parser.py
@@ -7,16 +7,17 @@ import torch
 from torch import nn
 from torch.nn import functional as F
 
-from fastNLP.core.losses import LossFunc
-from fastNLP.core.metrics import MetricBase
-from fastNLP.core.utils import seq_lens_to_masks
-from fastNLP.models.base_model import BaseModel
-from fastNLP.modules.dropout import TimestepDropout
-from fastNLP.modules.encoder.transformer import TransformerEncoder
-from fastNLP.modules.encoder.variational_rnn import VarLSTM
-from fastNLP.modules.utils import initial_parameter
-from fastNLP.modules.utils import seq_mask
-from fastNLP.modules.utils import get_embeddings
+from ..core.const import Const as C
+from ..core.losses import LossFunc
+from ..core.metrics import MetricBase
+from ..core.utils import seq_lens_to_masks
+from ..modules.dropout import TimestepDropout
+from ..modules.encoder.transformer import TransformerEncoder
+from ..modules.encoder.variational_rnn import VarLSTM
+from ..modules.utils import initial_parameter
+from ..modules.utils import seq_mask
+from ..modules.utils import get_embeddings
+from .base_model import BaseModel
 
 def _mst(scores):
     """
@@ -325,21 +326,20 @@ class BiaffineParser(GraphParser):
                 for p in m.parameters():
                     nn.init.normal_(p, 0, 0.1)
 
-    def forward(self, words1, words2, seq_len, gold_heads=None):
+    def forward(self, words1, words2, seq_len, target1=None):
         """模型forward阶段
 
         :param words1: [batch_size, seq_len] 输入word序列
         :param words2: [batch_size, seq_len] 输入pos序列
         :param seq_len: [batch_size, seq_len] 输入序列长度
-        :param gold_heads: [batch_size, seq_len] 输入真实标注的heads, 仅在训练阶段有效,
+        :param target1: [batch_size, seq_len] 输入真实标注的heads, 仅在训练阶段有效,
             用于训练label分类器. 若为 ``None`` , 使用预测的heads输入到label分类器
             Default: ``None``
         :return dict: parsing结果::
 
-            arc_pred: [batch_size, seq_len, seq_len] 边预测logits
-            label_pred: [batch_size, seq_len, num_label] label预测logits
-            mask: [batch_size, seq_len] 预测结果的mask
-            head_pred: [batch_size, seq_len] heads的预测结果, 在 ``gold_heads=None`` 时预测
+            pred1: [batch_size, seq_len, seq_len] 边预测logits
+            pred2: [batch_size, seq_len, num_label] label预测logits
+            pred3: [batch_size, seq_len] heads的预测结果, 在 ``target1=None`` 时预测
         """
         # prepare embeddings
         batch_size, length = words1.shape
@@ -365,7 +365,7 @@ class BiaffineParser(GraphParser):
             _, unsort_idx = torch.sort(sort_idx, dim=0, descending=False)
             feat = feat[unsort_idx]
         else:
-            seq_range = torch.arange(seq_len, dtype=torch.long, device=x.device)[None,:]
+            seq_range = torch.arange(length, dtype=torch.long, device=x.device)[None,:]
             x = x + self.position_emb(seq_range)
             feat = self.encoder(x, mask.float())
 
@@ -380,7 +380,7 @@ class BiaffineParser(GraphParser):
         arc_pred = self.arc_predictor(arc_head, arc_dep) # [N, L, L]
 
         # use gold or predicted arc to predict label
-        if gold_heads is None or not self.training:
+        if target1 is None or not self.training:
             # use greedy decoding in training
             if self.training or self.use_greedy_infer:
                 heads = self.greedy_decoder(arc_pred, mask)
@@ -389,44 +389,45 @@ class BiaffineParser(GraphParser):
             head_pred = heads
         else:
             assert self.training # must be training mode
-            if gold_heads is None:
+            if target1 is None:
                 heads = self.greedy_decoder(arc_pred, mask)
                 head_pred = heads
             else:
                 head_pred = None
-                heads = gold_heads
+                heads = target1
 
         batch_range = torch.arange(start=0, end=batch_size, dtype=torch.long, device=words1.device).unsqueeze(1)
         label_head = label_head[batch_range, heads].contiguous()
         label_pred = self.label_predictor(label_head, label_dep) # [N, L, num_label]
-        res_dict = {'arc_pred': arc_pred, 'label_pred': label_pred, 'mask': mask}
+        res_dict = {C.OUTPUTS(0): arc_pred, C.OUTPUTS(1): label_pred}
         if head_pred is not None:
-            res_dict['head_pred'] = head_pred
+            res_dict[C.OUTPUTS(2)] = head_pred
         return res_dict
 
     @staticmethod
-    def loss(arc_pred, label_pred, arc_true, label_true, mask):
+    def loss(pred1, pred2, target1, target2, seq_len):
         """
-        Compute loss.
-
-        :param arc_pred: [batch_size, seq_len, seq_len] 边预测logits
-        :param label_pred: [batch_size, seq_len, num_label] label预测logits
-        :param arc_true: [batch_size, seq_len] 真实边的标注
-        :param label_true: [batch_size, seq_len] 真实类别的标注
-        :param mask: [batch_size, seq_len] 预测结果的mask
-        :return: loss value
+        计算parser的loss
+
+        :param pred1: [batch_size, seq_len, seq_len] 边预测logits
+        :param pred2: [batch_size, seq_len, num_label] label预测logits
+        :param target1: [batch_size, seq_len] 真实边的标注
+        :param target2: [batch_size, seq_len] 真实类别的标注
+        :param seq_len: [batch_size, seq_len] 真实目标的长度
+        :return loss: scalar
         """
 
-        batch_size, seq_len, _ = arc_pred.shape
+        batch_size, length, _ = pred1.shape
+        mask = seq_mask(seq_len, length)
         flip_mask = (mask == 0)
-        _arc_pred = arc_pred.clone()
+        _arc_pred = pred1.clone()
         _arc_pred.masked_fill_(flip_mask.unsqueeze(1), -float('inf'))
         arc_logits = F.log_softmax(_arc_pred, dim=2)
-        label_logits = F.log_softmax(label_pred, dim=2)
+        label_logits = F.log_softmax(pred2, dim=2)
         batch_index = torch.arange(batch_size, device=arc_logits.device, dtype=torch.long).unsqueeze(1)
-        child_index = torch.arange(seq_len, device=arc_logits.device, dtype=torch.long).unsqueeze(0)
-        arc_loss = arc_logits[batch_index, child_index, arc_true]
-        label_loss = label_logits[batch_index, child_index, label_true]
+        child_index = torch.arange(length, device=arc_logits.device, dtype=torch.long).unsqueeze(0)
+        arc_loss = arc_logits[batch_index, child_index, target1]
+        label_loss = label_logits[batch_index, child_index, target2]
 
         byte_mask = flip_mask.byte()
         arc_loss.masked_fill_(byte_mask, 0)
@@ -441,21 +442,16 @@ class BiaffineParser(GraphParser):
         :param words1: [batch_size, seq_len] 输入word序列
         :param words2: [batch_size, seq_len] 输入pos序列
         :param seq_len: [batch_size, seq_len] 输入序列长度
-        :param gold_heads: [batch_size, seq_len] 输入真实标注的heads, 仅在训练阶段有效,
-            用于训练label分类器. 若为 ``None`` , 使用预测的heads输入到label分类器
-            Default: ``None``
         :return dict: parsing结果::
 
-            arc_pred: [batch_size, seq_len, seq_len] 边预测logits
-            label_pred: [batch_size, seq_len, num_label] label预测logits
-            mask: [batch_size, seq_len] 预测结果的mask
-            head_pred: [batch_size, seq_len] heads的预测结果, 在 ``gold_heads=None`` 时预测
+            pred1: [batch_size, seq_len] heads的预测结果
+            pred2: [batch_size, seq_len, num_label] label预测logits
         """
         res = self(words1, words2, seq_len)
         output = {}
-        output['arc_pred'] = res.pop('head_pred')
-        _, label_pred = res.pop('label_pred').max(2)
-        output['label_pred'] = label_pred
+        output[C.OUTPUTS(0)] = res.pop(C.OUTPUTS(2))
+        _, label_pred = res.pop(C.OUTPUTS(1)).max(2)
+        output[C.OUTPUTS(1)] = label_pred
         return output
 
 
@@ -463,41 +459,44 @@ class ParserLoss(LossFunc):
     """
     计算parser的loss
 
-    :param arc_pred: [batch_size, seq_len, seq_len] 边预测logits
-    :param label_pred: [batch_size, seq_len, num_label] label预测logits
-    :param arc_true: [batch_size, seq_len] 真实边的标注
-    :param label_true: [batch_size, seq_len] 真实类别的标注
-    :param mask: [batch_size, seq_len] 预测结果的mask
+    :param pred1: [batch_size, seq_len, seq_len] 边预测logits
+    :param pred2: [batch_size, seq_len, num_label] label预测logits
+    :param target1: [batch_size, seq_len] 真实边的标注
+    :param target2: [batch_size, seq_len] 真实类别的标注
+    :param seq_len: [batch_size, seq_len] 真实目标的长度
     :return loss: scalar
     """
-    def __init__(self, arc_pred=None, label_pred=None, arc_true=None, label_true=None):
+    def __init__(self, pred1=None, pred2=None,
+                 target1=None, target2=None,
+                 seq_len=None):
         super(ParserLoss, self).__init__(BiaffineParser.loss,
-                                                 arc_pred=arc_pred,
-                                                 label_pred=label_pred,
-                                                 arc_true=arc_true,
-                                                 label_true=label_true)
+                                         pred1=pred1,
+                                         pred2=pred2,
+                                         target1=target1,
+                                         target2=target2,
+                                         seq_len=seq_len)
 
 
 class ParserMetric(MetricBase):
     """
     评估parser的性能
 
-    :param arc_pred: 边预测logits
-    :param label_pred: label预测logits
-    :param arc_true: 真实边的标注
-    :param label_true: 真实类别的标注
+    :param pred1: 边预测logits
+    :param pred2: label预测logits
+    :param target1: 真实边的标注
+    :param target2: 真实类别的标注
     :param seq_len: 序列长度
     :return dict: 评估结果::
 
         UAS: 不带label时, 边预测的准确率
         LAS: 同时预测边和label的准确率
     """
-    def __init__(self, arc_pred=None, label_pred=None,
-                 arc_true=None, label_true=None, seq_len=None):
+    def __init__(self, pred1=None, pred2=None,
+                 target1=None, target2=None, seq_len=None):
 
         super().__init__()
-        self._init_param_map(arc_pred=arc_pred, label_pred=label_pred,
-                             arc_true=arc_true, label_true=label_true,
+        self._init_param_map(pred1=pred1, pred2=pred2,
+                             target1=target1, target2=target2,
                              seq_len=seq_len)
         self.num_arc = 0
         self.num_label = 0
@@ -509,17 +508,17 @@ class ParserMetric(MetricBase):
             self.num_sample = self.num_label = self.num_arc = 0
         return res
 
-    def evaluate(self, arc_pred, label_pred, arc_true, label_true, seq_len=None):
+    def evaluate(self, pred1, pred2, target1, target2, seq_len=None):
         """Evaluate the performance of prediction.
         """
         if seq_len is None:
-            seq_mask = arc_pred.new_ones(arc_pred.size(), dtype=torch.long)
+            seq_mask = pred1.new_ones(pred1.size(), dtype=torch.long)
         else:
             seq_mask = seq_lens_to_masks(seq_len.long(), float=False).long()
         # mask out <root> tag
         seq_mask[:,0] = 0
-        head_pred_correct = (arc_pred == arc_true).long() * seq_mask
-        label_pred_correct = (label_pred == label_true).long() * head_pred_correct
+        head_pred_correct = (pred1 == target1).long() * seq_mask
+        label_pred_correct = (pred2 == target2).long() * head_pred_correct
         self.num_arc += head_pred_correct.sum().item()
         self.num_label += label_pred_correct.sum().item()
         self.num_sample += seq_mask.sum().item()
diff --git a/fastNLP/models/char_language_model.py b/fastNLP/models/char_language_model.py
index d5e3359d..d0b4c426 100644
--- a/fastNLP/models/char_language_model.py
+++ b/fastNLP/models/char_language_model.py
@@ -2,7 +2,7 @@ import torch
 import torch.nn as nn
 import torch.nn.functional as F
 
-from fastNLP.modules.encoder.lstm import LSTM
+from ..modules.encoder.lstm import LSTM
 
 
 class Highway(nn.Module):
diff --git a/fastNLP/models/enas_controller.py b/fastNLP/models/enas_controller.py
index ae9bcfd2..16b970e6 100644
--- a/fastNLP/models/enas_controller.py
+++ b/fastNLP/models/enas_controller.py
@@ -5,9 +5,8 @@ import os
 
 import torch
 import torch.nn.functional as F
-import fastNLP
-import fastNLP.models.enas_utils as utils
-from fastNLP.models.enas_utils import Node
+from . import enas_utils as utils
+from .enas_utils import Node
 
 
 def _construct_dags(prev_nodes, activations, func_names, num_blocks):
diff --git a/fastNLP/models/enas_model.py b/fastNLP/models/enas_model.py
index cc91e675..5c667927 100644
--- a/fastNLP/models/enas_model.py
+++ b/fastNLP/models/enas_model.py
@@ -9,9 +9,8 @@ from torch import nn
 import torch.nn.functional as F
 from torch.autograd import Variable
 
-import fastNLP.models.enas_utils as utils
-from fastNLP.models.base_model import BaseModel
-import fastNLP.modules.encoder as encoder
+from . import enas_utils as utils
+from .base_model import BaseModel
 
 def _get_dropped_weights(w_raw, dropout_p, is_training):
     """Drops out weights to implement DropConnect.
diff --git a/fastNLP/models/enas_trainer.py b/fastNLP/models/enas_trainer.py
index d8110db0..824b8184 100644
--- a/fastNLP/models/enas_trainer.py
+++ b/fastNLP/models/enas_trainer.py
@@ -1,6 +1,5 @@
 # Code Modified from https://github.com/carpedm20/ENAS-pytorch
 
-import os
 import time
 from datetime import datetime
 from datetime import timedelta
@@ -8,21 +7,19 @@ from datetime import timedelta
 import numpy as np
 import torch
 import math
-from torch import nn
 
 try:
     from tqdm.autonotebook import tqdm
 except:
-    from fastNLP.core.utils import _pseudo_tqdm as tqdm
+    from ..core.utils import _pseudo_tqdm as tqdm
 
-from fastNLP.core.batch import Batch
-from fastNLP.core.callback import CallbackManager, CallbackException
-from fastNLP.core.dataset import DataSet
-from fastNLP.core.utils import _CheckError
-from fastNLP.core.utils import _move_dict_value_to_device
-import fastNLP
-import fastNLP.models.enas_utils as utils
-from fastNLP.core.utils import _build_args
+from ..core.trainer import Trainer
+from ..core.batch import Batch
+from ..core.callback import CallbackManager, CallbackException
+from ..core.dataset import DataSet
+from ..core.utils import _move_dict_value_to_device
+from . import enas_utils as utils
+from ..core.utils import _build_args
 
 from torch.optim import Adam
 
@@ -34,7 +31,7 @@ def _get_no_grad_ctx_mgr():
     return torch.no_grad()
 
 
-class ENASTrainer(fastNLP.Trainer):
+class ENASTrainer(Trainer):
     """A class to wrap training code."""
     def __init__(self, train_data, model, controller, **kwargs):
         """Constructor for training algorithm.
diff --git a/fastNLP/models/enas_utils.py b/fastNLP/models/enas_utils.py
index e5027d81..aafcb3a7 100644
--- a/fastNLP/models/enas_utils.py
+++ b/fastNLP/models/enas_utils.py
@@ -4,21 +4,20 @@ from __future__ import print_function
 
 from collections import defaultdict
 import collections
-from datetime import datetime
-import os
-import json
 
 import numpy as np
 
 import torch
 from torch.autograd import Variable
 
+
 def detach(h):
     if type(h) == Variable:
         return Variable(h.data)
     else:
         return tuple(detach(v) for v in h)
 
+
 def get_variable(inputs, cuda=False, **kwargs):
     if type(inputs) in [list, np.ndarray]:
         inputs = torch.Tensor(inputs)
@@ -28,10 +27,12 @@ def get_variable(inputs, cuda=False, **kwargs):
         out = Variable(inputs, **kwargs)
     return out
 
+
 def update_lr(optimizer, lr):
     for param_group in optimizer.param_groups:
         param_group['lr'] = lr
 
+
 Node = collections.namedtuple('Node', ['id', 'name'])
 
 
@@ -48,9 +49,9 @@ def to_item(x):
     """Converts x, possibly scalar and possibly tensor, to a Python scalar."""
     if isinstance(x, (float, int)):
         return x
-
+    
     if float(torch.__version__[0:3]) < 0.4:
         assert (x.dim() == 1) and (len(x) == 1)
         return x[0]
-
+    
     return x.item()
diff --git a/fastNLP/models/sequence_modeling.py b/fastNLP/models/sequence_modeling.py
index b9b0677d..e076910f 100644
--- a/fastNLP/models/sequence_modeling.py
+++ b/fastNLP/models/sequence_modeling.py
@@ -1,9 +1,9 @@
 import torch
 
-from fastNLP.models.base_model import BaseModel
-from fastNLP.modules import decoder, encoder
-from fastNLP.modules.decoder.CRF import allowed_transitions
-from fastNLP.modules.utils import seq_mask
+from .base_model import BaseModel
+from ..modules import decoder, encoder
+from ..modules.decoder.CRF import allowed_transitions
+from ..modules.utils import seq_mask
 
 
 class SeqLabeling(BaseModel):
diff --git a/fastNLP/models/snli.py b/fastNLP/models/snli.py
index d4bf3d59..6b54bee6 100644
--- a/fastNLP/models/snli.py
+++ b/fastNLP/models/snli.py
@@ -1,11 +1,11 @@
 import torch
 import torch.nn as nn
 
-from fastNLP.models.base_model import BaseModel
-from fastNLP.modules import decoder as Decoder
-from fastNLP.modules import encoder as Encoder
-from fastNLP.modules import aggregator as Aggregator
-from fastNLP.modules.utils import seq_mask
+from .base_model import BaseModel
+from ..modules import decoder as Decoder
+from ..modules import encoder as Encoder
+from ..modules import aggregator as Aggregator
+from ..modules.utils import seq_mask
 
 
 my_inf = 10e12
diff --git a/fastNLP/models/star_transformer.py b/fastNLP/models/star_transformer.py
index e4fbeb28..93ee72f6 100644
--- a/fastNLP/models/star_transformer.py
+++ b/fastNLP/models/star_transformer.py
@@ -1,12 +1,12 @@
 """Star-Transformer 的 一个 Pytorch 实现.
 """
-from fastNLP.modules.encoder.star_transformer import StarTransformer
-from fastNLP.core.utils import seq_lens_to_masks
+from ..modules.encoder.star_transformer import StarTransformer
+from ..core.utils import seq_lens_to_masks
 from ..modules.utils import get_embeddings
+from ..core.const import Const
 
 import torch
 from torch import nn
-import torch.nn.functional as F
 
 
 class StarTransEnc(nn.Module):
@@ -107,7 +107,7 @@ class STSeqLabel(nn.Module):
     :param emb_dropout: 词嵌入的dropout概率. Default: 0.1
     :param dropout: 模型除词嵌入外的dropout概率. Default: 0.1
     """
-    def __init__(self, vocab_size, emb_dim, num_cls,
+    def __init__(self, init_embed, num_cls,
                  hidden_size=300,
                  num_layers=4,
                  num_head=8,
@@ -117,8 +117,7 @@ class STSeqLabel(nn.Module):
                  emb_dropout=0.1,
                  dropout=0.1,):
         super(STSeqLabel, self).__init__()
-        self.enc = StarTransEnc(vocab_size=vocab_size,
-                                emb_dim=emb_dim,
+        self.enc = StarTransEnc(init_embed=init_embed,
                                 hidden_size=hidden_size,
                                 num_layers=num_layers,
                                 num_head=num_head,
@@ -139,7 +138,7 @@ class STSeqLabel(nn.Module):
         nodes, _ = self.enc(words, mask)
         output = self.cls(nodes)
         output = output.transpose(1,2) # make hidden to be dim 1
-        return {'output': output} # [bsz, n_cls, seq_len]
+        return {Const.OUTPUT: output} # [bsz, n_cls, seq_len]
 
     def predict(self, words, seq_len):
         """
@@ -149,8 +148,8 @@ class STSeqLabel(nn.Module):
         :return output: [batch, seq_len] 输出序列中每个元素的分类
         """
         y = self.forward(words, seq_len)
-        _, pred = y['output'].max(1)
-        return {'output': pred}
+        _, pred = y[Const.OUTPUT].max(1)
+        return {Const.OUTPUT: pred}
 
 
 class STSeqCls(nn.Module):
@@ -169,7 +168,7 @@ class STSeqCls(nn.Module):
     :param dropout: 模型除词嵌入外的dropout概率. Default: 0.1
     """
 
-    def __init__(self, vocab_size, emb_dim, num_cls,
+    def __init__(self, init_embed, num_cls,
                  hidden_size=300,
                  num_layers=4,
                  num_head=8,
@@ -179,8 +178,7 @@ class STSeqCls(nn.Module):
                  emb_dropout=0.1,
                  dropout=0.1,):
         super(STSeqCls, self).__init__()
-        self.enc = StarTransEnc(vocab_size=vocab_size,
-                                emb_dim=emb_dim,
+        self.enc = StarTransEnc(init_embed=init_embed,
                                 hidden_size=hidden_size,
                                 num_layers=num_layers,
                                 num_head=num_head,
@@ -201,7 +199,7 @@ class STSeqCls(nn.Module):
         nodes, relay = self.enc(words, mask)
         y = 0.5 * (relay + nodes.max(1)[0])
         output = self.cls(y) # [bsz, n_cls]
-        return {'output': output}
+        return {Const.OUTPUT: output}
 
     def predict(self, words, seq_len):
         """
@@ -211,8 +209,8 @@ class STSeqCls(nn.Module):
         :return output: [batch, num_cls] 输出序列的分类
         """
         y = self.forward(words, seq_len)
-        _, pred = y['output'].max(1)
-        return {'output': pred}
+        _, pred = y[Const.OUTPUT].max(1)
+        return {Const.OUTPUT: pred}
 
 
 class STNLICls(nn.Module):
@@ -231,7 +229,7 @@ class STNLICls(nn.Module):
     :param dropout: 模型除词嵌入外的dropout概率. Default: 0.1
     """
 
-    def __init__(self, vocab_size, emb_dim, num_cls,
+    def __init__(self, init_embed, num_cls,
                  hidden_size=300,
                  num_layers=4,
                  num_head=8,
@@ -241,8 +239,7 @@ class STNLICls(nn.Module):
                  emb_dropout=0.1,
                  dropout=0.1,):
         super(STNLICls, self).__init__()
-        self.enc = StarTransEnc(vocab_size=vocab_size,
-                                emb_dim=emb_dim,
+        self.enc = StarTransEnc(init_embed=init_embed,
                                 hidden_size=hidden_size,
                                 num_layers=num_layers,
                                 num_head=num_head,
@@ -269,7 +266,7 @@ class STNLICls(nn.Module):
         y1 = enc(words1, mask1)
         y2 = enc(words2, mask2)
         output = self.cls(y1, y2) # [bsz, n_cls]
-        return {'output': output}
+        return {Const.OUTPUT: output}
 
     def predict(self, words1, words2, seq_len1, seq_len2):
         """
@@ -281,5 +278,5 @@ class STNLICls(nn.Module):
         :return output: [batch, num_cls] 输出分类的概率
         """
         y = self.forward(words1, words2, seq_len1, seq_len2)
-        _, pred = y['output'].max(1)
-        return {'output': pred}
+        _, pred = y[Const.OUTPUT].max(1)
+        return {Const.OUTPUT: pred}
diff --git a/fastNLP/modules/aggregator/attention.py b/fastNLP/modules/aggregator/attention.py
index f2f2ac68..67f68ff2 100644
--- a/fastNLP/modules/aggregator/attention.py
+++ b/fastNLP/modules/aggregator/attention.py
@@ -4,10 +4,10 @@ import torch
 import torch.nn.functional as F
 from torch import nn
 
-from fastNLP.modules.dropout import TimestepDropout
-from fastNLP.modules.utils import mask_softmax
+from ..dropout import TimestepDropout
+from ..utils import mask_softmax
 
-from fastNLP.modules.utils import initial_parameter
+from ..utils import initial_parameter
 
 
 class Attention(torch.nn.Module):
diff --git a/fastNLP/modules/aggregator/pooling.py b/fastNLP/modules/aggregator/pooling.py
index 9961b87f..fd4414b7 100644
--- a/fastNLP/modules/aggregator/pooling.py
+++ b/fastNLP/modules/aggregator/pooling.py
@@ -1,17 +1,12 @@
-# python: 3.6
-# encoding: utf-8
-
 import torch
 import torch.nn as nn
 
 
 class MaxPool(nn.Module):
     """Max-pooling模块。"""
-
-    def __init__(
-            self, stride=None, padding=0, dilation=1, dimension=1, kernel_size=None,
-            return_indices=False, ceil_mode=False
-    ):
+    
+    def __init__(self, stride=None, padding=0, dilation=1, dimension=1, kernel_size=None,
+                 return_indices=False, ceil_mode=False):
         """
         :param stride: 窗口移动大小，默认为kernel_size
         :param padding: padding的内容，默认为0
@@ -30,7 +25,7 @@ class MaxPool(nn.Module):
         self.kernel_size = kernel_size
         self.return_indices = return_indices
         self.ceil_mode = ceil_mode
-
+    
     def forward(self, x):
         if self.dimension == 1:
             pooling = nn.MaxPool1d(
@@ -57,10 +52,11 @@ class MaxPool(nn.Module):
 
 class MaxPoolWithMask(nn.Module):
     """带mask矩阵的1维max pooling"""
+    
     def __init__(self):
         super(MaxPoolWithMask, self).__init__()
         self.inf = 10e12
-
+    
     def forward(self, tensor, mask, dim=1):
         """
         :param torch.FloatTensor tensor: [batch_size, seq_len, channels] 初始tensor
@@ -75,11 +71,11 @@ class MaxPoolWithMask(nn.Module):
 
 class KMaxPool(nn.Module):
     """K max-pooling module."""
-
+    
     def __init__(self, k=1):
         super(KMaxPool, self).__init__()
         self.k = k
-
+    
     def forward(self, x):
         """
         :param torch.Tensor x: [N, C, L] 初始tensor
@@ -92,12 +88,12 @@ class KMaxPool(nn.Module):
 
 class AvgPool(nn.Module):
     """1-d average pooling module."""
-
+    
     def __init__(self, stride=None, padding=0):
         super(AvgPool, self).__init__()
         self.stride = stride
         self.padding = padding
-
+    
     def forward(self, x):
         """
         :param torch.Tensor x: [N, C, L] 初始tensor
@@ -117,7 +113,7 @@ class MeanPoolWithMask(nn.Module):
     def __init__(self):
         super(MeanPoolWithMask, self).__init__()
         self.inf = 10e12
-
+    
     def forward(self, tensor, mask, dim=1):
         """
         :param torch.FloatTensor tensor: [batch_size, seq_len, channels] 初始tensor
@@ -127,7 +123,3 @@ class MeanPoolWithMask(nn.Module):
         """
         masks = mask.view(mask.size(0), mask.size(1), -1).float()
         return torch.sum(tensor * masks.float(), dim=dim) / torch.sum(masks.float(), dim=1)
-
-
-
-
diff --git a/fastNLP/modules/decoder/CRF.py b/fastNLP/modules/decoder/CRF.py
index cc713bc6..4c3ac122 100644
--- a/fastNLP/modules/decoder/CRF.py
+++ b/fastNLP/modules/decoder/CRF.py
@@ -1,8 +1,8 @@
 import torch
 from torch import nn
 
-from fastNLP.modules.utils import initial_parameter
-from fastNLP.modules.decoder.utils import log_sum_exp
+from ..utils import initial_parameter
+from ..decoder.utils import log_sum_exp
 
 
 def seq_len_to_byte_mask(seq_lens):
diff --git a/fastNLP/modules/decoder/MLP.py b/fastNLP/modules/decoder/MLP.py
index e7fafd68..35484932 100644
--- a/fastNLP/modules/decoder/MLP.py
+++ b/fastNLP/modules/decoder/MLP.py
@@ -1,7 +1,7 @@
 import torch
 import torch.nn as nn
 
-from fastNLP.modules.utils import initial_parameter
+from ..utils import initial_parameter
 
 
 class MLP(nn.Module):
diff --git a/fastNLP/modules/encoder/char_encoder.py b/fastNLP/modules/encoder/char_encoder.py
index 39e4b43e..54b702ea 100644
--- a/fastNLP/modules/encoder/char_encoder.py
+++ b/fastNLP/modules/encoder/char_encoder.py
@@ -1,7 +1,7 @@
 import torch
 from torch import nn
 
-from fastNLP.modules.utils import initial_parameter
+from ..utils import initial_parameter
 
 
 # from torch.nn.init import xavier_uniform
diff --git a/fastNLP/modules/encoder/conv_maxpool.py b/fastNLP/modules/encoder/conv_maxpool.py
index d7a8b286..d01eddea 100644
--- a/fastNLP/modules/encoder/conv_maxpool.py
+++ b/fastNLP/modules/encoder/conv_maxpool.py
@@ -5,7 +5,7 @@ import torch
 import torch.nn as nn
 import torch.nn.functional as F
 
-from fastNLP.modules.utils import initial_parameter
+from ..utils import initial_parameter
 
 
 class ConvMaxpool(nn.Module):
diff --git a/fastNLP/modules/encoder/embedding.py b/fastNLP/modules/encoder/embedding.py
index 098788a8..8cc53b0b 100644
--- a/fastNLP/modules/encoder/embedding.py
+++ b/fastNLP/modules/encoder/embedding.py
@@ -1,5 +1,5 @@
 import torch.nn as nn
-from fastNLP.modules.utils import get_embeddings
+from ..utils import get_embeddings
 
 class Embedding(nn.Embedding):
     """Embedding组件. 可以通过self.num_embeddings获取词表大小; self.embedding_dim获取embedding的维度"""
diff --git a/fastNLP/modules/encoder/linear.py b/fastNLP/modules/encoder/linear.py
index 2dc31eea..06edf81b 100644
--- a/fastNLP/modules/encoder/linear.py
+++ b/fastNLP/modules/encoder/linear.py
@@ -1,6 +1,6 @@
 import torch.nn as nn
 
-from fastNLP.modules.utils import initial_parameter
+from ..utils import initial_parameter
 
 
 class Linear(nn.Module):
diff --git a/fastNLP/modules/encoder/lstm.py b/fastNLP/modules/encoder/lstm.py
index cff39c84..cc6b1183 100644
--- a/fastNLP/modules/encoder/lstm.py
+++ b/fastNLP/modules/encoder/lstm.py
@@ -5,7 +5,7 @@ import torch
 import torch.nn as nn
 import torch.nn.utils.rnn as rnn
 
-from fastNLP.modules.utils import initial_parameter
+from ..utils import initial_parameter
 
 
 class LSTM(nn.Module):
diff --git a/fastNLP/modules/encoder/variational_rnn.py b/fastNLP/modules/encoder/variational_rnn.py
index 89ab44d9..2657ebf4 100644
--- a/fastNLP/modules/encoder/variational_rnn.py
+++ b/fastNLP/modules/encoder/variational_rnn.py
@@ -3,7 +3,7 @@
 import torch
 import torch.nn as nn
 from torch.nn.utils.rnn import PackedSequence, pack_padded_sequence, pad_packed_sequence
-from fastNLP.modules.utils import initial_parameter
+from ..utils import initial_parameter
 
 try:
     from torch import flip
diff --git a/test/core/test_batch.py b/test/core/test_batch.py
index 2bf2dcea..072f7c83 100644
--- a/test/core/test_batch.py
+++ b/test/core/test_batch.py
@@ -142,13 +142,12 @@ class TestCase1(unittest.TestCase):
 
     def test_sequential_batch(self):
         batch_size = 32
-        pause_seconds = 0.01
         num_samples = 1000
         dataset = generate_fake_dataset(num_samples)
 
         batch = Batch(dataset, batch_size=batch_size, sampler=SequentialSampler())
         for batch_x, batch_y in batch:
-            time.sleep(pause_seconds)
+            pass
 
     """
     def test_multi_workers_batch(self):
diff --git a/test/core/test_callbacks.py b/test/core/test_callbacks.py
index 3329e7a1..cf3e2fff 100644
--- a/test/core/test_callbacks.py
+++ b/test/core/test_callbacks.py
@@ -3,7 +3,7 @@ import unittest
 import numpy as np
 import torch
 
-from fastNLP.core.callback import EchoCallback, EarlyStopCallback, GradientClipCallback, LRScheduler, ControlC, \
+from fastNLP.core.callback import EarlyStopCallback, GradientClipCallback, LRScheduler, ControlC, \
     LRFinder, \
     TensorboardCallback
 from fastNLP.core.dataset import DataSet
diff --git a/test/core/test_metrics.py b/test/core/test_metrics.py
index 3f37c495..d4422ec4 100644
--- a/test/core/test_metrics.py
+++ b/test/core/test_metrics.py
@@ -132,6 +132,19 @@ class TestAccuracyMetric(unittest.TestCase):
             return
         self.assertTrue(True, False), "No exception catches."
 
+    def test_seq_len(self):
+        N = 256
+        seq_len = torch.zeros(N).long()
+        seq_len[0] = 2
+        pred = {'pred': torch.ones(N, 2)}
+        target = {'target': torch.ones(N, 2), 'seq_len': seq_len}
+        metric = AccuracyMetric()
+        metric(pred_dict=pred, target_dict=target)
+        self.assertDictEqual(metric.get_metric(), {'acc': 1.})
+        seq_len[1:] = 1
+        metric(pred_dict=pred, target_dict=target)
+        self.assertDictEqual(metric.get_metric(), {'acc': 1.})
+
 class SpanF1PreRecMetric(unittest.TestCase):
     def test_case1(self):
         from fastNLP.core.metrics import _bmes_tag_to_spans
diff --git a/test/io/test_dataset_loader.py b/test/io/test_dataset_loader.py
index 97379a7d..2e367567 100644
--- a/test/io/test_dataset_loader.py
+++ b/test/io/test_dataset_loader.py
@@ -1,7 +1,7 @@
 import unittest
 
 from fastNLP.io.dataset_loader import Conll2003Loader, PeopleDailyCorpusLoader, \
-    CSVLoader, SNLILoader
+    CSVLoader, SNLILoader, JsonLoader
 
 class TestDatasetLoader(unittest.TestCase):
 
@@ -24,3 +24,8 @@ class TestDatasetLoader(unittest.TestCase):
     def test_SNLILoader(self):
         ds = SNLILoader().load('test/data_for_tests/sample_snli.jsonl')
         assert len(ds) == 3
+
+    def test_JsonLoader(self):
+        ds = JsonLoader().load('test/data_for_tests/sample_snli.jsonl')
+        assert len(ds) == 3
+
diff --git a/test/models/__init__.py b/test/models/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/test/models/model_runner.py b/test/models/model_runner.py
new file mode 100644
index 00000000..7a8d0593
--- /dev/null
+++ b/test/models/model_runner.py
@@ -0,0 +1,151 @@
+"""
+此模块可以非常方便的测试模型。
+若你的模型属于：文本分类，序列标注，自然语言推理（NLI），可以直接使用此模块测试
+若模型不属于上述类别，也可以自己准备假数据，设定loss和metric进行测试
+
+此模块的测试仅保证模型能使用fastNLP进行训练和测试，不测试模型实际性能
+
+Example::
+    # import 全大写变量...
+    from model_runner import *
+
+    # 测试一个文本分类模型
+    init_emb = (VOCAB_SIZE, 50)
+    model = SomeModel(init_emb, num_cls=NUM_CLS)
+    RUNNER.run_model_with_task(TEXT_CLS, model)
+
+    # 序列标注模型
+    RUNNER.run_model_with_task(POS_TAGGING, model)
+
+    # NLI模型
+    RUNNER.run_model_with_task(NLI, model)
+
+    # 自定义模型
+    RUNNER.run_model(model, data=get_mydata(),
+     loss=Myloss(), metrics=Mymetric())
+"""
+from fastNLP import Trainer, Tester, DataSet
+from fastNLP import AccuracyMetric
+from fastNLP import CrossEntropyLoss
+from fastNLP.core.const import Const as C
+from random import randrange
+
+VOCAB_SIZE = 100
+NUM_CLS = 100
+MAX_LEN = 10
+N_SAMPLES = 100
+N_EPOCHS = 1
+BATCH_SIZE = 5
+
+TEXT_CLS = 'text_cls'
+POS_TAGGING = 'pos_tagging'
+NLI = 'nli'
+
+class ModelRunner():
+    def gen_seq(self, length, vocab_size):
+        """generate fake sequence indexes with given length"""
+        # reserve 0 for padding
+        return [randrange(1, vocab_size) for _ in range(length)]
+
+    def gen_var_seq(self, max_len, vocab_size):
+        """generate fake sequence indexes in variant length"""
+        length = randrange(3, max_len) # at least 3 words in a seq
+        return self.gen_seq(length, vocab_size)
+
+    def prepare_text_classification_data(self):
+        index = 'index'
+        ds = DataSet({index: list(range(N_SAMPLES))})
+        ds.apply_field(lambda x: self.gen_var_seq(MAX_LEN, VOCAB_SIZE),
+                       field_name=index, new_field_name=C.INPUT,
+                       is_input=True)
+        ds.apply_field(lambda x: randrange(NUM_CLS),
+                       field_name=index, new_field_name=C.TARGET,
+                       is_target=True)
+        ds.apply_field(len, C.INPUT, C.INPUT_LEN,
+                       is_input=True)
+        return ds
+
+    def prepare_pos_tagging_data(self):
+        index = 'index'
+        ds = DataSet({index: list(range(N_SAMPLES))})
+        ds.apply_field(lambda x: self.gen_var_seq(MAX_LEN, VOCAB_SIZE),
+                       field_name=index, new_field_name=C.INPUT,
+                       is_input=True)
+        ds.apply_field(lambda x: self.gen_seq(len(x), NUM_CLS),
+                       field_name=C.INPUT, new_field_name=C.TARGET,
+                       is_target=True)
+        ds.apply_field(len, C.INPUT, C.INPUT_LEN,
+                       is_input=True, is_target=True)
+        return ds
+
+    def prepare_nli_data(self):
+        index = 'index'
+        ds = DataSet({index: list(range(N_SAMPLES))})
+        ds.apply_field(lambda x: self.gen_var_seq(MAX_LEN, VOCAB_SIZE),
+                       field_name=index, new_field_name=C.INPUTS(0),
+                       is_input=True)
+        ds.apply_field(lambda x: self.gen_var_seq(MAX_LEN, VOCAB_SIZE),
+                       field_name=index, new_field_name=C.INPUTS(1),
+                       is_input=True)
+        ds.apply_field(lambda x: randrange(NUM_CLS),
+                       field_name=index, new_field_name=C.TARGET,
+                       is_target=True)
+        ds.apply_field(len, C.INPUTS(0), C.INPUT_LENS(0),
+                       is_input=True, is_target=True)
+        ds.apply_field(len, C.INPUTS(1), C.INPUT_LENS(1),
+                       is_input = True, is_target = True)
+        ds.set_input(C.INPUTS(0), C.INPUTS(1))
+        ds.set_target(C.TARGET)
+        return ds
+
+    def run_text_classification(self, model, data=None):
+        if data is None:
+            data = self.prepare_text_classification_data()
+        loss = CrossEntropyLoss(pred=C.OUTPUT, target=C.TARGET)
+        metric = AccuracyMetric(pred=C.OUTPUT, target=C.TARGET)
+        self.run_model(model, data, loss, metric)
+
+    def run_pos_tagging(self, model, data=None):
+        if data is None:
+            data = self.prepare_pos_tagging_data()
+        loss = CrossEntropyLoss(pred=C.OUTPUT, target=C.TARGET, padding_idx=0)
+        metric = AccuracyMetric(pred=C.OUTPUT, target=C.TARGET, seq_len=C.INPUT_LEN)
+        self.run_model(model, data, loss, metric)
+
+    def run_nli(self, model, data=None):
+        if data is None:
+            data = self.prepare_nli_data()
+        loss = CrossEntropyLoss(pred=C.OUTPUT, target=C.TARGET)
+        metric = AccuracyMetric(pred=C.OUTPUT, target=C.TARGET)
+        self.run_model(model, data, loss, metric)
+
+    def run_model(self, model, data, loss, metrics):
+        """run a model, test if it can run with fastNLP"""
+        print('testing model:', model.__class__.__name__)
+        tester = Tester(data=data, model=model, metrics=metrics,
+                        batch_size=BATCH_SIZE, verbose=0)
+        before_train = tester.test()
+        trainer = Trainer(model=model, train_data=data, dev_data=None,
+                          n_epochs=N_EPOCHS, batch_size=BATCH_SIZE,
+                          loss=loss,
+                          save_path=None,
+                          use_tqdm=False)
+        trainer.train(load_best_model=False)
+        after_train = tester.test()
+        for metric_name, v1 in before_train.items():
+            assert metric_name in after_train
+            # # at least we can sure model params changed, even if we don't know performance
+            # v2 = after_train[metric_name]
+            # assert v1 != v2
+
+    def run_model_with_task(self, task, model):
+        """run a model with certain task"""
+        TASKS = {
+            TEXT_CLS: self.run_text_classification,
+            POS_TAGGING: self.run_pos_tagging,
+            NLI: self.run_nli,
+        }
+        assert task in TASKS
+        TASKS[task](model)
+
+RUNNER = ModelRunner()
diff --git a/test/models/test_biaffine_parser.py b/test/models/test_biaffine_parser.py
index 5d6c2102..e4746391 100644
--- a/test/models/test_biaffine_parser.py
+++ b/test/models/test_biaffine_parser.py
@@ -2,90 +2,33 @@ import unittest
 
 import fastNLP
 from fastNLP.models.biaffine_parser import BiaffineParser, ParserLoss, ParserMetric
-
-data_file = """
-1       The     _       DET     DT      _       3       det     _       _
-2       new     _       ADJ     JJ      _       3       amod    _       _
-3       rate    _       NOUN    NN      _       6       nsubj   _       _
-4       will    _       AUX     MD      _       6       aux     _       _
-5       be      _       VERB    VB      _       6       cop     _       _
-6       payable _       ADJ     JJ      _       0       root    _       _
-7       mask    _       ADJ     JJ      _       6       punct    _       _
-8       mask    _       ADJ     JJ      _       6       punct    _       _
-9       cents   _       NOUN    NNS     _       4       nmod    _       _
-10      from    _       ADP     IN      _       12      case    _       _
-11      seven   _       NUM     CD      _       12      nummod  _       _
-12      cents   _       NOUN    NNS     _       4       nmod    _       _
-13      a       _       DET     DT      _       14      det     _       _
-14      share   _       NOUN    NN      _       12      nmod:npmod      _       _
-15      .       _       PUNCT   .       _       4       punct   _       _
-
-1       The     _       DET     DT      _       3       det     _       _
-2       new     _       ADJ     JJ      _       3       amod    _       _
-3       rate    _       NOUN    NN      _       6       nsubj   _       _
-4       will    _       AUX     MD      _       6       aux     _       _
-5       be      _       VERB    VB      _       6       cop     _       _
-6       payable _       ADJ     JJ      _       0       root    _       _
-7       Feb.    _       PROPN   NNP     _       6       nmod:tmod       _       _
-8       15      _       NUM     CD      _       7       nummod  _       _
-9       .       _       PUNCT   .       _       6       punct   _       _
-
-1       A       _       DET     DT      _       3       det     _       _
-2       record  _       NOUN    NN      _       3       compound        _       _
-3       date    _       NOUN    NN      _       7       nsubjpass       _       _
-4       has     _       AUX     VBZ     _       7       aux     _       _
-5       n't     _       PART    RB      _       7       neg     _       _
-6       been    _       AUX     VBN     _       7       auxpass _       _
-7       set     _       VERB    VBN     _       0       root    _       _
-8       .       _       PUNCT   .       _       7       punct   _       _
-
-"""
-
-
-def init_data():
-    ds = fastNLP.DataSet()
-    v = {'words1': fastNLP.Vocabulary(),
-         'words2': fastNLP.Vocabulary(),
-         'label_true': fastNLP.Vocabulary()}
-    data = []
-    for line in data_file.split('\n'):
-        line = line.split()
-        if len(line) == 0 and len(data) > 0:
-            data = list(zip(*data))
-            ds.append(fastNLP.Instance(words1=data[1],
-                                       words2=data[4],
-                                       arc_true=data[6],
-                                       label_true=data[7]))
-            data = []
-        elif len(line) > 0:
-            data.append(line)
-
-    for name in ['words1', 'words2', 'label_true']:
-        ds.apply(lambda x: ['<st>'] + list(x[name]), new_field_name=name)
-        ds.apply(lambda x: v[name].add_word_lst(x[name]))
-
-    for name in ['words1', 'words2', 'label_true']:
-        ds.apply(lambda x: [v[name].to_index(w) for w in x[name]], new_field_name=name)
-
-    ds.apply(lambda x: [0] + list(map(int, x['arc_true'])), new_field_name='arc_true')
-    ds.apply(lambda x: len(x['words1']), new_field_name='seq_len')
-    ds.set_input('words1', 'words2', 'seq_len', flag=True)
-    ds.set_target('arc_true', 'label_true', 'seq_len', flag=True)
-    return ds, v['words1'], v['words2'], v['label_true']
-
+from .model_runner import *
+
+
+def prepare_parser_data():
+    index = 'index'
+    ds = DataSet({index: list(range(N_SAMPLES))})
+    ds.apply_field(lambda x: RUNNER.gen_var_seq(MAX_LEN, VOCAB_SIZE),
+                   field_name=index, new_field_name=C.INPUTS(0),
+                   is_input=True)
+    ds.apply_field(lambda x: RUNNER.gen_seq(len(x), NUM_CLS),
+                   field_name=C.INPUTS(0), new_field_name=C.INPUTS(1),
+                   is_input=True)
+    # target1 is heads, should in range(0, len(words))
+    ds.apply_field(lambda x: RUNNER.gen_seq(len(x), len(x)),
+                   field_name=C.INPUTS(0), new_field_name=C.TARGETS(0),
+                   is_target=True)
+    ds.apply_field(lambda x: RUNNER.gen_seq(len(x), NUM_CLS),
+                   field_name=C.INPUTS(0), new_field_name=C.TARGETS(1),
+                   is_target=True)
+    ds.apply_field(len, field_name=C.INPUTS(0), new_field_name=C.INPUT_LEN,
+                   is_input=True, is_target=True)
+    return ds
 
 class TestBiaffineParser(unittest.TestCase):
     def test_train(self):
-        ds, v1, v2, v3 = init_data()
-        model = BiaffineParser(word_vocab_size=len(v1), word_emb_dim=30,
-                               pos_vocab_size=len(v2), pos_emb_dim=30,
-                               num_label=len(v3), encoder='var-lstm')
-        trainer = fastNLP.Trainer(model=model, train_data=ds, dev_data=ds,
-                                  loss=ParserLoss(), metrics=ParserMetric(), metric_key='UAS',
-                                  batch_size=1, validate_every=10,
-                                  n_epochs=10, use_cuda=False, use_tqdm=False)
-        trainer.train(load_best_model=False)
-
-
-if __name__ == '__main__':
-    unittest.main()
+        model = BiaffineParser(init_embed=(VOCAB_SIZE, 30),
+                               pos_vocab_size=VOCAB_SIZE, pos_emb_dim=30,
+                               num_label=NUM_CLS, encoder='var-lstm')
+        ds = prepare_parser_data()
+        RUNNER.run_model(model, ds, loss=ParserLoss(), metrics=ParserMetric())
diff --git a/test/models/test_star_trans.py b/test/models/test_star_trans.py
new file mode 100644
index 00000000..b08e2efe
--- /dev/null
+++ b/test/models/test_star_trans.py
@@ -0,0 +1,16 @@
+from .model_runner import *
+from fastNLP.models.star_transformer import STNLICls, STSeqCls, STSeqLabel
+
+
+# add star-transformer tests, for 3 kinds of tasks.
+def test_cls():
+    model = STSeqCls((VOCAB_SIZE, 100), NUM_CLS, dropout=0)
+    RUNNER.run_model_with_task(TEXT_CLS, model)
+
+def test_nli():
+    model = STNLICls((VOCAB_SIZE, 100), NUM_CLS, dropout=0)
+    RUNNER.run_model_with_task(NLI, model)
+
+def test_seq_label():
+    model = STSeqLabel((VOCAB_SIZE, 100), NUM_CLS, dropout=0)
+    RUNNER.run_model_with_task(POS_TAGGING, model)