diff --git a/docs/source/tutorials/tutorial_2_load_dataset.rst b/docs/source/tutorials/tutorial_2_load_dataset.rst
index d964d8c7..7f4509f2 100644
--- a/docs/source/tutorials/tutorial_2_load_dataset.rst
+++ b/docs/source/tutorials/tutorial_2_load_dataset.rst
@@ -91,11 +91,11 @@ Part IV: DataSetLoader举例
 
 以Matching任务为例子：
 
-    :class:`~fastNLP.io.data_loader.matching.MatchingLoader`
-        我们在fastNLP当中封装了一个Matching任务数据集的数据加载类： :class:`~fastNLP.io.data_loader.matching.MatchingLoader` .
+    :class:`~fastNLP.io.data_loader.MatchingLoader`
+        我们在fastNLP当中封装了一个Matching任务数据集的数据加载类： :class:`~fastNLP.io.data_loader.MatchingLoader` .
 
         在MatchingLoader类当中我们封装了一个对数据集中的文本内容进行进一步的预处理的函数：
-        :meth:`~fastNLP.io.data_loader.matching.MatchingLoader.process`
+        :meth:`~fastNLP.io.data_loader.MatchingLoader.process`
         这个函数具有各种预处理option，如：
         - 是否将文本转成全小写
         - 是否需要序列长度信息，需要什么类型的序列长度信息
@@ -104,21 +104,58 @@ Part IV: DataSetLoader举例
 
         具体内容参见 :meth:`fastNLP.io.MatchingLoader.process` 。
 
-    :class:`~fastNLP.io.data_loader.matching.SNLILoader`
+    :class:`~fastNLP.io.data_loader.SNLILoader`
         一个关于SNLI数据集的DataSetLoader。SNLI数据集来自
         `SNLI Data Set <https://nlp.stanford.edu/projects/snli/snli_1.0.zip>`_ .
 
-        在 :class:`~fastNLP.io.data_loader.matching.SNLILoader` 的 :meth:`~fastNLP.io.data_loader.matching.SNLILoader._load`
-        函数中，我们用以下代码将数据集内容从文本文件读入内存
+        在 :class:`~fastNLP.io.data_loader.SNLILoader` 的 :meth:`~fastNLP.io.data_loader.SNLILoader._load`
+        函数中，我们用以下代码将数据集内容从文本文件读入内存：
 
         .. code-block:: python
 
                 data = SNLILoader().process(
-                    paths='path/to/snli/data', to_lower=False, seq_len_type=arg.seq_len_type,
+                    paths='path/to/snli/data', to_lower=False, seq_len_type='seq_len',
                     get_index=True, concat=False,
                 )
+                print(data)
 
-        这里的data即可直接传入 :class:`~fastNLP.Trainer` 进行
+        输出的内容是::
+
+            In total 3 datasets:
+                train has 549367 instances.
+                dev has 9842 instances.
+                test has 9824 instances.
+            In total 2 vocabs:
+                words has 43154 entries.
+                target has 3 entries.
+
+
+        这里的data是一个 :class:`~fastNLP.io.base_loader.DataBundle` ，取 ``datasets`` 字典里的内容即可直接传入
+        :class:`~fastNLP.Trainer` 或者 :class:`~fastNLP.Tester` 进行训练或者测试。
+
+    :class:`~fastNLP.io.data_loader.IMDBLoader`
+        以IMDB数据集为例，在 :class:`~fastNLP.io.data_loader.IMDBLoader` 的 :meth:`~fastNLP.io.data_loader.IMDBLoader._load`
+        函数中，我们用以下代码将数据集内容从文本文件读入内存：
+
+        .. code-block:: python
+
+                data = IMDBLoader().process(
+                    paths={'train': 'path/to/train/file', 'test': 'path/to/test/file'}
+                )
+                print(data)
+
+        输出的内容是::
+
+            In total 3 datasets:
+                train has 22500 instances.
+                test has 25000 instances.
+                dev has 2500 instances.
+            In total 2 vocabs:
+                words has 82846 entries.
+                target has 2 entries.
+
+
+        这里的将原来的train集按9:1的比例分成了训练集和验证集。
 
 
 ------------------------------------------
diff --git a/fastNLP/core/optimizer.py b/fastNLP/core/optimizer.py
index f0dfdef0..3036257c 100644
--- a/fastNLP/core/optimizer.py
+++ b/fastNLP/core/optimizer.py
@@ -104,25 +104,28 @@ class Adam(Optimizer):
 
 
 class AdamW(TorchOptimizer):
-    r"""对AdamW的实现，该实现应该会在pytorch更高版本中出现，https://github.com/pytorch/pytorch/pull/21250。这里提前加入
+    r"""
+    别名：:class:`fastNLP.AdamW` :class:`fastNLP.core.optimizer.AdamW`
+
+    对AdamW的实现，该实现应该会在pytorch更高版本中出现，https://github.com/pytorch/pytorch/pull/21250。这里提前加入
     
     .. todo::
         翻译成中文
     
     The original Adam algorithm was proposed in `Adam: A Method for Stochastic Optimization`_.
     The AdamW variant was proposed in `Decoupled Weight Decay Regularization`_.
-    Arguments:
-        params (iterable): iterable of parameters to optimize or dicts defining
-            parameter groups
-        lr (float, optional): learning rate (default: 1e-3)
-        betas (Tuple[float, float], optional): coefficients used for computing
-            running averages of gradient and its square (default: (0.9, 0.99))
-        eps (float, optional): term added to the denominator to improve
-            numerical stability (default: 1e-8)
-        weight_decay (float, optional): weight decay coefficient (default: 1e-2)
-        amsgrad (boolean, optional): whether to use the AMSGrad variant of this
-            algorithm from the paper `On the Convergence of Adam and Beyond`_
-            (default: False)
+
+    :param params (iterable): iterable of parameters to optimize or dicts defining
+        parameter groups
+    :param lr (float, optional): learning rate (default: 1e-3)
+    :param betas (Tuple[float, float], optional): coefficients used for computing
+        running averages of gradient and its square (default: (0.9, 0.99))
+    :param eps (float, optional): term added to the denominator to improve
+        numerical stability (default: 1e-8)
+    :param weight_decay (float, optional): weight decay coefficient (default: 1e-2)
+        algorithm from the paper `On the Convergence of Adam and Beyond`_
+        (default: False)
+
     .. _Adam\: A Method for Stochastic Optimization:
         https://arxiv.org/abs/1412.6980
     .. _Decoupled Weight Decay Regularization:
@@ -152,9 +155,9 @@ class AdamW(TorchOptimizer):
 
     def step(self, closure=None):
         """Performs a single optimization step.
-        Arguments:
-            closure (callable, optional): A closure that reevaluates the model
-                and returns the loss.
+
+        :param closure: (callable, optional) A closure that reevaluates the model
+            and returns the loss.
         """
         loss = None
         if closure is not None: