diff --git a/docs/source/tutorials/文本分类.rst b/docs/source/tutorials/文本分类.rst
index c056d038..f4995dc1 100644
--- a/docs/source/tutorials/文本分类.rst
+++ b/docs/source/tutorials/文本分类.rst
@@ -375,4 +375,4 @@ fastNLP提供了Trainer对象来组织训练过程，包括完成loss计算(所
 
 .. raw:: html
 
-    <a href="../_static/notebooks/%E6%96%87%E6%9C%AC%E5%88%86%E7%B1%BB.ipynb" download="文本分类.ipynb">点击下载 IPython Notebook 文件 </a>
+    <a href="../_static/notebooks/%E6%96%87%E6%9C%AC%E5%88%86%E7%B1%BB.ipynb" download="文本分类.ipynb">点击下载 IPython Notebook 文件 </a><hr>
diff --git a/docs/source/user/example.rst b/docs/source/user/example.rst
index 70ebe628..63535058 100644
--- a/docs/source/user/example.rst
+++ b/docs/source/user/example.rst
@@ -154,3 +154,9 @@ csv 表格
 
 :meth:`fastNLP.DataSet.apply`
 
+下面这个代码是不可行的，必须要用 r""" 才行:
+
+.. code::
+
+    :param float beta: f_beta分数， :math:`f_{beta} = \frac{(1 + {beta}^{2})*(pre*rec)}{({beta}^{2}*pre + rec)}` . 常用为 `beta=0.5, 1, 2` 若为0.5则精确率的权重高于召回率；若为1，则两者平等；若为2，则召回率权重高于精确率。
+
diff --git a/fastNLP/__init__.py b/fastNLP/__init__.py
index 077a4d95..c7cca71d 100644
--- a/fastNLP/__init__.py
+++ b/fastNLP/__init__.py
@@ -44,7 +44,7 @@ __all__ = [
     "AutoPadder",
     "EngChar2DPadder",
 
-    "CollectFn",
+    # "CollectFn",
     "ConcatCollectFn",
 
     "MetricBase",
diff --git a/fastNLP/core/batch.py b/fastNLP/core/batch.py
index 58d304c9..10f4b966 100644
--- a/fastNLP/core/batch.py
+++ b/fastNLP/core/batch.py
@@ -260,51 +260,51 @@ class DataSetIter(BatchIter):
 class TorchLoaderIter(BatchIter):
     """
     与DataSetIter类似，但可以用于非fastNLP的数据容器对象，然后将其传入到Trainer中。
-        只需要保证数据容器实现了实现了以下的方法
+    只需要保证数据容器实现了实现了以下的方法
 
-        Example::
-
-            import random
-            from fastNLP import TorchLoaderIter
-            import torch
-            class UdfDataSet:
-                def __init__(self, num_samples):
-                    self.num_samples = num_samples
-
-                def __getitem__(self, idx):  # 必须实现的方法，输入参数是一个int，范围为[0, len(self))
-                    x = [random.random() for _ in range(3)]
-                    y = random.random()
-                    return x,y
-
-                def __len__(self):  # 需要实现该方法返回值需要是一个int数据
-                    return self.num_samples
-
-            # 需要实现collact_fn将数据转换为tensor
-            def collact_fn(data_list):
-                # [(x1,y1), (x2,y2), ...], 这里的输入实际上是将UdfDataSet的__getitem__输入结合为list
-                xs, ys = [], []
-                for l in data_list:
-                    x, y = l
-                    xs.append(x)
-                    ys.append(y)
-                # 不需要转移到gpu，Trainer或Tester会将其转移到model所在的device
-                x,y = torch.FloatTensor(xs), torch.FloatTensor(ys)
-                return {'x':x, 'y':y}, {'y':y}
+    Example::
 
-            udf_dataset = UdfDataSet(10)
-            dataset = TorchLoaderIter(udf_dataset, collate_fn=collact_fn)
-            class Model(nn.Module):
-                def __init__(self):
-                    super().__init__()
-                    self.fc = nn.Linear(3, 1)
-                def forward(self, x, y):
-                    return {'loss':torch.pow(self.fc(x).squeeze(-1)-y, 2).sum()}
-                def predict(self, x):
-                    return {'pred':self.fc(x).squeeze(0)}
-            model = Model()
-            trainer = Trainer(train_data=dataset, model=model, loss=None, print_every=2, dev_data=dataset,
-                              metrics=AccuracyMetric(target='y'), use_tqdm=False)
-            trainer.train(load_best_model=False)
+        import random
+        from fastNLP import TorchLoaderIter
+        import torch
+        class UdfDataSet:
+            def __init__(self, num_samples):
+                self.num_samples = num_samples
+
+            def __getitem__(self, idx):  # 必须实现的方法，输入参数是一个int，范围为[0, len(self))
+                x = [random.random() for _ in range(3)]
+                y = random.random()
+                return x,y
+
+            def __len__(self):  # 需要实现该方法返回值需要是一个int数据
+                return self.num_samples
+
+        # 需要实现collact_fn将数据转换为tensor
+        def collact_fn(data_list):
+            # [(x1,y1), (x2,y2), ...], 这里的输入实际上是将UdfDataSet的__getitem__输入结合为list
+            xs, ys = [], []
+            for l in data_list:
+                x, y = l
+                xs.append(x)
+                ys.append(y)
+            # 不需要转移到gpu，Trainer或Tester会将其转移到model所在的device
+            x,y = torch.FloatTensor(xs), torch.FloatTensor(ys)
+            return {'x':x, 'y':y}, {'y':y}
+
+        udf_dataset = UdfDataSet(10)
+        dataset = TorchLoaderIter(udf_dataset, collate_fn=collact_fn)
+        class Model(nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.fc = nn.Linear(3, 1)
+            def forward(self, x, y):
+                return {'loss':torch.pow(self.fc(x).squeeze(-1)-y, 2).sum()}
+            def predict(self, x):
+                return {'pred':self.fc(x).squeeze(0)}
+        model = Model()
+        trainer = Trainer(train_data=dataset, model=model, loss=None, print_every=2, dev_data=dataset,
+                          metrics=AccuracyMetric(target='y'), use_tqdm=False)
+        trainer.train(load_best_model=False)
 
     除此之外，还可以通过该方法实现OnTheFly的训练，如下面的代码所示
 
@@ -321,7 +321,7 @@ class TorchLoaderIter(BatchIter):
                 data.append(x + [y])
             with open(tmp_file_path, 'w') as f:
                 for d in data:
-                    f.write(' '.join(map(str, d)) + '\n')
+                    f.write(' '.join(map(str, d)) + '\\n')
 
             class FileDataSet:
                 def __init__(self, tmp_file):
@@ -382,6 +382,7 @@ class TorchLoaderIter(BatchIter):
             import os
             if os.path.exists(tmp_file_path):
                 os.remove(tmp_file_path)
+    
     """
     def __init__(self, dataset, batch_size=1, sampler=None,
                  num_workers=0, pin_memory=False, drop_last=False,
@@ -391,7 +392,6 @@ class TorchLoaderIter(BatchIter):
         :param dataset: :class:`~fastNLP.DataSet` 对象, 数据集
         :param int batch_size: 取出的batch大小
         :param sampler: 规定使用的 :class:`~fastNLP.Sampler` 方式. 若为 ``None`` , 使用 :class:`~fastNLP.SequentialSampler`.
-
             Default: ``None``
         :param int num_workers: 使用多少个进程来预处理数据
         :param bool pin_memory: 是否将产生的tensor使用pin memory, 可能会加快速度。
diff --git a/fastNLP/core/callback.py b/fastNLP/core/callback.py
index f57766c4..14f94e19 100644
--- a/fastNLP/core/callback.py
+++ b/fastNLP/core/callback.py
@@ -806,7 +806,7 @@ class TensorboardCallback(Callback):
     
     .. warning::
         fastNLP 已停止对此功能的维护，请等待 fastNLP 兼容 PyTorch1.1 的下一个版本。
-        或者使用和 fastNLP 高度配合的 fitlog（参见 :doc:`/tutorials/tutorial_11_fitlog` ）。
+        或者使用和 fastNLP 高度配合的 fitlog（参见 :doc:`/tutorials/extend_2_fitlog` ）。
         
     """
     
diff --git a/fastNLP/core/dataset.py b/fastNLP/core/dataset.py
index c2c5048e..1275574b 100644
--- a/fastNLP/core/dataset.py
+++ b/fastNLP/core/dataset.py
@@ -282,7 +282,7 @@
         dataset.set_pad_val('chars', -1)
 
 3.3 根据DataSet中多个field合成新的field
---------------------------------------
+------------------------------------------------------------
 
     DataSet支持在进行batch时，默认只能看到当前的field的值，但在某些训练中可能存在以下的情况: (1)需要两个field拼接成为一个field;
     (2)需要在batch中进行负采样。这时候就需要能够同时利用多个field进行batch的操作，DataSet中的add_collect_fn()函数支持添加
diff --git a/fastNLP/core/metrics.py b/fastNLP/core/metrics.py
index 97d181f9..3dfd9574 100644
--- a/fastNLP/core/metrics.py
+++ b/fastNLP/core/metrics.py
@@ -282,21 +282,31 @@ class MetricBase(object):
 class ConfusionMatrixMetric(MetricBase):
     r"""
     分类问题计算混淆矩阵的Metric（其它的Metric参见 :mod:`fastNLP.core.metrics` ）
-    最后返回结果为dict,{'confusion_matrix': ConfusionMatrix实例}
+    最后返回结果为::
+        
+        dict,{'confusion_matrix': ConfusionMatrix实例}
+        
     ConfusionMatrix实例的print()函数将输出矩阵字符串。
-    pred_dict = {"pred": torch.Tensor([2,1,3])}
-    target_dict = {'target': torch.Tensor([2,2,1])}
-    metric = ConfusionMatrixMetric()
-    metric(pred_dict=pred_dict, target_dict=target_dict, )
-    print(metric.get_metric())
-    {'confusion_matrix': 
-     target  1.0     2.0     3.0     all
-       pred
-        1.0    0       1       0       1
-        2.0    0       1       0       1
-        3.0    1       0       0       1
-        all    1       2       0       3
-}
+    
+    .. code ::
+    
+        pred_dict = {"pred": torch.Tensor([2,1,3])}
+        target_dict = {'target': torch.Tensor([2,2,1])}
+        metric = ConfusionMatrixMetric()
+        metric(pred_dict=pred_dict, target_dict=target_dict, )
+        print(metric.get_metric())
+        
+    .. code ::
+        
+        {'confusion_matrix':
+         target  1.0     2.0     3.0     all
+           pred
+            1.0    0       1       0       1
+            2.0    0       1       0       1
+            3.0    1       0       0       1
+            all    1       2       0       3
+        }
+        
     """
     def __init__(self,
                  vocab=None,
@@ -322,12 +332,12 @@ class ConfusionMatrixMetric(MetricBase):
     def evaluate(self, pred, target, seq_len=None):
         """
         evaluate函数将针对一个批次的预测结果做评价指标的累计
+        
         :param torch.Tensor pred: 预测的tensor, tensor的形状可以是torch.Size([B,]), torch.Size([B, n_classes]),
-                torch.Size([B, max_len]), 或者torch.Size([B, max_len, n_classes])
+            torch.Size([B, max_len]), 或者torch.Size([B, max_len, n_classes])
         :param torch.Tensor target: 真实值的tensor, tensor的形状可以是Element's can be: torch.Size([B,]),
-                torch.Size([B,]), torch.Size([B, max_len]), 或者torch.Size([B, max_len])
+            torch.Size([B,]), torch.Size([B, max_len]), 或者torch.Size([B, max_len])
         :param torch.Tensor seq_len: 序列长度标记, 标记的形状可以是None, torch.Size([B]), 或者torch.Size([B]).
-                
         """
         if not isinstance(pred, torch.Tensor):
             raise TypeError(
@@ -489,11 +499,12 @@ class ClassifyFPreRecMetric(MetricBase):
             'rec-label':xxx,
             ...
         }
+    
     """
 
     def __init__(self, tag_vocab=None, pred=None, target=None, seq_len=None, ignore_labels=None,
                  only_gross=True, f_type='micro', beta=1):
-        """
+        r"""
 
         :param tag_vocab: 标签的 :class:`~fastNLP.Vocabulary` . 默认值为None。若为None则使用数字来作为标签内容，否则使用vocab来作为标签内容。
         :param str pred: 用该key在evaluate()时从传入dict中取出prediction数据。 为None，则使用 `pred` 取数据
@@ -504,6 +515,7 @@ class ClassifyFPreRecMetric(MetricBase):
         :param str f_type: `micro` 或 `macro` . `micro` :通过先计算总体的TP，FN和FP的数量，再计算f, precision, recall; `macro` : 分布计算每个类别的f, precision, recall，然后做平均（各类别f的权重相同）
         :param float beta: f_beta分数， :math:`f_{beta} = \frac{(1 + {beta}^{2})*(pre*rec)}{({beta}^{2}*pre + rec)}` . 常用为 `beta=0.5, 1, 2` 若为0.5则精确率的权重高于召回率；若为1，则两者平等；若为2，则召回率权重高于精确率。
         """
+        
         if tag_vocab:
             if not isinstance(tag_vocab, Vocabulary):
                 raise TypeError("tag_vocab can only be fastNLP.Vocabulary, not {}.".format(type(tag_vocab)))
diff --git a/fastNLP/core/trainer.py b/fastNLP/core/trainer.py
index 002a6930..8e8c2101 100644
--- a/fastNLP/core/trainer.py
+++ b/fastNLP/core/trainer.py
@@ -355,6 +355,7 @@ from .utils import _move_model_to_device
 from ._parallel_utils import _model_contains_inner_module
 from ._logger import logger
 
+
 class Trainer(object):
     """
     Trainer在fastNLP中用于组织单任务的训练过程，可以避免用户在不同训练任务中重复撰写
@@ -373,9 +374,8 @@ class Trainer(object):
                  dev_data=None, metrics=None, metric_key=None,
                  validate_every=-1, save_path=None, use_tqdm=True, device=None,
                  callbacks=None, check_code_level=0, **kwargs):
-        """
-        
-        :param train_data: 训练集， :class:`~fastNLP.DataSet` 类型或 :class:`~fastNLP.BatchIter`的子类
+        r"""
+        :param train_data: 训练集， :class:`~fastNLP.DataSet` 类型或 :class:`~fastNLP.BatchIter` 的子类
         :param nn.modules model: 待训练的模型
         :param optimizer: `torch.optim.Optimizer` 优化器。如果为None，则Trainer使用默认的Adam(model.parameters(), lr=4e-3)这个优化器
         :param int batch_size: 训练和验证的时候的batch大小。
diff --git a/fastNLP/embeddings/elmo_embedding.py b/fastNLP/embeddings/elmo_embedding.py
index ce077ebf..06cc04f5 100644
--- a/fastNLP/embeddings/elmo_embedding.py
+++ b/fastNLP/embeddings/elmo_embedding.py
@@ -26,6 +26,9 @@ class ElmoEmbedding(ContextualEmbedding):
     """
     使用ELMo的embedding。初始化之后，只需要传入words就可以得到对应的embedding。
     当前支持的使用名称初始化的模型:
+    
+    .. code::
+    
         en: 即en-medium hidden_size 1024; output_size 12
         en-medium: hidden_size 2048; output_size 256
         en-origial: hidden_size 4096; output_size 512
diff --git a/fastNLP/embeddings/static_embedding.py b/fastNLP/embeddings/static_embedding.py
index 1338f492..17fc6dff 100644
--- a/fastNLP/embeddings/static_embedding.py
+++ b/fastNLP/embeddings/static_embedding.py
@@ -27,6 +27,9 @@ class StaticEmbedding(TokenEmbedding):
     StaticEmbedding组件. 给定预训练embedding的名称或路径，根据vocab从embedding中抽取相应的数据(只会将出现在vocab中的词抽取出来，
     如果没有找到，则会随机初始化一个值(但如果该word是被标记为no_create_entry的话，则不会单独创建一个值，而是会被指向unk的index))。
     当前支持自动下载的预训练vector有:
+    
+    .. code::
+    
         en: 实际为en-glove-840b-300d(常用)
         en-glove-6b-50d: glove官方的50d向量
         en-glove-6b-100d: glove官方的100d向量
@@ -88,8 +91,7 @@ class StaticEmbedding(TokenEmbedding):
         :param dict kwargs:
                 bool only_train_min_freq: 仅对train中的词语使用min_freq筛选;
                 bool only_norm_found_vector: 是否仅对在预训练中找到的词语使用normalize;
-                bool only_use_pretrain_word: 仅使用出现在pretrain词表中的词，如果该词没有在预训练的词表中出现则为unk。如果
-                    embedding不需要更新建议设置为True。
+                bool only_use_pretrain_word: 仅使用出现在pretrain词表中的词，如果该词没有在预训练的词表中出现则为unk。如果embedding不需要更新建议设置为True。
         """
         super(StaticEmbedding, self).__init__(vocab, word_dropout=word_dropout, dropout=dropout)
         if embedding_dim > 0:
diff --git a/fastNLP/io/loader/conll.py b/fastNLP/io/loader/conll.py
index 2e0bb038..7a206b68 100644
--- a/fastNLP/io/loader/conll.py
+++ b/fastNLP/io/loader/conll.py
@@ -227,7 +227,7 @@ class OntoNotesNERLoader(ConllLoader):
         :header: "raw_words", "target"
 
         "['Hi', 'everyone', '.']", "['O', 'O', 'O']"
-        "['first', 'up', 'on', 'the', 'docket'], "['O', 'O', 'O', 'O', 'O']"
+        "['first', 'up', 'on', 'the', 'docket']", "['O', 'O', 'O', 'O', 'O']"
         "[...]", "[...]"
 
     """
diff --git a/fastNLP/io/loader/qa.py b/fastNLP/io/loader/qa.py
index 782a2701..ec248e0a 100644
--- a/fastNLP/io/loader/qa.py
+++ b/fastNLP/io/loader/qa.py
@@ -29,10 +29,10 @@ class CMRC2018Loader(Loader):
     验证集DataSet将具备以下的内容，每个问题的答案可能有三个(有时候只是3个重复的答案)
 
     .. csv-table::
-       :header:"title", "context", "question", "answers", "answer_starts", "id"
+       :header: "title", "context", "question", "answers", "answer_starts", "id"
 
-       "战国无双3", "《战国无双3》（）是由光荣和ω-force开发...", "《战国无双3》是由哪两个公司合作开发的？", ["光荣和ω-force", "光荣和ω-force", "光荣和ω-force"], ["30", "30", "30"], "DEV_0_QUERY_0"
-       "战国无双3", "《战国无双3》（）是由光荣和ω-force开发...", "男女主角亦有专属声优这一模式是由谁改编的？", ["村雨城", "村雨城", "任天堂游戏谜之村雨城"], ["226", "226", "219"], "DEV_0_QUERY_1"
+       "战国无双3", "《战国无双3》（）是由光荣和ω-force开发...", "《战国无双3》是由哪两个公司合作开发的？", "['光荣和ω-force', '光荣和ω-force', '光荣和ω-force']", "[30, 30, 30]", "DEV_0_QUERY_0"
+       "战国无双3", "《战国无双3》（）是由光荣和ω-force开发...", "男女主角亦有专属声优这一模式是由谁改编的？", "['村雨城', '村雨城', '任天堂游戏谜之村雨城']", "[226, 226, 219]", "DEV_0_QUERY_1"
        "...", "...", "...","...", ".", "..."
 
     其中answer_starts是从0开始的index。例如"我来自a复旦大学？"，其中"复"的开始index为4。另外"Russell评价说"中的说的index为9, 因为
diff --git a/fastNLP/io/pipe/qa.py b/fastNLP/io/pipe/qa.py
index e8c0c69b..949f143b 100644
--- a/fastNLP/io/pipe/qa.py
+++ b/fastNLP/io/pipe/qa.py
@@ -83,15 +83,19 @@ class CMRC2018BertPipe(Pipe):
 
     .. csv-table::
         :header: "context_len", "raw_chars",  "target_start", "target_end", "chars"
-        492, ['范', '廷', '颂... ], 30, 34, [21, 25, ...]
-        491, ['范', '廷', '颂... ], 41, 61, [21, 25, ...]
+        
+        492, ['范', '廷', '颂... ], 30, 34, "[21, 25, ...]"
+        491, ['范', '廷', '颂... ], 41, 61, "[21, 25, ...]"
 
-       ".", "...", "...","...", "..."
+        ".", "...", "...","...", "..."
 
     raw_words列是context与question拼起来的结果(连接的地方加入了[SEP])，words是转为index的值, target_start为答案start的index，target_end为答案end的index
     （闭区间）；context_len指示的是words列中context的长度。
 
     其中各列的meta信息如下:
+    
+    .. code::
+    
         +-------------+-------------+-----------+--------------+------------+-------+---------+
         | field_names | context_len | raw_chars | target_start | target_end | chars | answers |
         +-------------+-------------+-----------+--------------+------------+-------+---------|
@@ -100,7 +104,7 @@ class CMRC2018BertPipe(Pipe):
         | ignore_type |    False    |    True   |    False     |   False    | False |  True   |
         |  pad_value  |      0      |     0     |      0       |     0      |   0   |   0     |
         +-------------+-------------+-----------+--------------+------------+-------+---------+
-
+    
     """
     def __init__(self, max_len=510):
         super().__init__()