修复了一系列文档中的 bug

5 years ago · a096e70242
--- a/docs/source/tutorials/文本分类.rst
+++ b/docs/source/tutorials/文本分类.rst
@@ -375,4 +375,4 @@ fastNLP提供了Trainer对象来组织训练过程，包括完成loss计算(所

 .. raw:: html

    <a href="../_static/notebooks/%E6%96%87%E6%9C%AC%E5%88%86%E7%B1%BB.ipynb" download="文本分类.ipynb">点击下载 IPython Notebook 文件 </a>
    <a href="../_static/notebooks/%E6%96%87%E6%9C%AC%E5%88%86%E7%B1%BB.ipynb" download="文本分类.ipynb">点击下载 IPython Notebook 文件 </a><hr>
--- a/docs/source/user/example.rst
+++ b/docs/source/user/example.rst
@@ -154,3 +154,9 @@ csv 表格

 :meth:`fastNLP.DataSet.apply`

 下面这个代码是不可行的，必须要用 r""" 才行:

 .. code::

    :param float beta: f_beta分数， :math:`f_{beta} = \frac{(1 + {beta}^{2})*(pre*rec)}{({beta}^{2}*pre + rec)}` . 常用为 `beta=0.5, 1, 2` 若为0.5则精确率的权重高于召回率；若为1，则两者平等；若为2，则召回率权重高于精确率。

--- a/fastNLP/init.py
+++ b/fastNLP/init.py
@@ -44,7 +44,7 @@ __all__ = [
    "AutoPadder",
    "EngChar2DPadder",

    "CollectFn",
    # "CollectFn",
    "ConcatCollectFn",

    "MetricBase",
--- a/fastNLP/core/batch.py
+++ b/fastNLP/core/batch.py
@@ -260,51 +260,51 @@ class DataSetIter(BatchIter):
 class TorchLoaderIter(BatchIter):
    """
    与DataSetIter类似，但可以用于非fastNLP的数据容器对象，然后将其传入到Trainer中。
        只需要保证数据容器实现了实现了以下的方法
    只需要保证数据容器实现了实现了以下的方法

        Example::

            import random
            from fastNLP import TorchLoaderIter
            import torch
            class UdfDataSet:
                def __init__(self, num_samples):
                    self.num_samples = num_samples

                def __getitem__(self, idx):  # 必须实现的方法，输入参数是一个int，范围为[0, len(self))
                    x = [random.random() for _ in range(3)]
                    y = random.random()
                    return x,y

                def __len__(self):  # 需要实现该方法返回值需要是一个int数据
                    return self.num_samples

            # 需要实现collact_fn将数据转换为tensor
            def collact_fn(data_list):
                # [(x1,y1), (x2,y2), ...], 这里的输入实际上是将UdfDataSet的__getitem__输入结合为list
                xs, ys = [], []
                for l in data_list:
                    x, y = l
                    xs.append(x)
                    ys.append(y)
                # 不需要转移到gpu，Trainer或Tester会将其转移到model所在的device
                x,y = torch.FloatTensor(xs), torch.FloatTensor(ys)
                return {'x':x, 'y':y}, {'y':y}
    Example::

            udf_dataset = UdfDataSet(10)
            dataset = TorchLoaderIter(udf_dataset, collate_fn=collact_fn)
            class Model(nn.Module):
                def __init__(self):
                    super().__init__()
                    self.fc = nn.Linear(3, 1)
                def forward(self, x, y):
                    return {'loss':torch.pow(self.fc(x).squeeze(-1)-y, 2).sum()}
                def predict(self, x):
                    return {'pred':self.fc(x).squeeze(0)}
            model = Model()
            trainer = Trainer(train_data=dataset, model=model, loss=None, print_every=2, dev_data=dataset,
                              metrics=AccuracyMetric(target='y'), use_tqdm=False)
            trainer.train(load_best_model=False)
        import random
        from fastNLP import TorchLoaderIter
        import torch
        class UdfDataSet:
            def __init__(self, num_samples):
                self.num_samples = num_samples

            def __getitem__(self, idx):  # 必须实现的方法，输入参数是一个int，范围为[0, len(self))
                x = [random.random() for _ in range(3)]
                y = random.random()
                return x,y

            def __len__(self):  # 需要实现该方法返回值需要是一个int数据
                return self.num_samples

        # 需要实现collact_fn将数据转换为tensor
        def collact_fn(data_list):
            # [(x1,y1), (x2,y2), ...], 这里的输入实际上是将UdfDataSet的__getitem__输入结合为list
            xs, ys = [], []
            for l in data_list:
                x, y = l
                xs.append(x)
                ys.append(y)
            # 不需要转移到gpu，Trainer或Tester会将其转移到model所在的device
            x,y = torch.FloatTensor(xs), torch.FloatTensor(ys)
            return {'x':x, 'y':y}, {'y':y}

        udf_dataset = UdfDataSet(10)
        dataset = TorchLoaderIter(udf_dataset, collate_fn=collact_fn)
        class Model(nn.Module):
            def __init__(self):
                super().__init__()
                self.fc = nn.Linear(3, 1)
            def forward(self, x, y):
                return {'loss':torch.pow(self.fc(x).squeeze(-1)-y, 2).sum()}
            def predict(self, x):
                return {'pred':self.fc(x).squeeze(0)}
        model = Model()
        trainer = Trainer(train_data=dataset, model=model, loss=None, print_every=2, dev_data=dataset,
                          metrics=AccuracyMetric(target='y'), use_tqdm=False)
        trainer.train(load_best_model=False)

    除此之外，还可以通过该方法实现OnTheFly的训练，如下面的代码所示

@@ -321,7 +321,7 @@ class TorchLoaderIter(BatchIter):
                data.append(x + [y])
            with open(tmp_file_path, 'w') as f:
                for d in data:
                    f.write(' '.join(map(str, d)) + '\n')
                    f.write(' '.join(map(str, d)) + '\\n')

            class FileDataSet:
                def __init__(self, tmp_file):
@@ -382,6 +382,7 @@ class TorchLoaderIter(BatchIter):
            import os
            if os.path.exists(tmp_file_path):
                os.remove(tmp_file_path)
    
    """
    def __init__(self, dataset, batch_size=1, sampler=None,
                 num_workers=0, pin_memory=False, drop_last=False,
@@ -391,7 +392,6 @@ class TorchLoaderIter(BatchIter):
        :param dataset: :class:`~fastNLP.DataSet` 对象, 数据集
        :param int batch_size: 取出的batch大小
        :param sampler: 规定使用的 :class:`~fastNLP.Sampler` 方式. 若为 ``None`` , 使用 :class:`~fastNLP.SequentialSampler`.

            Default: ``None``
        :param int num_workers: 使用多少个进程来预处理数据
        :param bool pin_memory: 是否将产生的tensor使用pin memory, 可能会加快速度。
--- a/fastNLP/core/callback.py
+++ b/fastNLP/core/callback.py
@@ -806,7 +806,7 @@ class TensorboardCallback(Callback):
    
    .. warning::
        fastNLP 已停止对此功能的维护，请等待 fastNLP 兼容 PyTorch1.1 的下一个版本。
        或者使用和 fastNLP 高度配合的 fitlog（参见 :doc:`/tutorials/tutorial_11_fitlog` ）。
        或者使用和 fastNLP 高度配合的 fitlog（参见 :doc:`/tutorials/extend_2_fitlog` ）。
        
    """
    
--- a/fastNLP/core/dataset.py
+++ b/fastNLP/core/dataset.py
@@ -282,7 +282,7 @@
        dataset.set_pad_val('chars', -1)

 3.3 根据DataSet中多个field合成新的field
 --------------------------------------
 ------------------------------------------------------------

    DataSet支持在进行batch时，默认只能看到当前的field的值，但在某些训练中可能存在以下的情况: (1)需要两个field拼接成为一个field;
    (2)需要在batch中进行负采样。这时候就需要能够同时利用多个field进行batch的操作，DataSet中的add_collect_fn()函数支持添加
--- a/fastNLP/core/metrics.py
+++ b/fastNLP/core/metrics.py
@@ -282,21 +282,31 @@ class MetricBase(object):
 class ConfusionMatrixMetric(MetricBase):
    r"""
    分类问题计算混淆矩阵的Metric（其它的Metric参见 :mod:`fastNLP.core.metrics` ）
    最后返回结果为dict,{'confusion_matrix': ConfusionMatrix实例}
    最后返回结果为::
        
        dict,{'confusion_matrix': ConfusionMatrix实例}
        
    ConfusionMatrix实例的print()函数将输出矩阵字符串。
    pred_dict = {"pred": torch.Tensor([2,1,3])}
    target_dict = {'target': torch.Tensor([2,2,1])}
    metric = ConfusionMatrixMetric()
    metric(pred_dict=pred_dict, target_dict=target_dict, )
    print(metric.get_metric())
    {'confusion_matrix': 
     target  1.0     2.0     3.0     all
       pred
        1.0    0       1       0       1
        2.0    0       1       0       1
        3.0    1       0       0       1
        all    1       2       0       3
 }
    
    .. code ::
    
        pred_dict = {"pred": torch.Tensor([2,1,3])}
        target_dict = {'target': torch.Tensor([2,2,1])}
        metric = ConfusionMatrixMetric()
        metric(pred_dict=pred_dict, target_dict=target_dict, )
        print(metric.get_metric())
        
    .. code ::
        
        {'confusion_matrix':
         target  1.0     2.0     3.0     all
           pred
            1.0    0       1       0       1
            2.0    0       1       0       1
            3.0    1       0       0       1
            all    1       2       0       3
        }
        
    """
    def __init__(self,
                 vocab=None,
@@ -322,12 +332,12 @@ class ConfusionMatrixMetric(MetricBase):
    def evaluate(self, pred, target, seq_len=None):
        """
        evaluate函数将针对一个批次的预测结果做评价指标的累计
        
        :param torch.Tensor pred: 预测的tensor, tensor的形状可以是torch.Size([B,]), torch.Size([B, n_classes]),
                torch.Size([B, max_len]), 或者torch.Size([B, max_len, n_classes])
            torch.Size([B, max_len]), 或者torch.Size([B, max_len, n_classes])
        :param torch.Tensor target: 真实值的tensor, tensor的形状可以是Element's can be: torch.Size([B,]),
                torch.Size([B,]), torch.Size([B, max_len]), 或者torch.Size([B, max_len])
            torch.Size([B,]), torch.Size([B, max_len]), 或者torch.Size([B, max_len])
        :param torch.Tensor seq_len: 序列长度标记, 标记的形状可以是None, torch.Size([B]), 或者torch.Size([B]).
                
        """
        if not isinstance(pred, torch.Tensor):
            raise TypeError(
@@ -489,11 +499,12 @@ class ClassifyFPreRecMetric(MetricBase):
            'rec-label':xxx,
            ...
        }
    
    """

    def __init__(self, tag_vocab=None, pred=None, target=None, seq_len=None, ignore_labels=None,
                 only_gross=True, f_type='micro', beta=1):
        """
        r"""

        :param tag_vocab: 标签的 :class:`~fastNLP.Vocabulary` . 默认值为None。若为None则使用数字来作为标签内容，否则使用vocab来作为标签内容。
        :param str pred: 用该key在evaluate()时从传入dict中取出prediction数据。 为None，则使用 `pred` 取数据
@@ -504,6 +515,7 @@ class ClassifyFPreRecMetric(MetricBase):
        :param str f_type: `micro` 或 `macro` . `micro` :通过先计算总体的TP，FN和FP的数量，再计算f, precision, recall; `macro` : 分布计算每个类别的f, precision, recall，然后做平均（各类别f的权重相同）
        :param float beta: f_beta分数， :math:`f_{beta} = \frac{(1 + {beta}^{2})*(pre*rec)}{({beta}^{2}*pre + rec)}` . 常用为 `beta=0.5, 1, 2` 若为0.5则精确率的权重高于召回率；若为1，则两者平等；若为2，则召回率权重高于精确率。
        """
        
        if tag_vocab:
            if not isinstance(tag_vocab, Vocabulary):
                raise TypeError("tag_vocab can only be fastNLP.Vocabulary, not {}.".format(type(tag_vocab)))
--- a/fastNLP/core/trainer.py
+++ b/fastNLP/core/trainer.py
@@ -355,6 +355,7 @@ from .utils import _move_model_to_device
 from ._parallel_utils import _model_contains_inner_module
 from ._logger import logger


 class Trainer(object):
    """
    Trainer在fastNLP中用于组织单任务的训练过程，可以避免用户在不同训练任务中重复撰写
@@ -373,9 +374,8 @@ class Trainer(object):
                 dev_data=None, metrics=None, metric_key=None,
                 validate_every=-1, save_path=None, use_tqdm=True, device=None,
                 callbacks=None, check_code_level=0, **kwargs):
        """
        
        :param train_data: 训练集， :class:`~fastNLP.DataSet` 类型或 :class:`~fastNLP.BatchIter`的子类
        r"""
        :param train_data: 训练集， :class:`~fastNLP.DataSet` 类型或 :class:`~fastNLP.BatchIter` 的子类
        :param nn.modules model: 待训练的模型
        :param optimizer: `torch.optim.Optimizer` 优化器。如果为None，则Trainer使用默认的Adam(model.parameters(), lr=4e-3)这个优化器
        :param int batch_size: 训练和验证的时候的batch大小。
--- a/fastNLP/embeddings/elmo_embedding.py
+++ b/fastNLP/embeddings/elmo_embedding.py
@@ -26,6 +26,9 @@ class ElmoEmbedding(ContextualEmbedding):
    """
    使用ELMo的embedding。初始化之后，只需要传入words就可以得到对应的embedding。
    当前支持的使用名称初始化的模型:
    
    .. code::
    
        en: 即en-medium hidden_size 1024; output_size 12
        en-medium: hidden_size 2048; output_size 256
        en-origial: hidden_size 4096; output_size 512
--- a/fastNLP/embeddings/static_embedding.py
+++ b/fastNLP/embeddings/static_embedding.py
@@ -27,6 +27,9 @@ class StaticEmbedding(TokenEmbedding):
    StaticEmbedding组件. 给定预训练embedding的名称或路径，根据vocab从embedding中抽取相应的数据(只会将出现在vocab中的词抽取出来，
    如果没有找到，则会随机初始化一个值(但如果该word是被标记为no_create_entry的话，则不会单独创建一个值，而是会被指向unk的index))。
    当前支持自动下载的预训练vector有:
    
    .. code::
    
        en: 实际为en-glove-840b-300d(常用)
        en-glove-6b-50d: glove官方的50d向量
        en-glove-6b-100d: glove官方的100d向量
@@ -88,8 +91,7 @@ class StaticEmbedding(TokenEmbedding):
        :param dict kwargs:
                bool only_train_min_freq: 仅对train中的词语使用min_freq筛选;
                bool only_norm_found_vector: 是否仅对在预训练中找到的词语使用normalize;
                bool only_use_pretrain_word: 仅使用出现在pretrain词表中的词，如果该词没有在预训练的词表中出现则为unk。如果
                    embedding不需要更新建议设置为True。
                bool only_use_pretrain_word: 仅使用出现在pretrain词表中的词，如果该词没有在预训练的词表中出现则为unk。如果embedding不需要更新建议设置为True。
        """
        super(StaticEmbedding, self).__init__(vocab, word_dropout=word_dropout, dropout=dropout)
        if embedding_dim > 0:
--- a/fastNLP/io/loader/conll.py
+++ b/fastNLP/io/loader/conll.py
@@ -227,7 +227,7 @@ class OntoNotesNERLoader(ConllLoader):
        :header: "raw_words", "target"

        "['Hi', 'everyone', '.']", "['O', 'O', 'O']"
        "['first', 'up', 'on', 'the', 'docket'], "['O', 'O', 'O', 'O', 'O']"
        "['first', 'up', 'on', 'the', 'docket']", "['O', 'O', 'O', 'O', 'O']"
        "[...]", "[...]"

    """
--- a/fastNLP/io/loader/qa.py
+++ b/fastNLP/io/loader/qa.py
@@ -29,10 +29,10 @@ class CMRC2018Loader(Loader):
    验证集DataSet将具备以下的内容，每个问题的答案可能有三个(有时候只是3个重复的答案)

    .. csv-table::
       :header:"title", "context", "question", "answers", "answer_starts", "id"
       :header: "title", "context", "question", "answers", "answer_starts", "id"

       "战国无双3", "《战国无双3》（）是由光荣和ω-force开发...", "《战国无双3》是由哪两个公司合作开发的？", ["光荣和ω-force", "光荣和ω-force", "光荣和ω-force"], ["30", "30", "30"], "DEV_0_QUERY_0"
       "战国无双3", "《战国无双3》（）是由光荣和ω-force开发...", "男女主角亦有专属声优这一模式是由谁改编的？", ["村雨城", "村雨城", "任天堂游戏谜之村雨城"], ["226", "226", "219"], "DEV_0_QUERY_1"
       "战国无双3", "《战国无双3》（）是由光荣和ω-force开发...", "《战国无双3》是由哪两个公司合作开发的？", "['光荣和ω-force', '光荣和ω-force', '光荣和ω-force']", "[30, 30, 30]", "DEV_0_QUERY_0"
       "战国无双3", "《战国无双3》（）是由光荣和ω-force开发...", "男女主角亦有专属声优这一模式是由谁改编的？", "['村雨城', '村雨城', '任天堂游戏谜之村雨城']", "[226, 226, 219]", "DEV_0_QUERY_1"
       "...", "...", "...","...", ".", "..."

    其中answer_starts是从0开始的index。例如"我来自a复旦大学？"，其中"复"的开始index为4。另外"Russell评价说"中的说的index为9, 因为
--- a/fastNLP/io/pipe/qa.py
+++ b/fastNLP/io/pipe/qa.py
@@ -83,15 +83,19 @@ class CMRC2018BertPipe(Pipe):

    .. csv-table::
        :header: "context_len", "raw_chars",  "target_start", "target_end", "chars"
        492, ['范', '廷', '颂... ], 30, 34, [21, 25, ...]
        491, ['范', '廷', '颂... ], 41, 61, [21, 25, ...]
        
        492, ['范', '廷', '颂... ], 30, 34, "[21, 25, ...]"
        491, ['范', '廷', '颂... ], 41, 61, "[21, 25, ...]"

       ".", "...", "...","...", "..."
        ".", "...", "...","...", "..."

    raw_words列是context与question拼起来的结果(连接的地方加入了[SEP])，words是转为index的值, target_start为答案start的index，target_end为答案end的index
    （闭区间）；context_len指示的是words列中context的长度。

    其中各列的meta信息如下:
    
    .. code::
    
        +-------------+-------------+-----------+--------------+------------+-------+---------+
        | field_names | context_len | raw_chars | target_start | target_end | chars | answers |
        +-------------+-------------+-----------+--------------+------------+-------+---------|
@@ -100,7 +104,7 @@ class CMRC2018BertPipe(Pipe):
        | ignore_type |    False    |    True   |    False     |   False    | False |  True   |
        |  pad_value  |      0      |     0     |      0       |     0      |   0   |   0     |
        +-------------+-------------+-----------+--------------+------------+-------+---------+

    
    """
    def __init__(self, max_len=510):
        super().__init__()