fastNLP/io/pipe 文档调整

3 years ago · 348107978e
--- a/docs/source/fastNLP.io.loader.conll.rst
+++ b/docs/source/fastNLP.io.loader.conll.rst
@@ -1,7 +0,0 @@
 fastNLP.io.loader.conll module
 ==============================

 .. automodule:: fastNLP.io.loader.conll
   :members:
   :undoc-members:
   :show-inheritance:
--- a/docs/source/fastNLP.io.loader.coreference.rst
+++ b/docs/source/fastNLP.io.loader.coreference.rst
@@ -1,7 +0,0 @@
 fastNLP.io.loader.coreference module
 ====================================

 .. automodule:: fastNLP.io.loader.coreference
   :members:
   :undoc-members:
   :show-inheritance:
--- a/docs/source/fastNLP.transformers.torch.rst
+++ b/docs/source/fastNLP.transformers.torch.rst
@@ -1,5 +1,5 @@
 fastNLP.transformers.torch package
 =============================
 ==================================

 .. automodule:: fastNLP.transformers.torch
   :members:
--- a/fastNLP/core/callbacks/callback_event.py
+++ b/fastNLP/core/callbacks/callback_event.py
@@ -35,14 +35,14 @@ class Event:

    :param value: Trainer 的 callback 时机；
    :param every: 每触发多少次才真正运行一次；
    :param once: 在第一次运行后时候再次执行；
    :param once: 是否仅运行一次；
    :param filter_fn: 输入参数的应该为 ``(filter, trainer)``，其中 ``filter`` 对象中包含了 `filter.num_called` 和
        `filter.num_executed` 两个变量分别获取当前被调用了多少次，真正执行了多少次；``trainer`` 对象即为当前正在运行的 Trainer；
    """
    every: Optional[int]
    once: Optional[int]
    once: Optional[bool]

    def __init__(self, value: str, every: Optional[int] = None, once: Optional[int] = None,
    def __init__(self, value: str, every: Optional[int] = None, once: Optional[bool] = None,
                 filter_fn: Optional[Callable] = None):
        self.every = every
        self.once = once
@@ -68,7 +68,6 @@ class Event:
        return Event(value='on_after_trainer_initialized', every=every, once=once, filter_fn=filter_fn)

    @staticmethod
    
    def on_sanity_check_begin(every=None, once=None, filter_fn=None):
        """
        当 Trainer 运行到 :func:`on_sanity_check_begin` 时触发；
@@ -85,7 +84,6 @@ class Event:
        return Event(value='on_sanity_check_begin', every=every, once=once, filter_fn=filter_fn)

    @staticmethod
    
    def on_sanity_check_end(every=None, once=None, filter_fn=None):
        """
        当 Trainer 运行到 :func:`on_sanity_check_end` 时触发；
@@ -101,7 +99,6 @@ class Event:
        return Event(value='on_sanity_check_end', every=every, once=once, filter_fn=filter_fn)

    @staticmethod
    
    def on_train_begin(every=None, once=None, filter_fn=None):
        """
        当 Trainer 运行到 :func:`on_train_begin` 时触发；
@@ -117,7 +114,6 @@ class Event:
        return Event(value='on_train_begin', every=every, once=once, filter_fn=filter_fn)

    @staticmethod
    
    def on_train_end(every=None, once=None, filter_fn=None):
        """
        当 Trainer 运行到 :func:`on_train_end` 时触发；
@@ -133,7 +129,6 @@ class Event:
        return Event(value='on_train_end', every=every, once=once, filter_fn=filter_fn)

    @staticmethod
    
    def on_train_epoch_begin(every=None, once=None, filter_fn=None):
        """
        当 Trainer 运行到 :func:`on_train_epoch_begin` 时触发；
@@ -149,7 +144,6 @@ class Event:
        return Event(value='on_train_epoch_begin', every=every, once=once, filter_fn=filter_fn)

    @staticmethod
    
    def on_train_epoch_end(every=None, once=None, filter_fn=None):
        """
        当 Trainer 运行到 :func:`on_train_epoch_end` 时触发；
@@ -165,7 +159,6 @@ class Event:
        return Event(value='on_train_epoch_end', every=every, once=once, filter_fn=filter_fn)

    @staticmethod
    
    def on_fetch_data_begin(every=None, once=None, filter_fn=None):
        """
        当 Trainer 运行到 :func:`on_fetch_data_begin` 时触发；
@@ -181,7 +174,6 @@ class Event:
        return Event(value='on_fetch_data_begin', every=every, once=once, filter_fn=filter_fn)

    @staticmethod
    
    def on_fetch_data_end(every=None, once=None, filter_fn=None):
        """
        当 Trainer 运行到 :func:`on_fetch_data_end` 时触发；
@@ -197,7 +189,6 @@ class Event:
        return Event(value='on_fetch_data_end', every=every, once=once, filter_fn=filter_fn)

    @staticmethod
    
    def on_train_batch_begin(every=None, once=None, filter_fn=None):
        """
        当 Trainer 运行到 :func:`on_train_batch_begin` 时触发；
@@ -213,7 +204,6 @@ class Event:
        return Event(value='on_train_batch_begin', every=every, once=once, filter_fn=filter_fn)

    @staticmethod
    
    def on_train_batch_end(every=None, once=None, filter_fn=None):
        """
        当 Trainer 运行到 :func:`on_train_batch_end` 时触发；
@@ -229,7 +219,6 @@ class Event:
        return Event(value='on_train_batch_end', every=every, once=once, filter_fn=filter_fn)

    @staticmethod
    
    def on_exception(every=None, once=None, filter_fn=None):
        """
        当 Trainer 运行到 :func:`on_exception` 时触发；
@@ -245,7 +234,6 @@ class Event:
        return Event(value='on_exception', every=every, once=once, filter_fn=filter_fn)

    @staticmethod
    
    def on_save_model(every=None, once=None, filter_fn=None):
        """
        当 Trainer 运行到 :func:`on_save_model` 时触发；
@@ -261,7 +249,6 @@ class Event:
        return Event(value='on_save_model', every=every, once=once, filter_fn=filter_fn)

    @staticmethod
    
    def on_load_model(every=None, once=None, filter_fn=None):
        """
        当 Trainer 运行到 :func:`on_load_model` 时触发；
@@ -277,7 +264,6 @@ class Event:
        return Event(value='on_load_model', every=every, once=once, filter_fn=filter_fn)

    @staticmethod
    
    def on_save_checkpoint(every=None, once=None, filter_fn=None):
        """
        当 Trainer 运行到 :func:`on_save_checkpoint` 时触发；
@@ -293,7 +279,6 @@ class Event:
        return Event(value='on_save_checkpoint', every=every, once=once, filter_fn=filter_fn)

    @staticmethod
    
    def on_load_checkpoint(every=None, once=None, filter_fn=None):
        """
        当 Trainer 运行到 :func:`on_load_checkpoint` 时触发；
@@ -309,7 +294,6 @@ class Event:
        return Event(value='on_load_checkpoint', every=every, once=once, filter_fn=filter_fn)

    @staticmethod
    
    def on_load_checkpoint(every=None, once=None, filter_fn=None):
        """
        当 Trainer 运行到 :func:`on_load_checkpoint` 时触发；
@@ -325,7 +309,6 @@ class Event:
        return Event(value='on_load_checkpoint', every=every, once=once, filter_fn=filter_fn)

    @staticmethod
    
    def on_before_backward(every=None, once=None, filter_fn=None):
        """
        当 Trainer 运行到 :func:`on_before_backward` 时触发；
@@ -341,7 +324,6 @@ class Event:
        return Event(value='on_before_backward', every=every, once=once, filter_fn=filter_fn)

    @staticmethod
    
    def on_after_backward(every=None, once=None, filter_fn=None):
        """
        当 Trainer 运行到 :func:`on_after_backward` 时触发；
@@ -357,7 +339,6 @@ class Event:
        return Event(value='on_after_backward', every=every, once=once, filter_fn=filter_fn)

    @staticmethod
    
    def on_before_optimizers_step(every=None, once=None, filter_fn=None):
        """
        当 Trainer 运行到 :func:`on_before_optimizers_step` 时触发；
@@ -373,7 +354,6 @@ class Event:
        return Event(value='on_before_optimizers_step', every=every, once=once, filter_fn=filter_fn)

    @staticmethod
    
    def on_after_optimizers_step(every=None, once=None, filter_fn=None):
        """
        当 Trainer 运行到 :func:`on_after_optimizers_step` 时触发；
@@ -389,7 +369,6 @@ class Event:
        return Event(value='on_after_optimizers_step', every=every, once=once, filter_fn=filter_fn)

    @staticmethod
    
    def on_before_zero_grad(every=None, once=None, filter_fn=None):
        """
        当 Trainer 运行到 :func:`on_before_zero_grad` 时触发；
@@ -405,7 +384,6 @@ class Event:
        return Event(value='on_before_zero_grad', every=every, once=once, filter_fn=filter_fn)

    @staticmethod
    
    def on_after_zero_grad(every=None, once=None, filter_fn=None):
        """
        当 Trainer 运行到 :func:`on_after_zero_grad` 时触发；
@@ -421,7 +399,6 @@ class Event:
        return Event(value='on_after_zero_grad', every=every, once=once, filter_fn=filter_fn)

    @staticmethod
    
    def on_evaluate_begin(every=None, once=None, filter_fn=None):
        """
        当 Trainer 运行到 :func:`on_evaluate_begin` 时触发；
@@ -437,7 +414,6 @@ class Event:
        return Event(value='on_evaluate_begin', every=every, once=once, filter_fn=filter_fn)

    @staticmethod
    
    def on_evaluate_end(every=None, once=None, filter_fn=None):
        """
        当 Trainer 运行到 :func:`on_evaluate_end` 时触发；
--- a/fastNLP/io/init.py
+++ b/fastNLP/io/init.py
@@ -105,7 +105,7 @@ __all__ = [
    "BQCorpusPipe",
    "RenamePipe",
    "GranularizePipe",
    "MachingTruncatePipe",
    "TruncateBertPipe",

    "CMRC2018BertPipe",

--- a/fastNLP/io/pipe/init.py
+++ b/fastNLP/io/pipe/init.py
@@ -52,7 +52,7 @@ __all__ = [
    "BQCorpusPipe",
    "RenamePipe",
    "GranularizePipe",
    "MachingTruncatePipe",
    "TruncateBertPipe",


    "CMRC2018BertPipe",
@@ -74,7 +74,7 @@ from .conll import Conll2003Pipe, iob2, iob2bioes
 from .cws import CWSPipe
 from .matching import MatchingBertPipe, RTEBertPipe, SNLIBertPipe, QuoraBertPipe, QNLIBertPipe, MNLIBertPipe, \
    MatchingPipe, RTEPipe, SNLIPipe, QuoraPipe, QNLIPipe, MNLIPipe, CNXNLIBertPipe, CNXNLIPipe, BQCorpusBertPipe, \
    LCQMCPipe, BQCorpusPipe, LCQMCBertPipe, RenamePipe, GranularizePipe, MachingTruncatePipe
    LCQMCPipe, BQCorpusPipe, LCQMCBertPipe, RenamePipe, GranularizePipe, TruncateBertPipe
 from .pipe import Pipe
 from .qa import CMRC2018BertPipe

--- a/fastNLP/io/pipe/classification.py
+++ b/fastNLP/io/pipe/classification.py
@@ -94,7 +94,7 @@ class CLSBasePipe(Pipe):

    def process_from_file(self, paths) -> DataBundle:
        r"""
        传入文件路径，生成处理好的 :class:`~fastNLP.io.DataBundle` 对象。``paths`` 支持的路径形式可以参考 :meth:`fastNLP.io.Loader.load()`
        传入文件路径，生成处理好的 :class:`~fastNLP.io.DataBundle` 对象。``paths`` 支持的路径形式可以参考 :meth:`fastNLP.io.Loader.load`

        :param paths:
        :return:
@@ -154,7 +154,7 @@ class YelpFullPipe(CLSBasePipe):

    def process_from_file(self, paths=None) -> DataBundle:
        r"""
        传入文件路径，生成处理好的 :class:`~fastNLP.io.DataBundle` 对象。``paths`` 支持的路径形式可以参考 :meth:`fastNLP.io.Loader.load()`
        传入文件路径，生成处理好的 :class:`~fastNLP.io.DataBundle` 对象。``paths`` 支持的路径形式可以参考 :meth:`fastNLP.io.Loader.load`

        :param paths:
        :return:
@@ -184,7 +184,7 @@ class YelpPolarityPipe(CLSBasePipe):

    def process_from_file(self, paths=None):
        r"""
        传入文件路径，生成处理好的 :class:`~fastNLP.io.DataBundle` 对象。``paths`` 支持的路径形式可以参考 :meth:`fastNLP.io.Loader.load()`
        传入文件路径，生成处理好的 :class:`~fastNLP.io.DataBundle` 对象。``paths`` 支持的路径形式可以参考 :meth:`fastNLP.io.Loader.load`

        :param paths:
        :return:
@@ -214,7 +214,7 @@ class AGsNewsPipe(CLSBasePipe):

    def process_from_file(self, paths=None):
        r"""
        传入文件路径，生成处理好的 :class:`~fastNLP.io.DataBundle` 对象。``paths`` 支持的路径形式可以参考 :meth:`fastNLP.io.Loader.load()`
        传入文件路径，生成处理好的 :class:`~fastNLP.io.DataBundle` 对象。``paths`` 支持的路径形式可以参考 :meth:`fastNLP.io.Loader.load`

        :param paths:
        :return:
@@ -244,7 +244,7 @@ class DBPediaPipe(CLSBasePipe):

    def process_from_file(self, paths=None):
        r"""
        传入文件路径，生成处理好的 :class:`~fastNLP.io.DataBundle` 对象。``paths`` 支持的路径形式可以参考 :meth:`fastNLP.io.Loader.load()`
        传入文件路径，生成处理好的 :class:`~fastNLP.io.DataBundle` 对象。``paths`` 支持的路径形式可以参考 :meth:`fastNLP.io.Loader.load`

        :param paths:
        :return:
@@ -328,7 +328,7 @@ class SSTPipe(CLSBasePipe):

    def process_from_file(self, paths=None):
        r"""
        传入文件路径，生成处理好的 :class:`~fastNLP.io.DataBundle` 对象。``paths`` 支持的路径形式可以参考 :meth:`fastNLP.io.Loader.load()`
        传入文件路径，生成处理好的 :class:`~fastNLP.io.DataBundle` 对象。``paths`` 支持的路径形式可以参考 :meth:`fastNLP.io.Loader.load`

        :param paths:
        :return:
@@ -358,7 +358,7 @@ class SST2Pipe(CLSBasePipe):

    def process_from_file(self, paths=None):
        r"""
        传入文件路径，生成处理好的 :class:`~fastNLP.io.DataBundle` 对象。``paths`` 支持的路径形式可以参考 :meth:`fastNLP.io.Loader.load()`
        传入文件路径，生成处理好的 :class:`~fastNLP.io.DataBundle` 对象。``paths`` 支持的路径形式可以参考 :meth:`fastNLP.io.Loader.load`

        :param paths:
        :return:
@@ -421,7 +421,7 @@ class IMDBPipe(CLSBasePipe):

    def process_from_file(self, paths=None):
        r"""
        传入文件路径，生成处理好的 :class:`~fastNLP.io.DataBundle` 对象。``paths`` 支持的路径形式可以参考 :meth:`fastNLP.io.Loader.load()`
        传入文件路径，生成处理好的 :class:`~fastNLP.io.DataBundle` 对象。``paths`` 支持的路径形式可以参考 :meth:`fastNLP.io.Loader.load`

        :param paths:
        :return:
@@ -518,7 +518,7 @@ class ChnSentiCorpPipe(Pipe):

    def process_from_file(self, paths=None):
        r"""
        传入文件路径，生成处理好的 :class:`~fastNLP.io.DataBundle` 对象。``paths`` 支持的路径形式可以参考 :meth:`fastNLP.io.Loader.load()`
        传入文件路径，生成处理好的 :class:`~fastNLP.io.DataBundle` 对象。``paths`` 支持的路径形式可以参考 :meth:`fastNLP.io.Loader.load`

        :param paths:
        :return:
@@ -622,7 +622,7 @@ class THUCNewsPipe(CLSBasePipe):

    def process_from_file(self, paths=None):
        r"""
        传入文件路径，生成处理好的 :class:`~fastNLP.io.DataBundle` 对象。``paths`` 支持的路径形式可以参考 :meth:`fastNLP.io.Loader.load()`
        传入文件路径，生成处理好的 :class:`~fastNLP.io.DataBundle` 对象。``paths`` 支持的路径形式可以参考 :meth:`fastNLP.io.Loader.load`

        :param paths:
        :return:
@@ -713,7 +713,7 @@ class WeiboSenti100kPipe(CLSBasePipe):
    
    def process_from_file(self, paths=None):
        r"""
        传入文件路径，生成处理好的 :class:`~fastNLP.io.DataBundle` 对象。``paths`` 支持的路径形式可以参考 :meth:`fastNLP.io.Loader.load()`
        传入文件路径，生成处理好的 :class:`~fastNLP.io.DataBundle` 对象。``paths`` 支持的路径形式可以参考 :meth:`fastNLP.io.Loader.load`

        :param paths:
        :return:
@@ -737,7 +737,7 @@ class MRPipe(CLSBasePipe):

    def process_from_file(self, paths=None):
        r"""
        传入文件路径，生成处理好的 :class:`~fastNLP.io.DataBundle` 对象。``paths`` 支持的路径形式可以参考 :meth:`fastNLP.io.Loader.load()`
        传入文件路径，生成处理好的 :class:`~fastNLP.io.DataBundle` 对象。``paths`` 支持的路径形式可以参考 :meth:`fastNLP.io.Loader.load`

        :param paths:
        :return:
@@ -763,7 +763,7 @@ class R8Pipe(CLSBasePipe):

    def process_from_file(self, paths=None):
        r"""
        传入文件路径，生成处理好的 :class:`~fastNLP.io.DataBundle` 对象。``paths`` 支持的路径形式可以参考 :meth:`fastNLP.io.Loader.load()`
        传入文件路径，生成处理好的 :class:`~fastNLP.io.DataBundle` 对象。``paths`` 支持的路径形式可以参考 :meth:`fastNLP.io.Loader.load`

        :param paths:
        :return:
@@ -789,7 +789,7 @@ class R52Pipe(CLSBasePipe):

    def process_from_file(self, paths=None):
        r"""
        传入文件路径，生成处理好的 :class:`~fastNLP.io.DataBundle` 对象。``paths`` 支持的路径形式可以参考 :meth:`fastNLP.io.Loader.load()`
        传入文件路径，生成处理好的 :class:`~fastNLP.io.DataBundle` 对象。``paths`` 支持的路径形式可以参考 :meth:`fastNLP.io.Loader.load`

        :param paths:
        :return:
@@ -815,7 +815,7 @@ class OhsumedPipe(CLSBasePipe):

    def process_from_file(self, paths=None):
        r"""
        传入文件路径，生成处理好的 :class:`~fastNLP.io.DataBundle` 对象。``paths`` 支持的路径形式可以参考 :meth:`fastNLP.io.Loader.load()`
        传入文件路径，生成处理好的 :class:`~fastNLP.io.DataBundle` 对象。``paths`` 支持的路径形式可以参考 :meth:`fastNLP.io.Loader.load`

        :param paths:
        :return:
@@ -841,7 +841,7 @@ class NG20Pipe(CLSBasePipe):

    def process_from_file(self, paths=None):
        r"""
        传入文件路径，生成处理好的 :class:`~fastNLP.io.DataBundle` 对象。``paths`` 支持的路径形式可以参考 :meth:`fastNLP.io.Loader.load()`
        传入文件路径，生成处理好的 :class:`~fastNLP.io.DataBundle` 对象。``paths`` 支持的路径形式可以参考 :meth:`fastNLP.io.Loader.load`

        :param paths:
        :return:
--- a/fastNLP/io/pipe/conll.py
+++ b/fastNLP/io/pipe/conll.py
@@ -104,7 +104,7 @@ class Conll2003NERPipe(_NERPipe):
    
    def process_from_file(self, paths) -> DataBundle:
        r"""
        传入文件路径，生成处理好的 :class:`~fastNLP.io.DataBundle` 对象。``paths`` 支持的路径形式可以参考 :meth:`fastNLP.io.Loader.load()`
        传入文件路径，生成处理好的 :class:`~fastNLP.io.DataBundle` 对象。``paths`` 支持的路径形式可以参考 :meth:`fastNLP.io.Loader.load`

        :param paths:
        :return:
@@ -193,7 +193,7 @@ class Conll2003Pipe(Pipe):
    
    def process_from_file(self, paths):
        r"""
        传入文件路径，生成处理好的 :class:`~fastNLP.io.DataBundle` 对象。``paths`` 支持的路径形式可以参考 :meth:`fastNLP.io.Loader.load()`
        传入文件路径，生成处理好的 :class:`~fastNLP.io.DataBundle` 对象。``paths`` 支持的路径形式可以参考 :meth:`fastNLP.io.Loader.load`

        :param paths:
        :return:
@@ -221,7 +221,7 @@ class OntoNotesNERPipe(_NERPipe):
    
    def process_from_file(self, paths):
        r"""
        传入文件路径，生成处理好的 :class:`~fastNLP.io.DataBundle` 对象。``paths`` 支持的路径形式可以参考 :meth:`fastNLP.io.Loader.load()`
        传入文件路径，生成处理好的 :class:`~fastNLP.io.DataBundle` 对象。``paths`` 支持的路径形式可以参考 :meth:`fastNLP.io.Loader.load`

        :param paths:
        :return:
@@ -343,7 +343,7 @@ class MsraNERPipe(_CNNERPipe):
    
    def process_from_file(self, paths=None) -> DataBundle:
        r"""
        传入文件路径，生成处理好的 :class:`~fastNLP.io.DataBundle` 对象。``paths`` 支持的路径形式可以参考 :meth:`fastNLP.io.Loader.load()`
        传入文件路径，生成处理好的 :class:`~fastNLP.io.DataBundle` 对象。``paths`` 支持的路径形式可以参考 :meth:`fastNLP.io.Loader.load`

        :param paths:
        :return:
@@ -379,7 +379,7 @@ class PeopleDailyPipe(_CNNERPipe):
    
    def process_from_file(self, paths=None) -> DataBundle:
        r"""
        传入文件路径，生成处理好的 :class:`~fastNLP.io.DataBundle` 对象。``paths`` 支持的路径形式可以参考 :meth:`fastNLP.io.Loader.load()`
        传入文件路径，生成处理好的 :class:`~fastNLP.io.DataBundle` 对象。``paths`` 支持的路径形式可以参考 :meth:`fastNLP.io.Loader.load`

        :param paths:
        :return:
@@ -415,7 +415,7 @@ class WeiboNERPipe(_CNNERPipe):
    
    def process_from_file(self, paths=None) -> DataBundle:
        r"""
        传入文件路径，生成处理好的 :class:`~fastNLP.io.DataBundle` 对象。``paths`` 支持的路径形式可以参考 :meth:`fastNLP.io.Loader.load()`
        传入文件路径，生成处理好的 :class:`~fastNLP.io.DataBundle` 对象。``paths`` 支持的路径形式可以参考 :meth:`fastNLP.io.Loader.load`

        :param paths:
        :return:
--- a/fastNLP/io/pipe/construct_graph.py
+++ b/fastNLP/io/pipe/construct_graph.py
@@ -164,7 +164,7 @@ class GraphBuilderBase:

    def build_graph_from_file(self, path: str):
        r"""
            传入文件路径，生成处理好的scipy_sparse_matrix对象。paths支持的路径形式可以参考 ：:meth:`fastNLP.io.Loader.load()`
            传入文件路径，生成处理好的scipy_sparse_matrix对象。paths支持的路径形式可以参考 ：:meth:`fastNLP.io.Loader.load`

            :param path:
            :return: scipy_sparse_matrix
@@ -185,7 +185,7 @@ class MRPmiGraphPipe(GraphBuilderBase):

    def build_graph(self, data_bundle: DataBundle):
        r"""
        :param data_bundle: 需要处理的 :class:`fastNLP.io.DataBundle` 对象。
        :param data_bundle: 需要处理的 :class:`~fastNLP.io.DataBundle` 对象。
        :return: 返回 ``csr`` 类型的稀疏矩阵图；包含训练集，验证集，测试集，在图中的 index 。
        """
        self._get_doc_edge(data_bundle)
@@ -219,7 +219,7 @@ class R8PmiGraphPipe(GraphBuilderBase):

    def build_graph(self, data_bundle: DataBundle):
        r"""
        :param data_bundle: 需要处理的 :class:`fastNLP.io.DataBundle` 对象。
        :param data_bundle: 需要处理的 :class:`~fastNLP.io.DataBundle` 对象。
        :return: 返回 ``csr`` 类型的稀疏矩阵图；包含训练集，验证集，测试集，在图中的 index 。
        """
        self._get_doc_edge(data_bundle)
@@ -253,7 +253,7 @@ class R52PmiGraphPipe(GraphBuilderBase):

    def build_graph(self, data_bundle: DataBundle):
        r"""
        :param data_bundle: 需要处理的 :class:`fastNLP.io.DataBundle` 对象。
        :param data_bundle: 需要处理的 :class:`~fastNLP.io.DataBundle` 对象。
        :return: 返回 ``csr`` 类型的稀疏矩阵图；包含训练集，验证集，测试集，在图中的 index 。
        """
        self._get_doc_edge(data_bundle)
@@ -287,7 +287,7 @@ class OhsumedPmiGraphPipe(GraphBuilderBase):

    def build_graph(self, data_bundle: DataBundle):
        r"""
        :param data_bundle: 需要处理的 :class:`fastNLP.io.DataBundle` 对象。
        :param data_bundle: 需要处理的 :class:`~fastNLP.io.DataBundle` 对象。
        :return: 返回 ``csr`` 类型的稀疏矩阵图；包含训练集，验证集，测试集，在图中的 index 。
        """
        self._get_doc_edge(data_bundle)
@@ -321,7 +321,7 @@ class NG20PmiGraphPipe(GraphBuilderBase):

    def build_graph(self, data_bundle: DataBundle):
        r"""
        :param data_bundle: 需要处理的 :class:`fastNLP.io.DataBundle` 对象。
        :param data_bundle: 需要处理的 :class:`~fastNLP.io.DataBundle` 对象。
        :return: 返回 ``csr`` 类型的稀疏矩阵图；包含训练集，验证集，测试集，在图中的 index 。
        """
        self._get_doc_edge(data_bundle)
--- a/fastNLP/io/pipe/cws.py
+++ b/fastNLP/io/pipe/cws.py
@@ -1,5 +1,3 @@
 r"""undocumented"""

 __all__ = [
    "CWSPipe"
 ]
@@ -135,7 +133,7 @@ def _find_and_replace_digit_spans(line):

 class CWSPipe(Pipe):
    r"""
    对CWS数据进行预处理, 处理之后的数据，具备以下的结构
    对 **CWS** 数据进行处理，处理之后 :class:`~fastNLP.core.DataSet` 中的内容如下：

    .. csv-table::
       :header: "raw_words", "chars", "target", "seq_len"
@@ -144,30 +142,21 @@ class CWSPipe(Pipe):
       "2001年  新年  钟声...", "[8, 9, 9, 7, ...]", "[0, 1, 1, 1, 2...]", 20
       "...", "[...]","[...]", .

    dataset的print_field_meta()函数输出的各个field的被设置成input和target的情况为::

        +-------------+-----------+-------+--------+---------+
        | field_names | raw_words | chars | target | seq_len |
        +-------------+-----------+-------+--------+---------+
        |   is_input  |   False   |  True |  True  |   True  |
        |  is_target  |   False   | False |  True  |   True  |
        | ignore_type |           | False | False  |  False  |
        |  pad_value  |           |   0   |   0    |    0    |
        +-------------+-----------+-------+--------+---------+

    :param dataset_name: data 的名称，支持 ``['pku', 'msra', 'cityu'(繁体), 'as'(繁体), None]``
    :param encoding_type: ``target`` 列使用什么类型的 encoding 方式，支持 ``['bmes', 'segapp']`` 两种。``"我 来自 复旦大学..."`` 这句话 ``bmes``的 
        tag为 ``[S, B, E, B, M, M, E...]`` ； ``segapp`` 的 tag 为 ``[seg, app, seg, app, app, app, seg, ...]`` 。
    :param replace_num_alpha: 是否将数字和字母用特殊字符替换。
    :param bigrams: 是否增加一列 ``bigrams`` 。 ``bigrams`` 会对原文进行如下转化： ``['复', '旦', '大', '学', ...]->["复旦", "旦大", ...]`` 。如果
        设置为 ``True`` ，返回的 :class:`~fastNLP.core.DataSet` 将有一列名为 ``bigrams`` ，且已经转换为了 index 并设置为 input，对应的词表可以通过
        ``data_bundle.get_vocab('bigrams')`` 获取。
    :param trigrams: 是否增加一列 ``trigrams`` 。 ``trigrams`` 会对原文进行如下转化 ``['复', '旦', '大', '学', ...]->["复旦大", "旦大学", ...]`` 。
        如果设置为 ``True`` ，返回的 :class:`~fastNLP.core.DataSet` 将有一列名为 ``trigrams`` ，且已经转换为了 index 并设置为 input，对应的词表可以通过
        ``data_bundle.get_vocab('trigrams')`` 获取。
    :param num_proc: 处理数据时使用的进程数目。
    """
    
    def __init__(self, dataset_name=None, encoding_type='bmes', replace_num_alpha=True,
                 bigrams=False, trigrams=False, num_proc: int = 0):
        r"""
        
        :param str,None dataset_name: 支持'pku', 'msra', 'cityu', 'as', None
        :param str encoding_type: 可以选择'bmes', 'segapp'两种。"我 来自 复旦大学...", bmes的tag为[S, B, E, B, M, M, E...]; segapp
            的tag为[seg, app, seg, app, app, app, seg, ...]
        :param bool replace_num_alpha: 是否将数字和字母用特殊字符替换。
        :param bool bigrams: 是否增加一列bigram. bigram的构成是['复', '旦', '大', '学', ...]->["复旦", "旦大", ...]
        :param bool trigrams: 是否增加一列trigram. trigram的构成是 ['复', '旦', '大', '学', ...]->["复旦大", "旦大学", ...]
        """
    def __init__(self, dataset_name: str=None, encoding_type: str='bmes', replace_num_alpha: bool=True,
                 bigrams: bool=False, trigrams: bool=False, num_proc: int = 0):
        if encoding_type == 'bmes':
            self.word_lens_to_tags = _word_lens_to_bmes
        else:
@@ -220,7 +209,7 @@ class CWSPipe(Pipe):
    
    def process(self, data_bundle: DataBundle) -> DataBundle:
        r"""
        可以处理的DataSet需要包含raw_words列
        ``data_bunlde`` 中的 :class:`~fastNLP.core.DataSet` 应该包含 ``raw_words`` ：

        .. csv-table::
           :header: "raw_words"
@@ -276,7 +265,7 @@ class CWSPipe(Pipe):
    
    def process_from_file(self, paths=None) -> DataBundle:
        r"""
        传入文件路径，生成处理好的 :class:`~fastNLP.io.DataBundle` 对象。``paths`` 支持的路径形式可以参考 :meth:`fastNLP.io.Loader.load()`
        传入文件路径，生成处理好的 :class:`~fastNLP.io.DataBundle` 对象。``paths`` 支持的路径形式可以参考 :meth:`fastNLP.io.Loader.load`

        :param paths:
        :return:
--- a/fastNLP/io/pipe/matching.py
+++ b/fastNLP/io/pipe/matching.py
@@ -1,5 +1,3 @@
 r"""undocumented"""

 __all__ = [
    "MatchingBertPipe",
    "RTEBertPipe",
@@ -21,7 +19,7 @@ __all__ = [
    "BQCorpusPipe",
    "RenamePipe",
    "GranularizePipe",
    "MachingTruncatePipe",
    "TruncateBertPipe",
 ]
 from functools import partial

@@ -31,14 +29,13 @@ from .utils import get_tokenizer
 from ..data_bundle import DataBundle
 from ..loader.matching import SNLILoader, MNLILoader, QNLILoader, RTELoader, QuoraLoader, BQCorpusLoader, CNXNLILoader, \
    LCQMCLoader
 # from ...core._logger import log
 # from ...core.const import Const
 from ...core.vocabulary import Vocabulary


 class MatchingBertPipe(Pipe):
    r"""
    Matching任务的Bert pipe，输出的DataSet将包含以下的field
    **Matching** 任务的 Bert pipe ，处理之后 :class:`~fastNLP.core.DataSet` 中的内容如下：

    .. csv-table::
       :header: "raw_words1", "raw_words2", "target", "words", "seq_len"
@@ -47,29 +44,17 @@ class MatchingBertPipe(Pipe):
       "This site includes a...", "The Government Executive...", 0, "[11, 12, 13,...]", 5
       "...", "...", ., "[...]", .

    words列是将raw_words1(即premise), raw_words2(即hypothesis)使用"[SEP]"链接起来转换为index的。
    words列被设置为input，target列被设置为target和input(设置为input以方便在forward函数中计算loss，
    如果不在forward函数中计算loss也不影响，fastNLP将根据forward函数的形参名进行传参).

    dataset的print_field_meta()函数输出的各个field的被设置成input和target的情况为::

        +-------------+------------+------------+--------+-------+---------+
        | field_names | raw_words1 | raw_words2 | target | words | seq_len |
        +-------------+------------+------------+--------+-------+---------+
        |   is_input  |   False    |   False    | False  |  True |   True  |
        |  is_target  |   False    |   False    |  True  | False |  False  |
        | ignore_type |            |            | False  | False |  False  |
        |  pad_value  |            |            |   0    |   0   |    0    |
        +-------------+------------+------------+--------+-------+---------+
    ``words`` 列是将 ``raw_words1`` （即 ``premise`` ）， ``raw_words2`` （即 ``hypothesis`` ）使用 ``[SEP]``
    链接起来转换为 index 的。``words`` 列被设置为 input， ``target`` 列被设置为 target 和 input （设置为 input 以
    方便在 :func:`forward` 函数中计算 loss，如果不在也不影响， **fastNLP** 将根据 :func:`forward` 函数的形参名进行
    传参）。

    :param lower: 是否对输入进行小写化。
    :param tokenizer: 使用哪种 tokenize 方式将数据切成单词。支持 ``['spacy', 'raw']`` 。``'raw'`` 表示使用空格作为切分，``'spacy'`` 则使用 :mod:`spacy` 库进行分词。
    :param num_proc: 处理数据时使用的进程数目。
    """
    
    def __init__(self, lower=False, tokenizer: str = 'raw', num_proc: int = 0):
        r"""
        
        :param bool lower: 是否将word小写化。
        :param str tokenizer: 使用什么tokenizer来将句子切分为words. 支持spacy, raw两种。raw即使用空格拆分。
        """
        super().__init__()
        
        self.lower = bool(lower)
@@ -89,9 +74,9 @@ class MatchingBertPipe(Pipe):
                dataset.apply_field(self.tokenizer, field_name=field_name, new_field_name=new_field_name, num_proc=self.num_proc)
        return data_bundle
    
    def process(self, data_bundle):
    def process(self, data_bundle: DataBundle):
        r"""
        输入的data_bundle中的dataset需要具有以下结构：
        ``data_bunlde`` 中的 :class:`~fastNLP.core.DataSet` 应该具备以下结构：

        .. csv-table::
            :header: "raw_words1", "raw_words2", "target"
@@ -100,7 +85,7 @@ class MatchingBertPipe(Pipe):
            "...","..."

        :param data_bundle:
        :return:
        :return: 处理后的 ``data_bundle``
        """
        for dataset in data_bundle.datasets.values():
            if dataset.has_field('target'):
@@ -164,9 +149,16 @@ class MatchingBertPipe(Pipe):


 class RTEBertPipe(MatchingBertPipe):
    """
    处理 **RTE** 数据。

    :param lower: 是否对输入进行小写化。
    :param tokenizer: 使用哪种 tokenize 方式将数据切成单词。支持 ``['spacy', 'raw']`` 。``'raw'`` 表示使用空格作为切分，``'spacy'`` 则使用 :mod:`spacy` 库进行分词。
    :param num_proc: 处理数据时使用的进程数目。
    """
    def process_from_file(self, paths=None):
        r"""
        传入文件路径，生成处理好的 :class:`~fastNLP.io.DataBundle` 对象。``paths`` 支持的路径形式可以参考 :meth:`fastNLP.io.Loader.load()`
        传入文件路径，生成处理好的 :class:`~fastNLP.io.DataBundle` 对象。``paths`` 支持的路径形式可以参考 :meth:`fastNLP.io.Loader.load`

        :param paths:
        :return:
@@ -176,9 +168,16 @@ class RTEBertPipe(MatchingBertPipe):


 class SNLIBertPipe(MatchingBertPipe):
    """
    处理 **SNLI** 数据。

    :param lower: 是否对输入进行小写化。
    :param tokenizer: 使用哪种 tokenize 方式将数据切成单词。支持 ``['spacy', 'raw']`` 。``'raw'`` 表示使用空格作为切分，``'spacy'`` 则使用 :mod:`spacy` 库进行分词。
    :param num_proc: 处理数据时使用的进程数目。
    """
    def process_from_file(self, paths=None):
        r"""
        传入文件路径，生成处理好的 :class:`~fastNLP.io.DataBundle` 对象。``paths`` 支持的路径形式可以参考 :meth:`fastNLP.io.Loader.load()`
        传入文件路径，生成处理好的 :class:`~fastNLP.io.DataBundle` 对象。``paths`` 支持的路径形式可以参考 :meth:`fastNLP.io.Loader.load`

        :param paths:
        :return:
@@ -188,9 +187,16 @@ class SNLIBertPipe(MatchingBertPipe):


 class QuoraBertPipe(MatchingBertPipe):
    """
    处理 **Quora** 数据。

    :param lower: 是否对输入进行小写化。
    :param tokenizer: 使用哪种 tokenize 方式将数据切成单词。支持 ``['spacy', 'raw']`` 。``'raw'`` 表示使用空格作为切分，``'spacy'`` 则使用 :mod:`spacy` 库进行分词。
    :param num_proc: 处理数据时使用的进程数目。
    """
    def process_from_file(self, paths):
        r"""
        传入文件路径，生成处理好的 :class:`~fastNLP.io.DataBundle` 对象。``paths`` 支持的路径形式可以参考 :meth:`fastNLP.io.Loader.load()`
        传入文件路径，生成处理好的 :class:`~fastNLP.io.DataBundle` 对象。``paths`` 支持的路径形式可以参考 :meth:`fastNLP.io.Loader.load`

        :param paths:
        :return:
@@ -200,9 +206,16 @@ class QuoraBertPipe(MatchingBertPipe):


 class QNLIBertPipe(MatchingBertPipe):
    """
    处理 **QNNLI** 数据。

    :param lower: 是否对输入进行小写化。
    :param tokenizer: 使用哪种 tokenize 方式将数据切成单词。支持 ``['spacy', 'raw']`` 。``'raw'`` 表示使用空格作为切分，``'spacy'`` 则使用 :mod:`spacy` 库进行分词。
    :param num_proc: 处理数据时使用的进程数目。
    """
    def process_from_file(self, paths=None):
        r"""
        传入文件路径，生成处理好的 :class:`~fastNLP.io.DataBundle` 对象。``paths`` 支持的路径形式可以参考 :meth:`fastNLP.io.Loader.load()`
        传入文件路径，生成处理好的 :class:`~fastNLP.io.DataBundle` 对象。``paths`` 支持的路径形式可以参考 :meth:`fastNLP.io.Loader.load`

        :param paths:
        :return:
@@ -212,9 +225,16 @@ class QNLIBertPipe(MatchingBertPipe):


 class MNLIBertPipe(MatchingBertPipe):
    """
    处理 **MNLI** 数据。

    :param lower: 是否对输入进行小写化。
    :param tokenizer: 使用哪种 tokenize 方式将数据切成单词。支持 ``['spacy', 'raw']`` 。``'raw'`` 表示使用空格作为切分，``'spacy'`` 则使用 :mod:`spacy` 库进行分词。
    :param num_proc: 处理数据时使用的进程数目。
    """
    def process_from_file(self, paths=None):
        r"""
        传入文件路径，生成处理好的 :class:`~fastNLP.io.DataBundle` 对象。``paths`` 支持的路径形式可以参考 :meth:`fastNLP.io.Loader.load()`
        传入文件路径，生成处理好的 :class:`~fastNLP.io.DataBundle` 对象。``paths`` 支持的路径形式可以参考 :meth:`fastNLP.io.Loader.load`

        :param paths:
        :return:
@@ -225,7 +245,7 @@ class MNLIBertPipe(MatchingBertPipe):

 class MatchingPipe(Pipe):
    r"""
    Matching任务的Pipe。输出的DataSet将包含以下的field
    **Matching** 任务的 Pipe，处理之后 :class:`~fastNLP.core.DataSet` 中的内容如下：

    .. csv-table::
       :header: "raw_words1", "raw_words2", "target", "words1", "words2", "seq_len1", "seq_len2"
@@ -234,21 +254,14 @@ class MatchingPipe(Pipe):
       "This site includes a...", "The Government Executive...", 0, "[11, 12, 13,...]", "[2, 7, ...]", 6, 7
       "...", "...", ., "[...]", "[...]", ., .

    words1是premise，words2是hypothesis。其中words1,words2,seq_len1,seq_len2被设置为input；target被设置为target
    和input(设置为input以方便在forward函数中计算loss，如果不在forward函数中计算loss也不影响，fastNLP将根据forward函数
    的形参名进行传参)。

    dataset的print_field_meta()函数输出的各个field的被设置成input和target的情况为::

        +-------------+------------+------------+--------+--------+--------+----------+----------+
        | field_names | raw_words1 | raw_words2 | target | words1 | words2 | seq_len1 | seq_len2 |
        +-------------+------------+------------+--------+--------+--------+----------+----------+
        |   is_input  |   False    |   False    | False  |  True  |  True  |   True   |   True   |
        |  is_target  |   False    |   False    |  True  | False  | False  |  False   |  False   |
        | ignore_type |            |            | False  | False  | False  |  False   |  False   |
        |  pad_value  |            |            |   0    |   0    |   0    |    0     |    0     |
        +-------------+------------+------------+--------+--------+--------+----------+----------+
    ``words1`` 是 ``premise`` ，``words2`` 是 ``hypothesis`` 。其中 ``words1`` , ``words2`` , ``seq_len1``, ``seq_len2``
    被设置为 input； ``target`` 列被设置为 target 和 input （设置为 input 以
    方便在 :func:`forward` 函数中计算 loss，如果不在也不影响， **fastNLP** 将根据 :func:`forward` 函数的形参名进行
    传参）。

    :param lower: 是否对输入进行小写化。
    :param tokenizer: 使用哪种 tokenize 方式将数据切成单词。支持 ``['spacy', 'raw']`` 。``'raw'`` 表示使用空格作为切分，``'spacy'`` 则使用 :mod:`spacy` 库进行分词。
    :param num_proc: 处理数据时使用的进程数目。
    """
    
    def __init__(self, lower=False, tokenizer: str = 'raw', num_proc: int = 0):
@@ -276,9 +289,9 @@ class MatchingPipe(Pipe):
                dataset.apply_field(self.tokenizer, field_name=field_name, new_field_name=new_field_name, num_proc=self.num_proc)
        return data_bundle
    
    def process(self, data_bundle):
    def process(self, data_bundle: DataBundle):
        r"""
        接受的DataBundle中的DataSet应该具有以下的field, target列可以没有
        ``data_bunlde`` 中的 :class:`~fastNLP.core.DataSet` 应该具备以下结构，可以没有 ``target`` 列：

        .. csv-table::
           :header: "raw_words1", "raw_words2", "target"
@@ -287,8 +300,8 @@ class MatchingPipe(Pipe):
           "This site includes a...", "The Government Executive...", "not_entailment"
           "...", "..."

        :param ~fastNLP.DataBundle data_bundle: 通过loader读取得到的data_bundle，里面包含了数据集的原始数据内容
        :return: data_bundle
        :param data_bundle:
        :return: 处理后的 ``data_bundle``
        """
        data_bundle = self._tokenize(data_bundle, ['raw_words1', 'raw_words2'],
                                     ['words1', 'words2'])
@@ -337,9 +350,16 @@ class MatchingPipe(Pipe):


 class RTEPipe(MatchingPipe):
    """
    处理 **RTE** 数据。

    :param lower: 是否对输入进行小写化。
    :param tokenizer: 使用哪种 tokenize 方式将数据切成单词。支持 ``['spacy', 'raw']`` 。``'raw'`` 表示使用空格作为切分，``'spacy'`` 则使用 :mod:`spacy` 库进行分词。
    :param num_proc: 处理数据时使用的进程数目。
    """
    def process_from_file(self, paths=None):
        r"""
        传入文件路径，生成处理好的 :class:`~fastNLP.io.DataBundle` 对象。``paths`` 支持的路径形式可以参考 :meth:`fastNLP.io.Loader.load()`
        传入文件路径，生成处理好的 :class:`~fastNLP.io.DataBundle` 对象。``paths`` 支持的路径形式可以参考 :meth:`fastNLP.io.Loader.load`

        :param paths:
        :return:
@@ -349,9 +369,16 @@ class RTEPipe(MatchingPipe):


 class SNLIPipe(MatchingPipe):
    """
    处理 **SNLI** 数据。

    :param lower: 是否对输入进行小写化。
    :param tokenizer: 使用哪种 tokenize 方式将数据切成单词。支持 ``['spacy', 'raw']`` 。``'raw'`` 表示使用空格作为切分，``'spacy'`` 则使用 :mod:`spacy` 库进行分词。
    :param num_proc: 处理数据时使用的进程数目。
    """
    def process_from_file(self, paths=None):
        r"""
        传入文件路径，生成处理好的 :class:`~fastNLP.io.DataBundle` 对象。``paths`` 支持的路径形式可以参考 :meth:`fastNLP.io.Loader.load()`
        传入文件路径，生成处理好的 :class:`~fastNLP.io.DataBundle` 对象。``paths`` 支持的路径形式可以参考 :meth:`fastNLP.io.Loader.load`

        :param paths:
        :return:
@@ -361,9 +388,16 @@ class SNLIPipe(MatchingPipe):


 class QuoraPipe(MatchingPipe):
    """
    处理 **Quora** 数据。

    :param lower: 是否对输入进行小写化。
    :param tokenizer: 使用哪种 tokenize 方式将数据切成单词。支持 ``['spacy', 'raw']`` 。``'raw'`` 表示使用空格作为切分，``'spacy'`` 则使用 :mod:`spacy` 库进行分词。
    :param num_proc: 处理数据时使用的进程数目。
    """
    def process_from_file(self, paths):
        r"""
        传入文件路径，生成处理好的 :class:`~fastNLP.io.DataBundle` 对象。``paths`` 支持的路径形式可以参考 :meth:`fastNLP.io.Loader.load()`
        传入文件路径，生成处理好的 :class:`~fastNLP.io.DataBundle` 对象。``paths`` 支持的路径形式可以参考 :meth:`fastNLP.io.Loader.load`

        :param paths:
        :return:
@@ -373,9 +407,16 @@ class QuoraPipe(MatchingPipe):


 class QNLIPipe(MatchingPipe):
    """
    处理 **QNLI** 数据。

    :param lower: 是否对输入进行小写化。
    :param tokenizer: 使用哪种 tokenize 方式将数据切成单词。支持 ``['spacy', 'raw']`` 。``'raw'`` 表示使用空格作为切分，``'spacy'`` 则使用 :mod:`spacy` 库进行分词。
    :param num_proc: 处理数据时使用的进程数目。
    """
    def process_from_file(self, paths=None):
        r"""
        传入文件路径，生成处理好的 :class:`~fastNLP.io.DataBundle` 对象。``paths`` 支持的路径形式可以参考 :meth:`fastNLP.io.Loader.load()`
        传入文件路径，生成处理好的 :class:`~fastNLP.io.DataBundle` 对象。``paths`` 支持的路径形式可以参考 :meth:`fastNLP.io.Loader.load`

        :param paths:
        :return:
@@ -385,9 +426,16 @@ class QNLIPipe(MatchingPipe):


 class MNLIPipe(MatchingPipe):
    """
    处理 **MNLI** 数据。

    :param lower: 是否对输入进行小写化。
    :param tokenizer: 使用哪种 tokenize 方式将数据切成单词。支持 ``['spacy', 'raw']`` 。``'raw'`` 表示使用空格作为切分，``'spacy'`` 则使用 :mod:`spacy` 库进行分词。
    :param num_proc: 处理数据时使用的进程数目。
    """
    def process_from_file(self, paths=None):
        r"""
        传入文件路径，生成处理好的 :class:`~fastNLP.io.DataBundle` 对象。``paths`` 支持的路径形式可以参考 :meth:`fastNLP.io.Loader.load()`
        传入文件路径，生成处理好的 :class:`~fastNLP.io.DataBundle` 对象。``paths`` 支持的路径形式可以参考 :meth:`fastNLP.io.Loader.load`

        :param paths:
        :return:
@@ -397,12 +445,18 @@ class MNLIPipe(MatchingPipe):


 class LCQMCPipe(MatchingPipe):
    def __init__(self, tokenizer='cn=char', num_proc=0):
    """
    处理 **LCQMC** 数据。

    :param tokenizer: 使用哪种 tokenize 方式将数据切成单词。支持 ``['cn-char']`` ，按字分词。
    :param num_proc: 处理数据时使用的进程数目。
    """
    def __init__(self, tokenizer='cn-char', num_proc=0):
        super().__init__(tokenizer=tokenizer, num_proc=num_proc)

    def process_from_file(self, paths=None):
        r"""
        传入文件路径，生成处理好的 :class:`~fastNLP.io.DataBundle` 对象。``paths`` 支持的路径形式可以参考 :meth:`fastNLP.io.Loader.load()`
        传入文件路径，生成处理好的 :class:`~fastNLP.io.DataBundle` 对象。``paths`` 支持的路径形式可以参考 :meth:`fastNLP.io.Loader.load`

        :param paths:
        :return:
@@ -415,12 +469,18 @@ class LCQMCPipe(MatchingPipe):


 class CNXNLIPipe(MatchingPipe):
    """
    处理 **XNLI Chinese** 数据。

    :param tokenizer: 使用哪种 tokenize 方式将数据切成单词。支持 ``['cn-char']`` ，按字分词。
    :param num_proc: 处理数据时使用的进程数目。
    """
    def __init__(self, tokenizer='cn-char', num_proc=0):
        super().__init__(tokenizer=tokenizer, num_proc=num_proc)

    def process_from_file(self, paths=None):
        r"""
        传入文件路径，生成处理好的 :class:`~fastNLP.io.DataBundle` 对象。``paths`` 支持的路径形式可以参考 :meth:`fastNLP.io.Loader.load()`
        传入文件路径，生成处理好的 :class:`~fastNLP.io.DataBundle` 对象。``paths`` 支持的路径形式可以参考 :meth:`fastNLP.io.Loader.load`

        :param paths:
        :return:
@@ -434,12 +494,18 @@ class CNXNLIPipe(MatchingPipe):


 class BQCorpusPipe(MatchingPipe):
    """
    处理 **BQ Corpus** 数据。
    
    :param tokenizer: 使用哪种 tokenize 方式将数据切成单词。支持 ``['cn-char']`` ，按字分词。
    :param num_proc: 处理数据时使用的进程数目。
    """
    def __init__(self, tokenizer='cn-char', num_proc=0):
        super().__init__(tokenizer=tokenizer, num_proc=num_proc)

    def process_from_file(self, paths=None):
        r"""
        传入文件路径，生成处理好的 :class:`~fastNLP.io.DataBundle` 对象。``paths`` 支持的路径形式可以参考 :meth:`fastNLP.io.Loader.load()`
        传入文件路径，生成处理好的 :class:`~fastNLP.io.DataBundle` 对象。``paths`` 支持的路径形式可以参考 :meth:`fastNLP.io.Loader.load`

        :param paths:
        :return:
@@ -452,6 +518,13 @@ class BQCorpusPipe(MatchingPipe):


 class RenamePipe(Pipe):
    """
    重命名数据集的 Pipe ，经过处理后会将数据集中的 ``chars``, ``raw_chars1`` 等列重命名为 ``words``, 
    ``raw_words1``，反之亦然。

    :param task: 任务类型，可选 ``['cn-nli', 'cn-nli-bert']`` 。
    :param num_proc: 处理数据时使用的进程数目。
    """
    def __init__(self, task='cn-nli', num_proc=0):
        super().__init__()
        self.task = task
@@ -459,7 +532,7 @@ class RenamePipe(Pipe):
    
    def process(self, data_bundle: DataBundle):  # rename field name for Chinese Matching dataset
        """

        :param data_bundle:
        :return: 处理后的 ``data_bundle``
        """
        if (self.task == 'cn-nli'):
@@ -497,6 +570,16 @@ class RenamePipe(Pipe):


 class GranularizePipe(Pipe):
    """
    将数据集中 ``target`` 列中的 tag 按照一定的映射进行重命名，并丢弃不在映射中的 tag。

    :param task: 任务类型，目前仅支持 ``['XNLI']``。

            - ``'XNLI'`` -- 将  ``neutral``, ``entailment``, ``contradictory``, ``contradiction`` 分别
              映射为 0, 1, 2, 3；

    :param num_proc: 处理数据时使用的进程数目。
    """
    def __init__(self, task=None, num_proc=0):
        super().__init__()
        self.task = task
@@ -520,7 +603,7 @@ class GranularizePipe(Pipe):
    
    def process(self, data_bundle: DataBundle):
        """

        :param data_bundle:
        :return: 处理后的 ``data_bundle``
        """
        task_tag_dict = {
@@ -532,28 +615,19 @@ class GranularizePipe(Pipe):
            raise RuntimeError(f"Only support {task_tag_dict.keys()} task_tag_map.")
        return data_bundle


 class MachingTruncatePipe(Pipe):  # truncate sentence for bert, modify seq_len
    def __init__(self):
        super().__init__()
    
    def process(self, data_bundle: DataBundle):
        """

        :return: None
        """
        for name, dataset in data_bundle.datasets.items():
            pass
        return None


 class LCQMCBertPipe(MatchingBertPipe):
    def __init__(self, tokenizer='cn=char', num_proc=0):
    """
    处理 **LCQMC** 数据

    :param tokenizer: 使用哪种 tokenize 方式将数据切成单词。支持 ``['cn-char']`` ，按字分词。
    :param num_proc: 处理数据时使用的进程数目。
    """
    def __init__(self, tokenizer='cn-char', num_proc=0):
        super().__init__(tokenizer=tokenizer, num_proc=num_proc)

    def process_from_file(self, paths=None):
        r"""
        传入文件路径，生成处理好的 :class:`~fastNLP.io.DataBundle` 对象。``paths`` 支持的路径形式可以参考 :meth:`fastNLP.io.Loader.load()`
        传入文件路径，生成处理好的 :class:`~fastNLP.io.DataBundle` 对象。``paths`` 支持的路径形式可以参考 :meth:`fastNLP.io.Loader.load`

        :param paths:
        :return:
@@ -567,12 +641,18 @@ class LCQMCBertPipe(MatchingBertPipe):


 class BQCorpusBertPipe(MatchingBertPipe):
    """
    处理 **BQ Corpus** 数据。

    :param tokenizer: 使用哪种 tokenize 方式将数据切成单词。支持 ``['cn-char']`` ，按字分词。
    :param num_proc: 处理数据时使用的进程数目。
    """
    def __init__(self, tokenizer='cn-char', num_proc=0):
        super().__init__(tokenizer=tokenizer, num_proc=num_proc)

    def process_from_file(self, paths=None):
        r"""
        传入文件路径，生成处理好的 :class:`~fastNLP.io.DataBundle` 对象。``paths`` 支持的路径形式可以参考 :meth:`fastNLP.io.Loader.load()`
        传入文件路径，生成处理好的 :class:`~fastNLP.io.DataBundle` 对象。``paths`` 支持的路径形式可以参考 :meth:`fastNLP.io.Loader.load`

        :param paths:
        :return:
@@ -586,12 +666,18 @@ class BQCorpusBertPipe(MatchingBertPipe):


 class CNXNLIBertPipe(MatchingBertPipe):
    """
    处理 **XNLI Chinese** 数据。

    :param tokenizer: 使用哪种 tokenize 方式将数据切成单词。支持 ``['cn-char']`` ，按字分词。
    :param num_proc: 处理数据时使用的进程数目。
    """
    def __init__(self, tokenizer='cn-char', num_proc=0):
        super().__init__(tokenizer=tokenizer, num_proc=num_proc)

    def process_from_file(self, paths=None):
        r"""
        传入文件路径，生成处理好的 :class:`~fastNLP.io.DataBundle` 对象。``paths`` 支持的路径形式可以参考 :meth:`fastNLP.io.Loader.load()`
        传入文件路径，生成处理好的 :class:`~fastNLP.io.DataBundle` 对象。``paths`` 支持的路径形式可以参考 :meth:`fastNLP.io.Loader.load`

        :param paths:
        :return:
@@ -606,6 +692,13 @@ class CNXNLIBertPipe(MatchingBertPipe):


 class TruncateBertPipe(Pipe):
    """
    对数据进行截断的 **Pipe** 。该 **Pipe** 将会寻找每条数据中的第一个分隔符 ``[SEP]`` ，对其前后的数据分别进行截断。
    对于中文任务会将前后的文本分别截断至长度 **250** ，对于英文任务会分别截断至 **215** 。

    :param task: 任务类型，可选 ``['cn', 'en']`` ，分别表示 **中文任务** 和 **英文任务** 。
    :param num_proc: 处理数据时使用的进程数目。
    """
    def __init__(self, task='cn', num_proc=0):
        super().__init__()
        self.task = task
@@ -631,7 +724,7 @@ class TruncateBertPipe(Pipe):

    def process(self, data_bundle: DataBundle) -> DataBundle:
        """

        :param data_bundle:
        :return: 处理后的 ``data_bundle``
        """
        for name in data_bundle.datasets.keys():
--- a/fastNLP/io/pipe/pipe.py
+++ b/fastNLP/io/pipe/pipe.py
@@ -1,5 +1,3 @@
 r"""undocumented"""

 __all__ = [
    "Pipe",
 ]
@@ -9,31 +7,36 @@ from fastNLP.io.data_bundle import DataBundle

 class Pipe:
    r"""
    Pipe是fastNLP中用于处理DataBundle的类，但实际是处理DataBundle中的DataSet。所有Pipe都会在其process()函数的文档中指出该Pipe可处理的DataSet应该具备怎样的格式；在Pipe
    文档中说明该Pipe返回后DataSet的格式以及其field的信息；以及新增的Vocabulary的信息。
    :class:`Pipe` 是 **fastNLP** 中用于处理 :class:`~fastNLP.io.DataBundle` 的类，但实际是处理其中的 :class:`~fastNLP.core.DataSet` 。
    所有 ``Pipe`` 都会在其 :meth:`process` 函数的文档中指出该 ``Pipe`` 可处理的 :class:`~fastNLP.core.DataSet` 应该具备怎样的格式；在
    ``Pipe`` 文档中说明该 ``Pipe`` 返回后 :class:`~fastNLP.core.DataSet` 的格式以及其 field 的信息；以及新增的 :class:`~fastNLP.core.Vocabulary` 
    的信息。

    一般情况下Pipe处理包含以下的几个过程，(1)将raw_words或raw_chars进行tokenize以切分成不同的词或字;
    (2) 再建立词或字的 :class:`~fastNLP.Vocabulary` , 并将词或字转换为index; (3)将target列建立词表并将target列转为index;
    一般情况下 **Pipe** 处理包含以下的几个过程：
    
        1. 将 ``raw_words`` 或 ``raw_chars`` 进行 tokenize 以切分成不同的词或字；
        2. 建立词或字的 :class:`~fastNLP.core.Vocabulary` ，并将词或字转换为 index；
        3. 将 ``target`` 列建立词表并将 ``target`` 列转为 index；

    Pipe中提供了两个方法
    **Pipe** 中提供了两个方法：

    -process()函数，输入为DataBundle
    -process_from_file()函数，输入为对应Loader的load函数可接受的类型。
        - :meth:`process` 函数，输入为 :class:`~fastNLP.io.DataBundle`
        - :meth:`process_from_file` 函数，输入为对应 :meth:`fastNLP.io.Loader.load` 函数可接受的类型。

    """
    
    def process(self, data_bundle: DataBundle) -> DataBundle:
        r"""
        对输入的DataBundle进行处理，然后返回该DataBundle。
        对输入的 ``data_bundle`` 进行处理，然后返回该 ``data_bundle``

        :param ~fastNLP.DataBundle data_bundle: 需要处理的DataBundle对象
        :param data_bundle:
        :return: 处理后的 ``data_bundle``
        """
        raise NotImplementedError

    def process_from_file(self, paths: str) -> DataBundle:
        r"""
        传入文件路径，生成处理好的 :class:`~fastNLP.io.DataBundle` 对象。``paths`` 支持的路径形式可以参考 :meth:`fastNLP.io.Loader.load()`
        传入文件路径，生成处理好的 :class:`~fastNLP.io.DataBundle` 对象。``paths`` 支持的路径形式可以参考 :meth:`fastNLP.io.Loader.load`

        :param paths:
        :return:
--- a/fastNLP/io/pipe/qa.py
+++ b/fastNLP/io/pipe/qa.py
@@ -1,5 +1,5 @@
 r"""
 本文件中的Pipe主要用于处理问答任务的数据。
 本文件中的 **Pipe** 主要用于处理问答任务的数据。

 """

@@ -78,32 +78,20 @@ def _concat_clip(data_bundle, max_len, concat_field_name='raw_chars'):

 class CMRC2018BertPipe(Pipe):
    r"""
    处理之后的DataSet将新增以下的field(传入的field仍然保留)
    处理 **CMRC2018** 的数据，处理之后 :class:`~fastNLP.core.DataSet` 中新增的内容如下（原有的 field 仍然保留）：

    .. csv-table::
        :header: "context_len", "raw_chars",  "target_start", "target_end", "chars"
        
        492, ['范', '廷', '颂... ], 30, 34, "[21, 25, ...]"
        491, ['范', '廷', '颂... ], 41, 61, "[21, 25, ...]"

        492, "['范', '廷', '颂... ]", 30, 34, "[21, 25, ...]"
        491, "['范', '廷', '颂... ]", 41, 61, "[21, 25, ...]"
        ".", "...", "...","...", "..."

    raw_words列是context与question拼起来的结果(连接的地方加入了[SEP])，words是转为index的值, target_start为答案start的index，target_end为答案end的index
    （闭区间）；context_len指示的是words列中context的长度。

    其中各列的meta信息如下:
    
    .. code::
    
        +-------------+-------------+-----------+--------------+------------+-------+---------+
        | field_names | context_len | raw_chars | target_start | target_end | chars | answers |
        +-------------+-------------+-----------+--------------+------------+-------+---------|
        |   is_input  |    False    |   False   |    False     |   False    |  True |  False  |
        |  is_target  |     True    |    True   |     True     |    True    | False |  True   |
        | ignore_type |    False    |    True   |    False     |   False    | False |  True   |
        |  pad_value  |      0      |     0     |      0       |     0      |   0   |   0     |
        +-------------+-------------+-----------+--------------+------------+-------+---------+
    
    ``raw_chars`` 列是 ``context`` 与 ``question`` 拼起来的结果（连接的地方加入了 ``[SEP]`` ）， ``chars`` 是转为
    index 的值， ``target_start`` 为答案开始的位置， ``target_end`` 为答案结束的位置（闭区间）； ``context_len``
    指示的是 ``chars`` 列中 context 的长度。

    :param max_len:
    """

    def __init__(self, max_len=510):
@@ -112,10 +100,10 @@ class CMRC2018BertPipe(Pipe):

    def process(self, data_bundle: DataBundle) -> DataBundle:
        r"""
        传入的DataSet应该具备以下的field
        ``data_bunlde`` 中的 :class:`~fastNLP.core.DataSet` 应该包含 ``raw_words`` ：

        .. csv-table::
           :header:"title", "context", "question", "answers", "answer_starts", "id"
           :header: "title", "context", "question", "answers", "answer_starts", "id"

           "范廷颂", "范廷颂枢机（，），圣名保禄·若瑟（）...", "范廷颂是什么时候被任为主教的？", ["1963年"], ["30"], "TRAIN_186_QUERY_0"
           "范廷颂", "范廷颂枢机（，），圣名保禄·若瑟（）...", "1990年，范廷颂担任什么职务？", ["1990年被擢升为天..."], ["41"],"TRAIN_186_QUERY_1"
@@ -139,7 +127,7 @@ class CMRC2018BertPipe(Pipe):

    def process_from_file(self, paths=None) -> DataBundle:
        r"""
        传入文件路径，生成处理好的 :class:`~fastNLP.io.DataBundle` 对象。``paths`` 支持的路径形式可以参考 :meth:`fastNLP.io.Loader.load()`
        传入文件路径，生成处理好的 :class:`~fastNLP.io.DataBundle` 对象。``paths`` 支持的路径形式可以参考 :meth:`fastNLP.io.Loader.load`

        :param paths:
        :return:
--- a/fastNLP/io/pipe/summarization.py
+++ b/fastNLP/io/pipe/summarization.py
@@ -1,4 +1,3 @@
 r"""undocumented"""
 import os
 import numpy as np
 from functools import partial
@@ -20,21 +19,23 @@ TAG_UNK = "X"

 class ExtCNNDMPipe(Pipe):
    r"""
    对CNN/Daily Mail数据进行适用于extractive summarization task的预处理，预处理之后的数据，具备以下结构：
    对 **CNN/Daily Mail** 数据进行适用于 ``extractive summarization task`` 的预处理，预处理之后的数据具备以下结构：
    
    .. csv-table::
       :header: "text", "summary", "label", "publication", "text_wd", "words", "seq_len", "target"
    

        "['I got new tires from them and... ','...']", "['The new tires...','...']", "[0, 1]", "cnndm", "[['I','got',...'.'],...,['...']]", "[[54,89,...,5],...,[9,43,..,0]]", "[1,1,...,0]", "[0,1,...,0]"
        "['Don't waste your time.  We had two...','...']", "['Time is precious','...']", "[1]", "cnndm", "[['Don't','waste',...,'.'],...,['...']]", "[[5234,653,...,5],...,[87,234,..,0]]", "[1,1,...,0]", "[1,1,...,0]"
        "['...']", "['...']", "[]", "cnndm", "[[''],...,['']]", "[[],...,[]]", "[]", "[]"

    :param vocab_size: 词表大小
    :param sent_max_len: 句子最大长度，不足的句子将 padding ，超出的将截断
    :param doc_max_timesteps: 文章最多句子个数，不足的将 padding，超出的将截断
    :param vocab_path: 外部词表路径
    :param domain: 是否需要建立 domain 词表
    :param num_proc: 处理数据时使用的进程数目。
    """
    def __init__(self, vocab_size, sent_max_len, doc_max_timesteps, vocab_path=None, domain=False, num_proc=0):
        r"""
        
        :param vocab_size: int, 词表大小
        :param sent_max_len: int, 句子最大长度，不足的句子将padding，超出的将截断
        :param doc_max_timesteps: int, 文章最多句子个数，不足的将padding，超出的将截断
        :param vocab_path: str, 外部词表路径
        :param domain:  bool, 是否需要建立domain词表
        """
    def __init__(self, vocab_size: int, sent_max_len: int, doc_max_timesteps: int, vocab_path=None, domain=False, num_proc=0):
        self.vocab_size = vocab_size
        self.vocab_path = vocab_path
        self.sent_max_len = sent_max_len
@@ -44,23 +45,24 @@ class ExtCNNDMPipe(Pipe):

    def process(self, data_bundle: DataBundle):
        r"""
        传入的DataSet应该具备如下的结构
        ``data_bunlde`` 中的 :class:`~fastNLP.core.DataSet` 应该具备以下结构：

        .. csv-table::
           :header: "text", "summary", "label", "publication"

           ["I got new tires from them and... ","..."], ["The new tires...","..."], [0, 1], "cnndm"
           ["Don't waste your time.  We had two...","..."], ["Time is precious","..."], [1], "cnndm"
           ["..."], ["..."], [], "cnndm"
           "['I got new tires from them and... ','...']", "['The new tires...','...']", "[0, 1]", "cnndm"
           "['Don't waste your time.  We had two...','...']", "['Time is precious','...']", "[1]", "cnndm"
           "['...']", ['...']", "[]", "cnndm"

        :param data_bundle:
        :return: 处理后的 ``data_bundle``
        :return: 处理后的 ``data_bundle``，新增以下列：

        .. csv-table::
           :header: "text_wd", "words", "seq_len", "target"

           [["I","got",..."."],...,["..."]], [[54,89,...,5],...,[9,43,..,0]], [1,1,...,0], [0,1,...,0]
           [["Don't","waste",...,"."],...,["..."]], [[5234,653,...,5],...,[87,234,..,0]], [1,1,...,0], [1,1,...,0]
           [[""],...,[""]], [[],...,[]], [], []
           "[['I','got',...'.'],...,['...']]", "[[54,89,...,5],...,[9,43,..,0]]", "[1,1,...,0]", "[0,1,...,0]"
           "[['Don't','waste',...,'.'],...,['...']]", "[[5234,653,...,5],...,[87,234,..,0]]", "[1,1,...,0]", "[1,1,...,0]"
           "[[''],...,['']]", "[[],...,[]]", "[]", "[]"
        """

        if self.vocab_path is None:
@@ -117,7 +119,7 @@ class ExtCNNDMPipe(Pipe):

    def process_from_file(self, paths=None):
        r"""
        传入文件路径，生成处理好的 :class:`~fastNLP.io.DataBundle` 对象。``paths`` 支持的路径形式可以参考 :meth:`fastNLP.io.Loader.load()`
        传入文件路径，生成处理好的 :class:`~fastNLP.io.DataBundle` 对象。``paths`` 支持的路径形式可以参考 :meth:`fastNLP.io.Loader.load`

        :param paths:
        :return:
--- a/fastNLP/io/pipe/utils.py
+++ b/fastNLP/io/pipe/utils.py
@@ -1,5 +1,3 @@
 r"""undocumented"""

 __all__ = [
    "iob2",
    "iob2bioes",
@@ -17,10 +15,10 @@ from pkg_resources import parse_version

 def iob2(tags: List[str]) -> List[str]:
    r"""
    检查数据是否是合法的IOB数据，如果是IOB1会被自动转换为IOB2。两种格式的区别见
    检查数据是否是合法的 ``IOB`` 数据，如果是 ``IOB1`` 会被自动转换为 ``IOB2`` 。两种格式的区别见
    https://datascience.stackexchange.com/questions/37824/difference-between-iob-and-iob2-format

    :param tags: 需要转换的tags
    :param tags: 需要转换的 tags
    """
    for i, tag in enumerate(tags):
        if tag == "O":
@@ -41,8 +39,9 @@ def iob2(tags: List[str]) -> List[str]:

 def iob2bioes(tags: List[str]) -> List[str]:
    r"""
    将iob的tag转换为bioes编码
    :param tags:
    将 ``iob`` 的 tag 转换为 ``bioes`` 编码

    :param tags: 需要转换的 tags
    :return:
    """
    new_tags = []
@@ -69,9 +68,10 @@ def iob2bioes(tags: List[str]) -> List[str]:
 def get_tokenizer(tokenize_method: str, lang='en'):
    r"""

    :param str tokenize_method: 获取tokenzier方法
    :param str lang: 语言，当前仅支持en
    :return: tokenize函数
    :param tokenize_method: 获取 tokenzier 方法，支持 ``['spacy', 'raw', 'cn-char']`` 。``'raw'`` 表示使用空格作为切分， ``'cn-char'`` 表示
        按字符切分，``'spacy'`` 则使用 :mod:`spacy` 库进行分词。
    :param lang: :mod:`spacy` 使用的语言，当前仅支持 ``'en'`` 。
    :return: tokenize 函数
    """
    tokenizer_dict = {
        'spacy': None,