Merge branch 'dev0.8.0' of github.com:fastnlp/fastNLP into dev0.8.0

2 years ago · fb15afa2cc
--- a/docs/source/fastNLP.core.callbacks.fitlog_callback.rst
+++ b/docs/source/fastNLP.core.callbacks.fitlog_callback.rst
@@ -0,0 +1,7 @@
 fastNLP.core.callbacks.fitlog\_callback module
 ==============================================

 .. automodule:: fastNLP.core.callbacks.fitlog_callback
   :members:
   :undoc-members:
   :show-inheritance:
--- a/docs/source/fastNLP.core.callbacks.rst
+++ b/docs/source/fastNLP.core.callbacks.rst
@@ -25,6 +25,7 @@ Submodules
   fastNLP.core.callbacks.callback_manager
   fastNLP.core.callbacks.checkpoint_callback
   fastNLP.core.callbacks.early_stop_callback
   fastNLP.core.callbacks.fitlog_callback
   fastNLP.core.callbacks.has_monitor_callback
   fastNLP.core.callbacks.load_best_model_callback
   fastNLP.core.callbacks.lr_scheduler_callback
--- a/docs/source/fastNLP.modules.mix_modules.rst
+++ b/docs/source/fastNLP.modules.mix_modules.rst
@@ -0,0 +1,15 @@
 fastNLP.modules.mix\_modules package
 ====================================

 .. automodule:: fastNLP.modules.mix_modules
   :members:
   :undoc-members:
   :show-inheritance:

 Submodules
 ----------

 .. toctree::
   :maxdepth: 4

   fastNLP.modules.mix_modules.utils
--- a/docs/source/fastNLP.modules.mix_modules.utils.rst
+++ b/docs/source/fastNLP.modules.mix_modules.utils.rst
@@ -0,0 +1,7 @@
 fastNLP.modules.mix\_modules.utils module
 =========================================

 .. automodule:: fastNLP.modules.mix_modules.utils
   :members:
   :undoc-members:
   :show-inheritance:
--- a/docs/source/fastNLP.modules.rst
+++ b/docs/source/fastNLP.modules.rst
@@ -0,0 +1,15 @@
 fastNLP.modules package
 =======================

 .. automodule:: fastNLP.modules
   :members:
   :undoc-members:
   :show-inheritance:

 Subpackages
 -----------

 .. toctree::
   :maxdepth: 4

   fastNLP.modules.mix_modules
--- a/docs/source/fastNLP.rst
+++ b/docs/source/fastNLP.rst
@@ -15,3 +15,4 @@ Subpackages
   fastNLP.core
   fastNLP.envs
   fastNLP.io
   fastNLP.modules
--- a/fastNLP/core/callbacks/fitlog_callback.py
+++ b/fastNLP/core/callbacks/fitlog_callback.py
@@ -3,30 +3,31 @@ __all__ = [
 ]
 from .has_monitor_callback import HasMonitorCallback
 from ...envs import _module_available
 from ...envs import get_global_rank
 if _module_available('fitlog'):
    import fitlog


 class FitlogCallback(HasMonitorCallback):
    """
    自动记录 ``evaluation`` 结果到 ``fitlog`` 中。会自动记录每一次 ``evaluate`` 后的结果；同时会根据
     ``monitor`` 记录最好的结果。另外，会自动将非 ``rank 0`` 上的 ``fitlog`` 设置为 ``debug`` 状态。

    :param monitor: 监控的 metric 值。

        * 为 ``None``
         将尝试使用 :class:`~fastNLP.Trainer` 中设置 `monitor` 值（如果有设置）。
        * 为 ``str``
         尝试直接使用该名称从 ``evaluation`` 结果中寻找，如果在 ``evaluation`` 结果中没有找到完全一致的名称，将
         使用 最长公共字符串算法 从 ``evaluation`` 结果中找到最匹配的那个作为 ``monitor`` 。
        * 为 ``Callable``
         接受参数为 ``evaluation`` 的结果(字典类型)，返回一个 ``float`` 值作为 ``monitor`` 的结果，如果当前结果中没有相关
         的 ``monitor`` 值请返回 ``None`` 。
    :param larger_better: 是否是越大越好。
    :param log_exception: 是否记录 ``exception`` 。
    :param log_loss_every: 多少个 ``batch`` 记录一次 loss 到 ``fitlog`` 中。
    """
    def __init__(self, monitor=None, larger_better: bool = True, log_exception:bool=True, log_loss_every:int=0):
        """
        自动记录 ``evaluation`` 结果到 ``fitlog`` 中的 ``Callback`` 。会根据 ``monitor`` 记录最好的结果，以及每一次 ``evaluate`` 后的
        结果。

        :param monitor: 监控的 metric 值。

            * 为 ``None``
             将尝试使用 :class:`~fastNLP.Trainer` 中设置 `monitor` 值（如果有设置）。
            * 为 ``str``
             尝试直接使用该名称从 ``evaluation`` 结果中寻找，如果在 ``evaluation`` 结果中没有找到完全一致的名称，将
             使用 最长公共字符串算法 从 ``evaluation`` 结果中找到最匹配的那个作为 ``monitor`` 。
            * 为 ``Callable``
             接受参数为 ``evaluation`` 的结果(字典类型)，返回一个 ``float`` 值作为 ``monitor`` 的结果，如果当前结果中没有相关
             的 ``monitor`` 值请返回 ``None`` 。
        :param larger_better: 是否是越大越好。
        :param log_exception: 是否记录 ``exception`` 。
        :param log_loss_every: 多少个 ``batch`` 记录一次 loss 到 ``fitlog`` 中。
        """
        assert _module_available('fitlog'), "fitlog is not installed."

        super().__init__(monitor=monitor, larger_better=larger_better)
@@ -34,6 +35,10 @@ class FitlogCallback(HasMonitorCallback):
        self.log_loss_every = log_loss_every
        self.avg_loss = 0

    def on_after_trainer_initialized(self, trainer, driver):
        if get_global_rank() != 0:  # 如果不是 global rank 为 0 ，需要关闭 fitlog
            fitlog.debug()

    def on_evaluate_end(self, trainer, results):
        results = self.itemize_results(results)
        fitlog.add_metric(results, step=trainer.global_forward_batches, epoch=trainer.cur_epoch_idx)
--- a/fastNLP/core/utils/paddle_utils.py
+++ b/fastNLP/core/utils/paddle_utils.py
@@ -22,9 +22,9 @@ from .utils import apply_to_collection

 def _convert_data_device(device: Union[str, int]) -> str:
    """
    用于转换 ``driver`` 的 ``data_device`` 的函数。如果用户设置了 ``FASTNLP_BACKEND=paddle``，那么 ``fastNLP`` 会将
    用于转换 ``driver`` 的 ``data_device`` 的函数。如果用户设置了 ``FASTNLP_BACKEND=paddle``，那么 **fastNLP** 会将
    可见的设备保存在 ``USER_CUDA_VISIBLE_DEVICES`` 中，并且将 ``CUDA_VISIBLE_DEVICES`` 设置为可见的第一张显卡；这是为
    了顺利执行 ``paddle`` 的分布式训练而设置的。
    了顺利执行 **paddle** 的分布式训练而设置的。
    
    在这种情况下，单纯使用 ``driver.data_device`` 是无效的。比如在分布式训练中将设备设置为 ``[0,2,3]`` ，且用户设置了
    ``CUDA_VISIBLE_DEVICES=3,4,5,6`` ，那么在 ``rank1``的进程中有::
@@ -127,7 +127,7 @@ def get_paddle_device_id(device: Union[str, int]) -> int:

 def paddle_move_data_to_device(batch: Any, device: Optional[Union[str, int]]) -> Any:
    r"""
    将 ``paddle`` 的数据集合传输到给定设备。只有 :class:`paddle.Tensor` 对象会被传输到设备中，其余保持不变。
    将 **paddle** 的数据集合传输到给定设备。只有 :class:`paddle.Tensor` 对象会被传输到设备中，其余保持不变。

    :param batch: 需要进行迁移的数据集合；
    :param device: 目标设备。可以是显卡设备的编号，或是``cpu``, ``gpu`` 或 ``gpu:x`` 格式的字符串；当这个参数
@@ -145,20 +145,20 @@ def paddle_move_data_to_device(batch: Any, device: Optional[Union[str, int]]) ->

 def is_in_paddle_dist() -> bool:
    """
    判断是否处于 ``paddle`` 分布式的进程下，使用 ``PADDLE_RANK_IN_NODE`` 和 ``FLAGS_selected_gpus`` 判断。
    判断是否处于 **paddle** 分布式的进程下，使用 ``PADDLE_RANK_IN_NODE`` 和 ``FLAGS_selected_gpus`` 判断。
    """
    return ('PADDLE_RANK_IN_NODE' in os.environ and 'FLAGS_selected_gpus' in os.environ)


 def is_in_fnlp_paddle_dist() -> bool:
    """
    判断是否处于 ``fastNLP`` 拉起的 ``paddle`` 分布式进程中
    判断是否处于 **fastNLP** 拉起的 **paddle** 分布式进程中
    """
    return FASTNLP_DISTRIBUTED_CHECK in os.environ


 def is_in_paddle_launch_dist() -> bool:
    """
    判断是否处于 ``python -m paddle.distributed.launch`` 方法启动的 ``paddle`` 分布式进程中
    判断是否处于 ``python -m paddle.distributed.launch`` 方法启动的 **paddle** 分布式进程中
    """
    return FASTNLP_BACKEND_LAUNCH in os.environ
--- a/fastNLP/core/utils/rich_progress.py
+++ b/fastNLP/core/utils/rich_progress.py
@@ -1,5 +1,5 @@
 """
 该文件用于为 ``fastNLP`` 提供一个统一的 ``progress bar`` 管理，通过共用一个``Task`` 对象， :class:`~fastNLP.core.Trainer` 中
 该文件用于为 **fastNLP** 提供一个统一的 ``progress bar`` 管理，通过共用一个``Task`` 对象， :class:`~fastNLP.core.Trainer` 中
 的 ``progress bar`` 和 :class:`~fastNLP.core.Evaluator` 中的 ``progress bar`` 才能不冲突
 """
 import sys
--- a/fastNLP/core/utils/torch_utils.py
+++ b/fastNLP/core/utils/torch_utils.py
@@ -44,11 +44,11 @@ class TorchTransferableDataType(ABC):
 def torch_move_data_to_device(batch: Any, device: Optional[Union[str, "torch.device"]] = None,
                              non_blocking: Optional[bool] = True) -> Any:
    r"""
    在 ``pytorch`` 中将数据集合 ``batch`` 传输到给定设备。任何定义方法 ``to(device)`` 的对象都将被移动并且集合中的所有其他对象将保持不变；
    在 **pytorch** 中将数据集合 ``batch`` 传输到给定设备。任何定义方法 ``to(device)`` 的对象都将被移动并且集合中的所有其他对象将保持不变；

    :param batch: 需要迁移的数据；
    :param device: 数据应当迁移到的设备；当该参数的值为 ``None`` 时则不执行任何操作；
    :param non_blocking: ``pytorch`` 的数据迁移方法 ``to`` 的参数；
    :param non_blocking: **pytorch** 的数据迁移方法 ``to`` 的参数；
    :return: 迁移到新设备上的数据集合；
    """
    if device is None:
--- a/fastNLP/core/utils/utils.py
+++ b/fastNLP/core/utils/utils.py
@@ -55,7 +55,7 @@ def get_fn_arg_names(fn: Callable) -> List[str]:
 def auto_param_call(fn: Callable, *args, signature_fn: Optional[Callable] = None,
                    mapping: Optional[Dict[AnyStr, AnyStr]] = None) -> Any:
    r"""
    该函数会根据输入函数的形参名从 ``*args`` （均为 ``dict`` 类型）中找到匹配的值进行调用，如果传入的数据与 ``fn`` 的形参不匹配，可以通过
    该函数会根据输入函数的形参名从 ``*args`` （均为 **dict** 类型）中找到匹配的值进行调用，如果传入的数据与 ``fn`` 的形参不匹配，可以通过
    ``mapping`` 参数进行转换。``mapping`` 参数中的一对 ``(key, value)`` 表示在 ``*args`` 中找到 ``key`` 对应的值，并将这个值传递给形参中名为
    ``value`` 的参数。

@@ -259,21 +259,21 @@ def dataclass_to_dict(data: "dataclasses.dataclass") -> Dict:

 def match_and_substitute_params(mapping: Optional[Union[Callable, Dict]] = None, data: Optional[Any] = None) -> Any:
    r"""
    用来实现将输入的 ``batch`` 或者输出的 ``outputs`` 通过 ``mapping`` 将键值进行更换的功能；
    用来实现将输入的 **batch** 或者输出的 **outputs** 通过 ``mapping`` 将键值进行更换的功能；
    该函数应用于 ``input_mapping`` 和 ``output_mapping``；

    * 对于 ``input_mapping``，该函数会在 :class:`~fastNLP.core.controllers.TrainBatchLoop` 中取完数据后立刻被调用；
    * 对于 ``output_mapping``，该函数会在 :class:`~fastNLP.core.Trainer` 的 :meth:`~fastNLP.core.Trainer.train_step`
     以及 :class:`~fastNLP.core.Evaluator` 的 :meth:`~fastNLP.core.Evaluator.train_step` 中得到结果后立刻被调用；
      以及 :class:`~fastNLP.core.Evaluator` 的 :meth:`~fastNLP.core.Evaluator.train_step` 中得到结果后立刻被调用；

    转换的逻辑按优先级依次为：

    1. 如果 ``mapping`` 是一个函数，那么会直接返回 ``mapping(data)``；
    2. 如果 ``mapping`` 是一个 ``Dict``，那么 ``data`` 的类型只能为以下三种： ``[Dict, dataclass, Sequence]``；
    1. 如果 ``mapping`` 是一个函数，那么会直接返回 **mapping(data)**；
    2. 如果 ``mapping`` 是一个 **Dict**，那么 ``data`` 的类型只能为以下三种： ``[Dict, dataclass, Sequence]``；
        
        * 如果 ``data`` 是 ``Dict``，那么该函数会将 ``data`` 的 ``key`` 替换为 ``mapping[key]``；
        * 如果 ``data`` 是 ``dataclass``，那么该函数会先使用 :func:`dataclasses.asdict` 函数将其转换为 ``Dict``，然后进行转换；
        * 如果 ``data`` 是 ``Sequence``，那么该函数会先将其转换成一个对应的字典::
        * 如果 ``data`` 是 **Dict**，那么该函数会将 ``data`` 的 ``key`` 替换为 **mapping[key]**；
        * 如果 ``data`` 是 **dataclass**，那么该函数会先使用 :func:`dataclasses.asdict` 函数将其转换为 **Dict**，然后进行转换；
        * 如果 ``data`` 是 **Sequence**，那么该函数会先将其转换成一个对应的字典::
        
            {
                "_0": list[0],
@@ -281,7 +281,7 @@ def match_and_substitute_params(mapping: Optional[Union[Callable, Dict]] = None,
                ...
            }

          然后使用 ``mapping`` 对这个 ``Dict`` 进行转换，如果没有匹配上 ``mapping`` 中的 ``key`` 则保持 ``\'\_number\'`` 这个形式。
          然后使用 ``mapping`` 对这个字典进行转换，如果没有匹配上 ``mapping`` 中的 ``key`` 则保持 ``'_number'`` 这个形式。

    :param mapping: 用于转换的字典或者函数；当 ``mapping`` 是函数时，返回值必须为字典类型；
    :param data: 需要被转换的对象；
@@ -459,7 +459,7 @@ def _is_iterable(value):

 def pretty_table_printer(dataset_or_ins) -> PrettyTable:
    r"""
    用于在 ``fastNLP`` 中展示数据的函数::
    用于在 **fastNLP** 中展示数据的函数::

        >>> ins = Instance(field_1=[1, 1, 1], field_2=[2, 2, 2], field_3=["a", "b", "c"])
        +-----------+-----------+-----------------+
--- a/fastNLP/modules/mix_modules/utils.py
+++ b/fastNLP/modules/mix_modules/utils.py
@@ -0,0 +1,242 @@
 import warnings
 from typing import Any, Optional, Union

 import numpy as np

 from fastNLP.core.utils import paddle_to, apply_to_collection
 from fastNLP.core.log import logger
 from fastNLP.envs.imports import _NEED_IMPORT_JITTOR, _NEED_IMPORT_TORCH, _NEED_IMPORT_PADDLE

 if _NEED_IMPORT_PADDLE:
    import paddle

 if _NEED_IMPORT_JITTOR:
    import jittor

 if _NEED_IMPORT_TORCH:
    import torch

 __all__ = [
    "paddle2torch",
    "torch2paddle",
    "jittor2torch",
    "torch2jittor",
 ]

 def _paddle2torch(paddle_tensor: 'paddle.Tensor', device: Optional[Union[str, int]] = None, no_gradient: bool = None) -> 'torch.Tensor':
    """
    将 :class:`paddle.Tensor` 转换为 :class:`torch.Tensor` ，并且能够保留梯度进行反向传播

    :param paddle_tensor: 要转换的 **paddle** 张量；
    :param device: 是否将转换后的张量迁移到特定设备上，为 ``None``时，和输入的张量相同；
    :param no_gradient: 是否保留原张量的梯度。为 ``None`` 时，新的张量与输入张量保持一致；
        为 ``True`` 时，全部不保留梯度；为 ``False`` 时，全部保留梯度；
    :return: 转换后的 **torch** 张量；
    """
    no_gradient = paddle_tensor.stop_gradient if no_gradient is None else no_gradient
    paddle_numpy = paddle_tensor.numpy()
    if not np.issubdtype(paddle_numpy.dtype, np.inexact):
        no_gradient = True

    if device is None:
        if paddle_tensor.place.is_gpu_place():
            # paddlepaddle有两种Place，对应不同的device id获取方式
            if hasattr(paddle_tensor.place, "gpu_device_id"):
                # paddle.fluid.core_avx.Place
                # 在gpu环境下创建张量的话，张量的place是这一类型
                device = f"cuda:{paddle_tensor.place.gpu_device_id()}"
            else:
                # paddle.CUDAPlace
                device = f"cuda:{paddle_tensor.place.get_device_id()}"
        else:
            # TODO: 可能需要支持xpu等设备
            device = "cpu"

    if not no_gradient:
        # 保持梯度，并保持反向传播
        # torch.tensor会保留numpy数组的类型
        torch_tensor = torch.tensor(paddle_numpy, requires_grad=True, device=device)
        hook = torch_tensor.register_hook(
            lambda grad: paddle.autograd.backward(paddle_tensor, paddle.to_tensor(grad.cpu().numpy()))
        )
    else:
        # 不保留梯度
        torch_tensor = torch.tensor(paddle_numpy, requires_grad=False, device=device)

    return torch_tensor


 def _torch2paddle(torch_tensor: 'torch.Tensor', device: str = None, no_gradient: bool = None) -> 'paddle.Tensor':
    """
    将 :class:`torch.Tensor` 转换为 :class:`paddle.Tensor`，并且能够保留梯度进行反向传播。

    :param torch_tensor: 要转换的 **torch** 张量；
    :param device: 是否将转换后的张量迁移到特定设备上，输入为 ``None`` 时，和输入的张量相同；
    :param no_gradient: 是否保留原张量的梯度。为 ``None`` 时，新的张量与输入张量保持一致；
        为 ``True`` 时，全部不保留梯度；为 ``False`` 时，全部保留梯度；
    :return: 转换后的 **paddle** 张量；
    """
    no_gradient = not torch_tensor.requires_grad if no_gradient is None else no_gradient
    if device is None:
        if torch_tensor.is_cuda:
            device = f"gpu:{torch_tensor.device.index}"
        else:
            device = "cpu"

    if not no_gradient:
        # 保持梯度并保持反向传播
        # paddle的stop_gradient和torch的requires_grad表现是相反的
        paddle_tensor = paddle.to_tensor(torch_tensor.detach().numpy(), stop_gradient=False)
        hook = paddle_tensor.register_hook(
            lambda grad: torch.autograd.backward(torch_tensor, torch.tensor(grad.numpy()))
        )
    else:
        paddle_tensor = paddle.to_tensor(torch_tensor.detach().numpy(), stop_gradient=True)

    paddle_tensor = paddle_to(paddle_tensor, device)

    return paddle_tensor


 def _jittor2torch(jittor_var: 'jittor.Var', device: Optional[Union[str, int]] = None, no_gradient: bool = None) -> 'torch.Tensor':
    """
    将 :class:`jittor.Var` 转换为 :class:`torch.Tensor` 。

    :param jittor_var: 要转换的 **jittor** 变量；
    :param device: 是否将转换后的张量迁移到特定设备上，输入为 ``None`` 时，根据 ``jittor.flags.use_cuda`` 决定；
    :param no_gradient: 是否保留原张量的梯度。为``None``时，新的张量与输入张量保持一致；
        为 ``True`` 时，全部不保留梯度；为 ``False`` 时，全部保留梯度；
    :return: 转换后的 **torch** 张量；
    """
    # TODO: warning：无法保留梯度
    # jittor的grad可以通过callback进行传递
    # 如果outputs有_grad键，可以实现求导
    no_gradient = not jittor_var.requires_grad if no_gradient is None else no_gradient
    if no_gradient == False:
        warnings.warn("The result tensor will not keep gradients due to differences between jittor and pytorch.")
    jittor_numpy = jittor_var.numpy()
    if not np.issubdtype(jittor_numpy.dtype, np.inexact):
        no_gradient = True

    if device is None:
        # jittor的设备分配是自动的
        # 根据use_cuda判断
        if jittor.flags.use_cuda:
            device = "cuda:0"
        else:
            device = "cpu"

    torch_tensor = torch.tensor(jittor_numpy, requires_grad=not no_gradient, device=device)

    return torch_tensor


 def _torch2jittor(torch_tensor: 'torch.Tensor', no_gradient: bool = None) -> 'jittor.Var':
    """
    将 :class:`torch.Tensor` 转换为 :class:`jittor.Var` 。

    :param torch_tensor: 要转换的 **torch** 张量；
    :param no_gradient: 是否保留原张量的梯度。为``None``时，新的张量与输入张量保持一致；
        为 ``True`` 时，全部不保留梯度；为 ``False`` 时，全部保留梯度；
    :return: 转换后的 **jittor** 变量；
    """
    no_gradient = not torch_tensor.requires_grad if no_gradient is None else no_gradient

    if not no_gradient:
        # 保持梯度并保持反向传播
        jittor_var = jittor.Var(torch_tensor.detach().numpy())
        jittor_var.requires_grad = True
        hook = jittor_var.register_hook(
            lambda grad: torch.autograd.backward(torch_tensor, torch.tensor(grad.numpy()))
        )
    else:
        jittor_var = jittor.Var(torch_tensor.detach().numpy())
        jittor_var.requires_grad = False

    return jittor_var


 def torch2paddle(batch: Any, device: str = None, no_gradient: bool = None) -> Any:
    """
    递归地将输入中包含的 :class:`torch.Tensor` 转换为 :class:`paddle.Tensor` 。

    :param batch: 包含 :class:`torch.Tensor` 类型的数据集合
    :param device: 是否将转换后的张量迁移到特定设备上。为 ``None`` 时，和输入保持一致；
    :param no_gradient: 是否保留原张量的梯度。为 ``None`` 时，新的张量与输入张量保持一致；
        为 ``True`` 时，不保留梯度；为 ``False`` 时，保留梯度；
    :return: 转换后的数据；      
    """

    return apply_to_collection(
        batch,
        dtype=torch.Tensor,
        function=_torch2paddle,
        device=device,
        no_gradient=no_gradient,
    )


 def paddle2torch(batch: Any, device: str = None, no_gradient: bool = None) -> Any:
    """
    递归地将输入中包含的 :class:`paddle.Tensor` 转换为 :class:`torch.Tensor` 。

    :param batch: 包含 :class:`paddle.Tensor` 类型的数据集合；
    :param device: 是否将转换后的张量迁移到特定设备上。为 ``None``时，和输入保持一致；
    :param no_gradient: 是否保留原张量的梯度。为 ``None`` 时，新的张量与输入张量保持一致；
        为 ``True`` 时，不保留梯度；为 ``False`` 时，保留梯度；
    :return: 转换后的数据；    
    """

    return apply_to_collection(
        batch,
        dtype=paddle.Tensor,
        function=_paddle2torch,
        device=device,
        no_gradient=no_gradient,
    )


 def jittor2torch(batch: Any, device: str = None, no_gradient: bool = None) -> Any:
    """
    递归地将输入中包含的 :class:`jittor.Var` 转换为 :class:`torch.Tensor` 。

    .. note::

        注意，由于 **pytorch** 和 **jittor** 之间的差异，从 :class:`jittor.Var` 转换
        至 :class:`torch.Tensor` 的过程中无法保留原张量的梯度。

    :param batch: 包含 :class:`jittor.Var` 类型的数据集合；
    :param device: 是否将转换后的张量迁移到特定设备上。为 ``None``时，和输入保持一致；
    :param no_gradient: 是否保留原张量的梯度，在这个函数中该参数无效。
    :return: 转换后的数据；
    """

    return apply_to_collection(
        batch,
        dtype=jittor.Var,
        function=_jittor2torch,
        device=device,
        no_gradient=no_gradient,
    )


 def torch2jittor(batch: Any, no_gradient: bool = None) -> Any:
    """
    递归地将输入中包含的 :class:`torch.Tensor` 转换为 :class:`jittor.Var` 。

    .. note::

        **jittor** 会自动为创建的变量分配设备。

    :param batch: 包含 :class:`torch.Tensor` 类型的数据集合；
    :param no_gradient: 是否保留原张量的梯度。为 ``None`` 时，新的张量与输入张量保持一致；
        为 ``True`` 时，不保留梯度；为 ``False`` 时，保留梯度；
    :return: 转换后的数据； 
    """
    
    return apply_to_collection(
        batch,
        dtype=torch.Tensor,
        function=_torch2jittor,
        no_gradient=no_gradient,
    )
--- a/fastNLP/transformers/torch/file_utils.py
+++ b/fastNLP/transformers/torch/file_utils.py
@@ -17,7 +17,7 @@ from enum import Enum
 from functools import partial
 from hashlib import sha256
 from pathlib import Path
 from typing import Any, BinaryIO, Dict, Optional, Tuple, Union
 from typing import Any, BinaryIO, Dict, Optional, Tuple, Union, List
 from urllib.parse import urlparse
 from uuid import uuid4
 from zipfile import ZipFile, is_zipfile
@@ -750,6 +750,78 @@ def get_from_cache(

    return cache_path

 def get_list_of_files(
    path_or_repo: Union[str, os.PathLike],
    revision: Optional[str] = None,
    use_auth_token: Optional[Union[bool, str]] = None,
    local_files_only: bool = False,
 ) -> List[str]:
    """
    Gets the list of files inside :obj:`path_or_repo`.

    Args:
        path_or_repo (:obj:`str` or :obj:`os.PathLike`):
            Can be either the id of a repo on huggingface.co or a path to a `directory`.
        revision (:obj:`str`, `optional`, defaults to :obj:`"main"`):
            The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
            git-based system for storing models and other artifacts on huggingface.co, so ``revision`` can be any
            identifier allowed by git.
        use_auth_token (:obj:`str` or `bool`, `optional`):
            The token to use as HTTP bearer authorization for remote files. If :obj:`True`, will use the token
            generated when running :obj:`transformers-cli login` (stored in :obj:`~/.huggingface`).
        local_files_only (:obj:`bool`, `optional`, defaults to :obj:`False`):
            Whether or not to only rely on local files and not to attempt to download any files.

    Returns:
        :obj:`List[str]`: The list of files available in :obj:`path_or_repo`.
    """
    path_or_repo = str(path_or_repo)
    # If path_or_repo is a folder, we just return what is inside (subdirectories included).
    if os.path.isdir(path_or_repo):
        list_of_files = []
        for path, dir_names, file_names in os.walk(path_or_repo):
            list_of_files.extend([os.path.join(path, f) for f in file_names])
        return list_of_files

    # Can't grab the files if we are on offline mode.
    if is_offline_mode() or local_files_only:
        return []

    # Otherwise we grab the token and use the model_info method.
    if isinstance(use_auth_token, str):
        token = use_auth_token
    elif use_auth_token is True:
        # token = HfFolder.get_token()
        path_token = os.path.expanduser("~/.huggingface/token")
        try:
            with open(path_token, "r") as f:
                token = f.read()
        except FileNotFoundError:
            token = None
    else:
        token = None
    # model_info = HfApi(endpoint=HUGGINGFACE_CO_RESOLVE_ENDPOINT).model_info(
    #     path_or_repo, revision=revision, token=token
    # )
    endpoint=HUGGINGFACE_CO_RESOLVE_ENDPOINT
    path = (
        f"{HUGGINGFACE_CO_RESOLVE_ENDPOINT}/api/models/{path_or_repo}"
        if revision is None
        else f"{HUGGINGFACE_CO_RESOLVE_ENDPOINT}/api/models/{path_or_repo}/revision/{revision}"
    )
    headers = {"authorization": f"Bearer {token}"} if token is not None else None
    status_query_param = None
    r = requests.get(
        path, headers=headers, timeout=None, params=status_query_param
    )
    r.raise_for_status()
    d = r.json()
    siblings = d.get("siblings", None)
    rfilenames = (
        [x["rfilename"] for x in siblings] if siblings is not None else None
    )
    return rfilenames

 def is_torch_fx_available():
    return _TORCH_GREATER_EQUAL_1_8 and _compare_version("torch", operator.lt, "1.9.0")

--- a/fastNLP/transformers/torch/tokenization_utils_base.py
+++ b/fastNLP/transformers/torch/tokenization_utils_base.py
@@ -44,6 +44,8 @@ from .file_utils import (
    cached_path,
    is_offline_mode,
    is_remote_url,
    get_list_of_files,
    hf_bucket_url,
    is_tokenizers_available,
    to_py_obj,
 )
@@ -100,7 +102,7 @@ TOKENIZER_CONFIG_FILE = "tokenizer_config.json"

 # Fast tokenizers (provided by HuggingFace tokenizer's library) can be saved in a single file
 FULL_TOKENIZER_FILE = "tokenizer.json"

 _re_tokenizer_file = re.compile(r"tokenizer\.(.*)\.json")

 class TruncationStrategy(ExplicitEnum):
    """
@@ -1607,8 +1609,41 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
            file_id = list(cls.vocab_files_names.keys())[0]
            vocab_files[file_id] = pretrained_model_name_or_path
        else:
            raise RuntimeError("At this point pretrained_model_name_or_path is either a directory or a model identifier name, ", 
                                "which is not supported in fastNLP now.")
            # raise RuntimeError("At this point pretrained_model_name_or_path is either a directory or a model identifier name, ", 
            #                     "which is not supported in fastNLP now.")
            # At this point pretrained_model_name_or_path is either a directory or a model identifier name
            fast_tokenizer_file = get_fast_tokenizer_file(
                pretrained_model_name_or_path,
                revision=revision,
                use_auth_token=use_auth_token,
                local_files_only=local_files_only,
            )
            additional_files_names = {
                "added_tokens_file": ADDED_TOKENS_FILE,
                "special_tokens_map_file": SPECIAL_TOKENS_MAP_FILE,
                "tokenizer_config_file": TOKENIZER_CONFIG_FILE,
                "tokenizer_file": fast_tokenizer_file,
            }
            # Look for the tokenizer files
            for file_id, file_name in {**cls.vocab_files_names, **additional_files_names}.items():
                if os.path.isdir(pretrained_model_name_or_path):
                    if subfolder is not None:
                        full_file_name = os.path.join(pretrained_model_name_or_path, subfolder, file_name)
                    else:
                        full_file_name = os.path.join(pretrained_model_name_or_path, file_name)
                    if not os.path.exists(full_file_name):
                        logger.info(f"Didn't find file {full_file_name}. We won't load it.")
                        full_file_name = None
                else:
                    full_file_name = hf_bucket_url(
                        pretrained_model_name_or_path,
                        filename=file_name,
                        subfolder=subfolder,
                        revision=revision,
                        mirror=None,
                    )

                vocab_files[file_id] = full_file_name

        # Get files from url, cache, or disk depending on the case
        resolved_vocab_files = {}
@@ -3349,3 +3384,52 @@ For a more complete example, see the implementation of `prepare_seq2seq_batch`.
            )
        model_inputs["labels"] = labels["input_ids"]
        return model_inputs

 def get_fast_tokenizer_file(
    path_or_repo: Union[str, os.PathLike],
    revision: Optional[str] = None,
    use_auth_token: Optional[Union[bool, str]] = None,
    local_files_only: bool = False,
 ) -> str:
    """
    Get the tokenizer file to use for this version of transformers.

    Args:
        path_or_repo (:obj:`str` or :obj:`os.PathLike`):
            Can be either the id of a repo on huggingface.co or a path to a `directory`.
        revision(:obj:`str`, `optional`, defaults to :obj:`"main"`):
            The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
            git-based system for storing models and other artifacts on huggingface.co, so ``revision`` can be any
            identifier allowed by git.
        use_auth_token (:obj:`str` or `bool`, `optional`):
            The token to use as HTTP bearer authorization for remote files. If :obj:`True`, will use the token
            generated when running :obj:`transformers-cli login` (stored in :obj:`~/.huggingface`).
        local_files_only (:obj:`bool`, `optional`, defaults to :obj:`False`):
            Whether or not to only rely on local files and not to attempt to download any files.

    Returns:
        :obj:`str`: The tokenizer file to use.
    """
    # Inspect all files from the repo/folder.
    all_files = get_list_of_files(
        path_or_repo, revision=revision, use_auth_token=use_auth_token, local_files_only=local_files_only
    )
    tokenizer_files_map = {}
    for file_name in all_files:
        search = _re_tokenizer_file.search(file_name)
        if search is not None:
            v = search.groups()[0]
            tokenizer_files_map[v] = file_name
    available_versions = sorted(tokenizer_files_map.keys())

    # Defaults to FULL_TOKENIZER_FILE and then try to look at some newer versions.
    tokenizer_file = FULL_TOKENIZER_FILE
    transformers_version = version.parse(__version__)
    for v in available_versions:
        if version.parse(v) <= transformers_version:
            tokenizer_file = tokenizer_files_map[v]
        else:
            # No point going further since the versions are sorted.
            break

    return tokenizer_file