Merge branch 'dev0.8.0' of github.com:fastnlp/fastNLP into dev0.8.0

3 years ago · 2297fcb30a
--- a/fastNLP/core/collators/padders/paddle_padder.py
+++ b/fastNLP/core/collators/padders/paddle_padder.py
@@ -56,7 +56,7 @@ def is_paddle_dtype_str(dtype):


 def _get_dtype(ele_dtype, dtype, class_name):
    if not (ele_dtype is not None or is_number_or_numpy_number(ele_dtype) or is_paddle_tensor(ele_dtype) or is_paddle_dtype_str(ele_dtype)):
    if not (ele_dtype is None or is_number_or_numpy_number(ele_dtype) or is_paddle_tensor(ele_dtype) or is_paddle_dtype_str(ele_dtype)):
        raise EleDtypeUnsupportedError(f"`{class_name}` only supports padding python numbers "
                                       f"or numpy numbers or paddle.Tensor but get `{ele_dtype}`.")

--- a/fastNLP/core/dataloaders/paddle_dataloader/fdl.py
+++ b/fastNLP/core/dataloaders/paddle_dataloader/fdl.py
@@ -8,11 +8,12 @@ from typing import Callable, List, Optional, Union, Dict, Sequence
 from fastNLP.envs.imports import _NEED_IMPORT_PADDLE

 if _NEED_IMPORT_PADDLE:
    from paddle.io import DataLoader, Dataset
    from paddle.io import DataLoader, Dataset, Sampler
    from paddle.fluid.dataloader.collate import default_collate_fn
 else:
    from fastNLP.core.utils.dummy_class import DummyClass as Dataset
    from fastNLP.core.utils.dummy_class import DummyClass as DataLoader
    from fastNLP.core.utils.dummy_class import DummyClass as Sampler

 from fastNLP.core.collators.collator import Collator
 from fastNLP.core.dataloaders.utils import indice_collate_wrapper
@@ -58,6 +59,9 @@ class PaddleDataLoader(DataLoader):
        if batch_sampler is None:
            batch_sampler = RandomBatchSampler(dataset, batch_size=batch_size, shuffle=shuffle,
                                               drop_last=drop_last)
            batch_size = 1
            shuffle = False
            drop_last = False

        super(PaddleDataLoader, self).__init__(dataset=dataset, feed_list=feed_list, places=places,
                                               return_list=return_list, batch_sampler=batch_sampler,
--- a/fastNLP/core/drivers/paddle_driver/fleet.py
+++ b/fastNLP/core/drivers/paddle_driver/fleet.py
@@ -1,12 +1,12 @@
 import os
 import shutil
 from typing import List, Union, Optional, Dict, Tuple, Callable

 from fastNLP.core.utils.paddle_utils import get_device_from_visible

 from .paddle_driver import PaddleDriver
 from .fleet_launcher import FleetLauncher
 from .utils import (
    _FleetWrappingModel, 
    get_device_from_visible,
    reset_seed,
    replace_sampler,
    replace_batch_sampler,
@@ -17,8 +17,8 @@ from fastNLP.envs.imports import _NEED_IMPORT_PADDLE
 from fastNLP.core.utils import (
    auto_param_call,
    check_user_specific_params,
    paddle_move_data_to_device,
    is_in_paddle_dist
    is_in_paddle_dist,
    is_in_paddle_dist,
 )
 from fastNLP.envs.distributed import rank_zero_rm
 from fastNLP.core.samplers import (
@@ -609,12 +609,6 @@ class PaddleFleetDriver(PaddleDriver):
    def is_distributed(self):
        return True

    def move_data_to_device(self, batch: 'paddle.Tensor'):
        device = self.data_device
        # 因为设置了CUDA_VISIBLE_DEVICES，可能会引起错误
        device = get_device_from_visible(device)
        return paddle_move_data_to_device(batch, device)

    @staticmethod
    def _check_optimizer_legality(optimizers):
        # paddle 存在设置分布式 optimizers 的函数，返回值为 fleet.meta_optimizers.HybridParallelOptimizer
@@ -637,9 +631,8 @@ class PaddleFleetDriver(PaddleDriver):
        :return: 如果当前不是分布式 driver 直接返回输入的 obj 。如果当前 rank 是接收端（其 global rank 包含在了 dst 中），则返回
            接收到的参数；如果是 source 端则返回发射的内容；既不是发送端、又不是接收端，则返回 None 。
        """
        device = self.data_device
        # 因为设置了CUDA_VISIBLE_DEVICES，可能会引起错误
        device = get_device_from_visible(device)
        device = get_device_from_visible(self.data_device)
        return fastnlp_paddle_broadcast_object(obj, src, device=device, group=group)

    def all_gather(self, obj, group=None) -> List:
--- a/fastNLP/core/drivers/paddle_driver/fleet_launcher.py
+++ b/fastNLP/core/drivers/paddle_driver/fleet_launcher.py
@@ -10,7 +10,6 @@ from fastNLP.envs.env import (
    FASTNLP_DISTRIBUTED_CHECK,
    FASTNLP_LOG_LEVEL,
    FASTNLP_GLOBAL_SEED,
    USER_CUDA_VISIBLE_DEVICES,
 )
 from .utils import (
    find_free_ports,
--- a/fastNLP/core/drivers/paddle_driver/initialize_paddle_driver.py
+++ b/fastNLP/core/drivers/paddle_driver/initialize_paddle_driver.py
@@ -42,7 +42,8 @@ def initialize_paddle_driver(driver: str, device: Optional[Union[str, int, List[

    user_visible_devices = os.getenv("USER_CUDA_VISIBLE_DEVICES")
    if user_visible_devices is None:
        raise RuntimeError("This situation cannot happen, please report a bug to us.")
        raise RuntimeError("`USER_CUDA_VISIBLE_DEVICES` cannot be None, please check if you have set "
                            "`FASTNLP_BACKEND` to 'paddle' before using FastNLP.")
    _could_use_device_num = len(user_visible_devices.split(","))
    if isinstance(device, int):
        if device < 0 and device != -1:
--- a/fastNLP/core/drivers/paddle_driver/paddle_driver.py
+++ b/fastNLP/core/drivers/paddle_driver/paddle_driver.py
@@ -10,7 +10,7 @@ import numpy as np
 from .utils import _build_fp16_env, optimizer_state_to_device, DummyGradScaler
 from fastNLP.envs.imports import _NEED_IMPORT_PADDLE
 from fastNLP.core.drivers.driver import Driver
 from fastNLP.core.utils import apply_to_collection, paddle_move_data_to_device
 from fastNLP.core.utils import apply_to_collection, paddle_move_data_to_device, get_device_from_visible
 from fastNLP.envs import (
    FASTNLP_SEED_WORKERS,
    FASTNLP_MODEL_FILENAME,
@@ -394,7 +394,8 @@ class PaddleDriver(Driver):

        :return: 将移动到指定机器上的 batch 对象返回；
        """
        return paddle_move_data_to_device(batch, self.data_device)
        device = get_device_from_visible(self.data_device)
        return paddle_move_data_to_device(batch, device)

    @staticmethod
    def worker_init_function(worker_id: int, rank: Optional[int] = None) -> None:  # pragma: no cover
--- a/fastNLP/core/drivers/paddle_driver/single_device.py
+++ b/fastNLP/core/drivers/paddle_driver/single_device.py
@@ -2,14 +2,14 @@ import os
 from typing import Optional, Dict, Union, Callable, Tuple

 from .paddle_driver import PaddleDriver
 from .utils import replace_batch_sampler, replace_sampler, get_device_from_visible
 from .utils import replace_batch_sampler, replace_sampler
 from fastNLP.envs.imports import _NEED_IMPORT_PADDLE
 from fastNLP.envs.env import USER_CUDA_VISIBLE_DEVICES
 from fastNLP.core.utils import (
    auto_param_call,
    get_device_from_visible,
    get_paddle_gpu_str,
    get_paddle_device_id,
    paddle_move_data_to_device,
 )
 from fastNLP.core.utils.utils import _get_fun_msg
 from fastNLP.core.samplers import (
@@ -39,6 +39,9 @@ class PaddleSingleDriver(PaddleDriver):
            raise ValueError("`paddle.DataParallel` is not supported in `PaddleSingleDriver`")

        cuda_visible_devices = os.environ.get(USER_CUDA_VISIBLE_DEVICES, None)
        if cuda_visible_devices is None:
            raise RuntimeError("`USER_CUDA_VISIBLE_DEVICES` cannot be None, please check if you have set "
                            "`FASTNLP_BACKEND` to 'paddle' before using FastNLP.")
        if cuda_visible_devices == "":
            device = "cpu"
            logger.info("You have set `CUDA_VISIBLE_DEVICES` to '' in system environment variable, and we are gonna to"
@@ -54,7 +57,7 @@ class PaddleSingleDriver(PaddleDriver):
                device_id = device
            else:
                device_id = get_paddle_device_id(device)
            os.environ["CUDA_VISIBLE_DEVICES"] = os.environ[USER_CUDA_VISIBLE_DEVICES].split(",")[device_id]
            os.environ["CUDA_VISIBLE_DEVICES"] = cuda_visible_devices.split(",")[device_id]
        self.model_device = get_paddle_gpu_str(device)

        self.local_rank = 0
@@ -65,8 +68,7 @@ class PaddleSingleDriver(PaddleDriver):
        r"""
        该函数用来初始化训练环境，用于设置当前训练的设备，并将模型迁移到对应设备上。
        """
        device = self.model_device
        device = get_device_from_visible(device, output_type=str)
        device = get_device_from_visible(self.model_device, output_type=str)
        paddle.device.set_device(device)
        self.model.to(device)

@@ -121,16 +123,6 @@ class PaddleSingleDriver(PaddleDriver):
        else:
            raise RuntimeError(f"There is no `{fn}` method in your {type(self.model)}.")

    def move_data_to_device(self, batch: 'paddle.Tensor'):
        r"""
        将数据迁移到指定的机器上；batch 可能是 list 也可能 dict ，或其嵌套结构。
        在 Paddle 中使用可能会引起因与设置的设备不一致而产生的问题，请注意。

        :return: 将移动到指定机器上的 batch 对象返回；
        """
        device = get_device_from_visible(self.data_device)
        return paddle_move_data_to_device(batch, device)

    def set_dist_repro_dataloader(self, dataloader, dist: Union[str, ReproducibleBatchSampler, ReproducibleSampler]=None,
                                  reproducible: bool = False):
        r"""
--- a/fastNLP/core/drivers/paddle_driver/utils.py
+++ b/fastNLP/core/drivers/paddle_driver/utils.py
@@ -6,12 +6,11 @@ import inspect
 import numpy as np
 from copy import deepcopy
 from contextlib import ExitStack, closing
 from enum import IntEnum
 from typing import Dict, Optional, Union
 from typing import Dict, Optional

 from fastNLP.envs.imports import _NEED_IMPORT_PADDLE
 from fastNLP.core.utils import get_paddle_device_id, auto_param_call, paddle_to
 from fastNLP.envs.env import FASTNLP_GLOBAL_SEED, FASTNLP_SEED_WORKERS, USER_CUDA_VISIBLE_DEVICES
 from fastNLP.core.utils import auto_param_call, paddle_to
 from fastNLP.envs.env import FASTNLP_GLOBAL_SEED, FASTNLP_SEED_WORKERS
 from fastNLP.core.log import logger


@@ -173,40 +172,6 @@ def find_free_ports(num):

    return None

 def get_device_from_visible(device: Union[str, int], output_type=int):
    """
    在有 CUDA_VISIBLE_DEVICES 的情况下，获取对应的设备。
    如 CUDA_VISIBLE_DEVICES=2,3 ，device=3 ，则返回1。

    :param device: 未转化的设备名
    :param output_type: 返回值的类型
    :return: 转化后的设备id
    """
    if output_type not in [int, str]:
        raise ValueError("Parameter `output_type` should be one of these types: [int, str]")
    if device == "cpu":
        return device
    cuda_visible_devices = os.getenv("CUDA_VISIBLE_DEVICES")
    idx = get_paddle_device_id(device)
    if cuda_visible_devices is None or cuda_visible_devices == "":
        # 这个判断一般不会发生，因为 fastnlp 会为 paddle 强行注入 CUDA_VISIBLE_DEVICES
        raise RuntimeError("This situation should not happen, please report us this bug.")
    else:
        # 利用 USER_CUDA_VISIBLDE_DEVICES 获取用户期望的设备
        user_visible_devices = os.getenv(USER_CUDA_VISIBLE_DEVICES)
        if user_visible_devices is None:
            raise RuntimeError("This situation cannot happen, please report a bug to us.")
        idx = user_visible_devices.split(",")[idx]

        cuda_visible_devices_list = cuda_visible_devices.split(',')
        if idx not in cuda_visible_devices_list:
            raise ValueError(f"Can't find your devices {idx} in CUDA_VISIBLE_DEVICES[{cuda_visible_devices}].")
        res = cuda_visible_devices_list.index(idx)
        if output_type == int:
            return res
        else:
            return f"gpu:{res}"

 def replace_batch_sampler(dataloader: "DataLoader", batch_sampler: "BatchSampler"):
    """
    利用 `batch_sampler` 重新构建一个 DataLoader，起到替换 `batch_sampler` 又不影响原 `dataloader` 的作用。
--- a/fastNLP/core/metrics/backend/paddle_backend/backend.py
+++ b/fastNLP/core/metrics/backend/paddle_backend/backend.py
@@ -1,11 +1,10 @@
 from typing import List, Optional, Any
 from typing import List, Any

 import numpy as np

 from fastNLP.core.metrics.backend import Backend
 from fastNLP.core.utils.paddle_utils import paddle_to
 from fastNLP.core.utils.paddle_utils import paddle_to, get_device_from_visible
 from fastNLP.core.metrics.utils import AggregateMethodError
 from fastNLP.core.drivers.paddle_driver.utils import get_device_from_visible
 from fastNLP.core.drivers.paddle_driver.dist_utils import fastnlp_paddle_all_gather
 from fastNLP.envs.imports import _NEED_IMPORT_PADDLE

@@ -80,7 +79,6 @@ class PaddleBackend(Backend):
            raise ValueError(f"tensor: {tensor} can not convert to ndarray!")

    def move_tensor_to_device(self, tensor, device):
        # TODO 如果在这里处理的话，会不会在别的地方引起bug？
        device = get_device_from_visible(device)
        return paddle_to(tensor, device)

--- a/fastNLP/core/utils/init.py
+++ b/fastNLP/core/utils/init.py
@@ -2,6 +2,7 @@ __all__ = [
    'cache_results',
    'is_jittor_dataset',
    'jittor_collate_wraps',
    'get_device_from_visible',
    'paddle_to',
    'paddle_move_data_to_device',
    'get_paddle_device_id',
@@ -27,7 +28,7 @@ __all__ = [

 from .cache_results import cache_results
 from .jittor_utils import is_jittor_dataset, jittor_collate_wraps
 from .paddle_utils import paddle_to, paddle_move_data_to_device, get_paddle_device_id, get_paddle_gpu_str, is_in_paddle_dist, \
 from .paddle_utils import get_device_from_visible, paddle_to, paddle_move_data_to_device, get_paddle_device_id, get_paddle_gpu_str, is_in_paddle_dist, \
    is_in_fnlp_paddle_dist, is_in_paddle_launch_dist
 from .rich_progress import f_rich_progress
 from .torch_paddle_utils import torch_paddle_move_data_to_device
--- a/fastNLP/core/utils/paddle_utils.py
+++ b/fastNLP/core/utils/paddle_utils.py
@@ -1,4 +1,5 @@
 __all__ = [
    "get_device_from_visible",
    "paddle_to",
    "paddle_move_data_to_device",
    "get_paddle_gpu_str",
@@ -13,13 +14,45 @@ import re
 from typing import Any, Optional, Union

 from fastNLP.envs.imports import _NEED_IMPORT_PADDLE
 from fastNLP.envs import FASTNLP_DISTRIBUTED_CHECK, FASTNLP_BACKEND_LAUNCH
 from fastNLP.envs import FASTNLP_DISTRIBUTED_CHECK, FASTNLP_BACKEND_LAUNCH, USER_CUDA_VISIBLE_DEVICES

 if _NEED_IMPORT_PADDLE:
    import paddle

 from .utils import apply_to_collection

 def get_device_from_visible(device: Union[str, int], output_type=int):
    """
    在有 CUDA_VISIBLE_DEVICES 的情况下，获取对应的设备。
    如 CUDA_VISIBLE_DEVICES=2,3 ，device=3 ，则返回1。

    :param device: 未转化的设备名
    :param output_type: 返回值的类型
    :return: 转化后的设备id
    """
    if output_type not in [int, str]:
        raise ValueError("Parameter `output_type` should be one of these types: [int, str]")
    if device == "cpu":
        return device
    cuda_visible_devices = os.getenv("CUDA_VISIBLE_DEVICES")
    user_visible_devices = os.getenv(USER_CUDA_VISIBLE_DEVICES)
    if user_visible_devices is None:
        raise RuntimeError("`USER_CUDA_VISIBLE_DEVICES` cannot be None, please check if you have set "
                            "`FASTNLP_BACKEND` to 'paddle' before using FastNLP.")
    idx = get_paddle_device_id(device)
    # 利用 USER_CUDA_VISIBLDE_DEVICES 获取用户期望的设备
    if user_visible_devices is None:
        raise RuntimeError("This situation cannot happen, please report a bug to us.")
    idx = user_visible_devices.split(",")[idx]

    cuda_visible_devices_list = cuda_visible_devices.split(',')
    if idx not in cuda_visible_devices_list:
        raise ValueError(f"Can't find your devices {idx} in CUDA_VISIBLE_DEVICES[{cuda_visible_devices}]. ")
    res = cuda_visible_devices_list.index(idx)
    if output_type == int:
        return res
    else:
        return f"gpu:{res}"

 def paddle_to(data, device: Union[str, int]):
    """
@@ -33,6 +66,7 @@ def paddle_to(data, device: Union[str, int]):
    if device == "cpu":
        return data.cpu()
    else:
        # device = get_device_from_visible(device, output_type=int)
        return data.cuda(get_paddle_device_id(device))


--- a/tests/core/collators/padders/test_get_padder.py
+++ b/tests/core/collators/padders/test_get_padder.py
@@ -14,10 +14,10 @@ def test_get_element_shape_dtype():
    catalog = _get_element_shape_dtype([np.zeros(3), np.zeros((2, 1))])


@pytest.mark.parametrize('backend', ['raw', None, 'numpy', 'torch', 'jittor', 'paddle'])
 # @pytest.mark.parametrize('backend', ['raw', None, 'numpy', 'torch', 'jittor', 'paddle'])
@pytest.mark.parametrize('backend', ['raw', None, 'numpy', 'torch', 'paddle'])
@pytest.mark.torch
@pytest.mark.paddle
@pytest.mark.jittor
 def test_get_padder_run(backend):
    if not _NEED_IMPORT_TORCH and backend == 'torch':
        pytest.skip("No torch")
--- a/tests/core/controllers/_test_trainer_fleet.py
+++ b/tests/core/controllers/_test_trainer_fleet.py
@@ -1,7 +1,7 @@
 """
 这个文件测试用户以python -m paddle.distributed.launch 启动的情况
 看看有没有用pytest执行的机会
 python -m paddle.distributed.launch --gpus=0,2,3 test_trainer_fleet.py
 FASTNLP_BACKEND=paddle python -m paddle.distributed.launch --gpus=0,2,3 _test_trainer_fleet.py
 """
 import os
 import sys
--- a/tests/core/controllers/_test_trainer_fleet_outside.py
+++ b/tests/core/controllers/_test_trainer_fleet_outside.py
@@ -1,7 +1,7 @@
 """
 这个文件测试用户以python -m paddle.distributed.launch 启动的情况
 并且自己初始化了 fleet
 python -m paddle.distributed.launch --gpus=0,2,3 test_trainer_fleet_outside.py
 FASTNLP_BACKEND=paddle python -m paddle.distributed.launch --gpus=0,2,3 _test_trainer_fleet_outside.py
 """
 import os
 import sys
@@ -93,5 +93,5 @@ if __name__ == "__main__":
        driver=driver,
        device=device,
        callbacks=callbacks,
        n_epochs=30,
        n_epochs=5,
    )
--- a/tests/core/controllers/test_trainer_paddle.py
+++ b/tests/core/controllers/test_trainer_paddle.py
@@ -27,7 +27,7 @@ class TrainPaddleConfig:
@pytest.mark.parametrize("driver,device", [("paddle", "cpu"), ("paddle", 1), ("fleet", [0, 1])])
 # @pytest.mark.parametrize("driver,device", [("fleet", [0, 1])])
@pytest.mark.parametrize("callbacks", [[RichCallback(5)]])
@pytest.mark.paddle
@pytest.mark.paddledist
@magic_argv_env_context
 def test_trainer_paddle(
        driver,
--- a/tests/core/dataloaders/paddle_dataloader/test_fdl.py
+++ b/tests/core/dataloaders/paddle_dataloader/test_fdl.py
@@ -58,11 +58,3 @@ class TestPaddle:
        for batch in fdl1:
            assert batch['image'].shape == [4, 10, 5]
            print(batch)

    def test_v2(self):
        from fastNLP.core.collators import Collator
        logger.setLevel("DEBUG")
        data = [paddle.Tensor(np.random.random((10, 5)).astype('float32')), paddle.Tensor(np.random.random((10, 5)).astype('float32'))]
        col = Collator(backend="jittor")
        res = col(data)
        print(res)
--- a/tests/core/dataset/test_dataset.py
+++ b/tests/core/dataset/test_dataset.py
@@ -370,29 +370,11 @@ class TestDataSetMethods:
        assert os.path.exists("1.csv") == True
        os.remove("1.csv")

    def test_add_collate_fn(self):
        ds = DataSet({'x': [1, 2, 3], 'y': [4, 5, 6]})

        def collate_fn(item):
            return item

        ds.add_collate_fn(collate_fn)

    def test_get_collator(self):
        from typing import Callable
        ds = DataSet({'x': [1, 2, 3], 'y': [4, 5, 6]})
        collate_fn = ds.get_collator()
        assert isinstance(collate_fn, Callable) == True

    def test_add_seq_len(self):
        ds = DataSet({'x': [[1, 2], [2, 3, 4], [3]], 'y': [4, 5, 6]})
        ds.add_seq_len('x')
        print(ds)

    def test_set_target(self):
        ds = DataSet({'x': [[1, 2], [2, 3, 4], [3]], 'y': [4, 5, 6]})
        ds.set_target('x')


 class TestFieldArrayInit:
    """
--- a/tests/core/drivers/paddle_driver/test_dist_utils.py
+++ b/tests/core/drivers/paddle_driver/test_dist_utils.py
@@ -19,7 +19,7 @@ if _NEED_IMPORT_PADDLE:
    import paddle
    import paddle.distributed as dist

@pytest.mark.paddle
@pytest.mark.paddledist
 class TestDistUtilsTools:
    """
    测试一些工具函数
@@ -79,14 +79,13 @@ class TestDistUtilsTools:
        assert res["int"] == paddle_dict["int"]
        assert res["string"] == paddle_dict["string"]


@pytest.mark.paddle
@pytest.mark.paddledist
 class TestAllGatherAndBroadCast:

    @classmethod
    def setup_class(cls):
        devices = [0,1,2]
        output_from_new_proc = "only_error"
        output_from_new_proc = "all"

        launcher = FleetLauncher(devices=devices, output_from_new_proc=output_from_new_proc)
        cls.local_rank = int(os.getenv("PADDLE_RANK_IN_NODE", "0"))
--- a/tests/core/drivers/paddle_driver/test_fleet.py
+++ b/tests/core/drivers/paddle_driver/test_fleet.py
@@ -39,7 +39,7 @@ def generate_driver(num_labels, feature_dimension, device=[0,1], fp16=False, out
 #
 ############################################################################

@pytest.mark.paddle
@pytest.mark.paddledist
 class TestFleetDriverFunction:
    """
    测试 PaddleFleetDriver 一些简单函数的测试类，基本都是测试能否运行、是否存在 import 错误等问题
@@ -147,7 +147,7 @@ class TestFleetDriverFunction:
 #
 ############################################################################

@pytest.mark.paddle
@pytest.mark.paddledist
 class TestSetDistReproDataloader:

    @classmethod
@@ -521,7 +521,7 @@ class TestSetDistReproDataloader:
 #
 ############################################################################

@pytest.mark.paddle
@pytest.mark.paddledist
 class TestSaveLoad:
    """
    测试多卡情况下 save 和 load 相关函数的表现
--- a/tests/core/drivers/paddle_driver/test_single_device.py
+++ b/tests/core/drivers/paddle_driver/test_single_device.py
@@ -552,22 +552,17 @@ def generate_random_driver(features, labels, fp16=False, device="cpu"):

    return driver

@pytest.fixture
 def prepare_test_save_load():
    dataset = PaddleRandomMaxDataset(40, 10)
    dataloader = DataLoader(dataset, batch_size=4)
    driver1, driver2 = generate_random_driver(10, 10), generate_random_driver(10, 10)
    return driver1, driver2, dataloader

@pytest.mark.paddle
@pytest.mark.parametrize("only_state_dict", ([True, False]))
 def test_save_and_load_model(prepare_test_save_load, only_state_dict):
 def test_save_and_load_model(only_state_dict):
    """
    测试 save_model 和 load_model 函数
    """
    try:
        path = "model"
        driver1, driver2, dataloader = prepare_test_save_load
        dataset = PaddleRandomMaxDataset(40, 10)
        dataloader = DataLoader(dataset, batch_size=4)
        driver1, driver2 = generate_random_driver(10, 10, device="gpu"), generate_random_driver(10, 10, device="gpu")

        if only_state_dict:
            driver1.save_model(path, only_state_dict)
--- a/tests/core/drivers/paddle_driver/test_utils.py
+++ b/tests/core/drivers/paddle_driver/test_utils.py
@@ -1,8 +1,6 @@
 import os
 import pytest

 from fastNLP.core.drivers.paddle_driver.utils import (
    get_device_from_visible,
    replace_batch_sampler,
    replace_sampler,
 )
@@ -14,24 +12,6 @@ if _NEED_IMPORT_PADDLE:

 from tests.helpers.datasets.paddle_data import PaddleNormalDataset

@pytest.mark.parametrize(
    ("user_visible_devices, cuda_visible_devices, device, output_type, correct"),
    (
        ("0,1,2,3,4,5,6,7", "0", "cpu", str, "cpu"),
        ("0,1,2,3,4,5,6,7", "0", "cpu", int, "cpu"),
        ("0,1,2,3,4,5,6,7", "3,4,5", "gpu:4", int, 1),
        ("0,1,2,3,4,5,6,7", "3,4,5", "gpu:5", str, "gpu:2"),
        ("3,4,5,6", "3,5", 0, int, 0),
        ("3,6,7,8", "6,7,8", "gpu:2", str, "gpu:1"),
    )
 )
@pytest.mark.paddle
 def test_get_device_from_visible_str(user_visible_devices, cuda_visible_devices, device, output_type, correct):
    os.environ["CUDA_VISIBLE_DEVICES"] = cuda_visible_devices
    os.environ["USER_CUDA_VISIBLE_DEVICES"] = user_visible_devices
    res = get_device_from_visible(device, output_type)
    assert res == correct

@pytest.mark.paddle
 def test_replace_batch_sampler():
    dataset = PaddleNormalDataset(10)
--- a/tests/core/drivers/torch_driver/test_single_device.py
+++ b/tests/core/drivers/torch_driver/test_single_device.py
@@ -545,22 +545,17 @@ def generate_random_driver(features, labels, fp16=False, device="cpu"):

    return driver

@pytest.fixture
 def prepare_test_save_load():
    dataset = TorchArgMaxDataset(10, 40)
    dataloader = DataLoader(dataset, batch_size=4)
    driver1, driver2 = generate_random_driver(10, 10), generate_random_driver(10, 10)
    return driver1, driver2, dataloader

@pytest.mark.torch
@pytest.mark.parametrize("only_state_dict", ([True, False]))
 def test_save_and_load_model(prepare_test_save_load, only_state_dict):
 def test_save_and_load_model(only_state_dict):
    """
    测试 save_model 和 load_model 函数
    """
    try:
        path = "model"
        driver1, driver2, dataloader = prepare_test_save_load
        dataset = TorchArgMaxDataset(10, 40)
        dataloader = DataLoader(dataset, batch_size=4)
        driver1, driver2 = generate_random_driver(10, 10), generate_random_driver(10, 10)

        driver1.save_model(path, only_state_dict)
        driver2.load_model(path, only_state_dict)
--- a/tests/core/utils/test_paddle_utils.py
+++ b/tests/core/utils/test_paddle_utils.py
@@ -1,10 +1,40 @@
 import os

 import pytest

 from fastNLP.core.utils.paddle_utils import paddle_to, paddle_move_data_to_device
 from fastNLP.core.utils.paddle_utils import get_device_from_visible, paddle_to, paddle_move_data_to_device
 from fastNLP.envs.imports import _NEED_IMPORT_PADDLE
 if _NEED_IMPORT_PADDLE:
    import paddle

@pytest.mark.parametrize(
    ("user_visible_devices, cuda_visible_devices, device, output_type, correct"),
    (
        ("0,1,2,3,4,5,6,7", "0", "cpu", str, "cpu"),
        ("0,1,2,3,4,5,6,7", "0", "cpu", int, "cpu"),
        ("0,1,2,3,4,5,6,7", "3,4,5", "gpu:4", int, 1),
        ("0,1,2,3,4,5,6,7", "3,4,5", "gpu:5", str, "gpu:2"),
        ("3,4,5,6", "3,5", 0, int, 0),
        ("3,6,7,8", "6,7,8", "gpu:2", str, "gpu:1"),
    )
 )
@pytest.mark.paddle
 def test_get_device_from_visible(user_visible_devices, cuda_visible_devices, device, output_type, correct):
    _cuda_visible_devices = os.getenv("CUDA_VISIBLE_DEVICES")
    _user_visible_devices = os.getenv("USER_CUDA_VISIBLE_DEVICES")
    os.environ["CUDA_VISIBLE_DEVICES"] = cuda_visible_devices
    os.environ["USER_CUDA_VISIBLE_DEVICES"] = user_visible_devices
    res = get_device_from_visible(device, output_type)
    assert res == correct

    # 还原环境变量
    if _cuda_visible_devices is None:
        del os.environ["CUDA_VISIBLE_DEVICES"]
    else:
        os.environ["CUDA_VISIBLE_DEVICES"] = _cuda_visible_devices
    if _user_visible_devices is None:
        del os.environ["USER_CUDA_VISIBLE_DEVICES"]
    else:
        os.environ["USER_CUDA_VISIBLE_DEVICES"] = _user_visible_devices

 ############################################################################
 #
@@ -22,12 +52,6 @@ class TestPaddleToDevice:
        assert res.place.gpu_device_id() == 0
        res = paddle_to(tensor, "cpu")
        assert res.place.is_cpu_place()
        res = paddle_to(tensor, "gpu:2")
        assert res.place.is_gpu_place()
        assert res.place.gpu_device_id() == 2
        res = paddle_to(tensor, "gpu:1")
        assert res.place.is_gpu_place()
        assert res.place.gpu_device_id() == 1

 ############################################################################
 #
@@ -64,28 +88,18 @@ class TestPaddleMoveDataToDevice:
        res = paddle_move_data_to_device(paddle_tensor, device="gpu:0", data_device=None)
        self.check_gpu(res, 0)

        res = paddle_move_data_to_device(paddle_tensor, device="gpu:1", data_device=None)
        self.check_gpu(res, 1)

        res = paddle_move_data_to_device(paddle_tensor, device="gpu:0", data_device="cpu")
        self.check_gpu(res, 0)

        res = paddle_move_data_to_device(paddle_tensor, device=None, data_device="gpu:0")
        self.check_gpu(res, 0)

        res = paddle_move_data_to_device(paddle_tensor, device=None, data_device="gpu:1")
        self.check_gpu(res, 1)

    def test_list_transfer(self):
        """
        测试张量列表的迁移
        """

        paddle_list = [paddle.rand((6, 4, 2)) for i in range(10)]
        res = paddle_move_data_to_device(paddle_list, device=None, data_device="gpu:1")
        assert isinstance(res, list)
        for r in res:
            self.check_gpu(r, 1)

        res = paddle_move_data_to_device(paddle_list, device="cpu", data_device="gpu:1")
        assert isinstance(res, list)
@@ -97,11 +111,6 @@ class TestPaddleMoveDataToDevice:
        for r in res:
            self.check_gpu(r, 0)

        res = paddle_move_data_to_device(paddle_list, device="gpu:1", data_device="cpu")
        assert isinstance(res, list)
        for r in res:
            self.check_gpu(r, 1)

    def test_tensor_tuple_transfer(self):
        """
        测试张量元组的迁移
@@ -109,10 +118,6 @@ class TestPaddleMoveDataToDevice:

        paddle_list = [paddle.rand((6, 4, 2)) for i in range(10)]
        paddle_tuple = tuple(paddle_list)
        res = paddle_move_data_to_device(paddle_tuple, device=None, data_device="gpu:1")
        assert isinstance(res, tuple)
        for r in res:
            self.check_gpu(r, 1)

        res = paddle_move_data_to_device(paddle_tuple, device="cpu", data_device="gpu:1")
        assert isinstance(res, tuple)
@@ -124,11 +129,6 @@ class TestPaddleMoveDataToDevice:
        for r in res:
            self.check_gpu(r, 0)

        res = paddle_move_data_to_device(paddle_tuple, device="gpu:1", data_device="cpu")
        assert isinstance(res, tuple)
        for r in res:
            self.check_gpu(r, 1)

    def test_dict_transfer(self):
        """
        测试字典结构的迁移
@@ -173,20 +173,6 @@ class TestPaddleMoveDataToDevice:
            self.check_gpu(t, 0)
        self.check_gpu(res["dict"]["tensor"], 0)

        res = paddle_move_data_to_device(paddle_dict, device=None, data_device="gpu:1")
        assert isinstance(res, dict)
        self.check_gpu(res["tensor"], 1)
        assert isinstance(res["list"], list)
        for t in res["list"]:
            self.check_gpu(t, 1)
        assert isinstance(res["int"], int)
        assert isinstance(res["string"], str)
        assert isinstance(res["dict"], dict)
        assert isinstance(res["dict"]["list"], list)
        for t in res["dict"]["list"]:
            self.check_gpu(t, 1)
        self.check_gpu(res["dict"]["tensor"], 1)

        res = paddle_move_data_to_device(paddle_dict, device="cpu", data_device="gpu:0")
        assert isinstance(res, dict)
        self.check_cpu(res["tensor"])
--- a/tests/pytest.ini
+++ b/tests/pytest.ini
@@ -2,5 +2,6 @@
 markers =
    torch
    paddle
    paddledist
    jittor
    torchpaddle
--- a/tutorials/data/test4dataset.csv
+++ b/tutorials/data/test4dataset.csv
@@ -0,0 +1,7 @@
 ,SentenceId,Sentence,Sentiment
 0,1,"['a', 'series', 'of', 'escapades', 'demonstrating', 'the', 'adage', 'that', 'what', 'is', 'good', 'for', 'the', 'goose', 'is', 'also', 'good', 'for', 'the', 'gander', ',', 'some', 'of', 'which', 'occasionally', 'amuses', 'but', 'none', 'of', 'which', 'amounts', 'to', 'much', 'of', 'a', 'story', '.']",negative
 1,2,"['this', 'quiet', ',', 'introspective', 'and', 'entertaining', 'independent', 'is', 'worth', 'seeking', '.']",positive
 2,3,"['even', 'fans', 'of', 'ismail', 'merchant', ""'s"", 'work', ',', 'i', 'suspect', ',', 'would', 'have', 'a', 'hard', 'time', 'sitting', 'through', 'this', 'one', '.']",negative
 3,4,"['a', 'positively', 'thrilling', 'combination', 'of', 'ethnography', 'and', 'all', 'the', 'intrigue', ',', 'betrayal', ',', 'deceit', 'and', 'murder', 'of', 'a', 'shakespearean', 'tragedy', 'or', 'a', 'juicy', 'soap', 'opera', '.']",neutral
 4,5,"['a', 'comedy-drama', 'of', 'nearly', 'epic', 'proportions', 'rooted', 'in', 'a', 'sincere', 'performance', 'by', 'the', 'title', 'character', 'undergoing', 'midlife', 'crisis', '.']",positive
 5,6,"['the', 'importance', 'of', 'being', 'earnest', ',', 'so', 'thick', 'with', 'wit', 'it', 'plays', 'like', 'a', 'reading', 'from', 'bartlett', ""'s"", 'familiar', 'quotations']",neutral
--- a/tutorials/data/test4dataset.tsv
+++ b/tutorials/data/test4dataset.tsv
@@ -0,0 +1,7 @@
 SentenceId	Sentence	Sentiment
 1	A series of escapades demonstrating the adage that what is good for the goose is also good for the gander , some of which occasionally amuses but none of which amounts to much of a story .	negative
 2	This quiet , introspective and entertaining independent is worth seeking .	positive
 3	Even fans of Ismail Merchant 's work , I suspect , would have a hard time sitting through this one .	negative
 4	A positively thrilling combination of ethnography and all the intrigue , betrayal , deceit and murder of a Shakespearean tragedy or a juicy soap opera .	neutral
 5	A comedy-drama of nearly epic proportions rooted in a sincere performance by the title character undergoing midlife crisis .	positive
 6	The Importance of Being Earnest , so thick with wit it plays like a reading from Bartlett 's Familiar Quotations	neutral
--- a/tutorials/fastnlp_tutorial_1.ipynb
+++ b/tutorials/fastnlp_tutorial_1.ipynb
@@ -153,7 +153,7 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1969418794120 1971237588872\n",
      "2438703969992 2438374526920\n",
      "+-----+------------------------+------------------------+-----+\n",
      "| idx | sentence               | words                  | num |\n",
      "+-----+------------------------+------------------------+-----+\n",
@@ -198,7 +198,7 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1971237588872 1971237588872\n",
      "2438374526920 2438374526920\n",
      "+-----+------------------------+------------------------+-----+\n",
      "| idx | sentence               | words                  | num |\n",
      "+-----+------------------------+------------------------+-----+\n",
@@ -774,9 +774,9 @@
    {
     "data": {
      "text/plain": [
       "{'sentence': <fastNLP.core.dataset.field.FieldArray at 0x1ca8a879d08>,\n",
       " 'words': <fastNLP.core.dataset.field.FieldArray at 0x1ca8a879d88>,\n",
       " 'num': <fastNLP.core.dataset.field.FieldArray at 0x1ca8a879e08>}"
       "{'sentence': <fastNLP.core.dataset.field.FieldArray at 0x237ce26d388>,\n",
       " 'words': <fastNLP.core.dataset.field.FieldArray at 0x237ce26d408>,\n",
       " 'num': <fastNLP.core.dataset.field.FieldArray at 0x237ce26d488>}"
      ]
     },
     "execution_count": 15,
@@ -923,7 +923,8 @@
     "output_type": "stream",
     "text": [
      "5 Counter({'生活': 1, '就像': 1, '海洋': 1})\n",
      "6 Counter({'生活': 1, '就像': 1, '海洋': 1, '只有': 1})\n"
      "6 Counter({'生活': 1, '就像': 1, '海洋': 1, '只有': 1})\n",
      "6 {'<pad>': 0, '<unk>': 1, '生活': 2, '就像': 3, '海洋': 4, '只有': 5}\n"
     ]
    }
   ],
@@ -931,7 +932,8 @@
    "vocab.add_word_lst(['生活', '就像', '海洋'])\n",
    "print(len(vocab), vocab.word_count)\n",
    "vocab.add_word('只有')\n",
    "print(len(vocab), vocab.word_count)"
    "print(len(vocab), vocab.word_count)\n",
    "print(len(vocab), vocab.word2idx)"
   ]
  },
  {
@@ -959,7 +961,6 @@
      "<pad> 0\n",
      "<unk> 1\n",
      "生活 2\n",
      "只有 5\n",
      "彼岸 1 False\n"
     ]
    }
@@ -968,7 +969,6 @@
    "print(vocab.to_word(0), vocab.to_index('<pad>'))\n",
    "print(vocab.to_word(1), vocab.to_index('<unk>'))\n",
    "print(vocab.to_word(2), vocab.to_index('生活'))\n",
    "print(vocab.to_word(5), vocab.to_index('只有'))\n",
    "print('彼岸', vocab.to_index('彼岸'), vocab.has_word('彼岸'))"
   ]
  },
@@ -979,7 +979,9 @@
   "source": [
    "**`vocabulary`允许反复添加相同单词**，**可以通过`word_count`方法看到相应单词被添加的次数**\n",
    "\n",
    "&emsp; 但其中没有`<unk>`和`<pad>`，`vocabulary`的全部变量与函数可以通过`dir(vocabulary)`查询"
    "&emsp; 但其中没有`<unk>`和`<pad>`，`vocabulary`的全部变量与函数可以通过`dir(vocabulary)`查询\n",
    "\n",
    "&emsp; 注：**使用`add_word_lst`添加单词**，**单词对应序号不会动态调整**，**使用`dataset`添加单词的情况不同**"
   ]
  },
  {
@@ -992,15 +994,19 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "13 Counter({'生活': 2, '就像': 2, '海洋': 2, '只有': 2, '意志': 1, '坚强的': 1, '人': 1, '才': 1, '能': 1, '到达': 1, '彼岸': 1})\n",
      "彼岸 12 True\n"
      "生活 2\n",
      "彼岸 12 True\n",
      "13 Counter({'人': 4, '生活': 2, '就像': 2, '海洋': 2, '只有': 2, '意志': 1, '坚强的': 1, '才': 1, '能': 1, '到达': 1, '彼岸': 1})\n",
      "13 {'<pad>': 0, '<unk>': 1, '生活': 2, '就像': 3, '海洋': 4, '只有': 5, '人': 6, '意志': 7, '坚强的': 8, '才': 9, '能': 10, '到达': 11, '彼岸': 12}\n"
     ]
    }
   ],
   "source": [
    "vocab.add_word_lst(['生活', '就像', '海洋', '只有', '意志', '坚强的', '人', '才', '能', '到达', '彼岸'])\n",
    "vocab.add_word_lst(['生活', '就像', '海洋', '只有', '意志', '坚强的', '人', '人', '人', '人', '才', '能', '到达', '彼岸'])\n",
    "print(vocab.to_word(2), vocab.to_index('生活'))\n",
    "print('彼岸', vocab.to_index('彼岸'), vocab.has_word('彼岸'))\n",
    "print(len(vocab), vocab.word_count)\n",
    "print('彼岸', vocab.to_index('彼岸'), vocab.has_word('彼岸'))"
    "print(len(vocab), vocab.word2idx)"
   ]
  },
  {
@@ -1082,52 +1088,440 @@
    "## 3 dataset 和 vocabulary 的组合使用\n",
    " \n",
    "### 3.1 从 dataframe 中加载 dataset\n",
    "\n"
    "\n",
    "以下通过 [NLP-beginner](https://github.com/FudanNLP/nlp-beginner) 实践一中 [Rotten Tomatoes 影评数据集](https://www.kaggle.com/c/sentiment-analysis-on-movie-reviews) 的部分训练数据组成`test4dataset.tsv`文件\n",
    "\n",
    "&emsp; 介绍如何使用`dataset`、`vocabulary`简单加载并处理数据集，首先使用`pandas`模块，读取原始数据的`dataframe`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "id": "3dbd985d",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>SentenceId</th>\n",
       "      <th>Sentence</th>\n",
       "      <th>Sentiment</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>A series of escapades demonstrating the adage ...</td>\n",
       "      <td>negative</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>This quiet , introspective and entertaining in...</td>\n",
       "      <td>positive</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>Even fans of Ismail Merchant 's work , I suspe...</td>\n",
       "      <td>negative</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>A positively thrilling combination of ethnogra...</td>\n",
       "      <td>neutral</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>A comedy-drama of nearly epic proportions root...</td>\n",
       "      <td>positive</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>6</td>\n",
       "      <td>The Importance of Being Earnest , so thick wit...</td>\n",
       "      <td>neutral</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   SentenceId                                           Sentence Sentiment\n",
       "0           1  A series of escapades demonstrating the adage ...  negative\n",
       "1           2  This quiet , introspective and entertaining in...  positive\n",
       "2           3  Even fans of Ismail Merchant 's work , I suspe...  negative\n",
       "3           4  A positively thrilling combination of ethnogra...   neutral\n",
       "4           5  A comedy-drama of nearly epic proportions root...  positive\n",
       "5           6  The Importance of Being Earnest , so thick wit...   neutral"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pandas as pd\n",
    "\n",
    "df = pd.read_csv('./data/test4dataset.tsv', sep='\\t')\n",
    "df"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "89059713",
   "id": "919ab350",
   "metadata": {},
   "source": []
   "source": [
    "接着，通过`dataset`中的`from_pandas`方法填充数据集，并使用`apply_more`方法对文本进行分词操作"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3dbd985d",
   "execution_count": 25,
   "id": "4f634586",
   "metadata": {},
   "outputs": [],
   "source": []
   "outputs": [
    {
     "data": {
      "text/html": [
       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"></pre>\n"
      ],
      "text/plain": []
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"></pre>\n"
      ],
      "text/plain": []
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">\n",
       "</pre>\n"
      ],
      "text/plain": [
       "\n"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+------------+------------------------------+-----------+\n",
      "| SentenceId | Sentence                     | Sentiment |\n",
      "+------------+------------------------------+-----------+\n",
      "| 1          | ['a', 'series', 'of', 'es... | negative  |\n",
      "| 2          | ['this', 'quiet', ',', 'i... | positive  |\n",
      "| 3          | ['even', 'fans', 'of', 'i... | negative  |\n",
      "| 4          | ['a', 'positively', 'thri... | neutral   |\n",
      "| 5          | ['a', 'comedy-drama', 'of... | positive  |\n",
      "| 6          | ['the', 'importance', 'of... | neutral   |\n",
      "+------------+------------------------------+-----------+\n"
     ]
    }
   ],
   "source": [
    "from fastNLP.core.dataset import DataSet\n",
    "\n",
    "dataset = DataSet()\n",
    "dataset = dataset.from_pandas(df)\n",
    "dataset.apply_more(lambda ins:{'SentenceId': ins['SentenceId'], \n",
    "                               'Sentence': ins['Sentence'].lower().split(), 'Sentiment': ins['Sentiment']})\n",
    "print(dataset)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "5c1ae192",
   "metadata": {},
   "source": [
    "&emsp; 如果需要保存中间结果，也可以使用`dataset`的`to_csv`方法，生成`.csv`或`.tsv`文件"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4f634586",
   "execution_count": 26,
   "id": "46722efc",
   "metadata": {},
   "outputs": [],
   "source": []
   "source": [
    "dataset.to_csv('./data/test4dataset.csv')"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "5ba13989",
   "metadata": {},
   "source": [
    "### 3.2 从 dataset 中获取 vocabulary"
    "### 3.2 从 dataset 中获取 vocabulary\n",
    "\n",
    "然后，初始化`vocabulary`，使用`vocabulary`中的`from_dataset`方法，从`dataset`的指定字段中\n",
    "\n",
    "&emsp; 获取字段中的所有元素，然后编号；如果指定字段是个列表，则针对字段中所有列表包含的元素编号\n",
    "\n",
    "&emsp; 注：**使用`dataset`添加单词**，**不同于`add_word_list`**，**单词被添加次数越多**，**序号越靠前**，例如案例中的`a`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "execution_count": 27,
   "id": "a2de615b",
   "metadata": {},
   "outputs": [],
   "source": []
   "outputs": [
    {
     "data": {
      "text/html": [
       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"></pre>\n"
      ],
      "text/plain": []
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"></pre>\n"
      ],
      "text/plain": []
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">\n",
       "</pre>\n"
      ],
      "text/plain": [
       "\n"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Counter({'a': 9, 'of': 9, ',': 7, 'the': 6, '.': 5, 'is': 3, 'and': 3, 'good': 2, 'for': 2, 'which': 2, 'this': 2, \"'s\": 2, 'series': 1, 'escapades': 1, 'demonstrating': 1, 'adage': 1, 'that': 1, 'what': 1, 'goose': 1, 'also': 1, 'gander': 1, 'some': 1, 'occasionally': 1, 'amuses': 1, 'but': 1, 'none': 1, 'amounts': 1, 'to': 1, 'much': 1, 'story': 1, 'quiet': 1, 'introspective': 1, 'entertaining': 1, 'independent': 1, 'worth': 1, 'seeking': 1, 'even': 1, 'fans': 1, 'ismail': 1, 'merchant': 1, 'work': 1, 'i': 1, 'suspect': 1, 'would': 1, 'have': 1, 'hard': 1, 'time': 1, 'sitting': 1, 'through': 1, 'one': 1, 'positively': 1, 'thrilling': 1, 'combination': 1, 'ethnography': 1, 'all': 1, 'intrigue': 1, 'betrayal': 1, 'deceit': 1, 'murder': 1, 'shakespearean': 1, 'tragedy': 1, 'or': 1, 'juicy': 1, 'soap': 1, 'opera': 1, 'comedy-drama': 1, 'nearly': 1, 'epic': 1, 'proportions': 1, 'rooted': 1, 'in': 1, 'sincere': 1, 'performance': 1, 'by': 1, 'title': 1, 'character': 1, 'undergoing': 1, 'midlife': 1, 'crisis': 1, 'importance': 1, 'being': 1, 'earnest': 1, 'so': 1, 'thick': 1, 'with': 1, 'wit': 1, 'it': 1, 'plays': 1, 'like': 1, 'reading': 1, 'from': 1, 'bartlett': 1, 'familiar': 1, 'quotations': 1}) \n",
      "\n",
      "{'<pad>': 0, '<unk>': 1, 'a': 2, 'of': 3, ',': 4, 'the': 5, '.': 6, 'is': 7, 'and': 8, 'good': 9, 'for': 10, 'which': 11, 'this': 12, \"'s\": 13, 'series': 14, 'escapades': 15, 'demonstrating': 16, 'adage': 17, 'that': 18, 'what': 19, 'goose': 20, 'also': 21, 'gander': 22, 'some': 23, 'occasionally': 24, 'amuses': 25, 'but': 26, 'none': 27, 'amounts': 28, 'to': 29, 'much': 30, 'story': 31, 'quiet': 32, 'introspective': 33, 'entertaining': 34, 'independent': 35, 'worth': 36, 'seeking': 37, 'even': 38, 'fans': 39, 'ismail': 40, 'merchant': 41, 'work': 42, 'i': 43, 'suspect': 44, 'would': 45, 'have': 46, 'hard': 47, 'time': 48, 'sitting': 49, 'through': 50, 'one': 51, 'positively': 52, 'thrilling': 53, 'combination': 54, 'ethnography': 55, 'all': 56, 'intrigue': 57, 'betrayal': 58, 'deceit': 59, 'murder': 60, 'shakespearean': 61, 'tragedy': 62, 'or': 63, 'juicy': 64, 'soap': 65, 'opera': 66, 'comedy-drama': 67, 'nearly': 68, 'epic': 69, 'proportions': 70, 'rooted': 71, 'in': 72, 'sincere': 73, 'performance': 74, 'by': 75, 'title': 76, 'character': 77, 'undergoing': 78, 'midlife': 79, 'crisis': 80, 'importance': 81, 'being': 82, 'earnest': 83, 'so': 84, 'thick': 85, 'with': 86, 'wit': 87, 'it': 88, 'plays': 89, 'like': 90, 'reading': 91, 'from': 92, 'bartlett': 93, 'familiar': 94, 'quotations': 95} \n",
      "\n",
      "Vocabulary(['a', 'series', 'of', 'escapades', 'demonstrating']...)\n"
     ]
    }
   ],
   "source": [
    "from fastNLP.core.vocabulary import Vocabulary\n",
    "\n",
    "vocab = Vocabulary()\n",
    "vocab = vocab.from_dataset(dataset, field_name='Sentence')\n",
    "print(vocab.word_count, '\\n')\n",
    "print(vocab.word2idx, '\\n')\n",
    "print(vocab)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f0857ccb",
   "metadata": {},
   "source": [
    "之后，**通过`vocabulary`的`index_dataset`方法**，**调整`dataset`中指定字段的元素**，**使用编号将之代替**\n",
    "\n",
    "&emsp; 使用上述方法，可以将影评数据集中的单词序列转化为词编号序列，为接下来转化为词嵌入序列做准备"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "execution_count": 28,
   "id": "2f9a04b2",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"></pre>\n"
      ],
      "text/plain": []
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"></pre>\n"
      ],
      "text/plain": []
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">\n",
       "</pre>\n"
      ],
      "text/plain": [
       "\n"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+------------+------------------------------+-----------+\n",
      "| SentenceId | Sentence                     | Sentiment |\n",
      "+------------+------------------------------+-----------+\n",
      "| 1          | [2, 14, 3, 15, 16, 5, 17,... | negative  |\n",
      "| 2          | [12, 32, 4, 33, 8, 34, 35... | positive  |\n",
      "| 3          | [38, 39, 3, 40, 41, 13, 4... | negative  |\n",
      "| 4          | [2, 52, 53, 54, 3, 55, 8,... | neutral   |\n",
      "| 5          | [2, 67, 3, 68, 69, 70, 71... | positive  |\n",
      "| 6          | [5, 81, 3, 82, 83, 4, 84,... | neutral   |\n",
      "+------------+------------------------------+-----------+\n"
     ]
    }
   ],
   "source": [
    "vocab.index_dataset(dataset, field_name='Sentence')\n",
    "print(dataset)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "6b26b707",
   "metadata": {},
   "source": [
    "最后，使用相同方法，再将`dataset`中`Sentiment`字段中的`negative`、`neutral`、`positive`转化为数字编号"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "id": "5f5eed18",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"></pre>\n"
      ],
      "text/plain": []
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'negative': 0, 'positive': 1, 'neutral': 2}\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"></pre>\n"
      ],
      "text/plain": []
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">\n",
       "</pre>\n"
      ],
      "text/plain": [
       "\n"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+------------+------------------------------+-----------+\n",
      "| SentenceId | Sentence                     | Sentiment |\n",
      "+------------+------------------------------+-----------+\n",
      "| 1          | [2, 14, 3, 15, 16, 5, 17,... | 0         |\n",
      "| 2          | [12, 32, 4, 33, 8, 34, 35... | 1         |\n",
      "| 3          | [38, 39, 3, 40, 41, 13, 4... | 0         |\n",
      "| 4          | [2, 52, 53, 54, 3, 55, 8,... | 2         |\n",
      "| 5          | [2, 67, 3, 68, 69, 70, 71... | 1         |\n",
      "| 6          | [5, 81, 3, 82, 83, 4, 84,... | 2         |\n",
      "+------------+------------------------------+-----------+\n"
     ]
    }
   ],
   "source": [
    "target_vocab = Vocabulary(padding=None, unknown=None)\n",
    "\n",
    "target_vocab.from_dataset(dataset, field_name='Sentiment')\n",
    "print(target_vocab.word2idx)\n",
    "target_vocab.index_dataset(dataset, field_name='Sentiment')\n",
    "print(dataset)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "eed7ea64",
   "metadata": {},
   "source": [
    "在最后的最后，通过以下的一张图，来总结本章关于`dataset`和`vocabulary`主要知识点的讲解，以及两者的联系\n",
    "\n",
    "<img src=\"./figures/T1-fig-dataset-and-vocabulary.png\" width=\"80%\" height=\"80%\" align=\"center\"></img>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "35b4f0f7",
   "metadata": {},
   "outputs": [],
   "source": []
  }
--- a/tutorials/fastnlp_tutorial_2.ipynb
+++ b/tutorials/fastnlp_tutorial_2.ipynb
@@ -0,0 +1,41 @@
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
 }
--- a/tutorials/figures/T1-fig-dataset-and-vocabulary.png
+++ b/tutorials/figures/T1-fig-dataset-and-vocabulary.png