| @@ -56,7 +56,7 @@ def is_paddle_dtype_str(dtype): | |||
| def _get_dtype(ele_dtype, dtype, class_name): | |||
| if not (ele_dtype is not None or is_number_or_numpy_number(ele_dtype) or is_paddle_tensor(ele_dtype) or is_paddle_dtype_str(ele_dtype)): | |||
| if not (ele_dtype is None or is_number_or_numpy_number(ele_dtype) or is_paddle_tensor(ele_dtype) or is_paddle_dtype_str(ele_dtype)): | |||
| raise EleDtypeUnsupportedError(f"`{class_name}` only supports padding python numbers " | |||
| f"or numpy numbers or paddle.Tensor but get `{ele_dtype}`.") | |||
| @@ -8,11 +8,12 @@ from typing import Callable, List, Optional, Union, Dict, Sequence | |||
| from fastNLP.envs.imports import _NEED_IMPORT_PADDLE | |||
| if _NEED_IMPORT_PADDLE: | |||
| from paddle.io import DataLoader, Dataset | |||
| from paddle.io import DataLoader, Dataset, Sampler | |||
| from paddle.fluid.dataloader.collate import default_collate_fn | |||
| else: | |||
| from fastNLP.core.utils.dummy_class import DummyClass as Dataset | |||
| from fastNLP.core.utils.dummy_class import DummyClass as DataLoader | |||
| from fastNLP.core.utils.dummy_class import DummyClass as Sampler | |||
| from fastNLP.core.collators.collator import Collator | |||
| from fastNLP.core.dataloaders.utils import indice_collate_wrapper | |||
| @@ -58,6 +59,9 @@ class PaddleDataLoader(DataLoader): | |||
| if batch_sampler is None: | |||
| batch_sampler = RandomBatchSampler(dataset, batch_size=batch_size, shuffle=shuffle, | |||
| drop_last=drop_last) | |||
| batch_size = 1 | |||
| shuffle = False | |||
| drop_last = False | |||
| super(PaddleDataLoader, self).__init__(dataset=dataset, feed_list=feed_list, places=places, | |||
| return_list=return_list, batch_sampler=batch_sampler, | |||
| @@ -1,12 +1,12 @@ | |||
| import os | |||
| import shutil | |||
| from typing import List, Union, Optional, Dict, Tuple, Callable | |||
| from fastNLP.core.utils.paddle_utils import get_device_from_visible | |||
| from .paddle_driver import PaddleDriver | |||
| from .fleet_launcher import FleetLauncher | |||
| from .utils import ( | |||
| _FleetWrappingModel, | |||
| get_device_from_visible, | |||
| reset_seed, | |||
| replace_sampler, | |||
| replace_batch_sampler, | |||
| @@ -17,8 +17,8 @@ from fastNLP.envs.imports import _NEED_IMPORT_PADDLE | |||
| from fastNLP.core.utils import ( | |||
| auto_param_call, | |||
| check_user_specific_params, | |||
| paddle_move_data_to_device, | |||
| is_in_paddle_dist | |||
| is_in_paddle_dist, | |||
| is_in_paddle_dist, | |||
| ) | |||
| from fastNLP.envs.distributed import rank_zero_rm | |||
| from fastNLP.core.samplers import ( | |||
| @@ -609,12 +609,6 @@ class PaddleFleetDriver(PaddleDriver): | |||
| def is_distributed(self): | |||
| return True | |||
| def move_data_to_device(self, batch: 'paddle.Tensor'): | |||
| device = self.data_device | |||
| # 因为设置了CUDA_VISIBLE_DEVICES,可能会引起错误 | |||
| device = get_device_from_visible(device) | |||
| return paddle_move_data_to_device(batch, device) | |||
| @staticmethod | |||
| def _check_optimizer_legality(optimizers): | |||
| # paddle 存在设置分布式 optimizers 的函数,返回值为 fleet.meta_optimizers.HybridParallelOptimizer | |||
| @@ -637,9 +631,8 @@ class PaddleFleetDriver(PaddleDriver): | |||
| :return: 如果当前不是分布式 driver 直接返回输入的 obj 。如果当前 rank 是接收端(其 global rank 包含在了 dst 中),则返回 | |||
| 接收到的参数;如果是 source 端则返回发射的内容;既不是发送端、又不是接收端,则返回 None 。 | |||
| """ | |||
| device = self.data_device | |||
| # 因为设置了CUDA_VISIBLE_DEVICES,可能会引起错误 | |||
| device = get_device_from_visible(device) | |||
| device = get_device_from_visible(self.data_device) | |||
| return fastnlp_paddle_broadcast_object(obj, src, device=device, group=group) | |||
| def all_gather(self, obj, group=None) -> List: | |||
| @@ -10,7 +10,6 @@ from fastNLP.envs.env import ( | |||
| FASTNLP_DISTRIBUTED_CHECK, | |||
| FASTNLP_LOG_LEVEL, | |||
| FASTNLP_GLOBAL_SEED, | |||
| USER_CUDA_VISIBLE_DEVICES, | |||
| ) | |||
| from .utils import ( | |||
| find_free_ports, | |||
| @@ -42,7 +42,8 @@ def initialize_paddle_driver(driver: str, device: Optional[Union[str, int, List[ | |||
| user_visible_devices = os.getenv("USER_CUDA_VISIBLE_DEVICES") | |||
| if user_visible_devices is None: | |||
| raise RuntimeError("This situation cannot happen, please report a bug to us.") | |||
| raise RuntimeError("`USER_CUDA_VISIBLE_DEVICES` cannot be None, please check if you have set " | |||
| "`FASTNLP_BACKEND` to 'paddle' before using FastNLP.") | |||
| _could_use_device_num = len(user_visible_devices.split(",")) | |||
| if isinstance(device, int): | |||
| if device < 0 and device != -1: | |||
| @@ -10,7 +10,7 @@ import numpy as np | |||
| from .utils import _build_fp16_env, optimizer_state_to_device, DummyGradScaler | |||
| from fastNLP.envs.imports import _NEED_IMPORT_PADDLE | |||
| from fastNLP.core.drivers.driver import Driver | |||
| from fastNLP.core.utils import apply_to_collection, paddle_move_data_to_device | |||
| from fastNLP.core.utils import apply_to_collection, paddle_move_data_to_device, get_device_from_visible | |||
| from fastNLP.envs import ( | |||
| FASTNLP_SEED_WORKERS, | |||
| FASTNLP_MODEL_FILENAME, | |||
| @@ -394,7 +394,8 @@ class PaddleDriver(Driver): | |||
| :return: 将移动到指定机器上的 batch 对象返回; | |||
| """ | |||
| return paddle_move_data_to_device(batch, self.data_device) | |||
| device = get_device_from_visible(self.data_device) | |||
| return paddle_move_data_to_device(batch, device) | |||
| @staticmethod | |||
| def worker_init_function(worker_id: int, rank: Optional[int] = None) -> None: # pragma: no cover | |||
| @@ -2,14 +2,14 @@ import os | |||
| from typing import Optional, Dict, Union, Callable, Tuple | |||
| from .paddle_driver import PaddleDriver | |||
| from .utils import replace_batch_sampler, replace_sampler, get_device_from_visible | |||
| from .utils import replace_batch_sampler, replace_sampler | |||
| from fastNLP.envs.imports import _NEED_IMPORT_PADDLE | |||
| from fastNLP.envs.env import USER_CUDA_VISIBLE_DEVICES | |||
| from fastNLP.core.utils import ( | |||
| auto_param_call, | |||
| get_device_from_visible, | |||
| get_paddle_gpu_str, | |||
| get_paddle_device_id, | |||
| paddle_move_data_to_device, | |||
| ) | |||
| from fastNLP.core.utils.utils import _get_fun_msg | |||
| from fastNLP.core.samplers import ( | |||
| @@ -39,6 +39,9 @@ class PaddleSingleDriver(PaddleDriver): | |||
| raise ValueError("`paddle.DataParallel` is not supported in `PaddleSingleDriver`") | |||
| cuda_visible_devices = os.environ.get(USER_CUDA_VISIBLE_DEVICES, None) | |||
| if cuda_visible_devices is None: | |||
| raise RuntimeError("`USER_CUDA_VISIBLE_DEVICES` cannot be None, please check if you have set " | |||
| "`FASTNLP_BACKEND` to 'paddle' before using FastNLP.") | |||
| if cuda_visible_devices == "": | |||
| device = "cpu" | |||
| logger.info("You have set `CUDA_VISIBLE_DEVICES` to '' in system environment variable, and we are gonna to" | |||
| @@ -54,7 +57,7 @@ class PaddleSingleDriver(PaddleDriver): | |||
| device_id = device | |||
| else: | |||
| device_id = get_paddle_device_id(device) | |||
| os.environ["CUDA_VISIBLE_DEVICES"] = os.environ[USER_CUDA_VISIBLE_DEVICES].split(",")[device_id] | |||
| os.environ["CUDA_VISIBLE_DEVICES"] = cuda_visible_devices.split(",")[device_id] | |||
| self.model_device = get_paddle_gpu_str(device) | |||
| self.local_rank = 0 | |||
| @@ -65,8 +68,7 @@ class PaddleSingleDriver(PaddleDriver): | |||
| r""" | |||
| 该函数用来初始化训练环境,用于设置当前训练的设备,并将模型迁移到对应设备上。 | |||
| """ | |||
| device = self.model_device | |||
| device = get_device_from_visible(device, output_type=str) | |||
| device = get_device_from_visible(self.model_device, output_type=str) | |||
| paddle.device.set_device(device) | |||
| self.model.to(device) | |||
| @@ -121,16 +123,6 @@ class PaddleSingleDriver(PaddleDriver): | |||
| else: | |||
| raise RuntimeError(f"There is no `{fn}` method in your {type(self.model)}.") | |||
| def move_data_to_device(self, batch: 'paddle.Tensor'): | |||
| r""" | |||
| 将数据迁移到指定的机器上;batch 可能是 list 也可能 dict ,或其嵌套结构。 | |||
| 在 Paddle 中使用可能会引起因与设置的设备不一致而产生的问题,请注意。 | |||
| :return: 将移动到指定机器上的 batch 对象返回; | |||
| """ | |||
| device = get_device_from_visible(self.data_device) | |||
| return paddle_move_data_to_device(batch, device) | |||
| def set_dist_repro_dataloader(self, dataloader, dist: Union[str, ReproducibleBatchSampler, ReproducibleSampler]=None, | |||
| reproducible: bool = False): | |||
| r""" | |||
| @@ -6,12 +6,11 @@ import inspect | |||
| import numpy as np | |||
| from copy import deepcopy | |||
| from contextlib import ExitStack, closing | |||
| from enum import IntEnum | |||
| from typing import Dict, Optional, Union | |||
| from typing import Dict, Optional | |||
| from fastNLP.envs.imports import _NEED_IMPORT_PADDLE | |||
| from fastNLP.core.utils import get_paddle_device_id, auto_param_call, paddle_to | |||
| from fastNLP.envs.env import FASTNLP_GLOBAL_SEED, FASTNLP_SEED_WORKERS, USER_CUDA_VISIBLE_DEVICES | |||
| from fastNLP.core.utils import auto_param_call, paddle_to | |||
| from fastNLP.envs.env import FASTNLP_GLOBAL_SEED, FASTNLP_SEED_WORKERS | |||
| from fastNLP.core.log import logger | |||
| @@ -173,40 +172,6 @@ def find_free_ports(num): | |||
| return None | |||
| def get_device_from_visible(device: Union[str, int], output_type=int): | |||
| """ | |||
| 在有 CUDA_VISIBLE_DEVICES 的情况下,获取对应的设备。 | |||
| 如 CUDA_VISIBLE_DEVICES=2,3 ,device=3 ,则返回1。 | |||
| :param device: 未转化的设备名 | |||
| :param output_type: 返回值的类型 | |||
| :return: 转化后的设备id | |||
| """ | |||
| if output_type not in [int, str]: | |||
| raise ValueError("Parameter `output_type` should be one of these types: [int, str]") | |||
| if device == "cpu": | |||
| return device | |||
| cuda_visible_devices = os.getenv("CUDA_VISIBLE_DEVICES") | |||
| idx = get_paddle_device_id(device) | |||
| if cuda_visible_devices is None or cuda_visible_devices == "": | |||
| # 这个判断一般不会发生,因为 fastnlp 会为 paddle 强行注入 CUDA_VISIBLE_DEVICES | |||
| raise RuntimeError("This situation should not happen, please report us this bug.") | |||
| else: | |||
| # 利用 USER_CUDA_VISIBLDE_DEVICES 获取用户期望的设备 | |||
| user_visible_devices = os.getenv(USER_CUDA_VISIBLE_DEVICES) | |||
| if user_visible_devices is None: | |||
| raise RuntimeError("This situation cannot happen, please report a bug to us.") | |||
| idx = user_visible_devices.split(",")[idx] | |||
| cuda_visible_devices_list = cuda_visible_devices.split(',') | |||
| if idx not in cuda_visible_devices_list: | |||
| raise ValueError(f"Can't find your devices {idx} in CUDA_VISIBLE_DEVICES[{cuda_visible_devices}].") | |||
| res = cuda_visible_devices_list.index(idx) | |||
| if output_type == int: | |||
| return res | |||
| else: | |||
| return f"gpu:{res}" | |||
| def replace_batch_sampler(dataloader: "DataLoader", batch_sampler: "BatchSampler"): | |||
| """ | |||
| 利用 `batch_sampler` 重新构建一个 DataLoader,起到替换 `batch_sampler` 又不影响原 `dataloader` 的作用。 | |||
| @@ -1,11 +1,10 @@ | |||
| from typing import List, Optional, Any | |||
| from typing import List, Any | |||
| import numpy as np | |||
| from fastNLP.core.metrics.backend import Backend | |||
| from fastNLP.core.utils.paddle_utils import paddle_to | |||
| from fastNLP.core.utils.paddle_utils import paddle_to, get_device_from_visible | |||
| from fastNLP.core.metrics.utils import AggregateMethodError | |||
| from fastNLP.core.drivers.paddle_driver.utils import get_device_from_visible | |||
| from fastNLP.core.drivers.paddle_driver.dist_utils import fastnlp_paddle_all_gather | |||
| from fastNLP.envs.imports import _NEED_IMPORT_PADDLE | |||
| @@ -80,7 +79,6 @@ class PaddleBackend(Backend): | |||
| raise ValueError(f"tensor: {tensor} can not convert to ndarray!") | |||
| def move_tensor_to_device(self, tensor, device): | |||
| # TODO 如果在这里处理的话,会不会在别的地方引起bug? | |||
| device = get_device_from_visible(device) | |||
| return paddle_to(tensor, device) | |||
| @@ -2,6 +2,7 @@ __all__ = [ | |||
| 'cache_results', | |||
| 'is_jittor_dataset', | |||
| 'jittor_collate_wraps', | |||
| 'get_device_from_visible', | |||
| 'paddle_to', | |||
| 'paddle_move_data_to_device', | |||
| 'get_paddle_device_id', | |||
| @@ -27,7 +28,7 @@ __all__ = [ | |||
| from .cache_results import cache_results | |||
| from .jittor_utils import is_jittor_dataset, jittor_collate_wraps | |||
| from .paddle_utils import paddle_to, paddle_move_data_to_device, get_paddle_device_id, get_paddle_gpu_str, is_in_paddle_dist, \ | |||
| from .paddle_utils import get_device_from_visible, paddle_to, paddle_move_data_to_device, get_paddle_device_id, get_paddle_gpu_str, is_in_paddle_dist, \ | |||
| is_in_fnlp_paddle_dist, is_in_paddle_launch_dist | |||
| from .rich_progress import f_rich_progress | |||
| from .torch_paddle_utils import torch_paddle_move_data_to_device | |||
| @@ -1,4 +1,5 @@ | |||
| __all__ = [ | |||
| "get_device_from_visible", | |||
| "paddle_to", | |||
| "paddle_move_data_to_device", | |||
| "get_paddle_gpu_str", | |||
| @@ -13,13 +14,45 @@ import re | |||
| from typing import Any, Optional, Union | |||
| from fastNLP.envs.imports import _NEED_IMPORT_PADDLE | |||
| from fastNLP.envs import FASTNLP_DISTRIBUTED_CHECK, FASTNLP_BACKEND_LAUNCH | |||
| from fastNLP.envs import FASTNLP_DISTRIBUTED_CHECK, FASTNLP_BACKEND_LAUNCH, USER_CUDA_VISIBLE_DEVICES | |||
| if _NEED_IMPORT_PADDLE: | |||
| import paddle | |||
| from .utils import apply_to_collection | |||
| def get_device_from_visible(device: Union[str, int], output_type=int): | |||
| """ | |||
| 在有 CUDA_VISIBLE_DEVICES 的情况下,获取对应的设备。 | |||
| 如 CUDA_VISIBLE_DEVICES=2,3 ,device=3 ,则返回1。 | |||
| :param device: 未转化的设备名 | |||
| :param output_type: 返回值的类型 | |||
| :return: 转化后的设备id | |||
| """ | |||
| if output_type not in [int, str]: | |||
| raise ValueError("Parameter `output_type` should be one of these types: [int, str]") | |||
| if device == "cpu": | |||
| return device | |||
| cuda_visible_devices = os.getenv("CUDA_VISIBLE_DEVICES") | |||
| user_visible_devices = os.getenv(USER_CUDA_VISIBLE_DEVICES) | |||
| if user_visible_devices is None: | |||
| raise RuntimeError("`USER_CUDA_VISIBLE_DEVICES` cannot be None, please check if you have set " | |||
| "`FASTNLP_BACKEND` to 'paddle' before using FastNLP.") | |||
| idx = get_paddle_device_id(device) | |||
| # 利用 USER_CUDA_VISIBLDE_DEVICES 获取用户期望的设备 | |||
| if user_visible_devices is None: | |||
| raise RuntimeError("This situation cannot happen, please report a bug to us.") | |||
| idx = user_visible_devices.split(",")[idx] | |||
| cuda_visible_devices_list = cuda_visible_devices.split(',') | |||
| if idx not in cuda_visible_devices_list: | |||
| raise ValueError(f"Can't find your devices {idx} in CUDA_VISIBLE_DEVICES[{cuda_visible_devices}]. ") | |||
| res = cuda_visible_devices_list.index(idx) | |||
| if output_type == int: | |||
| return res | |||
| else: | |||
| return f"gpu:{res}" | |||
| def paddle_to(data, device: Union[str, int]): | |||
| """ | |||
| @@ -33,6 +66,7 @@ def paddle_to(data, device: Union[str, int]): | |||
| if device == "cpu": | |||
| return data.cpu() | |||
| else: | |||
| # device = get_device_from_visible(device, output_type=int) | |||
| return data.cuda(get_paddle_device_id(device)) | |||
| @@ -14,10 +14,10 @@ def test_get_element_shape_dtype(): | |||
| catalog = _get_element_shape_dtype([np.zeros(3), np.zeros((2, 1))]) | |||
| @pytest.mark.parametrize('backend', ['raw', None, 'numpy', 'torch', 'jittor', 'paddle']) | |||
| # @pytest.mark.parametrize('backend', ['raw', None, 'numpy', 'torch', 'jittor', 'paddle']) | |||
| @pytest.mark.parametrize('backend', ['raw', None, 'numpy', 'torch', 'paddle']) | |||
| @pytest.mark.torch | |||
| @pytest.mark.paddle | |||
| @pytest.mark.jittor | |||
| def test_get_padder_run(backend): | |||
| if not _NEED_IMPORT_TORCH and backend == 'torch': | |||
| pytest.skip("No torch") | |||
| @@ -1,7 +1,7 @@ | |||
| """ | |||
| 这个文件测试用户以python -m paddle.distributed.launch 启动的情况 | |||
| 看看有没有用pytest执行的机会 | |||
| python -m paddle.distributed.launch --gpus=0,2,3 test_trainer_fleet.py | |||
| FASTNLP_BACKEND=paddle python -m paddle.distributed.launch --gpus=0,2,3 _test_trainer_fleet.py | |||
| """ | |||
| import os | |||
| import sys | |||
| @@ -1,7 +1,7 @@ | |||
| """ | |||
| 这个文件测试用户以python -m paddle.distributed.launch 启动的情况 | |||
| 并且自己初始化了 fleet | |||
| python -m paddle.distributed.launch --gpus=0,2,3 test_trainer_fleet_outside.py | |||
| FASTNLP_BACKEND=paddle python -m paddle.distributed.launch --gpus=0,2,3 _test_trainer_fleet_outside.py | |||
| """ | |||
| import os | |||
| import sys | |||
| @@ -93,5 +93,5 @@ if __name__ == "__main__": | |||
| driver=driver, | |||
| device=device, | |||
| callbacks=callbacks, | |||
| n_epochs=30, | |||
| n_epochs=5, | |||
| ) | |||
| @@ -27,7 +27,7 @@ class TrainPaddleConfig: | |||
| @pytest.mark.parametrize("driver,device", [("paddle", "cpu"), ("paddle", 1), ("fleet", [0, 1])]) | |||
| # @pytest.mark.parametrize("driver,device", [("fleet", [0, 1])]) | |||
| @pytest.mark.parametrize("callbacks", [[RichCallback(5)]]) | |||
| @pytest.mark.paddle | |||
| @pytest.mark.paddledist | |||
| @magic_argv_env_context | |||
| def test_trainer_paddle( | |||
| driver, | |||
| @@ -58,11 +58,3 @@ class TestPaddle: | |||
| for batch in fdl1: | |||
| assert batch['image'].shape == [4, 10, 5] | |||
| print(batch) | |||
| def test_v2(self): | |||
| from fastNLP.core.collators import Collator | |||
| logger.setLevel("DEBUG") | |||
| data = [paddle.Tensor(np.random.random((10, 5)).astype('float32')), paddle.Tensor(np.random.random((10, 5)).astype('float32'))] | |||
| col = Collator(backend="jittor") | |||
| res = col(data) | |||
| print(res) | |||
| @@ -370,29 +370,11 @@ class TestDataSetMethods: | |||
| assert os.path.exists("1.csv") == True | |||
| os.remove("1.csv") | |||
| def test_add_collate_fn(self): | |||
| ds = DataSet({'x': [1, 2, 3], 'y': [4, 5, 6]}) | |||
| def collate_fn(item): | |||
| return item | |||
| ds.add_collate_fn(collate_fn) | |||
| def test_get_collator(self): | |||
| from typing import Callable | |||
| ds = DataSet({'x': [1, 2, 3], 'y': [4, 5, 6]}) | |||
| collate_fn = ds.get_collator() | |||
| assert isinstance(collate_fn, Callable) == True | |||
| def test_add_seq_len(self): | |||
| ds = DataSet({'x': [[1, 2], [2, 3, 4], [3]], 'y': [4, 5, 6]}) | |||
| ds.add_seq_len('x') | |||
| print(ds) | |||
| def test_set_target(self): | |||
| ds = DataSet({'x': [[1, 2], [2, 3, 4], [3]], 'y': [4, 5, 6]}) | |||
| ds.set_target('x') | |||
| class TestFieldArrayInit: | |||
| """ | |||
| @@ -19,7 +19,7 @@ if _NEED_IMPORT_PADDLE: | |||
| import paddle | |||
| import paddle.distributed as dist | |||
| @pytest.mark.paddle | |||
| @pytest.mark.paddledist | |||
| class TestDistUtilsTools: | |||
| """ | |||
| 测试一些工具函数 | |||
| @@ -79,14 +79,13 @@ class TestDistUtilsTools: | |||
| assert res["int"] == paddle_dict["int"] | |||
| assert res["string"] == paddle_dict["string"] | |||
| @pytest.mark.paddle | |||
| @pytest.mark.paddledist | |||
| class TestAllGatherAndBroadCast: | |||
| @classmethod | |||
| def setup_class(cls): | |||
| devices = [0,1,2] | |||
| output_from_new_proc = "only_error" | |||
| output_from_new_proc = "all" | |||
| launcher = FleetLauncher(devices=devices, output_from_new_proc=output_from_new_proc) | |||
| cls.local_rank = int(os.getenv("PADDLE_RANK_IN_NODE", "0")) | |||
| @@ -39,7 +39,7 @@ def generate_driver(num_labels, feature_dimension, device=[0,1], fp16=False, out | |||
| # | |||
| ############################################################################ | |||
| @pytest.mark.paddle | |||
| @pytest.mark.paddledist | |||
| class TestFleetDriverFunction: | |||
| """ | |||
| 测试 PaddleFleetDriver 一些简单函数的测试类,基本都是测试能否运行、是否存在 import 错误等问题 | |||
| @@ -147,7 +147,7 @@ class TestFleetDriverFunction: | |||
| # | |||
| ############################################################################ | |||
| @pytest.mark.paddle | |||
| @pytest.mark.paddledist | |||
| class TestSetDistReproDataloader: | |||
| @classmethod | |||
| @@ -521,7 +521,7 @@ class TestSetDistReproDataloader: | |||
| # | |||
| ############################################################################ | |||
| @pytest.mark.paddle | |||
| @pytest.mark.paddledist | |||
| class TestSaveLoad: | |||
| """ | |||
| 测试多卡情况下 save 和 load 相关函数的表现 | |||
| @@ -552,22 +552,17 @@ def generate_random_driver(features, labels, fp16=False, device="cpu"): | |||
| return driver | |||
| @pytest.fixture | |||
| def prepare_test_save_load(): | |||
| dataset = PaddleRandomMaxDataset(40, 10) | |||
| dataloader = DataLoader(dataset, batch_size=4) | |||
| driver1, driver2 = generate_random_driver(10, 10), generate_random_driver(10, 10) | |||
| return driver1, driver2, dataloader | |||
| @pytest.mark.paddle | |||
| @pytest.mark.parametrize("only_state_dict", ([True, False])) | |||
| def test_save_and_load_model(prepare_test_save_load, only_state_dict): | |||
| def test_save_and_load_model(only_state_dict): | |||
| """ | |||
| 测试 save_model 和 load_model 函数 | |||
| """ | |||
| try: | |||
| path = "model" | |||
| driver1, driver2, dataloader = prepare_test_save_load | |||
| dataset = PaddleRandomMaxDataset(40, 10) | |||
| dataloader = DataLoader(dataset, batch_size=4) | |||
| driver1, driver2 = generate_random_driver(10, 10, device="gpu"), generate_random_driver(10, 10, device="gpu") | |||
| if only_state_dict: | |||
| driver1.save_model(path, only_state_dict) | |||
| @@ -1,8 +1,6 @@ | |||
| import os | |||
| import pytest | |||
| from fastNLP.core.drivers.paddle_driver.utils import ( | |||
| get_device_from_visible, | |||
| replace_batch_sampler, | |||
| replace_sampler, | |||
| ) | |||
| @@ -14,24 +12,6 @@ if _NEED_IMPORT_PADDLE: | |||
| from tests.helpers.datasets.paddle_data import PaddleNormalDataset | |||
| @pytest.mark.parametrize( | |||
| ("user_visible_devices, cuda_visible_devices, device, output_type, correct"), | |||
| ( | |||
| ("0,1,2,3,4,5,6,7", "0", "cpu", str, "cpu"), | |||
| ("0,1,2,3,4,5,6,7", "0", "cpu", int, "cpu"), | |||
| ("0,1,2,3,4,5,6,7", "3,4,5", "gpu:4", int, 1), | |||
| ("0,1,2,3,4,5,6,7", "3,4,5", "gpu:5", str, "gpu:2"), | |||
| ("3,4,5,6", "3,5", 0, int, 0), | |||
| ("3,6,7,8", "6,7,8", "gpu:2", str, "gpu:1"), | |||
| ) | |||
| ) | |||
| @pytest.mark.paddle | |||
| def test_get_device_from_visible_str(user_visible_devices, cuda_visible_devices, device, output_type, correct): | |||
| os.environ["CUDA_VISIBLE_DEVICES"] = cuda_visible_devices | |||
| os.environ["USER_CUDA_VISIBLE_DEVICES"] = user_visible_devices | |||
| res = get_device_from_visible(device, output_type) | |||
| assert res == correct | |||
| @pytest.mark.paddle | |||
| def test_replace_batch_sampler(): | |||
| dataset = PaddleNormalDataset(10) | |||
| @@ -545,22 +545,17 @@ def generate_random_driver(features, labels, fp16=False, device="cpu"): | |||
| return driver | |||
| @pytest.fixture | |||
| def prepare_test_save_load(): | |||
| dataset = TorchArgMaxDataset(10, 40) | |||
| dataloader = DataLoader(dataset, batch_size=4) | |||
| driver1, driver2 = generate_random_driver(10, 10), generate_random_driver(10, 10) | |||
| return driver1, driver2, dataloader | |||
| @pytest.mark.torch | |||
| @pytest.mark.parametrize("only_state_dict", ([True, False])) | |||
| def test_save_and_load_model(prepare_test_save_load, only_state_dict): | |||
| def test_save_and_load_model(only_state_dict): | |||
| """ | |||
| 测试 save_model 和 load_model 函数 | |||
| """ | |||
| try: | |||
| path = "model" | |||
| driver1, driver2, dataloader = prepare_test_save_load | |||
| dataset = TorchArgMaxDataset(10, 40) | |||
| dataloader = DataLoader(dataset, batch_size=4) | |||
| driver1, driver2 = generate_random_driver(10, 10), generate_random_driver(10, 10) | |||
| driver1.save_model(path, only_state_dict) | |||
| driver2.load_model(path, only_state_dict) | |||
| @@ -1,10 +1,40 @@ | |||
| import os | |||
| import pytest | |||
| from fastNLP.core.utils.paddle_utils import paddle_to, paddle_move_data_to_device | |||
| from fastNLP.core.utils.paddle_utils import get_device_from_visible, paddle_to, paddle_move_data_to_device | |||
| from fastNLP.envs.imports import _NEED_IMPORT_PADDLE | |||
| if _NEED_IMPORT_PADDLE: | |||
| import paddle | |||
| @pytest.mark.parametrize( | |||
| ("user_visible_devices, cuda_visible_devices, device, output_type, correct"), | |||
| ( | |||
| ("0,1,2,3,4,5,6,7", "0", "cpu", str, "cpu"), | |||
| ("0,1,2,3,4,5,6,7", "0", "cpu", int, "cpu"), | |||
| ("0,1,2,3,4,5,6,7", "3,4,5", "gpu:4", int, 1), | |||
| ("0,1,2,3,4,5,6,7", "3,4,5", "gpu:5", str, "gpu:2"), | |||
| ("3,4,5,6", "3,5", 0, int, 0), | |||
| ("3,6,7,8", "6,7,8", "gpu:2", str, "gpu:1"), | |||
| ) | |||
| ) | |||
| @pytest.mark.paddle | |||
| def test_get_device_from_visible(user_visible_devices, cuda_visible_devices, device, output_type, correct): | |||
| _cuda_visible_devices = os.getenv("CUDA_VISIBLE_DEVICES") | |||
| _user_visible_devices = os.getenv("USER_CUDA_VISIBLE_DEVICES") | |||
| os.environ["CUDA_VISIBLE_DEVICES"] = cuda_visible_devices | |||
| os.environ["USER_CUDA_VISIBLE_DEVICES"] = user_visible_devices | |||
| res = get_device_from_visible(device, output_type) | |||
| assert res == correct | |||
| # 还原环境变量 | |||
| if _cuda_visible_devices is None: | |||
| del os.environ["CUDA_VISIBLE_DEVICES"] | |||
| else: | |||
| os.environ["CUDA_VISIBLE_DEVICES"] = _cuda_visible_devices | |||
| if _user_visible_devices is None: | |||
| del os.environ["USER_CUDA_VISIBLE_DEVICES"] | |||
| else: | |||
| os.environ["USER_CUDA_VISIBLE_DEVICES"] = _user_visible_devices | |||
| ############################################################################ | |||
| # | |||
| @@ -22,12 +52,6 @@ class TestPaddleToDevice: | |||
| assert res.place.gpu_device_id() == 0 | |||
| res = paddle_to(tensor, "cpu") | |||
| assert res.place.is_cpu_place() | |||
| res = paddle_to(tensor, "gpu:2") | |||
| assert res.place.is_gpu_place() | |||
| assert res.place.gpu_device_id() == 2 | |||
| res = paddle_to(tensor, "gpu:1") | |||
| assert res.place.is_gpu_place() | |||
| assert res.place.gpu_device_id() == 1 | |||
| ############################################################################ | |||
| # | |||
| @@ -64,28 +88,18 @@ class TestPaddleMoveDataToDevice: | |||
| res = paddle_move_data_to_device(paddle_tensor, device="gpu:0", data_device=None) | |||
| self.check_gpu(res, 0) | |||
| res = paddle_move_data_to_device(paddle_tensor, device="gpu:1", data_device=None) | |||
| self.check_gpu(res, 1) | |||
| res = paddle_move_data_to_device(paddle_tensor, device="gpu:0", data_device="cpu") | |||
| self.check_gpu(res, 0) | |||
| res = paddle_move_data_to_device(paddle_tensor, device=None, data_device="gpu:0") | |||
| self.check_gpu(res, 0) | |||
| res = paddle_move_data_to_device(paddle_tensor, device=None, data_device="gpu:1") | |||
| self.check_gpu(res, 1) | |||
| def test_list_transfer(self): | |||
| """ | |||
| 测试张量列表的迁移 | |||
| """ | |||
| paddle_list = [paddle.rand((6, 4, 2)) for i in range(10)] | |||
| res = paddle_move_data_to_device(paddle_list, device=None, data_device="gpu:1") | |||
| assert isinstance(res, list) | |||
| for r in res: | |||
| self.check_gpu(r, 1) | |||
| res = paddle_move_data_to_device(paddle_list, device="cpu", data_device="gpu:1") | |||
| assert isinstance(res, list) | |||
| @@ -97,11 +111,6 @@ class TestPaddleMoveDataToDevice: | |||
| for r in res: | |||
| self.check_gpu(r, 0) | |||
| res = paddle_move_data_to_device(paddle_list, device="gpu:1", data_device="cpu") | |||
| assert isinstance(res, list) | |||
| for r in res: | |||
| self.check_gpu(r, 1) | |||
| def test_tensor_tuple_transfer(self): | |||
| """ | |||
| 测试张量元组的迁移 | |||
| @@ -109,10 +118,6 @@ class TestPaddleMoveDataToDevice: | |||
| paddle_list = [paddle.rand((6, 4, 2)) for i in range(10)] | |||
| paddle_tuple = tuple(paddle_list) | |||
| res = paddle_move_data_to_device(paddle_tuple, device=None, data_device="gpu:1") | |||
| assert isinstance(res, tuple) | |||
| for r in res: | |||
| self.check_gpu(r, 1) | |||
| res = paddle_move_data_to_device(paddle_tuple, device="cpu", data_device="gpu:1") | |||
| assert isinstance(res, tuple) | |||
| @@ -124,11 +129,6 @@ class TestPaddleMoveDataToDevice: | |||
| for r in res: | |||
| self.check_gpu(r, 0) | |||
| res = paddle_move_data_to_device(paddle_tuple, device="gpu:1", data_device="cpu") | |||
| assert isinstance(res, tuple) | |||
| for r in res: | |||
| self.check_gpu(r, 1) | |||
| def test_dict_transfer(self): | |||
| """ | |||
| 测试字典结构的迁移 | |||
| @@ -173,20 +173,6 @@ class TestPaddleMoveDataToDevice: | |||
| self.check_gpu(t, 0) | |||
| self.check_gpu(res["dict"]["tensor"], 0) | |||
| res = paddle_move_data_to_device(paddle_dict, device=None, data_device="gpu:1") | |||
| assert isinstance(res, dict) | |||
| self.check_gpu(res["tensor"], 1) | |||
| assert isinstance(res["list"], list) | |||
| for t in res["list"]: | |||
| self.check_gpu(t, 1) | |||
| assert isinstance(res["int"], int) | |||
| assert isinstance(res["string"], str) | |||
| assert isinstance(res["dict"], dict) | |||
| assert isinstance(res["dict"]["list"], list) | |||
| for t in res["dict"]["list"]: | |||
| self.check_gpu(t, 1) | |||
| self.check_gpu(res["dict"]["tensor"], 1) | |||
| res = paddle_move_data_to_device(paddle_dict, device="cpu", data_device="gpu:0") | |||
| assert isinstance(res, dict) | |||
| self.check_cpu(res["tensor"]) | |||
| @@ -2,5 +2,6 @@ | |||
| markers = | |||
| torch | |||
| paddle | |||
| paddledist | |||
| jittor | |||
| torchpaddle | |||
| @@ -0,0 +1,7 @@ | |||
| ,SentenceId,Sentence,Sentiment | |||
| 0,1,"['a', 'series', 'of', 'escapades', 'demonstrating', 'the', 'adage', 'that', 'what', 'is', 'good', 'for', 'the', 'goose', 'is', 'also', 'good', 'for', 'the', 'gander', ',', 'some', 'of', 'which', 'occasionally', 'amuses', 'but', 'none', 'of', 'which', 'amounts', 'to', 'much', 'of', 'a', 'story', '.']",negative | |||
| 1,2,"['this', 'quiet', ',', 'introspective', 'and', 'entertaining', 'independent', 'is', 'worth', 'seeking', '.']",positive | |||
| 2,3,"['even', 'fans', 'of', 'ismail', 'merchant', ""'s"", 'work', ',', 'i', 'suspect', ',', 'would', 'have', 'a', 'hard', 'time', 'sitting', 'through', 'this', 'one', '.']",negative | |||
| 3,4,"['a', 'positively', 'thrilling', 'combination', 'of', 'ethnography', 'and', 'all', 'the', 'intrigue', ',', 'betrayal', ',', 'deceit', 'and', 'murder', 'of', 'a', 'shakespearean', 'tragedy', 'or', 'a', 'juicy', 'soap', 'opera', '.']",neutral | |||
| 4,5,"['a', 'comedy-drama', 'of', 'nearly', 'epic', 'proportions', 'rooted', 'in', 'a', 'sincere', 'performance', 'by', 'the', 'title', 'character', 'undergoing', 'midlife', 'crisis', '.']",positive | |||
| 5,6,"['the', 'importance', 'of', 'being', 'earnest', ',', 'so', 'thick', 'with', 'wit', 'it', 'plays', 'like', 'a', 'reading', 'from', 'bartlett', ""'s"", 'familiar', 'quotations']",neutral | |||
| @@ -0,0 +1,7 @@ | |||
| SentenceId Sentence Sentiment | |||
| 1 A series of escapades demonstrating the adage that what is good for the goose is also good for the gander , some of which occasionally amuses but none of which amounts to much of a story . negative | |||
| 2 This quiet , introspective and entertaining independent is worth seeking . positive | |||
| 3 Even fans of Ismail Merchant 's work , I suspect , would have a hard time sitting through this one . negative | |||
| 4 A positively thrilling combination of ethnography and all the intrigue , betrayal , deceit and murder of a Shakespearean tragedy or a juicy soap opera . neutral | |||
| 5 A comedy-drama of nearly epic proportions rooted in a sincere performance by the title character undergoing midlife crisis . positive | |||
| 6 The Importance of Being Earnest , so thick with wit it plays like a reading from Bartlett 's Familiar Quotations neutral | |||
| @@ -153,7 +153,7 @@ | |||
| "name": "stdout", | |||
| "output_type": "stream", | |||
| "text": [ | |||
| "1969418794120 1971237588872\n", | |||
| "2438703969992 2438374526920\n", | |||
| "+-----+------------------------+------------------------+-----+\n", | |||
| "| idx | sentence | words | num |\n", | |||
| "+-----+------------------------+------------------------+-----+\n", | |||
| @@ -198,7 +198,7 @@ | |||
| "name": "stdout", | |||
| "output_type": "stream", | |||
| "text": [ | |||
| "1971237588872 1971237588872\n", | |||
| "2438374526920 2438374526920\n", | |||
| "+-----+------------------------+------------------------+-----+\n", | |||
| "| idx | sentence | words | num |\n", | |||
| "+-----+------------------------+------------------------+-----+\n", | |||
| @@ -774,9 +774,9 @@ | |||
| { | |||
| "data": { | |||
| "text/plain": [ | |||
| "{'sentence': <fastNLP.core.dataset.field.FieldArray at 0x1ca8a879d08>,\n", | |||
| " 'words': <fastNLP.core.dataset.field.FieldArray at 0x1ca8a879d88>,\n", | |||
| " 'num': <fastNLP.core.dataset.field.FieldArray at 0x1ca8a879e08>}" | |||
| "{'sentence': <fastNLP.core.dataset.field.FieldArray at 0x237ce26d388>,\n", | |||
| " 'words': <fastNLP.core.dataset.field.FieldArray at 0x237ce26d408>,\n", | |||
| " 'num': <fastNLP.core.dataset.field.FieldArray at 0x237ce26d488>}" | |||
| ] | |||
| }, | |||
| "execution_count": 15, | |||
| @@ -923,7 +923,8 @@ | |||
| "output_type": "stream", | |||
| "text": [ | |||
| "5 Counter({'生活': 1, '就像': 1, '海洋': 1})\n", | |||
| "6 Counter({'生活': 1, '就像': 1, '海洋': 1, '只有': 1})\n" | |||
| "6 Counter({'生活': 1, '就像': 1, '海洋': 1, '只有': 1})\n", | |||
| "6 {'<pad>': 0, '<unk>': 1, '生活': 2, '就像': 3, '海洋': 4, '只有': 5}\n" | |||
| ] | |||
| } | |||
| ], | |||
| @@ -931,7 +932,8 @@ | |||
| "vocab.add_word_lst(['生活', '就像', '海洋'])\n", | |||
| "print(len(vocab), vocab.word_count)\n", | |||
| "vocab.add_word('只有')\n", | |||
| "print(len(vocab), vocab.word_count)" | |||
| "print(len(vocab), vocab.word_count)\n", | |||
| "print(len(vocab), vocab.word2idx)" | |||
| ] | |||
| }, | |||
| { | |||
| @@ -959,7 +961,6 @@ | |||
| "<pad> 0\n", | |||
| "<unk> 1\n", | |||
| "生活 2\n", | |||
| "只有 5\n", | |||
| "彼岸 1 False\n" | |||
| ] | |||
| } | |||
| @@ -968,7 +969,6 @@ | |||
| "print(vocab.to_word(0), vocab.to_index('<pad>'))\n", | |||
| "print(vocab.to_word(1), vocab.to_index('<unk>'))\n", | |||
| "print(vocab.to_word(2), vocab.to_index('生活'))\n", | |||
| "print(vocab.to_word(5), vocab.to_index('只有'))\n", | |||
| "print('彼岸', vocab.to_index('彼岸'), vocab.has_word('彼岸'))" | |||
| ] | |||
| }, | |||
| @@ -979,7 +979,9 @@ | |||
| "source": [ | |||
| "**`vocabulary`允许反复添加相同单词**,**可以通过`word_count`方法看到相应单词被添加的次数**\n", | |||
| "\n", | |||
| "  但其中没有`<unk>`和`<pad>`,`vocabulary`的全部变量与函数可以通过`dir(vocabulary)`查询" | |||
| "  但其中没有`<unk>`和`<pad>`,`vocabulary`的全部变量与函数可以通过`dir(vocabulary)`查询\n", | |||
| "\n", | |||
| "  注:**使用`add_word_lst`添加单词**,**单词对应序号不会动态调整**,**使用`dataset`添加单词的情况不同**" | |||
| ] | |||
| }, | |||
| { | |||
| @@ -992,15 +994,19 @@ | |||
| "name": "stdout", | |||
| "output_type": "stream", | |||
| "text": [ | |||
| "13 Counter({'生活': 2, '就像': 2, '海洋': 2, '只有': 2, '意志': 1, '坚强的': 1, '人': 1, '才': 1, '能': 1, '到达': 1, '彼岸': 1})\n", | |||
| "彼岸 12 True\n" | |||
| "生活 2\n", | |||
| "彼岸 12 True\n", | |||
| "13 Counter({'人': 4, '生活': 2, '就像': 2, '海洋': 2, '只有': 2, '意志': 1, '坚强的': 1, '才': 1, '能': 1, '到达': 1, '彼岸': 1})\n", | |||
| "13 {'<pad>': 0, '<unk>': 1, '生活': 2, '就像': 3, '海洋': 4, '只有': 5, '人': 6, '意志': 7, '坚强的': 8, '才': 9, '能': 10, '到达': 11, '彼岸': 12}\n" | |||
| ] | |||
| } | |||
| ], | |||
| "source": [ | |||
| "vocab.add_word_lst(['生活', '就像', '海洋', '只有', '意志', '坚强的', '人', '才', '能', '到达', '彼岸'])\n", | |||
| "vocab.add_word_lst(['生活', '就像', '海洋', '只有', '意志', '坚强的', '人', '人', '人', '人', '才', '能', '到达', '彼岸'])\n", | |||
| "print(vocab.to_word(2), vocab.to_index('生活'))\n", | |||
| "print('彼岸', vocab.to_index('彼岸'), vocab.has_word('彼岸'))\n", | |||
| "print(len(vocab), vocab.word_count)\n", | |||
| "print('彼岸', vocab.to_index('彼岸'), vocab.has_word('彼岸'))" | |||
| "print(len(vocab), vocab.word2idx)" | |||
| ] | |||
| }, | |||
| { | |||
| @@ -1082,52 +1088,440 @@ | |||
| "## 3 dataset 和 vocabulary 的组合使用\n", | |||
| " \n", | |||
| "### 3.1 从 dataframe 中加载 dataset\n", | |||
| "\n" | |||
| "\n", | |||
| "以下通过 [NLP-beginner](https://github.com/FudanNLP/nlp-beginner) 实践一中 [Rotten Tomatoes 影评数据集](https://www.kaggle.com/c/sentiment-analysis-on-movie-reviews) 的部分训练数据组成`test4dataset.tsv`文件\n", | |||
| "\n", | |||
| "  介绍如何使用`dataset`、`vocabulary`简单加载并处理数据集,首先使用`pandas`模块,读取原始数据的`dataframe`" | |||
| ] | |||
| }, | |||
| { | |||
| "cell_type": "code", | |||
| "execution_count": 24, | |||
| "id": "3dbd985d", | |||
| "metadata": {}, | |||
| "outputs": [ | |||
| { | |||
| "data": { | |||
| "text/html": [ | |||
| "<div>\n", | |||
| "<style scoped>\n", | |||
| " .dataframe tbody tr th:only-of-type {\n", | |||
| " vertical-align: middle;\n", | |||
| " }\n", | |||
| "\n", | |||
| " .dataframe tbody tr th {\n", | |||
| " vertical-align: top;\n", | |||
| " }\n", | |||
| "\n", | |||
| " .dataframe thead th {\n", | |||
| " text-align: right;\n", | |||
| " }\n", | |||
| "</style>\n", | |||
| "<table border=\"1\" class=\"dataframe\">\n", | |||
| " <thead>\n", | |||
| " <tr style=\"text-align: right;\">\n", | |||
| " <th></th>\n", | |||
| " <th>SentenceId</th>\n", | |||
| " <th>Sentence</th>\n", | |||
| " <th>Sentiment</th>\n", | |||
| " </tr>\n", | |||
| " </thead>\n", | |||
| " <tbody>\n", | |||
| " <tr>\n", | |||
| " <th>0</th>\n", | |||
| " <td>1</td>\n", | |||
| " <td>A series of escapades demonstrating the adage ...</td>\n", | |||
| " <td>negative</td>\n", | |||
| " </tr>\n", | |||
| " <tr>\n", | |||
| " <th>1</th>\n", | |||
| " <td>2</td>\n", | |||
| " <td>This quiet , introspective and entertaining in...</td>\n", | |||
| " <td>positive</td>\n", | |||
| " </tr>\n", | |||
| " <tr>\n", | |||
| " <th>2</th>\n", | |||
| " <td>3</td>\n", | |||
| " <td>Even fans of Ismail Merchant 's work , I suspe...</td>\n", | |||
| " <td>negative</td>\n", | |||
| " </tr>\n", | |||
| " <tr>\n", | |||
| " <th>3</th>\n", | |||
| " <td>4</td>\n", | |||
| " <td>A positively thrilling combination of ethnogra...</td>\n", | |||
| " <td>neutral</td>\n", | |||
| " </tr>\n", | |||
| " <tr>\n", | |||
| " <th>4</th>\n", | |||
| " <td>5</td>\n", | |||
| " <td>A comedy-drama of nearly epic proportions root...</td>\n", | |||
| " <td>positive</td>\n", | |||
| " </tr>\n", | |||
| " <tr>\n", | |||
| " <th>5</th>\n", | |||
| " <td>6</td>\n", | |||
| " <td>The Importance of Being Earnest , so thick wit...</td>\n", | |||
| " <td>neutral</td>\n", | |||
| " </tr>\n", | |||
| " </tbody>\n", | |||
| "</table>\n", | |||
| "</div>" | |||
| ], | |||
| "text/plain": [ | |||
| " SentenceId Sentence Sentiment\n", | |||
| "0 1 A series of escapades demonstrating the adage ... negative\n", | |||
| "1 2 This quiet , introspective and entertaining in... positive\n", | |||
| "2 3 Even fans of Ismail Merchant 's work , I suspe... negative\n", | |||
| "3 4 A positively thrilling combination of ethnogra... neutral\n", | |||
| "4 5 A comedy-drama of nearly epic proportions root... positive\n", | |||
| "5 6 The Importance of Being Earnest , so thick wit... neutral" | |||
| ] | |||
| }, | |||
| "execution_count": 24, | |||
| "metadata": {}, | |||
| "output_type": "execute_result" | |||
| } | |||
| ], | |||
| "source": [ | |||
| "import pandas as pd\n", | |||
| "\n", | |||
| "df = pd.read_csv('./data/test4dataset.tsv', sep='\\t')\n", | |||
| "df" | |||
| ] | |||
| }, | |||
| { | |||
| "cell_type": "markdown", | |||
| "id": "89059713", | |||
| "id": "919ab350", | |||
| "metadata": {}, | |||
| "source": [] | |||
| "source": [ | |||
| "接着,通过`dataset`中的`from_pandas`方法填充数据集,并使用`apply_more`方法对文本进行分词操作" | |||
| ] | |||
| }, | |||
| { | |||
| "cell_type": "code", | |||
| "execution_count": null, | |||
| "id": "3dbd985d", | |||
| "execution_count": 25, | |||
| "id": "4f634586", | |||
| "metadata": {}, | |||
| "outputs": [], | |||
| "source": [] | |||
| "outputs": [ | |||
| { | |||
| "data": { | |||
| "text/html": [ | |||
| "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"></pre>\n" | |||
| ], | |||
| "text/plain": [] | |||
| }, | |||
| "metadata": {}, | |||
| "output_type": "display_data" | |||
| }, | |||
| { | |||
| "data": { | |||
| "text/html": [ | |||
| "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"></pre>\n" | |||
| ], | |||
| "text/plain": [] | |||
| }, | |||
| "metadata": {}, | |||
| "output_type": "display_data" | |||
| }, | |||
| { | |||
| "data": { | |||
| "text/html": [ | |||
| "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">\n", | |||
| "</pre>\n" | |||
| ], | |||
| "text/plain": [ | |||
| "\n" | |||
| ] | |||
| }, | |||
| "metadata": {}, | |||
| "output_type": "display_data" | |||
| }, | |||
| { | |||
| "name": "stdout", | |||
| "output_type": "stream", | |||
| "text": [ | |||
| "+------------+------------------------------+-----------+\n", | |||
| "| SentenceId | Sentence | Sentiment |\n", | |||
| "+------------+------------------------------+-----------+\n", | |||
| "| 1 | ['a', 'series', 'of', 'es... | negative |\n", | |||
| "| 2 | ['this', 'quiet', ',', 'i... | positive |\n", | |||
| "| 3 | ['even', 'fans', 'of', 'i... | negative |\n", | |||
| "| 4 | ['a', 'positively', 'thri... | neutral |\n", | |||
| "| 5 | ['a', 'comedy-drama', 'of... | positive |\n", | |||
| "| 6 | ['the', 'importance', 'of... | neutral |\n", | |||
| "+------------+------------------------------+-----------+\n" | |||
| ] | |||
| } | |||
| ], | |||
| "source": [ | |||
| "from fastNLP.core.dataset import DataSet\n", | |||
| "\n", | |||
| "dataset = DataSet()\n", | |||
| "dataset = dataset.from_pandas(df)\n", | |||
| "dataset.apply_more(lambda ins:{'SentenceId': ins['SentenceId'], \n", | |||
| " 'Sentence': ins['Sentence'].lower().split(), 'Sentiment': ins['Sentiment']})\n", | |||
| "print(dataset)" | |||
| ] | |||
| }, | |||
| { | |||
| "cell_type": "markdown", | |||
| "id": "5c1ae192", | |||
| "metadata": {}, | |||
| "source": [ | |||
| "  如果需要保存中间结果,也可以使用`dataset`的`to_csv`方法,生成`.csv`或`.tsv`文件" | |||
| ] | |||
| }, | |||
| { | |||
| "cell_type": "code", | |||
| "execution_count": null, | |||
| "id": "4f634586", | |||
| "execution_count": 26, | |||
| "id": "46722efc", | |||
| "metadata": {}, | |||
| "outputs": [], | |||
| "source": [] | |||
| "source": [ | |||
| "dataset.to_csv('./data/test4dataset.csv')" | |||
| ] | |||
| }, | |||
| { | |||
| "cell_type": "markdown", | |||
| "id": "5ba13989", | |||
| "metadata": {}, | |||
| "source": [ | |||
| "### 3.2 从 dataset 中获取 vocabulary" | |||
| "### 3.2 从 dataset 中获取 vocabulary\n", | |||
| "\n", | |||
| "然后,初始化`vocabulary`,使用`vocabulary`中的`from_dataset`方法,从`dataset`的指定字段中\n", | |||
| "\n", | |||
| "  获取字段中的所有元素,然后编号;如果指定字段是个列表,则针对字段中所有列表包含的元素编号\n", | |||
| "\n", | |||
| "  注:**使用`dataset`添加单词**,**不同于`add_word_list`**,**单词被添加次数越多**,**序号越靠前**,例如案例中的`a`" | |||
| ] | |||
| }, | |||
| { | |||
| "cell_type": "code", | |||
| "execution_count": null, | |||
| "execution_count": 27, | |||
| "id": "a2de615b", | |||
| "metadata": {}, | |||
| "outputs": [], | |||
| "source": [] | |||
| "outputs": [ | |||
| { | |||
| "data": { | |||
| "text/html": [ | |||
| "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"></pre>\n" | |||
| ], | |||
| "text/plain": [] | |||
| }, | |||
| "metadata": {}, | |||
| "output_type": "display_data" | |||
| }, | |||
| { | |||
| "data": { | |||
| "text/html": [ | |||
| "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"></pre>\n" | |||
| ], | |||
| "text/plain": [] | |||
| }, | |||
| "metadata": {}, | |||
| "output_type": "display_data" | |||
| }, | |||
| { | |||
| "data": { | |||
| "text/html": [ | |||
| "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">\n", | |||
| "</pre>\n" | |||
| ], | |||
| "text/plain": [ | |||
| "\n" | |||
| ] | |||
| }, | |||
| "metadata": {}, | |||
| "output_type": "display_data" | |||
| }, | |||
| { | |||
| "name": "stdout", | |||
| "output_type": "stream", | |||
| "text": [ | |||
| "Counter({'a': 9, 'of': 9, ',': 7, 'the': 6, '.': 5, 'is': 3, 'and': 3, 'good': 2, 'for': 2, 'which': 2, 'this': 2, \"'s\": 2, 'series': 1, 'escapades': 1, 'demonstrating': 1, 'adage': 1, 'that': 1, 'what': 1, 'goose': 1, 'also': 1, 'gander': 1, 'some': 1, 'occasionally': 1, 'amuses': 1, 'but': 1, 'none': 1, 'amounts': 1, 'to': 1, 'much': 1, 'story': 1, 'quiet': 1, 'introspective': 1, 'entertaining': 1, 'independent': 1, 'worth': 1, 'seeking': 1, 'even': 1, 'fans': 1, 'ismail': 1, 'merchant': 1, 'work': 1, 'i': 1, 'suspect': 1, 'would': 1, 'have': 1, 'hard': 1, 'time': 1, 'sitting': 1, 'through': 1, 'one': 1, 'positively': 1, 'thrilling': 1, 'combination': 1, 'ethnography': 1, 'all': 1, 'intrigue': 1, 'betrayal': 1, 'deceit': 1, 'murder': 1, 'shakespearean': 1, 'tragedy': 1, 'or': 1, 'juicy': 1, 'soap': 1, 'opera': 1, 'comedy-drama': 1, 'nearly': 1, 'epic': 1, 'proportions': 1, 'rooted': 1, 'in': 1, 'sincere': 1, 'performance': 1, 'by': 1, 'title': 1, 'character': 1, 'undergoing': 1, 'midlife': 1, 'crisis': 1, 'importance': 1, 'being': 1, 'earnest': 1, 'so': 1, 'thick': 1, 'with': 1, 'wit': 1, 'it': 1, 'plays': 1, 'like': 1, 'reading': 1, 'from': 1, 'bartlett': 1, 'familiar': 1, 'quotations': 1}) \n", | |||
| "\n", | |||
| "{'<pad>': 0, '<unk>': 1, 'a': 2, 'of': 3, ',': 4, 'the': 5, '.': 6, 'is': 7, 'and': 8, 'good': 9, 'for': 10, 'which': 11, 'this': 12, \"'s\": 13, 'series': 14, 'escapades': 15, 'demonstrating': 16, 'adage': 17, 'that': 18, 'what': 19, 'goose': 20, 'also': 21, 'gander': 22, 'some': 23, 'occasionally': 24, 'amuses': 25, 'but': 26, 'none': 27, 'amounts': 28, 'to': 29, 'much': 30, 'story': 31, 'quiet': 32, 'introspective': 33, 'entertaining': 34, 'independent': 35, 'worth': 36, 'seeking': 37, 'even': 38, 'fans': 39, 'ismail': 40, 'merchant': 41, 'work': 42, 'i': 43, 'suspect': 44, 'would': 45, 'have': 46, 'hard': 47, 'time': 48, 'sitting': 49, 'through': 50, 'one': 51, 'positively': 52, 'thrilling': 53, 'combination': 54, 'ethnography': 55, 'all': 56, 'intrigue': 57, 'betrayal': 58, 'deceit': 59, 'murder': 60, 'shakespearean': 61, 'tragedy': 62, 'or': 63, 'juicy': 64, 'soap': 65, 'opera': 66, 'comedy-drama': 67, 'nearly': 68, 'epic': 69, 'proportions': 70, 'rooted': 71, 'in': 72, 'sincere': 73, 'performance': 74, 'by': 75, 'title': 76, 'character': 77, 'undergoing': 78, 'midlife': 79, 'crisis': 80, 'importance': 81, 'being': 82, 'earnest': 83, 'so': 84, 'thick': 85, 'with': 86, 'wit': 87, 'it': 88, 'plays': 89, 'like': 90, 'reading': 91, 'from': 92, 'bartlett': 93, 'familiar': 94, 'quotations': 95} \n", | |||
| "\n", | |||
| "Vocabulary(['a', 'series', 'of', 'escapades', 'demonstrating']...)\n" | |||
| ] | |||
| } | |||
| ], | |||
| "source": [ | |||
| "from fastNLP.core.vocabulary import Vocabulary\n", | |||
| "\n", | |||
| "vocab = Vocabulary()\n", | |||
| "vocab = vocab.from_dataset(dataset, field_name='Sentence')\n", | |||
| "print(vocab.word_count, '\\n')\n", | |||
| "print(vocab.word2idx, '\\n')\n", | |||
| "print(vocab)" | |||
| ] | |||
| }, | |||
| { | |||
| "cell_type": "markdown", | |||
| "id": "f0857ccb", | |||
| "metadata": {}, | |||
| "source": [ | |||
| "之后,**通过`vocabulary`的`index_dataset`方法**,**调整`dataset`中指定字段的元素**,**使用编号将之代替**\n", | |||
| "\n", | |||
| "  使用上述方法,可以将影评数据集中的单词序列转化为词编号序列,为接下来转化为词嵌入序列做准备" | |||
| ] | |||
| }, | |||
| { | |||
| "cell_type": "code", | |||
| "execution_count": null, | |||
| "execution_count": 28, | |||
| "id": "2f9a04b2", | |||
| "metadata": {}, | |||
| "outputs": [ | |||
| { | |||
| "data": { | |||
| "text/html": [ | |||
| "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"></pre>\n" | |||
| ], | |||
| "text/plain": [] | |||
| }, | |||
| "metadata": {}, | |||
| "output_type": "display_data" | |||
| }, | |||
| { | |||
| "data": { | |||
| "text/html": [ | |||
| "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"></pre>\n" | |||
| ], | |||
| "text/plain": [] | |||
| }, | |||
| "metadata": {}, | |||
| "output_type": "display_data" | |||
| }, | |||
| { | |||
| "data": { | |||
| "text/html": [ | |||
| "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">\n", | |||
| "</pre>\n" | |||
| ], | |||
| "text/plain": [ | |||
| "\n" | |||
| ] | |||
| }, | |||
| "metadata": {}, | |||
| "output_type": "display_data" | |||
| }, | |||
| { | |||
| "name": "stdout", | |||
| "output_type": "stream", | |||
| "text": [ | |||
| "+------------+------------------------------+-----------+\n", | |||
| "| SentenceId | Sentence | Sentiment |\n", | |||
| "+------------+------------------------------+-----------+\n", | |||
| "| 1 | [2, 14, 3, 15, 16, 5, 17,... | negative |\n", | |||
| "| 2 | [12, 32, 4, 33, 8, 34, 35... | positive |\n", | |||
| "| 3 | [38, 39, 3, 40, 41, 13, 4... | negative |\n", | |||
| "| 4 | [2, 52, 53, 54, 3, 55, 8,... | neutral |\n", | |||
| "| 5 | [2, 67, 3, 68, 69, 70, 71... | positive |\n", | |||
| "| 6 | [5, 81, 3, 82, 83, 4, 84,... | neutral |\n", | |||
| "+------------+------------------------------+-----------+\n" | |||
| ] | |||
| } | |||
| ], | |||
| "source": [ | |||
| "vocab.index_dataset(dataset, field_name='Sentence')\n", | |||
| "print(dataset)" | |||
| ] | |||
| }, | |||
| { | |||
| "cell_type": "markdown", | |||
| "id": "6b26b707", | |||
| "metadata": {}, | |||
| "source": [ | |||
| "最后,使用相同方法,再将`dataset`中`Sentiment`字段中的`negative`、`neutral`、`positive`转化为数字编号" | |||
| ] | |||
| }, | |||
| { | |||
| "cell_type": "code", | |||
| "execution_count": 29, | |||
| "id": "5f5eed18", | |||
| "metadata": {}, | |||
| "outputs": [ | |||
| { | |||
| "data": { | |||
| "text/html": [ | |||
| "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"></pre>\n" | |||
| ], | |||
| "text/plain": [] | |||
| }, | |||
| "metadata": {}, | |||
| "output_type": "display_data" | |||
| }, | |||
| { | |||
| "name": "stdout", | |||
| "output_type": "stream", | |||
| "text": [ | |||
| "{'negative': 0, 'positive': 1, 'neutral': 2}\n" | |||
| ] | |||
| }, | |||
| { | |||
| "data": { | |||
| "text/html": [ | |||
| "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"></pre>\n" | |||
| ], | |||
| "text/plain": [] | |||
| }, | |||
| "metadata": {}, | |||
| "output_type": "display_data" | |||
| }, | |||
| { | |||
| "data": { | |||
| "text/html": [ | |||
| "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">\n", | |||
| "</pre>\n" | |||
| ], | |||
| "text/plain": [ | |||
| "\n" | |||
| ] | |||
| }, | |||
| "metadata": {}, | |||
| "output_type": "display_data" | |||
| }, | |||
| { | |||
| "name": "stdout", | |||
| "output_type": "stream", | |||
| "text": [ | |||
| "+------------+------------------------------+-----------+\n", | |||
| "| SentenceId | Sentence | Sentiment |\n", | |||
| "+------------+------------------------------+-----------+\n", | |||
| "| 1 | [2, 14, 3, 15, 16, 5, 17,... | 0 |\n", | |||
| "| 2 | [12, 32, 4, 33, 8, 34, 35... | 1 |\n", | |||
| "| 3 | [38, 39, 3, 40, 41, 13, 4... | 0 |\n", | |||
| "| 4 | [2, 52, 53, 54, 3, 55, 8,... | 2 |\n", | |||
| "| 5 | [2, 67, 3, 68, 69, 70, 71... | 1 |\n", | |||
| "| 6 | [5, 81, 3, 82, 83, 4, 84,... | 2 |\n", | |||
| "+------------+------------------------------+-----------+\n" | |||
| ] | |||
| } | |||
| ], | |||
| "source": [ | |||
| "target_vocab = Vocabulary(padding=None, unknown=None)\n", | |||
| "\n", | |||
| "target_vocab.from_dataset(dataset, field_name='Sentiment')\n", | |||
| "print(target_vocab.word2idx)\n", | |||
| "target_vocab.index_dataset(dataset, field_name='Sentiment')\n", | |||
| "print(dataset)" | |||
| ] | |||
| }, | |||
| { | |||
| "cell_type": "markdown", | |||
| "id": "eed7ea64", | |||
| "metadata": {}, | |||
| "source": [ | |||
| "在最后的最后,通过以下的一张图,来总结本章关于`dataset`和`vocabulary`主要知识点的讲解,以及两者的联系\n", | |||
| "\n", | |||
| "<img src=\"./figures/T1-fig-dataset-and-vocabulary.png\" width=\"80%\" height=\"80%\" align=\"center\"></img>" | |||
| ] | |||
| }, | |||
| { | |||
| "cell_type": "code", | |||
| "execution_count": null, | |||
| "id": "35b4f0f7", | |||
| "metadata": {}, | |||
| "outputs": [], | |||
| "source": [] | |||
| } | |||
| @@ -0,0 +1,41 @@ | |||
| { | |||
| "cells": [ | |||
| { | |||
| "cell_type": "code", | |||
| "execution_count": null, | |||
| "metadata": { | |||
| "collapsed": true | |||
| }, | |||
| "outputs": [], | |||
| "source": [] | |||
| }, | |||
| { | |||
| "cell_type": "code", | |||
| "execution_count": null, | |||
| "metadata": {}, | |||
| "outputs": [], | |||
| "source": [] | |||
| } | |||
| ], | |||
| "metadata": { | |||
| "kernelspec": { | |||
| "display_name": "Python 3 (ipykernel)", | |||
| "language": "python", | |||
| "name": "python3" | |||
| }, | |||
| "language_info": { | |||
| "codemirror_mode": { | |||
| "name": "ipython", | |||
| "version": 3 | |||
| }, | |||
| "file_extension": ".py", | |||
| "mimetype": "text/x-python", | |||
| "name": "python", | |||
| "nbconvert_exporter": "python", | |||
| "pygments_lexer": "ipython3", | |||
| "version": "3.7.4" | |||
| } | |||
| }, | |||
| "nbformat": 4, | |||
| "nbformat_minor": 1 | |||
| } | |||