diff --git a/fastNLP/core/__init__.py b/fastNLP/core/__init__.py index 439f5886..d07382e4 100644 --- a/fastNLP/core/__init__.py +++ b/fastNLP/core/__init__.py @@ -14,6 +14,8 @@ __all__ = [ 'MoreEvaluateCallback', "TorchWarmupCallback", "TorchGradClipCallback", + "MonitorUtility", + 'HasMonitorCallback', # collators 'Collator', @@ -40,6 +42,12 @@ __all__ = [ 'Trainer', # dataloaders TODO 需要把 mix_dataloader 的搞定 + 'TorchDataLoader', + 'PaddleDataLoader', + 'JittorDataLoader', + 'prepare_jittor_dataloader', + 'prepare_paddle_dataloader', + 'prepare_torch_dataloader', # dataset 'DataSet', diff --git a/fastNLP/core/callbacks/__init__.py b/fastNLP/core/callbacks/__init__.py index cfda1763..6f859183 100644 --- a/fastNLP/core/callbacks/__init__.py +++ b/fastNLP/core/callbacks/__init__.py @@ -15,6 +15,9 @@ __all__ = [ "TorchWarmupCallback", "TorchGradClipCallback", + + "MonitorUtility", + 'HasMonitorCallback' ] @@ -28,4 +31,5 @@ from .load_best_model_callback import LoadBestModelCallback from .early_stop_callback import EarlyStopCallback from .torch_callbacks import * from .more_evaluate_callback import MoreEvaluateCallback +from .has_monitor_callback import MonitorUtility, HasMonitorCallback diff --git a/fastNLP/core/callbacks/more_evaluate_callback.py b/fastNLP/core/callbacks/more_evaluate_callback.py index b5800134..713ffc09 100644 --- a/fastNLP/core/callbacks/more_evaluate_callback.py +++ b/fastNLP/core/callbacks/more_evaluate_callback.py @@ -66,7 +66,6 @@ class MoreEvaluateCallback(HasMonitorCallback): raise RuntimeError("`evaluate_every` and `watch_monitor` cannot be None at the same time.") if watch_monitor is not None and evaluate_every is not None: raise RuntimeError("`evaluate_every` and `watch_monitor` cannot be set at the same time.") - self.watch_monitor = watch_monitor if topk_monitor is not None and topk == 0: raise RuntimeError("`topk_monitor` is set, but `topk` is 0.") @@ -93,8 +92,8 @@ class MoreEvaluateCallback(HasMonitorCallback): def on_after_trainer_initialized(self, trainer, driver): # 如果是需要 watch 的,不能没有 evaluator - if self.watch_monitor is not None: - assert trainer.evaluator is not None, f"You set `watch_monitor={self.watch_monitor}`, but no " \ + if self.monitor is not None: + assert trainer.evaluator is not None, f"You set `watch_monitor={self.monitor}`, but no " \ f"evaluate_dataloaders is provided in Trainer." if trainer.evaluate_fn is self.evaluate_fn: @@ -134,7 +133,7 @@ class MoreEvaluateCallback(HasMonitorCallback): self.topk_saver.save_topk(trainer, results) def on_train_epoch_end(self, trainer): - if self.watch_monitor is not None: + if self.monitor is not None: return if isinstance(self.evaluate_every, int) and self.evaluate_every < 0: evaluate_every = -self.evaluate_every @@ -143,7 +142,7 @@ class MoreEvaluateCallback(HasMonitorCallback): self.topk_saver.save_topk(trainer, results) def on_train_batch_end(self, trainer): - if self.watch_monitor is not None: + if self.monitor is not None: return if callable(self.evaluate_every): if self.evaluate_every(trainer): diff --git a/fastNLP/core/collators/padders/paddle_padder.py b/fastNLP/core/collators/padders/paddle_padder.py index f7db6534..f4ae0300 100644 --- a/fastNLP/core/collators/padders/paddle_padder.py +++ b/fastNLP/core/collators/padders/paddle_padder.py @@ -56,7 +56,7 @@ def is_paddle_dtype_str(dtype): def _get_dtype(ele_dtype, dtype, class_name): - if not (ele_dtype is not None or is_number_or_numpy_number(ele_dtype) or is_paddle_tensor(ele_dtype) or is_paddle_dtype_str(ele_dtype)): + if not (ele_dtype is None or is_number_or_numpy_number(ele_dtype) or is_paddle_tensor(ele_dtype) or is_paddle_dtype_str(ele_dtype)): raise EleDtypeUnsupportedError(f"`{class_name}` only supports padding python numbers " f"or numpy numbers or paddle.Tensor but get `{ele_dtype}`.") diff --git a/fastNLP/core/controllers/trainer.py b/fastNLP/core/controllers/trainer.py index e0cf4b0d..f720fe5b 100644 --- a/fastNLP/core/controllers/trainer.py +++ b/fastNLP/core/controllers/trainer.py @@ -117,6 +117,7 @@ class Trainer(TrainerEventTrigger): :param monitor: 当存在 evaluate_dataloaders 时,默认的 monitor metric 的名字。传入的 callback 如果有 monitor 参数且没有 在 callback 初始化设定的,将采取这个值。如果在 evaluation 结果中没有找到完全一致的名称,将使用 最短公共字符串算法 找到最匹配 的那个作为 monitor 。也可以传入一个函数,接受参数为 evaluation 的结果(字典类型),返回一个 float 值作为 monitor 的结果。 + 如果 evaluate_dataloaders 与 metrics 没有提供,该参数无意义。 :param larger_better: monitor 的值是否是越大越好。 :param marker: 用于标记一个 Trainer 实例,从而在用户调用 `Trainer.on` 函数时,标记该 callback 函数属于哪一个具体的 'trainer' 实例;默认为 None; :param kwargs: 一些其它的可能需要的参数; @@ -231,7 +232,6 @@ class Trainer(TrainerEventTrigger): total_batches=None ) - """ 设置内部的 Evaluator """ if metrics is None and evaluate_dataloaders is not None: raise ValueError("You have set 'evaluate_dataloaders' but forget to set 'metrics'.") @@ -760,8 +760,6 @@ class Trainer(TrainerEventTrigger): self.on_before_backward(outputs) loss = self.extract_loss_from_outputs(outputs) loss = loss / self.accumulation_steps - # with self.get_no_sync_context(): - # self.driver.backward(loss) self.driver.backward(loss) self.on_after_backward() diff --git a/fastNLP/core/dataloaders/paddle_dataloader/fdl.py b/fastNLP/core/dataloaders/paddle_dataloader/fdl.py index fa99be22..952759f7 100644 --- a/fastNLP/core/dataloaders/paddle_dataloader/fdl.py +++ b/fastNLP/core/dataloaders/paddle_dataloader/fdl.py @@ -8,11 +8,12 @@ from typing import Callable, List, Optional, Union, Dict, Sequence from fastNLP.envs.imports import _NEED_IMPORT_PADDLE if _NEED_IMPORT_PADDLE: - from paddle.io import DataLoader, Dataset + from paddle.io import DataLoader, Dataset, Sampler from paddle.fluid.dataloader.collate import default_collate_fn else: from fastNLP.core.utils.dummy_class import DummyClass as Dataset from fastNLP.core.utils.dummy_class import DummyClass as DataLoader + from fastNLP.core.utils.dummy_class import DummyClass as Sampler from fastNLP.core.collators.collator import Collator from fastNLP.core.dataloaders.utils import indice_collate_wrapper @@ -58,6 +59,9 @@ class PaddleDataLoader(DataLoader): if batch_sampler is None: batch_sampler = RandomBatchSampler(dataset, batch_size=batch_size, shuffle=shuffle, drop_last=drop_last) + batch_size = 1 + shuffle = False + drop_last = False super(PaddleDataLoader, self).__init__(dataset=dataset, feed_list=feed_list, places=places, return_list=return_list, batch_sampler=batch_sampler, diff --git a/fastNLP/core/dataloaders/torch_dataloader/fdl.py b/fastNLP/core/dataloaders/torch_dataloader/fdl.py index d008d4ad..ff2d1e65 100644 --- a/fastNLP/core/dataloaders/torch_dataloader/fdl.py +++ b/fastNLP/core/dataloaders/torch_dataloader/fdl.py @@ -165,8 +165,8 @@ class TorchDataLoader(DataLoader): def prepare_torch_dataloader(ds_or_db: Union[DataSet, DataBundle, Sequence[DataSet], Mapping[str, DataSet]], - batch_size: int = 1, - shuffle: bool = False, sampler: Union["Sampler[int]", ReproducibleSampler, UnrepeatedSampler] = None, + batch_size: int = 16, + shuffle: bool = True, sampler: Union["Sampler[int]", ReproducibleSampler, UnrepeatedSampler] = None, batch_sampler: Union["Sampler[Sequence[int]]", ReproducibleBatchSampler] = None, num_workers: int = 0, collate_fn: Union[str, Callable, None] = None, pin_memory: bool = False, drop_last: bool = False, diff --git a/fastNLP/core/drivers/paddle_driver/fleet.py b/fastNLP/core/drivers/paddle_driver/fleet.py index f3a739f0..01b61afa 100644 --- a/fastNLP/core/drivers/paddle_driver/fleet.py +++ b/fastNLP/core/drivers/paddle_driver/fleet.py @@ -1,12 +1,12 @@ import os -import shutil from typing import List, Union, Optional, Dict, Tuple, Callable +from fastNLP.core.utils.paddle_utils import get_device_from_visible + from .paddle_driver import PaddleDriver from .fleet_launcher import FleetLauncher from .utils import ( _FleetWrappingModel, - get_device_from_visible, reset_seed, replace_sampler, replace_batch_sampler, @@ -17,8 +17,8 @@ from fastNLP.envs.imports import _NEED_IMPORT_PADDLE from fastNLP.core.utils import ( auto_param_call, check_user_specific_params, - paddle_move_data_to_device, - is_in_paddle_dist + is_in_paddle_dist, + is_in_paddle_dist, ) from fastNLP.envs.distributed import rank_zero_rm from fastNLP.core.samplers import ( @@ -609,12 +609,6 @@ class PaddleFleetDriver(PaddleDriver): def is_distributed(self): return True - def move_data_to_device(self, batch: 'paddle.Tensor'): - device = self.data_device - # 因为设置了CUDA_VISIBLE_DEVICES,可能会引起错误 - device = get_device_from_visible(device) - return paddle_move_data_to_device(batch, device) - @staticmethod def _check_optimizer_legality(optimizers): # paddle 存在设置分布式 optimizers 的函数,返回值为 fleet.meta_optimizers.HybridParallelOptimizer @@ -637,9 +631,8 @@ class PaddleFleetDriver(PaddleDriver): :return: 如果当前不是分布式 driver 直接返回输入的 obj 。如果当前 rank 是接收端(其 global rank 包含在了 dst 中),则返回 接收到的参数;如果是 source 端则返回发射的内容;既不是发送端、又不是接收端,则返回 None 。 """ - device = self.data_device # 因为设置了CUDA_VISIBLE_DEVICES,可能会引起错误 - device = get_device_from_visible(device) + device = get_device_from_visible(self.data_device) return fastnlp_paddle_broadcast_object(obj, src, device=device, group=group) def all_gather(self, obj, group=None) -> List: diff --git a/fastNLP/core/drivers/paddle_driver/fleet_launcher.py b/fastNLP/core/drivers/paddle_driver/fleet_launcher.py index 471679a7..ca341db5 100644 --- a/fastNLP/core/drivers/paddle_driver/fleet_launcher.py +++ b/fastNLP/core/drivers/paddle_driver/fleet_launcher.py @@ -10,7 +10,6 @@ from fastNLP.envs.env import ( FASTNLP_DISTRIBUTED_CHECK, FASTNLP_LOG_LEVEL, FASTNLP_GLOBAL_SEED, - USER_CUDA_VISIBLE_DEVICES, ) from .utils import ( find_free_ports, diff --git a/fastNLP/core/drivers/paddle_driver/initialize_paddle_driver.py b/fastNLP/core/drivers/paddle_driver/initialize_paddle_driver.py index c0489e6e..46f51b9c 100644 --- a/fastNLP/core/drivers/paddle_driver/initialize_paddle_driver.py +++ b/fastNLP/core/drivers/paddle_driver/initialize_paddle_driver.py @@ -42,7 +42,8 @@ def initialize_paddle_driver(driver: str, device: Optional[Union[str, int, List[ user_visible_devices = os.getenv("USER_CUDA_VISIBLE_DEVICES") if user_visible_devices is None: - raise RuntimeError("This situation cannot happen, please report a bug to us.") + raise RuntimeError("`USER_CUDA_VISIBLE_DEVICES` cannot be None, please check if you have set " + "`FASTNLP_BACKEND` to 'paddle' before using FastNLP.") _could_use_device_num = len(user_visible_devices.split(",")) if isinstance(device, int): if device < 0 and device != -1: diff --git a/fastNLP/core/drivers/paddle_driver/paddle_driver.py b/fastNLP/core/drivers/paddle_driver/paddle_driver.py index f65efd3d..48ff9de1 100644 --- a/fastNLP/core/drivers/paddle_driver/paddle_driver.py +++ b/fastNLP/core/drivers/paddle_driver/paddle_driver.py @@ -10,7 +10,7 @@ import numpy as np from .utils import _build_fp16_env, optimizer_state_to_device, DummyGradScaler from fastNLP.envs.imports import _NEED_IMPORT_PADDLE from fastNLP.core.drivers.driver import Driver -from fastNLP.core.utils import apply_to_collection, paddle_move_data_to_device +from fastNLP.core.utils import apply_to_collection, paddle_move_data_to_device, get_device_from_visible from fastNLP.envs import ( FASTNLP_SEED_WORKERS, FASTNLP_MODEL_FILENAME, @@ -394,7 +394,8 @@ class PaddleDriver(Driver): :return: 将移动到指定机器上的 batch 对象返回; """ - return paddle_move_data_to_device(batch, self.data_device) + device = get_device_from_visible(self.data_device) + return paddle_move_data_to_device(batch, device) @staticmethod def worker_init_function(worker_id: int, rank: Optional[int] = None) -> None: # pragma: no cover diff --git a/fastNLP/core/drivers/paddle_driver/single_device.py b/fastNLP/core/drivers/paddle_driver/single_device.py index 52805a97..84572914 100644 --- a/fastNLP/core/drivers/paddle_driver/single_device.py +++ b/fastNLP/core/drivers/paddle_driver/single_device.py @@ -2,14 +2,14 @@ import os from typing import Optional, Dict, Union, Callable, Tuple from .paddle_driver import PaddleDriver -from .utils import replace_batch_sampler, replace_sampler, get_device_from_visible +from .utils import replace_batch_sampler, replace_sampler from fastNLP.envs.imports import _NEED_IMPORT_PADDLE from fastNLP.envs.env import USER_CUDA_VISIBLE_DEVICES from fastNLP.core.utils import ( auto_param_call, + get_device_from_visible, get_paddle_gpu_str, get_paddle_device_id, - paddle_move_data_to_device, ) from fastNLP.core.utils.utils import _get_fun_msg from fastNLP.core.samplers import ( @@ -39,6 +39,9 @@ class PaddleSingleDriver(PaddleDriver): raise ValueError("`paddle.DataParallel` is not supported in `PaddleSingleDriver`") cuda_visible_devices = os.environ.get(USER_CUDA_VISIBLE_DEVICES, None) + if cuda_visible_devices is None: + raise RuntimeError("`USER_CUDA_VISIBLE_DEVICES` cannot be None, please check if you have set " + "`FASTNLP_BACKEND` to 'paddle' before using FastNLP.") if cuda_visible_devices == "": device = "cpu" logger.info("You have set `CUDA_VISIBLE_DEVICES` to '' in system environment variable, and we are gonna to" @@ -54,7 +57,7 @@ class PaddleSingleDriver(PaddleDriver): device_id = device else: device_id = get_paddle_device_id(device) - os.environ["CUDA_VISIBLE_DEVICES"] = os.environ[USER_CUDA_VISIBLE_DEVICES].split(",")[device_id] + os.environ["CUDA_VISIBLE_DEVICES"] = cuda_visible_devices.split(",")[device_id] self.model_device = get_paddle_gpu_str(device) self.local_rank = 0 @@ -65,8 +68,7 @@ class PaddleSingleDriver(PaddleDriver): r""" 该函数用来初始化训练环境,用于设置当前训练的设备,并将模型迁移到对应设备上。 """ - device = self.model_device - device = get_device_from_visible(device, output_type=str) + device = get_device_from_visible(self.model_device, output_type=str) paddle.device.set_device(device) self.model.to(device) @@ -121,16 +123,6 @@ class PaddleSingleDriver(PaddleDriver): else: raise RuntimeError(f"There is no `{fn}` method in your {type(self.model)}.") - def move_data_to_device(self, batch: 'paddle.Tensor'): - r""" - 将数据迁移到指定的机器上;batch 可能是 list 也可能 dict ,或其嵌套结构。 - 在 Paddle 中使用可能会引起因与设置的设备不一致而产生的问题,请注意。 - - :return: 将移动到指定机器上的 batch 对象返回; - """ - device = get_device_from_visible(self.data_device) - return paddle_move_data_to_device(batch, device) - def set_dist_repro_dataloader(self, dataloader, dist: Union[str, ReproducibleBatchSampler, ReproducibleSampler]=None, reproducible: bool = False): r""" diff --git a/fastNLP/core/drivers/paddle_driver/utils.py b/fastNLP/core/drivers/paddle_driver/utils.py index 6cd7b252..60d243e7 100644 --- a/fastNLP/core/drivers/paddle_driver/utils.py +++ b/fastNLP/core/drivers/paddle_driver/utils.py @@ -6,12 +6,11 @@ import inspect import numpy as np from copy import deepcopy from contextlib import ExitStack, closing -from enum import IntEnum -from typing import Dict, Optional, Union +from typing import Dict, Optional from fastNLP.envs.imports import _NEED_IMPORT_PADDLE -from fastNLP.core.utils import get_paddle_device_id, auto_param_call, paddle_to -from fastNLP.envs.env import FASTNLP_GLOBAL_SEED, FASTNLP_SEED_WORKERS, USER_CUDA_VISIBLE_DEVICES +from fastNLP.core.utils import auto_param_call, paddle_to +from fastNLP.envs.env import FASTNLP_GLOBAL_SEED, FASTNLP_SEED_WORKERS from fastNLP.core.log import logger @@ -173,40 +172,6 @@ def find_free_ports(num): return None -def get_device_from_visible(device: Union[str, int], output_type=int): - """ - 在有 CUDA_VISIBLE_DEVICES 的情况下,获取对应的设备。 - 如 CUDA_VISIBLE_DEVICES=2,3 ,device=3 ,则返回1。 - - :param device: 未转化的设备名 - :param output_type: 返回值的类型 - :return: 转化后的设备id - """ - if output_type not in [int, str]: - raise ValueError("Parameter `output_type` should be one of these types: [int, str]") - if device == "cpu": - return device - cuda_visible_devices = os.getenv("CUDA_VISIBLE_DEVICES") - idx = get_paddle_device_id(device) - if cuda_visible_devices is None or cuda_visible_devices == "": - # 这个判断一般不会发生,因为 fastnlp 会为 paddle 强行注入 CUDA_VISIBLE_DEVICES - raise RuntimeError("This situation should not happen, please report us this bug.") - else: - # 利用 USER_CUDA_VISIBLDE_DEVICES 获取用户期望的设备 - user_visible_devices = os.getenv(USER_CUDA_VISIBLE_DEVICES) - if user_visible_devices is None: - raise RuntimeError("This situation cannot happen, please report a bug to us.") - idx = user_visible_devices.split(",")[idx] - - cuda_visible_devices_list = cuda_visible_devices.split(',') - if idx not in cuda_visible_devices_list: - raise ValueError(f"Can't find your devices {idx} in CUDA_VISIBLE_DEVICES[{cuda_visible_devices}].") - res = cuda_visible_devices_list.index(idx) - if output_type == int: - return res - else: - return f"gpu:{res}" - def replace_batch_sampler(dataloader: "DataLoader", batch_sampler: "BatchSampler"): """ 利用 `batch_sampler` 重新构建一个 DataLoader,起到替换 `batch_sampler` 又不影响原 `dataloader` 的作用。 diff --git a/fastNLP/core/metrics/backend/paddle_backend/backend.py b/fastNLP/core/metrics/backend/paddle_backend/backend.py index 243c5aac..aa57bbc2 100644 --- a/fastNLP/core/metrics/backend/paddle_backend/backend.py +++ b/fastNLP/core/metrics/backend/paddle_backend/backend.py @@ -1,11 +1,10 @@ -from typing import List, Optional, Any +from typing import List, Any import numpy as np from fastNLP.core.metrics.backend import Backend -from fastNLP.core.utils.paddle_utils import paddle_to +from fastNLP.core.utils.paddle_utils import paddle_to, get_device_from_visible from fastNLP.core.metrics.utils import AggregateMethodError -from fastNLP.core.drivers.paddle_driver.utils import get_device_from_visible from fastNLP.core.drivers.paddle_driver.dist_utils import fastnlp_paddle_all_gather from fastNLP.envs.imports import _NEED_IMPORT_PADDLE @@ -80,7 +79,6 @@ class PaddleBackend(Backend): raise ValueError(f"tensor: {tensor} can not convert to ndarray!") def move_tensor_to_device(self, tensor, device): - # TODO 如果在这里处理的话,会不会在别的地方引起bug? device = get_device_from_visible(device) return paddle_to(tensor, device) diff --git a/fastNLP/core/utils/__init__.py b/fastNLP/core/utils/__init__.py index 4af6a24a..ea716fe8 100644 --- a/fastNLP/core/utils/__init__.py +++ b/fastNLP/core/utils/__init__.py @@ -2,6 +2,7 @@ __all__ = [ 'cache_results', 'is_jittor_dataset', 'jittor_collate_wraps', + 'get_device_from_visible', 'paddle_to', 'paddle_move_data_to_device', 'get_paddle_device_id', @@ -27,7 +28,7 @@ __all__ = [ from .cache_results import cache_results from .jittor_utils import is_jittor_dataset, jittor_collate_wraps -from .paddle_utils import paddle_to, paddle_move_data_to_device, get_paddle_device_id, get_paddle_gpu_str, is_in_paddle_dist, \ +from .paddle_utils import get_device_from_visible, paddle_to, paddle_move_data_to_device, get_paddle_device_id, get_paddle_gpu_str, is_in_paddle_dist, \ is_in_fnlp_paddle_dist, is_in_paddle_launch_dist from .rich_progress import f_rich_progress from .torch_paddle_utils import torch_paddle_move_data_to_device diff --git a/fastNLP/core/utils/cache_results.py b/fastNLP/core/utils/cache_results.py index ff253f3e..f8d34bc9 100644 --- a/fastNLP/core/utils/cache_results.py +++ b/fastNLP/core/utils/cache_results.py @@ -3,6 +3,7 @@ import hashlib import _pickle import functools import os +import re from typing import Callable, List, Any, Optional import inspect import ast @@ -126,7 +127,10 @@ def _get_func_and_its_called_func_source_code(func) -> List[str]: # some failure pass del last_frame # - sources.append(inspect.getsource(func)) + func_source_code = inspect.getsource(func) # 将这个函数中的 cache_results 装饰删除掉。 + for match in list(re.finditer('@cache_results\(.*\)\\n', func_source_code))[::-1]: + func_source_code = func_source_code[:match.start()] + func_source_code[match.end():] + sources.append(func_source_code) return sources @@ -163,11 +167,12 @@ def cal_fn_hash_code(fn: Optional[Callable] = None, fn_kwargs: Optional[dict] = if fn_kwargs is None: fn_kwargs = {} hasher = Hasher() - try: - sources = _get_func_and_its_called_func_source_code(fn) - hasher.update(sources) - except: - return "can't be hashed" + if fn is not None: + try: + sources = _get_func_and_its_called_func_source_code(fn) + hasher.update(sources) + except: + return "can't be hashed" for key in sorted(fn_kwargs): hasher.update(key) try: @@ -177,7 +182,7 @@ def cal_fn_hash_code(fn: Optional[Callable] = None, fn_kwargs: Optional[dict] = return hasher.hexdigest() -def cache_results(_cache_fp, _refresh=False, _verbose=1, _check_hash=True): +def cache_results(_cache_fp, _hash_param=True, _refresh=False, _verbose=1, _check_hash=True): r""" cache_results是fastNLP中用于cache数据的装饰器。通过下面的例子看一下如何使用:: @@ -186,9 +191,9 @@ def cache_results(_cache_fp, _refresh=False, _verbose=1, _check_hash=True): from fastNLP import cache_results @cache_results('cache.pkl') - def process_data(): + def process_data(second=1): # 一些比较耗时的工作,比如读取数据,预处理数据等,这里用time.sleep()代替耗时 - time.sleep(1) + time.sleep(second) return np.random.randint(10, size=(5,)) start_time = time.time() @@ -199,49 +204,49 @@ def cache_results(_cache_fp, _refresh=False, _verbose=1, _check_hash=True): print("res =",process_data()) print(time.time() - start_time) - # 输出内容如下,可以看到两次结果相同,且第二次几乎没有花费时间 - # Save cache to cache.pkl. + start_time = time.time() + print("res =",process_data(second=2)) + print(time.time() - start_time) + + # 输出内容如下,可以看到前两次结果相同,且第二次几乎没有花费时间。第三次由于参数变化了,所以cache的结果也就自然变化了。 + # Save cache to 2d145aeb_cache.pkl. # res = [5 4 9 1 8] - # 1.0042750835418701 - # Read cache from cache.pkl. + # 1.0134737491607666 + # Read cache from 2d145aeb_cache.pkl (Saved on xxxx). # res = [5 4 9 1 8] # 0.0040721893310546875 + # Save cache to 0ead3093_cache.pkl. + # res = [1 8 2 5 1] + # 2.0086121559143066 - 可以看到第二次运行的时候,只用了0.0001s左右,是由于第二次运行将直接从cache.pkl这个文件读取数据,而不会经过再次预处理:: - - # 还是以上面的例子为例,如果需要重新生成另一个cache,比如另一个数据集的内容,通过如下的方式调用即可 - process_data(_cache_fp='cache2.pkl') # 完全不影响之前的‘cache.pkl' - - 上面的_cache_fp是cache_results会识别的参数,它将从'cache2.pkl'这里缓存/读取数据,即这里的'cache2.pkl'覆盖默认的 - 'cache.pkl'。如果在你的函数前面加上了@cache_results()则你的函数会增加三个参数[_cache_fp, _refresh, _verbose]。 - 上面的例子即为使用_cache_fp的情况,这三个参数不会传入到你的函数中,当然你写的函数参数名也不可能包含这三个名称:: - - process_data(_cache_fp='cache2.pkl', _refresh=True) # 这里强制重新生成一份对预处理的cache。 - # _verbose是用于控制输出信息的,如果为0,则不输出任何内容;如果为1,则会提醒当前步骤是读取的cache还是生成了新的cache + 可以看到第二次运行的时候,只用了0.0001s左右,是由于第二次运行将直接从cache.pkl这个文件读取数据,而不会经过再次预处理。 + 如果在函数加上了装饰器@cache_results(),则函数会增加五个参数[_cache_fp, _hash_param, _refresh, _verbose, + _check_hash]。上面的例子即为使用_cache_fp的情况,这五个参数不会传入到被装饰函数中,当然被装饰函数参数名也不能包含这五个名称:: :param str _cache_fp: 将返回结果缓存到什么位置;或从什么位置读取缓存。如果为None,cache_results没有任何效用,除非在 - 函数调用的时候传入_cache_fp这个参数。 - :param bool _refresh: 是否重新生成cache。 + 函数调用的时候传入 _cache_fp 这个参数。保存文件的名称会受到 + :param bool _hash_param: 是否将传入给被装饰函数的 parameter 进行 str 之后的 hash 结果加入到 _cache_fp 中,这样每次函数的 + parameter 改变的时候,cache 文件就自动改变了。 + :param bool _refresh: 强制重新生成新的 cache 。 :param int _verbose: 是否打印cache的信息。 :param bool _check_hash: 如果为 True 将尝试对比修饰的函数的源码以及该函数内部调用的函数的源码的hash值。如果发现保存时的hash值 与当前的hash值有差异,会报warning。但该warning可能出现实质上并不影响结果的误报(例如增删空白行);且在修改不涉及源码时,虽然 该修改对结果有影响,但无法做出warning。 - :return: """ def wrapper_(func): signature = inspect.signature(func) for key, _ in signature.parameters.items(): - if key in ('_cache_fp', '_refresh', '_verbose', '_check_hash'): + if key in ('_cache_fp', "_hash_param", '_refresh', '_verbose', '_check_hash'): raise RuntimeError("The function decorated by cache_results cannot have keyword `{}`.".format(key)) @functools.wraps(func) def wrapper(*args, **kwargs): - fn_param = kwargs.copy() - if args: - params = [p.name for p in inspect.signature(func).parameters.values()] - fn_param.update(zip(params, args)) + # fn_param = kwargs.copy() + # if args: + # params = [p.name for p in inspect.signature(func).parameters.values()] + # fn_param.update(zip(params, args)) if '_cache_fp' in kwargs: cache_filepath = kwargs.pop('_cache_fp') assert isinstance(cache_filepath, str), "_cache_fp can only be str." @@ -263,10 +268,31 @@ def cache_results(_cache_fp, _refresh=False, _verbose=1, _check_hash=True): else: check_hash = _check_hash + if '_hash_param' in kwargs: + hash_param = kwargs.pop('_hash_param') + assert isinstance(hash_param, bool), "_hash_param can only be bool." + else: + hash_param = _hash_param + + if hash_param and cache_filepath is not None: # 尝试将parameter给hash一下 + try: + params = dict(inspect.getcallargs(func, *args, **kwargs)) + if inspect.ismethod(func): # 如果是 method 的话第一个参数(一般就是 self )就不考虑了 + first_key = next(iter(params.items())) + params.pop(first_key) + if len(params): + # sort 一下防止顺序改变 + params = {k: str(v) for k, v in sorted(params.items(), key=lambda item: item[0])} + param_hash = cal_fn_hash_code(None, params)[:8] + head, tail = os.path.split(cache_filepath) + cache_filepath = os.path.join(head, param_hash + '_' + tail) + except BaseException as e: + logger.debug(f"Fail to add parameter hash to cache path, because of Exception:{e}") + refresh_flag = True new_hash_code = None if check_hash: - new_hash_code = cal_fn_hash_code(func, fn_param) + new_hash_code = cal_fn_hash_code(func, None) if cache_filepath is not None and refresh is False: # load data @@ -281,13 +307,13 @@ def cache_results(_cache_fp, _refresh=False, _verbose=1, _check_hash=True): logger.info("Read cache from {} (Saved on {}).".format(cache_filepath, save_time)) if check_hash and old_hash_code != new_hash_code: logger.warning(f"The function `{func.__name__}` is different from its last cache (Save on {save_time}). The " - f"difference may caused by the sourcecode change of the functions by this function.", + f"difference may caused by the sourcecode change.", extra={'highlighter': ColorHighlighter('red')}) refresh_flag = False if refresh_flag: if new_hash_code is None: - new_hash_code = cal_fn_hash_code(func, fn_param) + new_hash_code = cal_fn_hash_code(func, None) results = func(*args, **kwargs) if cache_filepath is not None: if results is None: diff --git a/fastNLP/core/utils/paddle_utils.py b/fastNLP/core/utils/paddle_utils.py index e4c0a8a9..db68879f 100644 --- a/fastNLP/core/utils/paddle_utils.py +++ b/fastNLP/core/utils/paddle_utils.py @@ -1,4 +1,5 @@ __all__ = [ + "get_device_from_visible", "paddle_to", "paddle_move_data_to_device", "get_paddle_gpu_str", @@ -13,13 +14,45 @@ import re from typing import Any, Optional, Union from fastNLP.envs.imports import _NEED_IMPORT_PADDLE -from fastNLP.envs import FASTNLP_DISTRIBUTED_CHECK, FASTNLP_BACKEND_LAUNCH +from fastNLP.envs import FASTNLP_DISTRIBUTED_CHECK, FASTNLP_BACKEND_LAUNCH, USER_CUDA_VISIBLE_DEVICES if _NEED_IMPORT_PADDLE: import paddle from .utils import apply_to_collection +def get_device_from_visible(device: Union[str, int], output_type=int): + """ + 在有 CUDA_VISIBLE_DEVICES 的情况下,获取对应的设备。 + 如 CUDA_VISIBLE_DEVICES=2,3 ,device=3 ,则返回1。 + + :param device: 未转化的设备名 + :param output_type: 返回值的类型 + :return: 转化后的设备id + """ + if output_type not in [int, str]: + raise ValueError("Parameter `output_type` should be one of these types: [int, str]") + if device == "cpu": + return device + cuda_visible_devices = os.getenv("CUDA_VISIBLE_DEVICES") + user_visible_devices = os.getenv(USER_CUDA_VISIBLE_DEVICES) + if user_visible_devices is None: + raise RuntimeError("`USER_CUDA_VISIBLE_DEVICES` cannot be None, please check if you have set " + "`FASTNLP_BACKEND` to 'paddle' before using FastNLP.") + idx = get_paddle_device_id(device) + # 利用 USER_CUDA_VISIBLDE_DEVICES 获取用户期望的设备 + if user_visible_devices is None: + raise RuntimeError("This situation cannot happen, please report a bug to us.") + idx = user_visible_devices.split(",")[idx] + + cuda_visible_devices_list = cuda_visible_devices.split(',') + if idx not in cuda_visible_devices_list: + raise ValueError(f"Can't find your devices {idx} in CUDA_VISIBLE_DEVICES[{cuda_visible_devices}]. ") + res = cuda_visible_devices_list.index(idx) + if output_type == int: + return res + else: + return f"gpu:{res}" def paddle_to(data, device: Union[str, int]): """ @@ -33,6 +66,7 @@ def paddle_to(data, device: Union[str, int]): if device == "cpu": return data.cpu() else: + # device = get_device_from_visible(device, output_type=int) return data.cuda(get_paddle_device_id(device)) diff --git a/tests/core/collators/padders/test_get_padder.py b/tests/core/collators/padders/test_get_padder.py index a07a943e..5996f023 100644 --- a/tests/core/collators/padders/test_get_padder.py +++ b/tests/core/collators/padders/test_get_padder.py @@ -14,10 +14,10 @@ def test_get_element_shape_dtype(): catalog = _get_element_shape_dtype([np.zeros(3), np.zeros((2, 1))]) -@pytest.mark.parametrize('backend', ['raw', None, 'numpy', 'torch', 'jittor', 'paddle']) +# @pytest.mark.parametrize('backend', ['raw', None, 'numpy', 'torch', 'jittor', 'paddle']) +@pytest.mark.parametrize('backend', ['raw', None, 'numpy', 'torch', 'paddle']) @pytest.mark.torch @pytest.mark.paddle -@pytest.mark.jittor def test_get_padder_run(backend): if not _NEED_IMPORT_TORCH and backend == 'torch': pytest.skip("No torch") diff --git a/tests/core/controllers/_test_trainer_fleet.py b/tests/core/controllers/_test_trainer_fleet.py index f438b6de..309e6eb4 100644 --- a/tests/core/controllers/_test_trainer_fleet.py +++ b/tests/core/controllers/_test_trainer_fleet.py @@ -1,7 +1,7 @@ """ 这个文件测试用户以python -m paddle.distributed.launch 启动的情况 看看有没有用pytest执行的机会 -python -m paddle.distributed.launch --gpus=0,2,3 test_trainer_fleet.py +FASTNLP_BACKEND=paddle python -m paddle.distributed.launch --gpus=0,2,3 _test_trainer_fleet.py """ import os import sys diff --git a/tests/core/controllers/_test_trainer_fleet_outside.py b/tests/core/controllers/_test_trainer_fleet_outside.py index e8c9a244..d2bcbc41 100644 --- a/tests/core/controllers/_test_trainer_fleet_outside.py +++ b/tests/core/controllers/_test_trainer_fleet_outside.py @@ -1,7 +1,7 @@ """ 这个文件测试用户以python -m paddle.distributed.launch 启动的情况 并且自己初始化了 fleet -python -m paddle.distributed.launch --gpus=0,2,3 test_trainer_fleet_outside.py +FASTNLP_BACKEND=paddle python -m paddle.distributed.launch --gpus=0,2,3 _test_trainer_fleet_outside.py """ import os import sys @@ -93,5 +93,5 @@ if __name__ == "__main__": driver=driver, device=device, callbacks=callbacks, - n_epochs=30, + n_epochs=5, ) \ No newline at end of file diff --git a/tests/core/controllers/test_trainer_paddle.py b/tests/core/controllers/test_trainer_paddle.py index 3cf850c3..895e8517 100644 --- a/tests/core/controllers/test_trainer_paddle.py +++ b/tests/core/controllers/test_trainer_paddle.py @@ -27,7 +27,7 @@ class TrainPaddleConfig: @pytest.mark.parametrize("driver,device", [("paddle", "cpu"), ("paddle", 1), ("fleet", [0, 1])]) # @pytest.mark.parametrize("driver,device", [("fleet", [0, 1])]) @pytest.mark.parametrize("callbacks", [[RichCallback(5)]]) -@pytest.mark.paddle +@pytest.mark.paddledist @magic_argv_env_context def test_trainer_paddle( driver, diff --git a/tests/core/dataloaders/paddle_dataloader/test_fdl.py b/tests/core/dataloaders/paddle_dataloader/test_fdl.py index d8ba521b..6632ad17 100644 --- a/tests/core/dataloaders/paddle_dataloader/test_fdl.py +++ b/tests/core/dataloaders/paddle_dataloader/test_fdl.py @@ -58,11 +58,3 @@ class TestPaddle: for batch in fdl1: assert batch['image'].shape == [4, 10, 5] print(batch) - - def test_v2(self): - from fastNLP.core.collators import Collator - logger.setLevel("DEBUG") - data = [paddle.Tensor(np.random.random((10, 5)).astype('float32')), paddle.Tensor(np.random.random((10, 5)).astype('float32'))] - col = Collator(backend="jittor") - res = col(data) - print(res) \ No newline at end of file diff --git a/tests/core/dataset/test_dataset.py b/tests/core/dataset/test_dataset.py index 8ff64d04..a2540ecf 100644 --- a/tests/core/dataset/test_dataset.py +++ b/tests/core/dataset/test_dataset.py @@ -370,29 +370,11 @@ class TestDataSetMethods: assert os.path.exists("1.csv") == True os.remove("1.csv") - def test_add_collate_fn(self): - ds = DataSet({'x': [1, 2, 3], 'y': [4, 5, 6]}) - - def collate_fn(item): - return item - - ds.add_collate_fn(collate_fn) - - def test_get_collator(self): - from typing import Callable - ds = DataSet({'x': [1, 2, 3], 'y': [4, 5, 6]}) - collate_fn = ds.get_collator() - assert isinstance(collate_fn, Callable) == True - def test_add_seq_len(self): ds = DataSet({'x': [[1, 2], [2, 3, 4], [3]], 'y': [4, 5, 6]}) ds.add_seq_len('x') print(ds) - def test_set_target(self): - ds = DataSet({'x': [[1, 2], [2, 3, 4], [3]], 'y': [4, 5, 6]}) - ds.set_target('x') - class TestFieldArrayInit: """ diff --git a/tests/core/drivers/paddle_driver/test_dist_utils.py b/tests/core/drivers/paddle_driver/test_dist_utils.py index da40ad78..e3a3eb5d 100644 --- a/tests/core/drivers/paddle_driver/test_dist_utils.py +++ b/tests/core/drivers/paddle_driver/test_dist_utils.py @@ -19,7 +19,7 @@ if _NEED_IMPORT_PADDLE: import paddle import paddle.distributed as dist -@pytest.mark.paddle +@pytest.mark.paddledist class TestDistUtilsTools: """ 测试一些工具函数 @@ -79,14 +79,13 @@ class TestDistUtilsTools: assert res["int"] == paddle_dict["int"] assert res["string"] == paddle_dict["string"] - -@pytest.mark.paddle +@pytest.mark.paddledist class TestAllGatherAndBroadCast: @classmethod def setup_class(cls): devices = [0,1,2] - output_from_new_proc = "only_error" + output_from_new_proc = "all" launcher = FleetLauncher(devices=devices, output_from_new_proc=output_from_new_proc) cls.local_rank = int(os.getenv("PADDLE_RANK_IN_NODE", "0")) diff --git a/tests/core/drivers/paddle_driver/test_fleet.py b/tests/core/drivers/paddle_driver/test_fleet.py index 3b3f15ec..453af92a 100644 --- a/tests/core/drivers/paddle_driver/test_fleet.py +++ b/tests/core/drivers/paddle_driver/test_fleet.py @@ -39,7 +39,7 @@ def generate_driver(num_labels, feature_dimension, device=[0,1], fp16=False, out # ############################################################################ -@pytest.mark.paddle +@pytest.mark.paddledist class TestFleetDriverFunction: """ 测试 PaddleFleetDriver 一些简单函数的测试类,基本都是测试能否运行、是否存在 import 错误等问题 @@ -147,7 +147,7 @@ class TestFleetDriverFunction: # ############################################################################ -@pytest.mark.paddle +@pytest.mark.paddledist class TestSetDistReproDataloader: @classmethod @@ -521,7 +521,7 @@ class TestSetDistReproDataloader: # ############################################################################ -@pytest.mark.paddle +@pytest.mark.paddledist class TestSaveLoad: """ 测试多卡情况下 save 和 load 相关函数的表现 diff --git a/tests/core/drivers/paddle_driver/test_single_device.py b/tests/core/drivers/paddle_driver/test_single_device.py index ba243106..ffcb35e7 100644 --- a/tests/core/drivers/paddle_driver/test_single_device.py +++ b/tests/core/drivers/paddle_driver/test_single_device.py @@ -552,22 +552,17 @@ def generate_random_driver(features, labels, fp16=False, device="cpu"): return driver -@pytest.fixture -def prepare_test_save_load(): - dataset = PaddleRandomMaxDataset(40, 10) - dataloader = DataLoader(dataset, batch_size=4) - driver1, driver2 = generate_random_driver(10, 10), generate_random_driver(10, 10) - return driver1, driver2, dataloader - @pytest.mark.paddle @pytest.mark.parametrize("only_state_dict", ([True, False])) -def test_save_and_load_model(prepare_test_save_load, only_state_dict): +def test_save_and_load_model(only_state_dict): """ 测试 save_model 和 load_model 函数 """ try: path = "model" - driver1, driver2, dataloader = prepare_test_save_load + dataset = PaddleRandomMaxDataset(40, 10) + dataloader = DataLoader(dataset, batch_size=4) + driver1, driver2 = generate_random_driver(10, 10, device="gpu"), generate_random_driver(10, 10, device="gpu") if only_state_dict: driver1.save_model(path, only_state_dict) diff --git a/tests/core/drivers/paddle_driver/test_utils.py b/tests/core/drivers/paddle_driver/test_utils.py index 3b0fb9e0..66dc23c4 100644 --- a/tests/core/drivers/paddle_driver/test_utils.py +++ b/tests/core/drivers/paddle_driver/test_utils.py @@ -1,8 +1,6 @@ -import os import pytest from fastNLP.core.drivers.paddle_driver.utils import ( - get_device_from_visible, replace_batch_sampler, replace_sampler, ) @@ -14,24 +12,6 @@ if _NEED_IMPORT_PADDLE: from tests.helpers.datasets.paddle_data import PaddleNormalDataset -@pytest.mark.parametrize( - ("user_visible_devices, cuda_visible_devices, device, output_type, correct"), - ( - ("0,1,2,3,4,5,6,7", "0", "cpu", str, "cpu"), - ("0,1,2,3,4,5,6,7", "0", "cpu", int, "cpu"), - ("0,1,2,3,4,5,6,7", "3,4,5", "gpu:4", int, 1), - ("0,1,2,3,4,5,6,7", "3,4,5", "gpu:5", str, "gpu:2"), - ("3,4,5,6", "3,5", 0, int, 0), - ("3,6,7,8", "6,7,8", "gpu:2", str, "gpu:1"), - ) -) -@pytest.mark.paddle -def test_get_device_from_visible_str(user_visible_devices, cuda_visible_devices, device, output_type, correct): - os.environ["CUDA_VISIBLE_DEVICES"] = cuda_visible_devices - os.environ["USER_CUDA_VISIBLE_DEVICES"] = user_visible_devices - res = get_device_from_visible(device, output_type) - assert res == correct - @pytest.mark.paddle def test_replace_batch_sampler(): dataset = PaddleNormalDataset(10) diff --git a/tests/core/drivers/torch_driver/test_single_device.py b/tests/core/drivers/torch_driver/test_single_device.py index 9115ed19..086f4251 100644 --- a/tests/core/drivers/torch_driver/test_single_device.py +++ b/tests/core/drivers/torch_driver/test_single_device.py @@ -545,22 +545,17 @@ def generate_random_driver(features, labels, fp16=False, device="cpu"): return driver -@pytest.fixture -def prepare_test_save_load(): - dataset = TorchArgMaxDataset(10, 40) - dataloader = DataLoader(dataset, batch_size=4) - driver1, driver2 = generate_random_driver(10, 10), generate_random_driver(10, 10) - return driver1, driver2, dataloader - @pytest.mark.torch @pytest.mark.parametrize("only_state_dict", ([True, False])) -def test_save_and_load_model(prepare_test_save_load, only_state_dict): +def test_save_and_load_model(only_state_dict): """ 测试 save_model 和 load_model 函数 """ try: path = "model" - driver1, driver2, dataloader = prepare_test_save_load + dataset = TorchArgMaxDataset(10, 40) + dataloader = DataLoader(dataset, batch_size=4) + driver1, driver2 = generate_random_driver(10, 10), generate_random_driver(10, 10) driver1.save_model(path, only_state_dict) driver2.load_model(path, only_state_dict) diff --git a/tests/core/utils/test_cache_results.py b/tests/core/utils/test_cache_results.py index efef9f10..726ad74e 100644 --- a/tests/core/utils/test_cache_results.py +++ b/tests/core/utils/test_cache_results.py @@ -246,6 +246,106 @@ class TestCacheResults: rank_zero_rm('demo.pkl') +def remove_postfix(folder='.', post_fix='.pkl'): + import os + for f in os.listdir(folder): + if os.path.isfile(f) and f.endswith(post_fix): + os.remove(os.path.join(folder, f)) + + +class TestCacheResultsWithParam: + @pytest.mark.parametrize('_refresh', [True, False]) + @pytest.mark.parametrize('_hash_param', [True, False]) + @pytest.mark.parametrize('_verbose', [0, 1]) + @pytest.mark.parametrize('_check_hash', [True, False]) + def test_cache_save(self, _refresh, _hash_param, _verbose, _check_hash): + cache_fp = 'demo.pkl' + try: + @cache_results(cache_fp, _refresh=_refresh, _hash_param=_hash_param, _verbose=_verbose, + _check_hash=_check_hash) + def demo(a=1): + print("¥") + return 1 + res = demo() + + with Capturing() as output: + res = demo(a=1) + if _refresh is False: + assert '¥' not in output[0] + if _verbose is 0: + assert 'read' not in output[0] + + with Capturing() as output: + res = demo(1) + if _refresh is False: + assert '¥' not in output[0] + + with Capturing() as output: + res = demo(a=2) + if _hash_param is True: # 一定对不上,需要重新生成 + assert '¥' in output[0] + + finally: + remove_postfix('.') + + def test_cache_complex_param(self): + cache_fp = 'demo.pkl' + try: + @cache_results(cache_fp, _refresh=False) + def demo(*args, s=1, **kwargs): + print("¥") + return 1 + + res = demo(1,2,3, s=4, d=4) + with Capturing() as output: + res = demo(1,2,3,d=4, s=4) + assert '¥' not in output[0] + finally: + remove_postfix('.') + + def test_wrapper_change(self): + cache_fp = 'demo.pkl' + test_type = 'wrapper_change' + try: + cmd = f'python {__file__} --cache_fp {cache_fp} --test_type {test_type} --turn 0' + res = get_subprocess_results(cmd) + assert "¥" in res + cmd = f'python {__file__} --cache_fp {cache_fp} --test_type {test_type} --turn 1' + res = get_subprocess_results(cmd) + assert "¥" not in res + assert 'Read' in res + assert 'different' not in res + + finally: + remove_postfix('.') + + def test_param_change(self): + cache_fp = 'demo.pkl' + test_type = 'param_change' + try: + cmd = f'python {__file__} --cache_fp {cache_fp} --test_type {test_type} --turn 0' + res = get_subprocess_results(cmd) + assert "¥" in res + cmd = f'python {__file__} --cache_fp {cache_fp} --test_type {test_type} --turn 1' + res = get_subprocess_results(cmd) + assert "¥" in res + assert 'Read' not in res + finally: + remove_postfix('.') + + def test_create_cache_dir(self): + @cache_results('demo/demo.pkl') + def cache(s): + return 1, 2 + + try: + results = cache(s=1) + assert (1, 2) == results + finally: + import shutil + shutil.rmtree('demo/') + + if __name__ == '__main__': import argparse parser = argparse.ArgumentParser() @@ -294,3 +394,31 @@ if __name__ == '__main__': res = demo_func() + if test_type == 'wrapper_change': + if turn == 0: + @cache_results(cache_fp, _refresh=True) + def demo_wrapper_change(): + print("¥") + return 1 + else: + @cache_results(cache_fp, _refresh=False) + def demo_wrapper_change(): + print("¥") + return 1 + + res = demo_wrapper_change() + + if test_type == 'param_change': + if turn == 0: + @cache_results(cache_fp, _refresh=False) + def demo_param_change(): + print("¥") + return 1 + else: + @cache_results(cache_fp, _refresh=False) + def demo_param_change(a=1): + print("¥") + return 1 + + res = demo_param_change() + diff --git a/tests/core/utils/test_paddle_utils.py b/tests/core/utils/test_paddle_utils.py index ba9dcf79..d86d215f 100644 --- a/tests/core/utils/test_paddle_utils.py +++ b/tests/core/utils/test_paddle_utils.py @@ -1,10 +1,40 @@ +import os + import pytest -from fastNLP.core.utils.paddle_utils import paddle_to, paddle_move_data_to_device +from fastNLP.core.utils.paddle_utils import get_device_from_visible, paddle_to, paddle_move_data_to_device from fastNLP.envs.imports import _NEED_IMPORT_PADDLE if _NEED_IMPORT_PADDLE: import paddle - +@pytest.mark.parametrize( + ("user_visible_devices, cuda_visible_devices, device, output_type, correct"), + ( + ("0,1,2,3,4,5,6,7", "0", "cpu", str, "cpu"), + ("0,1,2,3,4,5,6,7", "0", "cpu", int, "cpu"), + ("0,1,2,3,4,5,6,7", "3,4,5", "gpu:4", int, 1), + ("0,1,2,3,4,5,6,7", "3,4,5", "gpu:5", str, "gpu:2"), + ("3,4,5,6", "3,5", 0, int, 0), + ("3,6,7,8", "6,7,8", "gpu:2", str, "gpu:1"), + ) +) +@pytest.mark.paddle +def test_get_device_from_visible(user_visible_devices, cuda_visible_devices, device, output_type, correct): + _cuda_visible_devices = os.getenv("CUDA_VISIBLE_DEVICES") + _user_visible_devices = os.getenv("USER_CUDA_VISIBLE_DEVICES") + os.environ["CUDA_VISIBLE_DEVICES"] = cuda_visible_devices + os.environ["USER_CUDA_VISIBLE_DEVICES"] = user_visible_devices + res = get_device_from_visible(device, output_type) + assert res == correct + + # 还原环境变量 + if _cuda_visible_devices is None: + del os.environ["CUDA_VISIBLE_DEVICES"] + else: + os.environ["CUDA_VISIBLE_DEVICES"] = _cuda_visible_devices + if _user_visible_devices is None: + del os.environ["USER_CUDA_VISIBLE_DEVICES"] + else: + os.environ["USER_CUDA_VISIBLE_DEVICES"] = _user_visible_devices ############################################################################ # @@ -22,12 +52,6 @@ class TestPaddleToDevice: assert res.place.gpu_device_id() == 0 res = paddle_to(tensor, "cpu") assert res.place.is_cpu_place() - res = paddle_to(tensor, "gpu:2") - assert res.place.is_gpu_place() - assert res.place.gpu_device_id() == 2 - res = paddle_to(tensor, "gpu:1") - assert res.place.is_gpu_place() - assert res.place.gpu_device_id() == 1 ############################################################################ # @@ -64,28 +88,18 @@ class TestPaddleMoveDataToDevice: res = paddle_move_data_to_device(paddle_tensor, device="gpu:0", data_device=None) self.check_gpu(res, 0) - res = paddle_move_data_to_device(paddle_tensor, device="gpu:1", data_device=None) - self.check_gpu(res, 1) - res = paddle_move_data_to_device(paddle_tensor, device="gpu:0", data_device="cpu") self.check_gpu(res, 0) res = paddle_move_data_to_device(paddle_tensor, device=None, data_device="gpu:0") self.check_gpu(res, 0) - res = paddle_move_data_to_device(paddle_tensor, device=None, data_device="gpu:1") - self.check_gpu(res, 1) - def test_list_transfer(self): """ 测试张量列表的迁移 """ paddle_list = [paddle.rand((6, 4, 2)) for i in range(10)] - res = paddle_move_data_to_device(paddle_list, device=None, data_device="gpu:1") - assert isinstance(res, list) - for r in res: - self.check_gpu(r, 1) res = paddle_move_data_to_device(paddle_list, device="cpu", data_device="gpu:1") assert isinstance(res, list) @@ -97,11 +111,6 @@ class TestPaddleMoveDataToDevice: for r in res: self.check_gpu(r, 0) - res = paddle_move_data_to_device(paddle_list, device="gpu:1", data_device="cpu") - assert isinstance(res, list) - for r in res: - self.check_gpu(r, 1) - def test_tensor_tuple_transfer(self): """ 测试张量元组的迁移 @@ -109,10 +118,6 @@ class TestPaddleMoveDataToDevice: paddle_list = [paddle.rand((6, 4, 2)) for i in range(10)] paddle_tuple = tuple(paddle_list) - res = paddle_move_data_to_device(paddle_tuple, device=None, data_device="gpu:1") - assert isinstance(res, tuple) - for r in res: - self.check_gpu(r, 1) res = paddle_move_data_to_device(paddle_tuple, device="cpu", data_device="gpu:1") assert isinstance(res, tuple) @@ -124,11 +129,6 @@ class TestPaddleMoveDataToDevice: for r in res: self.check_gpu(r, 0) - res = paddle_move_data_to_device(paddle_tuple, device="gpu:1", data_device="cpu") - assert isinstance(res, tuple) - for r in res: - self.check_gpu(r, 1) - def test_dict_transfer(self): """ 测试字典结构的迁移 @@ -173,20 +173,6 @@ class TestPaddleMoveDataToDevice: self.check_gpu(t, 0) self.check_gpu(res["dict"]["tensor"], 0) - res = paddle_move_data_to_device(paddle_dict, device=None, data_device="gpu:1") - assert isinstance(res, dict) - self.check_gpu(res["tensor"], 1) - assert isinstance(res["list"], list) - for t in res["list"]: - self.check_gpu(t, 1) - assert isinstance(res["int"], int) - assert isinstance(res["string"], str) - assert isinstance(res["dict"], dict) - assert isinstance(res["dict"]["list"], list) - for t in res["dict"]["list"]: - self.check_gpu(t, 1) - self.check_gpu(res["dict"]["tensor"], 1) - res = paddle_move_data_to_device(paddle_dict, device="cpu", data_device="gpu:0") assert isinstance(res, dict) self.check_cpu(res["tensor"]) diff --git a/tests/pytest.ini b/tests/pytest.ini index d6a33a94..5015a002 100644 --- a/tests/pytest.ini +++ b/tests/pytest.ini @@ -2,5 +2,6 @@ markers = torch paddle + paddledist jittor torchpaddle \ No newline at end of file diff --git a/tutorials/data/test4dataset.csv b/tutorials/data/test4dataset.csv new file mode 100644 index 00000000..fdf87c08 --- /dev/null +++ b/tutorials/data/test4dataset.csv @@ -0,0 +1,7 @@ +,SentenceId,Sentence,Sentiment +0,1,"['a', 'series', 'of', 'escapades', 'demonstrating', 'the', 'adage', 'that', 'what', 'is', 'good', 'for', 'the', 'goose', 'is', 'also', 'good', 'for', 'the', 'gander', ',', 'some', 'of', 'which', 'occasionally', 'amuses', 'but', 'none', 'of', 'which', 'amounts', 'to', 'much', 'of', 'a', 'story', '.']",negative +1,2,"['this', 'quiet', ',', 'introspective', 'and', 'entertaining', 'independent', 'is', 'worth', 'seeking', '.']",positive +2,3,"['even', 'fans', 'of', 'ismail', 'merchant', ""'s"", 'work', ',', 'i', 'suspect', ',', 'would', 'have', 'a', 'hard', 'time', 'sitting', 'through', 'this', 'one', '.']",negative +3,4,"['a', 'positively', 'thrilling', 'combination', 'of', 'ethnography', 'and', 'all', 'the', 'intrigue', ',', 'betrayal', ',', 'deceit', 'and', 'murder', 'of', 'a', 'shakespearean', 'tragedy', 'or', 'a', 'juicy', 'soap', 'opera', '.']",neutral +4,5,"['a', 'comedy-drama', 'of', 'nearly', 'epic', 'proportions', 'rooted', 'in', 'a', 'sincere', 'performance', 'by', 'the', 'title', 'character', 'undergoing', 'midlife', 'crisis', '.']",positive +5,6,"['the', 'importance', 'of', 'being', 'earnest', ',', 'so', 'thick', 'with', 'wit', 'it', 'plays', 'like', 'a', 'reading', 'from', 'bartlett', ""'s"", 'familiar', 'quotations']",neutral diff --git a/tutorials/data/test4dataset.tsv b/tutorials/data/test4dataset.tsv new file mode 100644 index 00000000..ea2f6f37 --- /dev/null +++ b/tutorials/data/test4dataset.tsv @@ -0,0 +1,7 @@ +SentenceId Sentence Sentiment +1 A series of escapades demonstrating the adage that what is good for the goose is also good for the gander , some of which occasionally amuses but none of which amounts to much of a story . negative +2 This quiet , introspective and entertaining independent is worth seeking . positive +3 Even fans of Ismail Merchant 's work , I suspect , would have a hard time sitting through this one . negative +4 A positively thrilling combination of ethnography and all the intrigue , betrayal , deceit and murder of a Shakespearean tragedy or a juicy soap opera . neutral +5 A comedy-drama of nearly epic proportions rooted in a sincere performance by the title character undergoing midlife crisis . positive +6 The Importance of Being Earnest , so thick with wit it plays like a reading from Bartlett 's Familiar Quotations neutral diff --git a/tutorials/fastnlp_tutorial_1.ipynb b/tutorials/fastnlp_tutorial_1.ipynb index 11bd2219..93e7a664 100644 --- a/tutorials/fastnlp_tutorial_1.ipynb +++ b/tutorials/fastnlp_tutorial_1.ipynb @@ -153,7 +153,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "1969418794120 1971237588872\n", + "2438703969992 2438374526920\n", "+-----+------------------------+------------------------+-----+\n", "| idx | sentence | words | num |\n", "+-----+------------------------+------------------------+-----+\n", @@ -198,7 +198,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "1971237588872 1971237588872\n", + "2438374526920 2438374526920\n", "+-----+------------------------+------------------------+-----+\n", "| idx | sentence | words | num |\n", "+-----+------------------------+------------------------+-----+\n", @@ -774,9 +774,9 @@ { "data": { "text/plain": [ - "{'sentence': ,\n", - " 'words': ,\n", - " 'num': }" + "{'sentence': ,\n", + " 'words': ,\n", + " 'num': }" ] }, "execution_count": 15, @@ -923,7 +923,8 @@ "output_type": "stream", "text": [ "5 Counter({'生活': 1, '就像': 1, '海洋': 1})\n", - "6 Counter({'生活': 1, '就像': 1, '海洋': 1, '只有': 1})\n" + "6 Counter({'生活': 1, '就像': 1, '海洋': 1, '只有': 1})\n", + "6 {'': 0, '': 1, '生活': 2, '就像': 3, '海洋': 4, '只有': 5}\n" ] } ], @@ -931,7 +932,8 @@ "vocab.add_word_lst(['生活', '就像', '海洋'])\n", "print(len(vocab), vocab.word_count)\n", "vocab.add_word('只有')\n", - "print(len(vocab), vocab.word_count)" + "print(len(vocab), vocab.word_count)\n", + "print(len(vocab), vocab.word2idx)" ] }, { @@ -959,7 +961,6 @@ " 0\n", " 1\n", "生活 2\n", - "只有 5\n", "彼岸 1 False\n" ] } @@ -968,7 +969,6 @@ "print(vocab.to_word(0), vocab.to_index(''))\n", "print(vocab.to_word(1), vocab.to_index(''))\n", "print(vocab.to_word(2), vocab.to_index('生活'))\n", - "print(vocab.to_word(5), vocab.to_index('只有'))\n", "print('彼岸', vocab.to_index('彼岸'), vocab.has_word('彼岸'))" ] }, @@ -979,7 +979,9 @@ "source": [ "**`vocabulary`允许反复添加相同单词**,**可以通过`word_count`方法看到相应单词被添加的次数**\n", "\n", - "  但其中没有``和``,`vocabulary`的全部变量与函数可以通过`dir(vocabulary)`查询" + "  但其中没有``和``,`vocabulary`的全部变量与函数可以通过`dir(vocabulary)`查询\n", + "\n", + "  注:**使用`add_word_lst`添加单词**,**单词对应序号不会动态调整**,**使用`dataset`添加单词的情况不同**" ] }, { @@ -992,15 +994,19 @@ "name": "stdout", "output_type": "stream", "text": [ - "13 Counter({'生活': 2, '就像': 2, '海洋': 2, '只有': 2, '意志': 1, '坚强的': 1, '人': 1, '才': 1, '能': 1, '到达': 1, '彼岸': 1})\n", - "彼岸 12 True\n" + "生活 2\n", + "彼岸 12 True\n", + "13 Counter({'人': 4, '生活': 2, '就像': 2, '海洋': 2, '只有': 2, '意志': 1, '坚强的': 1, '才': 1, '能': 1, '到达': 1, '彼岸': 1})\n", + "13 {'': 0, '': 1, '生活': 2, '就像': 3, '海洋': 4, '只有': 5, '人': 6, '意志': 7, '坚强的': 8, '才': 9, '能': 10, '到达': 11, '彼岸': 12}\n" ] } ], "source": [ - "vocab.add_word_lst(['生活', '就像', '海洋', '只有', '意志', '坚强的', '人', '才', '能', '到达', '彼岸'])\n", + "vocab.add_word_lst(['生活', '就像', '海洋', '只有', '意志', '坚强的', '人', '人', '人', '人', '才', '能', '到达', '彼岸'])\n", + "print(vocab.to_word(2), vocab.to_index('生活'))\n", + "print('彼岸', vocab.to_index('彼岸'), vocab.has_word('彼岸'))\n", "print(len(vocab), vocab.word_count)\n", - "print('彼岸', vocab.to_index('彼岸'), vocab.has_word('彼岸'))" + "print(len(vocab), vocab.word2idx)" ] }, { @@ -1082,52 +1088,440 @@ "## 3 dataset 和 vocabulary 的组合使用\n", " \n", "### 3.1 从 dataframe 中加载 dataset\n", - "\n" + "\n", + "以下通过 [NLP-beginner](https://github.com/FudanNLP/nlp-beginner) 实践一中 [Rotten Tomatoes 影评数据集](https://www.kaggle.com/c/sentiment-analysis-on-movie-reviews) 的部分训练数据组成`test4dataset.tsv`文件\n", + "\n", + "  介绍如何使用`dataset`、`vocabulary`简单加载并处理数据集,首先使用`pandas`模块,读取原始数据的`dataframe`" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "3dbd985d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
SentenceIdSentenceSentiment
01A series of escapades demonstrating the adage ...negative
12This quiet , introspective and entertaining in...positive
23Even fans of Ismail Merchant 's work , I suspe...negative
34A positively thrilling combination of ethnogra...neutral
45A comedy-drama of nearly epic proportions root...positive
56The Importance of Being Earnest , so thick wit...neutral
\n", + "
" + ], + "text/plain": [ + " SentenceId Sentence Sentiment\n", + "0 1 A series of escapades demonstrating the adage ... negative\n", + "1 2 This quiet , introspective and entertaining in... positive\n", + "2 3 Even fans of Ismail Merchant 's work , I suspe... negative\n", + "3 4 A positively thrilling combination of ethnogra... neutral\n", + "4 5 A comedy-drama of nearly epic proportions root... positive\n", + "5 6 The Importance of Being Earnest , so thick wit... neutral" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import pandas as pd\n", + "\n", + "df = pd.read_csv('./data/test4dataset.tsv', sep='\\t')\n", + "df" ] }, { "cell_type": "markdown", - "id": "89059713", + "id": "919ab350", "metadata": {}, - "source": [] + "source": [ + "接着,通过`dataset`中的`from_pandas`方法填充数据集,并使用`apply_more`方法对文本进行分词操作" + ] }, { "cell_type": "code", - "execution_count": null, - "id": "3dbd985d", + "execution_count": 25, + "id": "4f634586", "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "data": { + "text/html": [ + "
\n"
+      ],
+      "text/plain": []
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "
\n"
+      ],
+      "text/plain": []
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "
\n",
+       "
\n" + ], + "text/plain": [ + "\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+------------+------------------------------+-----------+\n", + "| SentenceId | Sentence | Sentiment |\n", + "+------------+------------------------------+-----------+\n", + "| 1 | ['a', 'series', 'of', 'es... | negative |\n", + "| 2 | ['this', 'quiet', ',', 'i... | positive |\n", + "| 3 | ['even', 'fans', 'of', 'i... | negative |\n", + "| 4 | ['a', 'positively', 'thri... | neutral |\n", + "| 5 | ['a', 'comedy-drama', 'of... | positive |\n", + "| 6 | ['the', 'importance', 'of... | neutral |\n", + "+------------+------------------------------+-----------+\n" + ] + } + ], + "source": [ + "from fastNLP.core.dataset import DataSet\n", + "\n", + "dataset = DataSet()\n", + "dataset = dataset.from_pandas(df)\n", + "dataset.apply_more(lambda ins:{'SentenceId': ins['SentenceId'], \n", + " 'Sentence': ins['Sentence'].lower().split(), 'Sentiment': ins['Sentiment']})\n", + "print(dataset)" + ] + }, + { + "cell_type": "markdown", + "id": "5c1ae192", + "metadata": {}, + "source": [ + "  如果需要保存中间结果,也可以使用`dataset`的`to_csv`方法,生成`.csv`或`.tsv`文件" + ] }, { "cell_type": "code", - "execution_count": null, - "id": "4f634586", + "execution_count": 26, + "id": "46722efc", "metadata": {}, "outputs": [], - "source": [] + "source": [ + "dataset.to_csv('./data/test4dataset.csv')" + ] }, { "cell_type": "markdown", "id": "5ba13989", "metadata": {}, "source": [ - "### 3.2 从 dataset 中获取 vocabulary" + "### 3.2 从 dataset 中获取 vocabulary\n", + "\n", + "然后,初始化`vocabulary`,使用`vocabulary`中的`from_dataset`方法,从`dataset`的指定字段中\n", + "\n", + "  获取字段中的所有元素,然后编号;如果指定字段是个列表,则针对字段中所有列表包含的元素编号\n", + "\n", + "  注:**使用`dataset`添加单词**,**不同于`add_word_list`**,**单词被添加次数越多**,**序号越靠前**,例如案例中的`a`" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 27, "id": "a2de615b", "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "data": { + "text/html": [ + "
\n"
+      ],
+      "text/plain": []
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "
\n"
+      ],
+      "text/plain": []
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "
\n",
+       "
\n" + ], + "text/plain": [ + "\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Counter({'a': 9, 'of': 9, ',': 7, 'the': 6, '.': 5, 'is': 3, 'and': 3, 'good': 2, 'for': 2, 'which': 2, 'this': 2, \"'s\": 2, 'series': 1, 'escapades': 1, 'demonstrating': 1, 'adage': 1, 'that': 1, 'what': 1, 'goose': 1, 'also': 1, 'gander': 1, 'some': 1, 'occasionally': 1, 'amuses': 1, 'but': 1, 'none': 1, 'amounts': 1, 'to': 1, 'much': 1, 'story': 1, 'quiet': 1, 'introspective': 1, 'entertaining': 1, 'independent': 1, 'worth': 1, 'seeking': 1, 'even': 1, 'fans': 1, 'ismail': 1, 'merchant': 1, 'work': 1, 'i': 1, 'suspect': 1, 'would': 1, 'have': 1, 'hard': 1, 'time': 1, 'sitting': 1, 'through': 1, 'one': 1, 'positively': 1, 'thrilling': 1, 'combination': 1, 'ethnography': 1, 'all': 1, 'intrigue': 1, 'betrayal': 1, 'deceit': 1, 'murder': 1, 'shakespearean': 1, 'tragedy': 1, 'or': 1, 'juicy': 1, 'soap': 1, 'opera': 1, 'comedy-drama': 1, 'nearly': 1, 'epic': 1, 'proportions': 1, 'rooted': 1, 'in': 1, 'sincere': 1, 'performance': 1, 'by': 1, 'title': 1, 'character': 1, 'undergoing': 1, 'midlife': 1, 'crisis': 1, 'importance': 1, 'being': 1, 'earnest': 1, 'so': 1, 'thick': 1, 'with': 1, 'wit': 1, 'it': 1, 'plays': 1, 'like': 1, 'reading': 1, 'from': 1, 'bartlett': 1, 'familiar': 1, 'quotations': 1}) \n", + "\n", + "{'': 0, '': 1, 'a': 2, 'of': 3, ',': 4, 'the': 5, '.': 6, 'is': 7, 'and': 8, 'good': 9, 'for': 10, 'which': 11, 'this': 12, \"'s\": 13, 'series': 14, 'escapades': 15, 'demonstrating': 16, 'adage': 17, 'that': 18, 'what': 19, 'goose': 20, 'also': 21, 'gander': 22, 'some': 23, 'occasionally': 24, 'amuses': 25, 'but': 26, 'none': 27, 'amounts': 28, 'to': 29, 'much': 30, 'story': 31, 'quiet': 32, 'introspective': 33, 'entertaining': 34, 'independent': 35, 'worth': 36, 'seeking': 37, 'even': 38, 'fans': 39, 'ismail': 40, 'merchant': 41, 'work': 42, 'i': 43, 'suspect': 44, 'would': 45, 'have': 46, 'hard': 47, 'time': 48, 'sitting': 49, 'through': 50, 'one': 51, 'positively': 52, 'thrilling': 53, 'combination': 54, 'ethnography': 55, 'all': 56, 'intrigue': 57, 'betrayal': 58, 'deceit': 59, 'murder': 60, 'shakespearean': 61, 'tragedy': 62, 'or': 63, 'juicy': 64, 'soap': 65, 'opera': 66, 'comedy-drama': 67, 'nearly': 68, 'epic': 69, 'proportions': 70, 'rooted': 71, 'in': 72, 'sincere': 73, 'performance': 74, 'by': 75, 'title': 76, 'character': 77, 'undergoing': 78, 'midlife': 79, 'crisis': 80, 'importance': 81, 'being': 82, 'earnest': 83, 'so': 84, 'thick': 85, 'with': 86, 'wit': 87, 'it': 88, 'plays': 89, 'like': 90, 'reading': 91, 'from': 92, 'bartlett': 93, 'familiar': 94, 'quotations': 95} \n", + "\n", + "Vocabulary(['a', 'series', 'of', 'escapades', 'demonstrating']...)\n" + ] + } + ], + "source": [ + "from fastNLP.core.vocabulary import Vocabulary\n", + "\n", + "vocab = Vocabulary()\n", + "vocab = vocab.from_dataset(dataset, field_name='Sentence')\n", + "print(vocab.word_count, '\\n')\n", + "print(vocab.word2idx, '\\n')\n", + "print(vocab)" + ] + }, + { + "cell_type": "markdown", + "id": "f0857ccb", + "metadata": {}, + "source": [ + "之后,**通过`vocabulary`的`index_dataset`方法**,**调整`dataset`中指定字段的元素**,**使用编号将之代替**\n", + "\n", + "  使用上述方法,可以将影评数据集中的单词序列转化为词编号序列,为接下来转化为词嵌入序列做准备" + ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 28, + "id": "2f9a04b2", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n"
+      ],
+      "text/plain": []
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "
\n"
+      ],
+      "text/plain": []
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "
\n",
+       "
\n" + ], + "text/plain": [ + "\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+------------+------------------------------+-----------+\n", + "| SentenceId | Sentence | Sentiment |\n", + "+------------+------------------------------+-----------+\n", + "| 1 | [2, 14, 3, 15, 16, 5, 17,... | negative |\n", + "| 2 | [12, 32, 4, 33, 8, 34, 35... | positive |\n", + "| 3 | [38, 39, 3, 40, 41, 13, 4... | negative |\n", + "| 4 | [2, 52, 53, 54, 3, 55, 8,... | neutral |\n", + "| 5 | [2, 67, 3, 68, 69, 70, 71... | positive |\n", + "| 6 | [5, 81, 3, 82, 83, 4, 84,... | neutral |\n", + "+------------+------------------------------+-----------+\n" + ] + } + ], + "source": [ + "vocab.index_dataset(dataset, field_name='Sentence')\n", + "print(dataset)" + ] + }, + { + "cell_type": "markdown", + "id": "6b26b707", + "metadata": {}, + "source": [ + "最后,使用相同方法,再将`dataset`中`Sentiment`字段中的`negative`、`neutral`、`positive`转化为数字编号" + ] + }, + { + "cell_type": "code", + "execution_count": 29, "id": "5f5eed18", "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n"
+      ],
+      "text/plain": []
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'negative': 0, 'positive': 1, 'neutral': 2}\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "
\n"
+      ],
+      "text/plain": []
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "
\n",
+       "
\n" + ], + "text/plain": [ + "\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+------------+------------------------------+-----------+\n", + "| SentenceId | Sentence | Sentiment |\n", + "+------------+------------------------------+-----------+\n", + "| 1 | [2, 14, 3, 15, 16, 5, 17,... | 0 |\n", + "| 2 | [12, 32, 4, 33, 8, 34, 35... | 1 |\n", + "| 3 | [38, 39, 3, 40, 41, 13, 4... | 0 |\n", + "| 4 | [2, 52, 53, 54, 3, 55, 8,... | 2 |\n", + "| 5 | [2, 67, 3, 68, 69, 70, 71... | 1 |\n", + "| 6 | [5, 81, 3, 82, 83, 4, 84,... | 2 |\n", + "+------------+------------------------------+-----------+\n" + ] + } + ], + "source": [ + "target_vocab = Vocabulary(padding=None, unknown=None)\n", + "\n", + "target_vocab.from_dataset(dataset, field_name='Sentiment')\n", + "print(target_vocab.word2idx)\n", + "target_vocab.index_dataset(dataset, field_name='Sentiment')\n", + "print(dataset)" + ] + }, + { + "cell_type": "markdown", + "id": "eed7ea64", + "metadata": {}, + "source": [ + "在最后的最后,通过以下的一张图,来总结本章关于`dataset`和`vocabulary`主要知识点的讲解,以及两者的联系\n", + "\n", + "" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "35b4f0f7", + "metadata": {}, "outputs": [], "source": [] } diff --git a/tutorials/fastnlp_tutorial_2.ipynb b/tutorials/fastnlp_tutorial_2.ipynb new file mode 100644 index 00000000..260d5bf4 --- /dev/null +++ b/tutorials/fastnlp_tutorial_2.ipynb @@ -0,0 +1,41 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.4" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +} diff --git a/tutorials/figures/T1-fig-dataset-and-vocabulary.png b/tutorials/figures/T1-fig-dataset-and-vocabulary.png new file mode 100644 index 00000000..803cf34a Binary files /dev/null and b/tutorials/figures/T1-fig-dataset-and-vocabulary.png differ