diff --git a/fastNLP/core/drivers/paddle_driver/initialize_paddle_driver.py b/fastNLP/core/drivers/paddle_driver/initialize_paddle_driver.py index 2cba6388..eac2d4a4 100644 --- a/fastNLP/core/drivers/paddle_driver/initialize_paddle_driver.py +++ b/fastNLP/core/drivers/paddle_driver/initialize_paddle_driver.py @@ -47,9 +47,7 @@ def initialize_paddle_driver(driver: str, device: Optional[Union[str, int, List[ raise ValueError("Parameter `device` can only be '-1' when it is smaller than 0.") if device >= _could_use_device_num: raise ValueError("The gpu device that parameter `device` specifies is not existed.") - if device != -1: - device = f"gpu:{device}" - else: + if device == -1: device = list(range(_could_use_device_num)) elif isinstance(device, Sequence) and not isinstance(device, str): device = list(set(device)) @@ -61,9 +59,6 @@ def initialize_paddle_driver(driver: str, device: Optional[Union[str, int, List[ elif each >= _could_use_device_num: raise ValueError("When parameter `device` is 'Sequence' type, the value in it should not be bigger than" " the available gpu number.") - if len(device) == 1: - # 传入了 [1] 这样的,视为单卡。 - device = device[0] elif device is not None and not isinstance(device, str): raise ValueError("Parameter `device` is wrong type, please check our documentation for the right use.") @@ -82,6 +77,6 @@ def initialize_paddle_driver(driver: str, device: Optional[Union[str, int, List[ logger.warning("Notice you are using `fleet` driver, but your chosen `device` is only one gpu, we will" "still use `PaddleFleetDriver` for you, but if you mean using `PaddleSingleDriver`, you should " "choose `paddle` driver.") - return PaddleFleetDriver(model, device, **kwargs) + return PaddleFleetDriver(model, [device], **kwargs) else: return PaddleFleetDriver(model, device, **kwargs) diff --git a/fastNLP/core/drivers/paddle_driver/paddle_driver.py b/fastNLP/core/drivers/paddle_driver/paddle_driver.py index fe8bf404..ed1aad73 100644 --- a/fastNLP/core/drivers/paddle_driver/paddle_driver.py +++ b/fastNLP/core/drivers/paddle_driver/paddle_driver.py @@ -19,7 +19,12 @@ from fastNLP.envs import ( rank_zero_call, ) from fastNLP.core.log import logger -from fastNLP.core.samplers import ReproducibleBatchSampler, ReproducibleSampler, RandomBatchSampler +from fastNLP.core.samplers import ( + ReproducibleBatchSampler, + ReproducibleSampler, + RandomBatchSampler, + RandomSampler, +) if _NEED_IMPORT_PADDLE: import paddle @@ -29,7 +34,7 @@ if _NEED_IMPORT_PADDLE: Dataset, Sampler, BatchSampler, - RandomSampler, + RandomSampler as PaddleRandomSampler, ) from paddle.optimizer import Optimizer @@ -333,6 +338,9 @@ class PaddleDriver(Driver): sampler = dataloader_args.batch_sampler elif isinstance(dataloader_args.sampler, ReproducibleSampler): sampler = dataloader_args.sampler + elif isinstance(dataloader_args.sampler, PaddleRandomSampler): + sampler = RandomSampler(dataloader_args.sampler.data_source) + logger.debug("Replace paddle RandomSampler into fastNLP RandomSampler.") elif self.is_distributed(): raise RuntimeError("It is not allowed to use checkpoint retraining when you do not use our or " "`ReproducibleSampler`.") @@ -464,7 +472,7 @@ class PaddleDriver(Driver): res.sampler = dataloader.batch_sampler.sampler if hasattr(dataloader.batch_sampler.sampler, "shuffle"): res.shuffle = dataloader.batch_sampler.sampler.shuffle - elif isinstance(dataloader.batch_sampler.sampler, RandomSampler): + elif isinstance(dataloader.batch_sampler.sampler, PaddleRandomSampler): res.shuffle = True else: res.shuffle = False @@ -474,7 +482,7 @@ class PaddleDriver(Driver): res.sampler = batch_sampler.sampler if hasattr(batch_sampler.sampler, "shuffle"): res.shuffle = dataloader.batch_sampler.sampler.shuffle - elif isinstance(batch_sampler.sampler, RandomSampler): + elif isinstance(batch_sampler.sampler, PaddleRandomSampler): res.shuffle = True else: res.shuffle = False diff --git a/tests/core/drivers/paddle_driver/test_initialize_paddle_driver.py b/tests/core/drivers/paddle_driver/test_initialize_paddle_driver.py index 54ef22b6..df96d746 100644 --- a/tests/core/drivers/paddle_driver/test_initialize_paddle_driver.py +++ b/tests/core/drivers/paddle_driver/test_initialize_paddle_driver.py @@ -19,7 +19,7 @@ def test_incorrect_driver(): @pytest.mark.parametrize( "device", - ["cpu", "gpu:0", 0, [1]] + ["cpu", "gpu:0", 0] ) @pytest.mark.parametrize( "driver", @@ -27,7 +27,7 @@ def test_incorrect_driver(): ) def test_get_single_device(driver, device): """ - 测试正常情况下初始化PaddleSingleDriver的情况 + 测试正常情况下初始化 PaddleSingleDriver 的情况 """ model = PaddleNormalModel_Classification_1(2, 100) @@ -36,7 +36,7 @@ def test_get_single_device(driver, device): @pytest.mark.parametrize( "device", - [0, 1] + [0, 1, [1]] ) @pytest.mark.parametrize( "driver", @@ -45,7 +45,7 @@ def test_get_single_device(driver, device): @magic_argv_env_context def test_get_fleet_2(driver, device): """ - 测试 fleet 多卡的初始化情况 + 测试 fleet 多卡的初始化情况,但传入了单个 gpu """ model = PaddleNormalModel_Classification_1(64, 10) diff --git a/tests/core/drivers/paddle_driver/test_single_device.py b/tests/core/drivers/paddle_driver/test_single_device.py index c80bd609..2aa4e0e6 100644 --- a/tests/core/drivers/paddle_driver/test_single_device.py +++ b/tests/core/drivers/paddle_driver/test_single_device.py @@ -34,7 +34,7 @@ class TestPaddleDriverFunctions: def test_check_single_optimizer_legality(self): """ - 测试传入单个optimizer时的表现 + 测试传入单个 optimizer 时的表现 """ optimizer = paddle.optimizer.Adam( parameters=self.driver.model.parameters(), @@ -50,7 +50,7 @@ class TestPaddleDriverFunctions: def test_check_optimizers_legality(self): """ - 测试传入optimizer list的表现 + 测试传入 optimizer list 的表现 """ optimizers = [ paddle.optimizer.Adam( @@ -70,13 +70,13 @@ class TestPaddleDriverFunctions: def test_check_dataloader_legality_in_train(self): """ - 测试is_train参数为True时,_check_dataloader_legality函数的表现 + 测试 `is_train` 参数为 True 时,_check_dataloader_legality 函数的表现 """ - dataloader = paddle.io.DataLoader(PaddleNormalDataset()) + dataloader = DataLoader(PaddleNormalDataset()) PaddleSingleDriver.check_dataloader_legality(dataloader, "dataloader", True) # batch_size 和 batch_sampler 均为 None 的情形 - dataloader = paddle.io.DataLoader(PaddleNormalDataset(), batch_size=None) + dataloader = DataLoader(PaddleNormalDataset(), batch_size=None) with pytest.raises(ValueError): PaddleSingleDriver.check_dataloader_legality(dataloader, "dataloader", True) @@ -90,29 +90,29 @@ class TestPaddleDriverFunctions: def test_check_dataloader_legality_in_test(self): """ - 测试is_train参数为False时,_check_dataloader_legality函数的表现 + 测试 `is_train` 参数为 False 时,_check_dataloader_legality 函数的表现 """ # 此时传入的应该是dict dataloader = { - "train": paddle.io.DataLoader(PaddleNormalDataset()), - "test":paddle.io.DataLoader(PaddleNormalDataset()) + "train": DataLoader(PaddleNormalDataset()), + "test":DataLoader(PaddleNormalDataset()) } PaddleSingleDriver.check_dataloader_legality(dataloader, "dataloader", False) # batch_size 和 batch_sampler 均为 None 的情形 dataloader = { - "train": paddle.io.DataLoader(PaddleNormalDataset()), - "test":paddle.io.DataLoader(PaddleNormalDataset(), batch_size=None) + "train": DataLoader(PaddleNormalDataset()), + "test":DataLoader(PaddleNormalDataset(), batch_size=None) } with pytest.raises(ValueError): PaddleSingleDriver.check_dataloader_legality(dataloader, "dataloader", False) - # 传入的不是dict,应该报错 - dataloader = paddle.io.DataLoader(PaddleNormalDataset()) + # 传入的不是 dict ,应该报错 + dataloader = DataLoader(PaddleNormalDataset()) with pytest.raises(ValueError): PaddleSingleDriver.check_dataloader_legality(dataloader, "dataloader", False) - # 创建torch的dataloader + # 创建 torch 的 dataloader train_loader = torch.utils.data.DataLoader( TorchNormalDataset(), batch_size=32, shuffle=True @@ -127,7 +127,7 @@ class TestPaddleDriverFunctions: def test_tensor_to_numeric(self): """ - 测试tensor_to_numeric函数 + 测试 tensor_to_numeric 函数 """ # 单个张量 tensor = paddle.to_tensor(3) @@ -180,7 +180,7 @@ class TestPaddleDriverFunctions: def test_set_model_mode(self): """ - 测试set_model_mode函数 + 测试 set_model_mode 函数 """ self.driver.set_model_mode("train") assert self.driver.model.training @@ -192,14 +192,14 @@ class TestPaddleDriverFunctions: def test_move_model_to_device_cpu(self): """ - 测试move_model_to_device函数 + 测试 move_model_to_device 函数 """ PaddleSingleDriver.move_model_to_device(self.driver.model, "cpu") assert self.driver.model.linear1.weight.place.is_cpu_place() def test_move_model_to_device_gpu(self): """ - 测试move_model_to_device函数 + 测试 move_model_to_device 函数 """ PaddleSingleDriver.move_model_to_device(self.driver.model, "gpu") assert self.driver.model.linear1.weight.place.is_gpu_place() @@ -207,7 +207,7 @@ class TestPaddleDriverFunctions: def test_worker_init_function(self): """ - 测试worker_init_function + 测试 worker_init_function """ # 先确保不影响运行 # TODO:正确性 @@ -215,7 +215,7 @@ class TestPaddleDriverFunctions: def test_set_deterministic_dataloader(self): """ - 测试set_deterministic_dataloader + 测试 set_deterministic_dataloader """ # 先确保不影响运行 # TODO:正确性 @@ -224,7 +224,7 @@ class TestPaddleDriverFunctions: def test_set_sampler_epoch(self): """ - 测试set_sampler_epoch + 测试 set_sampler_epoch """ # 先确保不影响运行 # TODO:正确性 @@ -336,7 +336,7 @@ class TestSingleDeviceFunction: def test_move_data_to_device(self): """ - 这个函数仅调用了paddle_move_data_to_device,测试例在tests/core/utils/test_paddle_utils.py中 + 这个函数仅调用了 paddle_move_data_to_device ,测试例在 tests/core/utils/test_paddle_utils.py 中 就不重复测试了 """ self.driver.move_data_to_device(paddle.rand((32, 64))) @@ -490,9 +490,6 @@ class TestSetDistReproDataloader: else: sampler_states = replaced_loader.batch_sampler.sampler.state_dict() - # 加载 num_consumed_samples_array,设置正确取出的 batch 数目 - num_consumed_samples_array = sampler_states.pop('num_consumed_samples_array', None) - # 重新加载,应该可以输出剩下的内容,且对于 PaddleNormalDataset 来说,排序后应该是一个 range left_idxes = set() if isinstance(replaced_loader.batch_sampler, RandomBatchSampler): @@ -510,7 +507,6 @@ class TestSetDistReproDataloader: new_loader.batch_sampler.load_state_dict(sampler_states) else: batch_size = replaced_loader.batch_sampler.batch_size - num_consumed_samples = num_consumed_batches * batch_size sampler_states["num_consumed_samples"] = num_consumed_batches * batch_size # 重新构造 dataloader batch_sampler = BatchSampler(replaced_loader.dataset, shuffle=shuffle, batch_size=batch_size) diff --git a/tests/core/drivers/torch_driver/test_initialize_torch_driver.py b/tests/core/drivers/torch_driver/test_initialize_torch_driver.py new file mode 100644 index 00000000..6c47e30e --- /dev/null +++ b/tests/core/drivers/torch_driver/test_initialize_torch_driver.py @@ -0,0 +1,103 @@ +import os +import pytest + +os.environ["FASTNLP_BACKEND"] = "torch" + +from fastNLP.core.drivers import TorchSingleDriver, TorchDDPDriver +from fastNLP.core.drivers.torch_driver.initialize_torch_driver import initialize_torch_driver +from fastNLP.envs import get_gpu_count +from tests.helpers.models.torch_model import TorchNormalModel_Classification_1 +from tests.helpers.utils import magic_argv_env_context + +import torch + +def test_incorrect_driver(): + + model = TorchNormalModel_Classification_1(2, 100) + with pytest.raises(ValueError): + driver = initialize_torch_driver("paddle", 0, model) + +@pytest.mark.parametrize( + "device", + ["cpu", "cuda:0", 0, torch.device("cuda:0")] +) +@pytest.mark.parametrize( + "driver", + ["torch"] +) +def test_get_single_device(driver, device): + """ + 测试正常情况下初始化TorchSingleDriver的情况 + """ + + model = TorchNormalModel_Classification_1(2, 100) + driver = initialize_torch_driver(driver, device, model) + assert isinstance(driver, TorchSingleDriver) + +@pytest.mark.parametrize( + "device", + [0, 1] +) +@pytest.mark.parametrize( + "driver", + ["torch_ddp"] +) +@magic_argv_env_context +def test_get_ddp_2(driver, device): + """ + 测试 ddp 多卡的初始化情况,但传入了单个 gpu + """ + + model = TorchNormalModel_Classification_1(64, 10) + driver = initialize_torch_driver(driver, device, model) + + assert isinstance(driver, TorchDDPDriver) + +@pytest.mark.parametrize( + "device", + [[0, 2, 3], -1] +) +@pytest.mark.parametrize( + "driver", + ["torch", "torch_ddp"] +) +@magic_argv_env_context +def test_get_ddp(driver, device): + """ + 测试 ddp 多卡的初始化情况 + """ + + model = TorchNormalModel_Classification_1(64, 10) + driver = initialize_torch_driver(driver, device, model) + + assert isinstance(driver, TorchDDPDriver) + +@pytest.mark.parametrize( + ("driver", "device"), + [("torch_ddp", "cpu")] +) +@magic_argv_env_context +def test_get_ddp_cpu(driver, device): + """ + 测试试图在 cpu 上初始化分布式训练的情况 + """ + model = TorchNormalModel_Classification_1(64, 10) + with pytest.raises(ValueError): + driver = initialize_torch_driver(driver, device, model) + +@pytest.mark.parametrize( + "device", + [-2, [0, torch.cuda.device_count() + 1, 3], [-2], torch.cuda.device_count() + 1] +) +@pytest.mark.parametrize( + "driver", + ["torch", "torch_ddp"] +) +@magic_argv_env_context +def test_device_out_of_range(driver, device): + """ + 测试传入的device超过范围的情况 + """ + model = TorchNormalModel_Classification_1(2, 100) + with pytest.raises(ValueError): + driver = initialize_torch_driver(driver, device, model) \ No newline at end of file