From c4af9f21c67ba65367a19ace869ebe472f000ce2 Mon Sep 17 00:00:00 2001 From: x54-729 <17307130121@fudan.edu.cn> Date: Sat, 14 May 2022 11:13:26 +0000 Subject: [PATCH] =?UTF-8?q?paddle=20=E5=92=8C=E6=B5=8B=E8=AF=95=E4=BE=8B?= =?UTF-8?q?=E8=B7=9F=E8=BF=9B=20set=5Fdist=5Frepro=5Fdataloader=20?= =?UTF-8?q?=E5=87=BD=E6=95=B0=EF=BC=9B=E4=BF=AE=E6=94=B9test=5Ftrainer=5Fw?= =?UTF-8?q?o=5Fevaluator=5Ftorch.py=E7=9A=84bug?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../drivers/paddle_driver/single_device.py | 27 ++++++++++++------- .../test_trainer_wo_evaluator_torch.py | 2 +- .../paddle_driver/test_single_device.py | 16 ++++------- .../torch_driver/test_single_device.py | 19 ++++++------- 4 files changed, 31 insertions(+), 33 deletions(-) diff --git a/fastNLP/core/drivers/paddle_driver/single_device.py b/fastNLP/core/drivers/paddle_driver/single_device.py index 9d362938..ba404814 100644 --- a/fastNLP/core/drivers/paddle_driver/single_device.py +++ b/fastNLP/core/drivers/paddle_driver/single_device.py @@ -123,17 +123,24 @@ class PaddleSingleDriver(PaddleDriver): if reproducible: if isinstance(args.sampler, paddle.io.RandomSampler): - # 如果本来就是随机的,直接替换 - sampler = RandomSampler(args.sampler.data_source) - logger.debug("Replace paddle RandomSampler into fastNLP RandomSampler.") + if getattr(args.sampler, '_num_samples', None) is None \ + and getattr(args.sampler, 'replacements', False) is False \ + and getattr(args.sampler, 'generator', None) is None: + # 如果本来就是随机的,并且没有定制,直接替换掉。 + sampler = RandomSampler(args.sampler.data_source, shuffle=True) + logger.debug("Replace paddle RandomSampler into fastNLP RandomSampler.") + return replace_sampler(dataloader, sampler) + elif isinstance(args.sampler, paddle.io.SequenceSampler): + # 需要替换为不要 shuffle 的。 + sampler = RandomSampler(args.sampler.data_source, shuffle=False) + logger.debug("Replace paddle SequentialSampler into fastNLP RandomSampler.") return replace_sampler(dataloader, sampler) - else: - batch_sampler = ReproduceBatchSampler( - batch_sampler=args.batch_sampler, - batch_size=args.batch_size, - drop_last=args.drop_last - ) - return replace_batch_sampler(dataloader, batch_sampler) + batch_sampler = ReproduceBatchSampler( + batch_sampler=args.batch_sampler, + batch_size=args.batch_size, + drop_last=args.drop_last + ) + return replace_batch_sampler(dataloader, batch_sampler) else: return dataloader diff --git a/tests/core/controllers/test_trainer_wo_evaluator_torch.py b/tests/core/controllers/test_trainer_wo_evaluator_torch.py index 5b794459..ad7bf97d 100644 --- a/tests/core/controllers/test_trainer_wo_evaluator_torch.py +++ b/tests/core/controllers/test_trainer_wo_evaluator_torch.py @@ -250,7 +250,7 @@ def test_trainer_output_from_new_proc( @pytest.mark.torch -@pytest.mark.parametrize("driver,device", [("torch", [1, 2])]) +@pytest.mark.parametrize("driver,device", [("torch", [0, 1])]) @pytest.mark.parametrize("cur_rank", [0]) # 依次测试如果是当前进程出现错误,是否能够正确地 kill 掉其他进程; , 1, 2, 3 @magic_argv_env_context def test_trainer_on_exception( diff --git a/tests/core/drivers/paddle_driver/test_single_device.py b/tests/core/drivers/paddle_driver/test_single_device.py index e7d6707a..67ea1b42 100644 --- a/tests/core/drivers/paddle_driver/test_single_device.py +++ b/tests/core/drivers/paddle_driver/test_single_device.py @@ -386,22 +386,16 @@ class TestSetDistReproDataloader: def test_with_reproducible_true(self, shuffle): """ 测试 set_dist_repro_dataloader 参数 `reproducible` 为 True 时的表现 - 当dist为字符串时,此时应该返回新的 dataloader,且如果原 sampler 为 paddle.io.RandomSampler(shuffle=True), - 只会替换 Sampler 为 RandomSampler;否则会替换 batch_sampler 为 ReproduceBatchSampler + 当dist为字符串时,此时应该返回新的 dataloader,会替换 sampler 为 RandomSampler """ dataloader = DataLoader(self.dataset, batch_size=2, shuffle=shuffle) replaced_loader = self.driver.set_dist_repro_dataloader(dataloader, dist="dist", reproducible=True) assert not (replaced_loader is dataloader) - if shuffle: - # 此时会替换 sampler - assert isinstance(replaced_loader.batch_sampler, paddle.io.BatchSampler) - assert not (replaced_loader.batch_sampler is dataloader.batch_sampler) - assert isinstance(replaced_loader.batch_sampler.sampler, RandomSampler) - else: - # 此时会替换 batch_sampler - assert isinstance(replaced_loader.batch_sampler, ReproduceBatchSampler) - assert isinstance(replaced_loader.batch_sampler.batch_sampler, BatchSampler) + assert isinstance(replaced_loader.batch_sampler, paddle.io.BatchSampler) + assert not (replaced_loader.batch_sampler is dataloader.batch_sampler) + assert isinstance(replaced_loader.batch_sampler.sampler, RandomSampler) + assert replaced_loader.batch_sampler.sampler.shuffle == shuffle assert replaced_loader.batch_sampler.batch_size == dataloader.batch_sampler.batch_size assert replaced_loader.drop_last == dataloader.drop_last diff --git a/tests/core/drivers/torch_driver/test_single_device.py b/tests/core/drivers/torch_driver/test_single_device.py index 1fbc9d82..51555918 100644 --- a/tests/core/drivers/torch_driver/test_single_device.py +++ b/tests/core/drivers/torch_driver/test_single_device.py @@ -400,22 +400,19 @@ class TestSetDistReproDataloader: def test_with_reproducible_true(self, shuffle): """ 测试 set_dist_repro_dataloader 参数 `reproducible` 为 True 时的表现 - 当dist为字符串时,此时应该返回新的 dataloader,且如果原 sampler 为 torch.utils.data.RandomSampler(shuffle=True), - 只会替换 Sampler 为 RandomSampler;否则会替换 batch_sampler 为 ReproduceBatchSampler + 当dist为字符串时,此时应该返回新的 dataloader,会替换 sampler 为 RandomSampler; + TODO: + 在 Sampler 的参数不是默认的情况下会替换 batch_sampler """ dataloader = DataLoader(self.dataset, batch_size=2, shuffle=shuffle) replaced_loader = self.driver.set_dist_repro_dataloader(dataloader, dist="dist", reproducible=True) assert not (replaced_loader is dataloader) - if shuffle: - # 此时会替换 sampler - assert isinstance(replaced_loader.batch_sampler, torch.utils.data.BatchSampler) - assert not (replaced_loader.batch_sampler is dataloader.batch_sampler) - assert isinstance(replaced_loader.batch_sampler.sampler, RandomSampler) - else: - # 此时会替换 batch_sampler - assert isinstance(replaced_loader.batch_sampler, ReproduceBatchSampler) - assert isinstance(replaced_loader.batch_sampler.batch_sampler, BatchSampler) + # 替换 sampler + assert isinstance(replaced_loader.batch_sampler, torch.utils.data.BatchSampler) + assert not (replaced_loader.batch_sampler is dataloader.batch_sampler) + assert isinstance(replaced_loader.batch_sampler.sampler, RandomSampler) + assert replaced_loader.batch_sampler.sampler.shuffle == shuffle assert replaced_loader.batch_sampler.batch_size == dataloader.batch_sampler.batch_size assert replaced_loader.drop_last == dataloader.drop_last