| @@ -182,7 +182,13 @@ def replace_batch_sampler(dataloader: "DataLoader", batch_sampler: "BatchSampler | |||
| if key not in init_params and key != 'self': | |||
| init_params[key] = value | |||
| reconstruct_args = {k: v for k, v in instance_attrs.items() if k in init_params} | |||
| # 如果初始化dataloader所使用的参数不是默认值,那么我们需要将其记录下来用于重新初始化时设置; | |||
| non_default_params = {name for name, p in init_params.items() if | |||
| name in instance_attrs and p.default != instance_attrs[name]} | |||
| # add `dataset` as it might have been replaced with `*args` | |||
| non_default_params.add("dataset") | |||
| reconstruct_args = {k: v for k, v in instance_attrs.items() if k in non_default_params} | |||
| reconstruct_args.update({ | |||
| "batch_sampler": batch_sampler, "shuffle": False, "drop_last": False, "batch_size": 1, | |||
| "persistent_workers": dataloader._persistent_workers, | |||
| @@ -13,6 +13,8 @@ from tests.helpers.models.paddle_model import PaddleNormalModel_Classification_1 | |||
| from tests.helpers.datasets.paddle_data import PaddleNormalDataset, PaddleNormalXYDataset | |||
| from tests.helpers.utils import magic_argv_env_context | |||
| from fastNLP.envs.distributed import rank_zero_rm | |||
| from fastNLP import prepare_paddle_dataloader | |||
| from fastNLP.core.drivers.paddle_driver.dist_utils import fastnlp_paddle_all_gather | |||
| from fastNLP.envs.imports import _NEED_IMPORT_PADDLE | |||
| if _NEED_IMPORT_PADDLE: | |||
| import paddle | |||
| @@ -814,4 +816,112 @@ class TestSaveLoad: | |||
| assert len(left_y_batches | already_seen_y_set) == len(self.dataset) / num_replicas | |||
| finally: | |||
| rank_zero_rm(path) | |||
| rank_zero_rm(path) | |||
| @pytest.mark.torch | |||
| @magic_argv_env_context | |||
| @pytest.mark.parametrize("shuffle", ([True, False])) | |||
| @pytest.mark.parametrize("batch_size", ([1, 3, 16, 17])) | |||
| @pytest.mark.parametrize("drop_last", ([True, False])) | |||
| def test_shuffle_dataloader(shuffle, batch_size, drop_last, reproducible=True): | |||
| try: | |||
| # 需要检验一下 set_dist_repro_dataloader 没有修改参数 | |||
| num_samples = 200 | |||
| dataset = PaddleNormalXYDataset(num_samples) | |||
| dl = prepare_paddle_dataloader(dataset, shuffle=shuffle, batch_size=batch_size, drop_last=drop_last) | |||
| model = PaddleNormalModel_Classification_1(10, 32) | |||
| device = [0, 1] | |||
| driver = PaddleFleetDriver(model, parallel_device=device) | |||
| driver.setup() | |||
| dl = driver.set_dist_repro_dataloader(dataloader=dl, dist='dist', reproducible=reproducible) | |||
| data = [] | |||
| flags = [] | |||
| for batch in dl: | |||
| flags.append(batch['x'].shape[0] == batch_size) | |||
| data.extend(batch['x'].reshape(-1).tolist()) | |||
| _num_samples = num_samples//2 | |||
| if drop_last and _num_samples%batch_size != 0: | |||
| assert len(data)!=_num_samples | |||
| assert all(flags) == True | |||
| elif _num_samples%batch_size!=0: | |||
| assert flags[-1] is False | |||
| else: | |||
| assert len(data) == _num_samples | |||
| if not shuffle: | |||
| for i in range(1, len(data)-1): | |||
| assert data[i]>data[i-1] | |||
| else: | |||
| flags = [] | |||
| for i in range(1, len(data)-1): | |||
| flags.append(data[i]>data[i-1]) | |||
| assert all(flags) is False | |||
| datas = fastnlp_paddle_all_gather(data) | |||
| if drop_last: | |||
| assert len(set(datas[0] + datas[1])) == num_samples-_num_samples%batch_size*2 | |||
| else: | |||
| assert len(set(datas[0] + datas[1])) == num_samples | |||
| finally: | |||
| if dist.is_initialized(): | |||
| dist.barrier() | |||
| dist.destroy_process_group() | |||
| @pytest.mark.torch | |||
| @magic_argv_env_context | |||
| @pytest.mark.parametrize("shuffle", ([True, False])) | |||
| @pytest.mark.parametrize("batch_size", ([1, 3, 16, 17])) | |||
| @pytest.mark.parametrize("drop_last", ([True, False])) | |||
| def test_batch_sampler_dataloader(shuffle, batch_size, drop_last, reproducible=True): | |||
| try: | |||
| # 需要检验一下 set_dist_repro_dataloader 没有修改参数 | |||
| num_samples = 200 | |||
| num_device = 2 | |||
| dataset = PaddleNormalXYDataset(num_samples) | |||
| sampler = BucketedBatchSampler(dataset, length=dataset._data, batch_size=batch_size, drop_last=drop_last, | |||
| shuffle=shuffle, num_batch_per_bucket=2) | |||
| dl = prepare_paddle_dataloader(dataset, batch_sampler=sampler) | |||
| model = PaddleNormalModel_Classification_1(10, 32) | |||
| device = [0, 1] | |||
| driver = PaddleFleetDriver(model, parallel_device=device) | |||
| driver.setup() | |||
| dl = driver.set_dist_repro_dataloader(dataloader=dl, dist='dist', reproducible=reproducible) | |||
| data = [] | |||
| flags = [] | |||
| for batch in dl: | |||
| d = batch['x'].reshape(-1).tolist() | |||
| diff = max(d) - min(d) | |||
| assert diff<batch_size*2*2*2 | |||
| data.extend(d) | |||
| flags.append(len(d)==batch_size) | |||
| _num_samples = num_samples//num_device | |||
| if drop_last and _num_samples%batch_size != 0: | |||
| assert len(data)!=_num_samples | |||
| assert all(flags) == True | |||
| elif _num_samples%batch_size!=0: | |||
| assert flags[-1] is False | |||
| else: | |||
| assert len(data) == _num_samples | |||
| if not shuffle: | |||
| for i in range(1, len(data)-1): | |||
| assert data[i]<data[i-1] | |||
| else: | |||
| flags = [] | |||
| for i in range(1, len(data)-1): | |||
| flags.append(data[i]<data[i-1]) | |||
| assert all(flags) is False | |||
| datas = fastnlp_paddle_all_gather(data) | |||
| if drop_last: | |||
| assert len(set(datas[0] + datas[1])) == num_samples-_num_samples%batch_size*2 | |||
| else: | |||
| assert len(set(datas[0] + datas[1])) == num_samples | |||
| finally: | |||
| if dist.is_initialized(): | |||
| dist.barrier() | |||
| dist.destroy_process_group() | |||
| @@ -9,6 +9,7 @@ from tests.helpers.datasets.torch_data import TorchNormalDataset | |||
| from tests.helpers.models.torch_model import TorchNormalModel_Classification_1 | |||
| from fastNLP.envs.distributed import rank_zero_rm | |||
| from fastNLP.envs.imports import _NEED_IMPORT_PADDLE, _NEED_IMPORT_TORCH | |||
| from fastNLP import prepare_paddle_dataloader, BucketedBatchSampler | |||
| if _NEED_IMPORT_PADDLE: | |||
| import paddle | |||
| @@ -738,3 +739,85 @@ def test_save_and_load_with_randomsampler(only_state_dict, fp16): | |||
| assert len(left_y_batches | already_seen_y_set) == len(dataset) | |||
| finally: | |||
| rank_zero_rm(path) | |||
| @pytest.mark.torch | |||
| @pytest.mark.parametrize("shuffle", ([True, False])) | |||
| @pytest.mark.parametrize("batch_size", ([1, 3, 16, 17])) | |||
| @pytest.mark.parametrize("drop_last", ([True, False])) | |||
| @pytest.mark.parametrize("reproducible", ([True, False])) | |||
| def test_shuffle_dataloader(shuffle, batch_size, drop_last, reproducible): | |||
| # 需要检验一下 set_dist_repro_dataloader 没有修改参数 | |||
| num_samples = 200 | |||
| dataset = PaddleNormalXYDataset(num_samples) | |||
| dl = prepare_paddle_dataloader(dataset, shuffle=shuffle, batch_size=batch_size, drop_last=drop_last) | |||
| model = PaddleNormalModel_Classification_1(1, 2) | |||
| driver = PaddleSingleDriver(model, device="cpu") | |||
| dl = driver.set_dist_repro_dataloader(dataloader=dl, reproducible=reproducible) | |||
| data = [] | |||
| flags = [] | |||
| for batch in dl: | |||
| flags.append(batch['x'].shape[0] == batch_size) | |||
| data.extend(batch['x'].reshape(-1).tolist()) | |||
| if drop_last and num_samples%batch_size != 0: | |||
| assert len(data)!=num_samples | |||
| assert all(flags) == True | |||
| elif num_samples%batch_size!=0: | |||
| assert flags[-1] is False | |||
| else: | |||
| assert len(data) == num_samples | |||
| if not shuffle: | |||
| for i in range(1, len(data)): | |||
| assert data[i]>data[i-1] | |||
| else: | |||
| flags = [] | |||
| for i in range(1, len(data)): | |||
| flags.append(data[i]>data[i-1]) | |||
| assert all(flags) is False | |||
| @pytest.mark.torch | |||
| @pytest.mark.parametrize("shuffle", ([True, False])) | |||
| @pytest.mark.parametrize("batch_size", ([1, 3, 16, 17])) | |||
| @pytest.mark.parametrize("drop_last", ([True, False])) | |||
| @pytest.mark.parametrize("reproducible", ([True, False])) | |||
| def test_batch_sampler_dataloader(shuffle, batch_size, drop_last, reproducible): | |||
| # 需要检验一下 set_dist_repro_dataloader 没有修改参数 | |||
| num_samples = 200 | |||
| dataset = PaddleNormalXYDataset(num_samples) | |||
| sampler = BucketedBatchSampler(dataset, length=dataset._data, batch_size=batch_size, drop_last=drop_last, | |||
| shuffle=shuffle, num_batch_per_bucket=2) | |||
| dl = prepare_paddle_dataloader(dataset, batch_sampler=sampler) | |||
| model = PaddleNormalModel_Classification_1(1, 2) | |||
| driver = PaddleSingleDriver(model, device="cpu") | |||
| dl = driver.set_dist_repro_dataloader(dataloader=dl, reproducible=reproducible) | |||
| data = [] | |||
| flags = [] | |||
| for batch in dl: | |||
| d = batch['x'].reshape(-1).tolist() | |||
| diff = max(d) - min(d) | |||
| assert diff<batch_size*2 | |||
| data.extend(d) | |||
| flags.append(len(d)==batch_size) | |||
| if drop_last and num_samples%batch_size != 0: | |||
| assert len(data)!=num_samples | |||
| assert all(flags) == True | |||
| elif num_samples%batch_size!=0: | |||
| assert flags[-1] is False | |||
| else: | |||
| assert len(data) == num_samples | |||
| if not shuffle: | |||
| for i in range(1, len(data)): | |||
| assert data[i]<data[i-1] | |||
| else: | |||
| flags = [] | |||
| for i in range(1, len(data)): | |||
| flags.append(data[i]<data[i-1]) | |||
| assert all(flags) is False | |||
| @@ -2,6 +2,7 @@ import pytest | |||
| from pathlib import Path | |||
| from fastNLP.core.drivers.torch_driver.ddp import TorchDDPDriver | |||
| from fastNLP import prepare_torch_dataloader | |||
| from fastNLP.core.samplers import ( | |||
| RandomSampler, | |||
| UnrepeatedSampler, | |||
| @@ -13,6 +14,7 @@ from tests.helpers.models.torch_model import TorchNormalModel_Classification_1 | |||
| from tests.helpers.datasets.torch_data import TorchNormalDataset, TorchNormalXYDataset | |||
| from tests.helpers.utils import magic_argv_env_context | |||
| from fastNLP.envs.distributed import rank_zero_rm | |||
| from fastNLP.core.drivers.torch_driver.dist_utils import fastnlp_torch_all_gather | |||
| from fastNLP.envs.imports import _NEED_IMPORT_TORCH | |||
| if _NEED_IMPORT_TORCH: | |||
| import torch | |||
| @@ -834,3 +836,112 @@ class TestSaveLoad: | |||
| if dist.is_initialized(): | |||
| dist.destroy_process_group() | |||
| @pytest.mark.torch | |||
| @magic_argv_env_context | |||
| @pytest.mark.parametrize("shuffle", ([True, False])) | |||
| @pytest.mark.parametrize("batch_size", ([1, 3, 16, 17])) | |||
| @pytest.mark.parametrize("drop_last", ([True, False])) | |||
| def test_shuffle_dataloader(shuffle, batch_size, drop_last, reproducible=True): | |||
| try: | |||
| # 需要检验一下 set_dist_repro_dataloader 没有修改参数 | |||
| num_samples = 200 | |||
| dataset = TorchNormalXYDataset(num_samples) | |||
| dl = prepare_torch_dataloader(dataset, shuffle=shuffle, batch_size=batch_size, drop_last=drop_last) | |||
| model = TorchNormalModel_Classification_1(10, 32) | |||
| device = [torch.device(i) for i in [0, 1]] | |||
| driver = TorchDDPDriver(model, parallel_device=device) | |||
| driver.setup() | |||
| dl = driver.set_dist_repro_dataloader(dataloader=dl, dist='dist', reproducible=reproducible) | |||
| data = [] | |||
| flags = [] | |||
| for batch in dl: | |||
| flags.append(batch['x'].size(0) == batch_size) | |||
| data.extend(batch['x'].reshape(-1).tolist()) | |||
| _num_samples = num_samples//2 | |||
| if drop_last and _num_samples%batch_size != 0: | |||
| assert len(data)!=_num_samples | |||
| assert all(flags) == True | |||
| elif _num_samples%batch_size!=0: | |||
| assert flags[-1] is False | |||
| else: | |||
| assert len(data) == _num_samples | |||
| if not shuffle: | |||
| for i in range(1, len(data)-1): | |||
| assert data[i]>data[i-1] | |||
| else: | |||
| flags = [] | |||
| for i in range(1, len(data)-1): | |||
| flags.append(data[i]>data[i-1]) | |||
| assert all(flags) is False | |||
| datas = fastnlp_torch_all_gather(data) | |||
| if drop_last: | |||
| assert len(set(datas[0] + datas[1])) == num_samples-_num_samples%batch_size*2 | |||
| else: | |||
| assert len(set(datas[0] + datas[1])) == num_samples | |||
| finally: | |||
| if dist.is_initialized(): | |||
| dist.barrier() | |||
| dist.destroy_process_group() | |||
| @pytest.mark.torch | |||
| @magic_argv_env_context | |||
| @pytest.mark.parametrize("shuffle", ([True, False])) | |||
| @pytest.mark.parametrize("batch_size", ([1, 3, 16, 17])) | |||
| @pytest.mark.parametrize("drop_last", ([True, False])) | |||
| def test_batch_sampler_dataloader(shuffle, batch_size, drop_last, reproducible=True): | |||
| try: | |||
| # 需要检验一下 set_dist_repro_dataloader 没有修改参数 | |||
| num_samples = 200 | |||
| num_device = 2 | |||
| dataset = TorchNormalXYDataset(num_samples) | |||
| sampler = BucketedBatchSampler(dataset, length=dataset._data, batch_size=batch_size, drop_last=drop_last, | |||
| shuffle=shuffle, num_batch_per_bucket=2) | |||
| dl = prepare_torch_dataloader(dataset, batch_sampler=sampler) | |||
| model = TorchNormalModel_Classification_1(10, 32) | |||
| device = [torch.device(i) for i in [0, 1]] | |||
| driver = TorchDDPDriver(model, parallel_device=device) | |||
| driver.setup() | |||
| dl = driver.set_dist_repro_dataloader(dataloader=dl, dist='dist', reproducible=reproducible) | |||
| data = [] | |||
| flags = [] | |||
| for batch in dl: | |||
| d = batch['x'].reshape(-1).tolist() | |||
| diff = max(d) - min(d) | |||
| assert diff<batch_size*2*2*2 | |||
| data.extend(d) | |||
| flags.append(len(d)==batch_size) | |||
| _num_samples = num_samples//num_device | |||
| if drop_last and _num_samples%batch_size != 0: | |||
| assert len(data)!=num_samples | |||
| assert all(flags) == True | |||
| elif _num_samples%batch_size!=0: | |||
| assert flags[-1] is False | |||
| else: | |||
| assert len(data) == _num_samples | |||
| if not shuffle: | |||
| for i in range(1, len(data)-1): | |||
| assert data[i]<data[i-1] | |||
| else: | |||
| flags = [] | |||
| for i in range(1, len(data)-1): | |||
| flags.append(data[i]<data[i-1]) | |||
| assert all(flags) is False | |||
| datas = fastnlp_torch_all_gather(data) | |||
| if drop_last: | |||
| assert len(set(datas[0] + datas[1])) == num_samples-_num_samples%batch_size*2 | |||
| else: | |||
| assert len(set(datas[0] + datas[1])) == num_samples | |||
| finally: | |||
| if dist.is_initialized(): | |||
| dist.barrier() | |||
| dist.destroy_process_group() | |||
| @@ -11,6 +11,7 @@ from tests.helpers.datasets.paddle_data import PaddleNormalDataset | |||
| from tests.helpers.models.paddle_model import PaddleNormalModel_Classification_1 | |||
| from fastNLP.envs.distributed import rank_zero_rm | |||
| from fastNLP.envs.imports import _NEED_IMPORT_PADDLE, _NEED_IMPORT_TORCH | |||
| from fastNLP import prepare_torch_dataloader, BucketedBatchSampler | |||
| if _NEED_IMPORT_TORCH: | |||
| import torch | |||
| @@ -710,3 +711,85 @@ def test_save_and_load_with_randomsampler(only_state_dict, fp16): | |||
| assert len(left_y_batches | already_seen_y_set) == len(dataset) | |||
| finally: | |||
| rank_zero_rm(path) | |||
| @pytest.mark.torch | |||
| @pytest.mark.parametrize("shuffle", ([True, False])) | |||
| @pytest.mark.parametrize("batch_size", ([1, 3, 16, 17])) | |||
| @pytest.mark.parametrize("drop_last", ([True, False])) | |||
| @pytest.mark.parametrize("reproducible", ([True, False])) | |||
| def test_shuffle_dataloader(shuffle, batch_size, drop_last, reproducible): | |||
| # 需要检验一下 set_dist_repro_dataloader 没有修改参数 | |||
| num_samples = 100 | |||
| dataset = TorchNormalXYDataset(num_samples) | |||
| dl = prepare_torch_dataloader(dataset, shuffle=shuffle, batch_size=batch_size, drop_last=drop_last) | |||
| model = TorchNormalModel_Classification_1(10, 32) | |||
| driver = TorchSingleDriver(model, device="cpu") | |||
| dl = driver.set_dist_repro_dataloader(dataloader=dl, reproducible=reproducible) | |||
| data = [] | |||
| flags = [] | |||
| for batch in dl: | |||
| flags.append(batch['x'].size(0) == batch_size) | |||
| data.extend(batch['x'].reshape(-1).tolist()) | |||
| if drop_last and num_samples%batch_size != 0: | |||
| assert len(data)!=num_samples | |||
| assert all(flags) == True | |||
| elif num_samples%batch_size!=0: | |||
| assert flags[-1] is False | |||
| else: | |||
| assert len(data) == num_samples | |||
| if not shuffle: | |||
| for i in range(1, len(data)): | |||
| assert data[i]>data[i-1] | |||
| else: | |||
| flags = [] | |||
| for i in range(1, len(data)): | |||
| flags.append(data[i]>data[i-1]) | |||
| assert all(flags) is False | |||
| @pytest.mark.torch | |||
| @pytest.mark.parametrize("shuffle", ([True, False])) | |||
| @pytest.mark.parametrize("batch_size", ([1, 3, 16, 17])) | |||
| @pytest.mark.parametrize("drop_last", ([True, False])) | |||
| @pytest.mark.parametrize("reproducible", ([True, False])) | |||
| def test_batch_sampler_dataloader(shuffle, batch_size, drop_last, reproducible): | |||
| # 需要检验一下 set_dist_repro_dataloader 没有修改参数 | |||
| num_samples = 100 | |||
| dataset = TorchNormalXYDataset(num_samples) | |||
| sampler = BucketedBatchSampler(dataset, length=dataset._data, batch_size=batch_size, drop_last=drop_last, | |||
| shuffle=shuffle, num_batch_per_bucket=2) | |||
| dl = prepare_torch_dataloader(dataset, batch_sampler=sampler) | |||
| model = TorchNormalModel_Classification_1(10, 32) | |||
| driver = TorchSingleDriver(model, device="cpu") | |||
| dl = driver.set_dist_repro_dataloader(dataloader=dl, reproducible=reproducible) | |||
| data = [] | |||
| flags = [] | |||
| for batch in dl: | |||
| d = batch['x'].reshape(-1).tolist() | |||
| diff = max(d) - min(d) | |||
| assert diff<batch_size*2 | |||
| data.extend(d) | |||
| flags.append(len(d)==batch_size) | |||
| if drop_last and num_samples%batch_size != 0: | |||
| assert len(data)!=num_samples | |||
| assert all(flags) == True | |||
| elif num_samples%batch_size!=0: | |||
| assert flags[-1] is False | |||
| else: | |||
| assert len(data) == num_samples | |||
| if not shuffle: | |||
| for i in range(1, len(data)): | |||
| assert data[i]<data[i-1] | |||
| else: | |||
| flags = [] | |||
| for i in range(1, len(data)): | |||
| flags.append(data[i]<data[i-1]) | |||
| assert all(flags) is False | |||