From 4be26c56202f25f3b3efd25f6f817e1a256c6ce4 Mon Sep 17 00:00:00 2001 From: MorningForest <2297662686@qq.com> Date: Thu, 14 Apr 2022 19:36:50 +0800 Subject: [PATCH 1/7] =?UTF-8?q?=E4=BF=AE=E6=94=B9torch=20fdl?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- fastNLP/core/dataloaders/torch_dataloader/fdl.py | 2 ++ tests/core/dataloaders/torch_dataloader/test_fdl.py | 8 +++++--- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/fastNLP/core/dataloaders/torch_dataloader/fdl.py b/fastNLP/core/dataloaders/torch_dataloader/fdl.py index d56dbac9..7f0d1bb6 100644 --- a/fastNLP/core/dataloaders/torch_dataloader/fdl.py +++ b/fastNLP/core/dataloaders/torch_dataloader/fdl.py @@ -98,6 +98,7 @@ class TorchDataLoader(DataLoader): def __getattr__(self, item): """ 为FDataLoader提供dataset的方法和属性,实现该方法后,用户可以在FDataLoader实例化后使用apply等dataset的方法 + :param item: :return: """ @@ -119,6 +120,7 @@ class TorchDataLoader(DataLoader): """ 设置每个field_name的padding值,默认为0,只有当autocollate存在时该方法有效, 若没有则会添加auto_collator函数 当val=None时,意味着给定的field_names都不需要尝试padding + :param field_names: :param val: padding值,默认为0 :return: diff --git a/tests/core/dataloaders/torch_dataloader/test_fdl.py b/tests/core/dataloaders/torch_dataloader/test_fdl.py index baa3781a..7c1352aa 100644 --- a/tests/core/dataloaders/torch_dataloader/test_fdl.py +++ b/tests/core/dataloaders/torch_dataloader/test_fdl.py @@ -21,11 +21,12 @@ class TestFdl: ds.set_pad_val("x", val=-1) fdl = TorchDataLoader(ds, batch_size=3) fdl.set_input("x", "y") + fdl.set_pad_val("x", val=None) for batch in fdl: print(batch) - fdl.set_pad_val("x", val=-2) - for batch in fdl: - print(batch) + # fdl.set_pad_val("x", val=-2) + # for batch in fdl: + # print(batch) def test_add_collator(self): ds = DataSet({"x": [[1, 2], [2, 3, 4], [4, 5, 6, 7]] * 10, "y": [1, 0, 1] * 10}) @@ -38,6 +39,7 @@ class TestFdl: fdl = TorchDataLoader(ds, batch_size=3, as_numpy=True) fdl.set_input("x", "y") + # fdl.set_pad_val("x", val=None) fdl.add_collator(collate_fn) for batch in fdl: print(batch) From be24572b114a151caea83468aa32d068def7b915 Mon Sep 17 00:00:00 2001 From: x54-729 <17307130121@fudan.edu.cn> Date: Fri, 15 Apr 2022 10:44:25 +0000 Subject: [PATCH 2/7] =?UTF-8?q?=E4=BF=AE=E6=94=B9paddle.distributed?= =?UTF-8?q?=E7=9A=84import=E5=90=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- fastNLP/core/drivers/paddle_driver/fleet.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/fastNLP/core/drivers/paddle_driver/fleet.py b/fastNLP/core/drivers/paddle_driver/fleet.py index a083e42c..762b3114 100644 --- a/fastNLP/core/drivers/paddle_driver/fleet.py +++ b/fastNLP/core/drivers/paddle_driver/fleet.py @@ -37,7 +37,7 @@ if _NEED_IMPORT_PADDLE: import paddle from paddle import DataParallel import paddle.distributed.fleet as fleet - import paddle.distributed as dist + import paddle.distributed as paddledist from paddle.io import BatchSampler from paddle.optimizer import Optimizer from paddle.fluid.reader import _DatasetKind @@ -185,8 +185,8 @@ class PaddleFleetDriver(PaddleDriver): if sorted(pre_gpus) != sorted(self.parallel_device): raise RuntimeError("Notice you are using `PaddleFleetDriver` after one instantiated `PaddleFleetDriver`, it is not" "allowed that your second `PaddleFleetDriver` has a new setting of parameters `parallel_device`.") - self.world_size = dist.get_world_size() - self.global_rank = dist.get_rank() + self.world_size = paddledist.get_world_size() + self.global_rank = paddledist.get_rank() if not self.outside_fleet: # self.model.to(self.model_device) @@ -197,12 +197,12 @@ class PaddleFleetDriver(PaddleDriver): # 初始化 self._pids,从而使得每一个进程都能接受到 rank0 的 send 操作; # TODO 不用.to会怎么样? self._pids = [] - dist.all_gather(self._pids, paddle.to_tensor(os.getpid(), dtype="int32")) + paddledist.all_gather(self._pids, paddle.to_tensor(os.getpid(), dtype="int32")) # TODO LOCAL_WORLD_SIZE local_world_size = int(os.environ.get("LOCAL_WORLD_SIZE")) if "LOCAL_WORLD_SIZE" in os.environ else None if local_world_size is None: local_world_size = paddle.to_tensor(self.local_rank, dtype="int32") - dist.all_reduce(local_world_size, op=dist.ReduceOp.MAX) + paddledist.all_reduce(local_world_size, op=paddledist.ReduceOp.MAX) local_world_size = local_world_size.item() + 1 node_rank = self.global_rank // local_world_size @@ -232,11 +232,11 @@ class PaddleFleetDriver(PaddleDriver): 当用户使用了 `python -m paddle.distributed.launch xxx.py` 启动时,我们需要 根据 paddle 设置的环境变量来获得各种属性 """ - self.world_size = dist.get_world_size() - self.global_rank = dist.get_rank() + self.world_size = paddledist.get_world_size() + self.global_rank = paddledist.get_rank() def barrier(self): - dist.barrier() + paddledist.barrier() def configure_fleet(self): if not self._has_fleetwrapped and not isinstance(self.model, DataParallel): From 9e97155312eed884645e1d04284c57fe343b601b Mon Sep 17 00:00:00 2001 From: x54-729 <17307130121@fudan.edu.cn> Date: Fri, 15 Apr 2022 10:44:49 +0000 Subject: [PATCH 3/7] =?UTF-8?q?PaddleSingleDriver=E7=9A=84save=20load?= =?UTF-8?q?=E5=87=BD=E6=95=B0=E6=B5=8B=E8=AF=95?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../paddle_driver/test_single_device.py | 846 +++++++++--------- 1 file changed, 435 insertions(+), 411 deletions(-) diff --git a/tests/core/drivers/paddle_driver/test_single_device.py b/tests/core/drivers/paddle_driver/test_single_device.py index ebd4721b..79527f39 100644 --- a/tests/core/drivers/paddle_driver/test_single_device.py +++ b/tests/core/drivers/paddle_driver/test_single_device.py @@ -1,3 +1,4 @@ +from dataclasses import replace import os from re import S os.environ["FASTNLP_BACKEND"] = "paddle" @@ -16,203 +17,303 @@ import paddle from paddle.io import DataLoader, BatchSampler import torch - ############################################################################ # -# 测试save和load相关的功能 +# 测试基类 PaddleDrvier 中的一些简单函数 # ############################################################################ -def generate_random_driver(features, labels): - """ - 生成driver - """ - model = PaddleNormalModel_Classification_1(labels, features) - opt = paddle.optimizer.Adam(parameters=model.parameters(), learning_rate=0.01) - driver = PaddleSingleDriver(model, device="cpu") - driver.set_optimizers(opt) - driver.setup() - - return driver - -@pytest.fixture -def prepare_test_save_load(): - dataset = PaddleRandomMaxDataset(320, 10) - dataloader = DataLoader(dataset, batch_size=32) - driver1, driver2 = generate_random_driver(10, 10), generate_random_driver(10, 10) - return driver1, driver2, dataloader - -@pytest.mark.parametrize("only_state_dict", ([True, False])) -def test_save_and_load_with_randombatchsampler(only_state_dict): +class TestPaddleDriverFunctions: """ - 测试save和load函数,主要测试 dataloader 被替换了 sampler 之后的情况 + 使用 PaddleSingleDriver 测试基类的函数 """ - try: - path = "model.ckp" + @classmethod + def setup_class(self): + model = PaddleNormalModel_Classification_1(10, 32) + self.driver = PaddleSingleDriver(model, device="cpu") - driver1, driver2 = generate_random_driver(10, 10), generate_random_driver(10, 10) - dataset = PaddleRandomMaxDataset(80, 10) - dataloader = DataLoader( - dataset=dataset, - batch_sampler=RandomBatchSampler(BatchSampler(dataset, batch_size=4), 4, False) + def test_check_single_optimizer_legality(self): + """ + 测试传入单个optimizer时的表现 + """ + optimizer = paddle.optimizer.Adam( + parameters=self.driver.model.parameters(), + learning_rate=0.01 ) - num_consumed_batches = 2 - # TODO 断点重训完善后在这里迭代几次 - already_seen_set = set() - for idx, batch in enumerate(dataloader): - if idx >= num_consumed_batches: - break - already_seen_set.update(batch) + self.driver.set_optimizers(optimizer) - sampler_states = dataloader.batch_sampler.state_dict() - save_states = {"num_consumed_batches": num_consumed_batches} - if only_state_dict: - driver1.save(Path(path), save_states, dataloader, only_state_dict, should_save_model=True) - else: - driver1.save(Path(path), save_states, dataloader, only_state_dict, should_save_model=True, input_spec=[paddle.ones((16, 10))]) - - # 加载 - # 更改 batch_size - dataloader = DataLoader( - dataset=dataset, - batch_sampler=RandomBatchSampler(BatchSampler(dataset, batch_size=2), 2, False) - ) - load_states = driver2.load(Path(path), dataloader, only_state_dict, should_load_model=True) - replaced_loader = load_states.pop("dataloader") + optimizer = torch.optim.Adam(TorchNormalModel_Classification_1(10, 32).parameters(), 0.01) + # 传入torch的optimizer时,应该报错ValueError + with pytest.raises(ValueError): + self.driver.set_optimizers(optimizer) - # 1. 检查 optimizer 的状态 - # TODO optimizer 的 state_dict 总是为空 + def test_check_optimizers_legality(self): + """ + 测试传入optimizer list的表现 + """ + optimizers = [ + paddle.optimizer.Adam( + parameters=self.driver.model.parameters(), + learning_rate=0.01 + ) for i in range(10) + ] - # 2. 检查 batch_sampler 是否被正确地加载和替换 - assert isinstance(replaced_loader.batch_sampler, RandomBatchSampler) - assert replaced_loader.batch_sampler.index_list == sampler_states["index_list"] - assert replaced_loader.batch_sampler.data_idx == sampler_states["data_idx"] + self.driver.set_optimizers(optimizers) - # 3. 检查 model 的参数是否被正确加载 - for batch in dataloader: - res1 = driver1.model.evaluate_step(**batch) - res2 = driver2.model.evaluate_step(**batch) + optimizers += [ + torch.optim.Adam(TorchNormalModel_Classification_1(10, 32).parameters(), 0.01) + ] - assert paddle.equal_all(res1["pred"], res2["pred"]) + with pytest.raises(ValueError): + self.driver.set_optimizers(optimizers) - # 4. 检查 batch_idx - start_batch = load_states.pop('batch_idx_in_epoch') - assert start_batch == 2 * num_consumed_batches - left_batches = set() - for idx, batch in enumerate(replaced_loader): - left_batches.update(batch) + def test_check_dataloader_legality_in_train(self): + """ + 测试is_train参数为True时,_check_dataloader_legality函数的表现 + """ + dataloader = paddle.io.DataLoader(PaddleNormalDataset()) + PaddleSingleDriver.check_dataloader_legality(dataloader, "dataloader", True) - assert len(left_batches) + len(already_seen_set) == len(dataset) - assert len(left_batches | already_seen_set) == len(dataset) + # batch_size 和 batch_sampler 均为 None 的情形 + dataloader = paddle.io.DataLoader(PaddleNormalDataset(), batch_size=None) + with pytest.raises(ValueError): + PaddleSingleDriver.check_dataloader_legality(dataloader, "dataloader", True) + # 创建torch的dataloader + dataloader = torch.utils.data.DataLoader( + TorchNormalDataset(), + batch_size=32, shuffle=True + ) + with pytest.raises(ValueError): + PaddleSingleDriver.check_dataloader_legality(dataloader, "dataloader", True) - finally: - synchronize_safe_rm(path) + def test_check_dataloader_legality_in_test(self): + """ + 测试is_train参数为False时,_check_dataloader_legality函数的表现 + """ + # 此时传入的应该是dict + dataloader = { + "train": paddle.io.DataLoader(PaddleNormalDataset()), + "test":paddle.io.DataLoader(PaddleNormalDataset()) + } + PaddleSingleDriver.check_dataloader_legality(dataloader, "dataloader", False) -@pytest.mark.parametrize("only_state_dict", ([True, False])) -def test_save_and_load_with_randomsampler(only_state_dict): - """ - 测试save和load函数,主要测试 dataloader 被替换了 batch_sampler 的情况 - """ + # batch_size 和 batch_sampler 均为 None 的情形 + dataloader = { + "train": paddle.io.DataLoader(PaddleNormalDataset()), + "test":paddle.io.DataLoader(PaddleNormalDataset(), batch_size=None) + } + with pytest.raises(ValueError): + PaddleSingleDriver.check_dataloader_legality(dataloader, "dataloader", False) - try: - path = "model.ckp" + # 传入的不是dict,应该报错 + dataloader = paddle.io.DataLoader(PaddleNormalDataset()) + with pytest.raises(ValueError): + PaddleSingleDriver.check_dataloader_legality(dataloader, "dataloader", False) - driver1, driver2 = generate_random_driver(10, 10), generate_random_driver(10, 10) - dataset = PaddleRandomMaxDataset(80, 10) - batch_sampler = BatchSampler(dataset=dataset, batch_size=2) - batch_sampler.sampler = RandomSampler(dataset, True) - dataloader = DataLoader( - dataset, - batch_sampler=batch_sampler + # 创建torch的dataloader + train_loader = torch.utils.data.DataLoader( + TorchNormalDataset(), + batch_size=32, shuffle=True ) - num_consumed_batches = 2 - - # TODO 断点重训完善后在这里迭代几次 - already_seen_set = set() - for idx, batch in enumerate(dataloader): - if idx >= num_consumed_batches: - break - already_seen_set.update(batch) - - sampler_states = dataloader.batch_sampler.sampler.state_dict() - save_states = {"num_consumed_batches": num_consumed_batches} - if only_state_dict: - driver1.save(Path(path), save_states, dataloader, only_state_dict, should_save_model=True) - else: - driver1.save(Path(path), save_states, dataloader, only_state_dict, should_save_model=True, input_spec=[paddle.ones((16, 10))]) - - # 加载 - # 更改 batch_size - dataloader = DataLoader( - dataset=dataset, - batch_sampler=RandomBatchSampler(BatchSampler(dataset, batch_size=2), 2, False) + test_loader = torch.utils.data.DataLoader( + TorchNormalDataset(), + batch_size=32, shuffle=True ) - load_states = driver2.load(Path(path), dataloader, only_state_dict, should_load_model=True) - replaced_loader = load_states.pop("dataloader") + dataloader = {"train": train_loader, "test": test_loader} + with pytest.raises(ValueError): + PaddleSingleDriver.check_dataloader_legality(dataloader, "dataloader", False) - # 1. 检查 optimizer 的状态 - # TODO optimizer 的 state_dict 总是为空 + def test_tensor_to_numeric(self): + """ + 测试tensor_to_numeric函数 + """ + # 单个张量 + tensor = paddle.to_tensor(3) + res = PaddleSingleDriver.tensor_to_numeric(tensor) + assert res == 3 - # 2. 检查 sampler 是否被正确地加载和替换 - replaced_loader = load_states["dataloader"] + tensor = paddle.rand((3, 4)) + res = PaddleSingleDriver.tensor_to_numeric(tensor) + assert res == tensor.tolist() - assert isinstance(replaced_loader.batch_sampler.sampler, RandomSampler) - assert replaced_loader.batch_sampler.sampler.seed == sampler_states["seed"] - assert replaced_loader.batch_sampler.sampler.epoch == sampler_states["epoch"] - assert replaced_loader.batch_sampler.sampler.num_consumed_samples == sampler_states["num_consumed_samples"] - assert len(replaced_loader.batch_sampler.sampler.dataset) == sampler_states["length"] - assert replaced_loader.batch_sampler.sampler.shuffle == sampler_states["shuffle"] + # 张量list + tensor_list = [paddle.rand((6, 4, 2)) for i in range(10)] + res = PaddleSingleDriver.tensor_to_numeric(tensor_list) + assert isinstance(res, list) + tensor_list = [t.tolist() for t in tensor_list] + assert res == tensor_list - # 3. 检查 model 的参数是否被正确加载 - for batch in dataloader: - res1 = driver1.model.evaluate_step(**batch) - res2 = driver2.model.evaluate_step(**batch) + # 张量tuple + tensor_tuple = tuple([paddle.rand((6, 4, 2)) for i in range(10)]) + res = PaddleSingleDriver.tensor_to_numeric(tensor_tuple) + assert isinstance(res, tuple) + tensor_tuple = tuple([t.tolist() for t in tensor_tuple]) + assert res == tensor_tuple - assert paddle.equal_all(res1["pred"], res2["pred"]) + # 张量dict + tensor_dict = { + "tensor": paddle.rand((3, 4)), + "list": [paddle.rand((6, 4, 2)) for i in range(10)], + "dict":{ + "list": [paddle.rand((6, 4, 2)) for i in range(10)], + "tensor": paddle.rand((3, 4)) + }, + "int": 2, + "string": "test string" + } - # 4. 检查 batch_idx - start_batch = load_states.pop('batch_idx_in_epoch') - assert start_batch == 2 * num_consumed_batches - left_batches = set() - for idx, batch in enumerate(replaced_loader): - left_batches.update(batch) + res = PaddleSingleDriver.tensor_to_numeric(tensor_dict) + assert isinstance(res, dict) + assert res["tensor"] == tensor_dict["tensor"].tolist() + assert isinstance(res["list"], list) + for r, d in zip(res["list"], tensor_dict["list"]): + assert r == d.tolist() + assert isinstance(res["int"], int) + assert isinstance(res["string"], str) + assert isinstance(res["dict"], dict) + assert isinstance(res["dict"]["list"], list) + for r, d in zip(res["dict"]["list"], tensor_dict["dict"]["list"]): + assert r == d.tolist() + assert res["dict"]["tensor"] == tensor_dict["dict"]["tensor"].tolist() - assert len(left_batches) + len(already_seen_set) == len(dataset) - assert len(left_batches | already_seen_set) == len(dataset) - finally: - synchronize_safe_rm(path) + def test_set_model_mode(self): + """ + 测试set_model_mode函数 + """ + self.driver.set_model_mode("train") + assert self.driver.model.training + self.driver.set_model_mode("eval") + assert not self.driver.model.training + # 应该报错 + with pytest.raises(AssertionError): + self.driver.set_model_mode("test") -@pytest.mark.parametrize("only_state_dict", ([True, False])) -def test_save_and_load_model(prepare_test_save_load, only_state_dict): - """ - 测试 save_model 和 load_model 函数 - """ - try: - path = "model" - driver1, driver2, dataloader = prepare_test_save_load + def test_move_model_to_device_cpu(self): + """ + 测试move_model_to_device函数 + """ + PaddleSingleDriver.move_model_to_device(self.driver.model, "cpu") + assert self.driver.model.linear1.weight.place.is_cpu_place() - if only_state_dict: - driver1.save_model(path, only_state_dict) + def test_move_model_to_device_gpu(self): + """ + 测试move_model_to_device函数 + """ + PaddleSingleDriver.move_model_to_device(self.driver.model, "gpu") + assert self.driver.model.linear1.weight.place.is_gpu_place() + assert self.driver.model.linear1.weight.place.gpu_device_id() == 0 + + def test_worker_init_function(self): + """ + 测试worker_init_function + """ + # 先确保不影响运行 + # TODO:正确性 + PaddleSingleDriver.worker_init_function(0) + + def test_set_deterministic_dataloader(self): + """ + 测试set_deterministic_dataloader + """ + # 先确保不影响运行 + # TODO:正确性 + dataloader = DataLoader(PaddleNormalDataset()) + self.driver.set_deterministic_dataloader(dataloader) + + def test_set_sampler_epoch(self): + """ + 测试set_sampler_epoch + """ + # 先确保不影响运行 + # TODO:正确性 + dataloader = DataLoader(PaddleNormalDataset()) + self.driver.set_sampler_epoch(dataloader, 0) + + @pytest.mark.parametrize("batch_size", [16]) + @pytest.mark.parametrize("shuffle", [True, False]) + @pytest.mark.parametrize("drop_last", [True, False]) + def test_get_dataloader_args(self, batch_size, shuffle, drop_last): + """ + 测试正常情况下 get_dataloader_args 的表现 + """ + dataloader = DataLoader( + PaddleNormalDataset(), + batch_size=batch_size, + shuffle=shuffle, + drop_last=drop_last, + ) + res = PaddleSingleDriver.get_dataloader_args(dataloader) + + assert isinstance(res.dataset, PaddleNormalDataset) + assert isinstance(res.batch_sampler, BatchSampler) + if shuffle: + assert isinstance(res.sampler, paddle.io.RandomSampler) else: - driver1.save_model(path, only_state_dict, input_spec=[paddle.ones((32, 10))]) - driver2.load_model(path, only_state_dict) + assert isinstance(res.sampler, paddle.io.SequenceSampler) + assert res.shuffle == shuffle + assert res.batch_size == batch_size + assert res.drop_last == drop_last - for batch in dataloader: - batch = driver1.move_data_to_device(batch) - res1 = driver1.model.evaluate_step(**batch) - res2 = driver2.model.evaluate_step(**batch) + @pytest.mark.parametrize("batch_size", [16]) + @pytest.mark.parametrize("shuffle", [True, False]) + @pytest.mark.parametrize("drop_last", [True, False]) + def test_get_dataloader_args_with_randombatchsampler(self, batch_size, shuffle, drop_last): + """ + 测试替换了 batch_sampler 后 get_dataloader_args 的表现 + """ + dataset = PaddleNormalDataset() + dataloader = DataLoader( + dataset, + batch_sampler=RandomBatchSampler( + BatchSampler(dataset, batch_size=batch_size, shuffle=shuffle), + batch_size, + drop_last, + ) + ) + res = PaddleSingleDriver.get_dataloader_args(dataloader) - assert paddle.equal_all(res1["pred"], res2["pred"]) - finally: - if only_state_dict: - synchronize_safe_rm(path) + assert isinstance(res.dataset, PaddleNormalDataset) + assert isinstance(res.batch_sampler, RandomBatchSampler) + if shuffle: + assert isinstance(res.sampler, paddle.io.RandomSampler) else: - synchronize_safe_rm(path + ".pdiparams") - synchronize_safe_rm(path + ".pdiparams.info") - synchronize_safe_rm(path + ".pdmodel") + assert isinstance(res.sampler, paddle.io.SequenceSampler) + assert res.shuffle == shuffle + assert res.batch_size == batch_size + assert res.drop_last == drop_last + + @pytest.mark.parametrize("batch_size", [16]) + @pytest.mark.parametrize("shuffle", [True, False]) + @pytest.mark.parametrize("drop_last", [True, False]) + def test_get_dataloader_args_with_randomsampler(self, batch_size, shuffle, drop_last): + """ + 测试替换了 sampler 后 get_dataloader_args 的表现 + """ + dataset = PaddleNormalDataset() + batch_sampler = BatchSampler(dataset, batch_size=batch_size, drop_last=drop_last) + batch_sampler.sampler = RandomSampler(dataset, shuffle) + dataloader = DataLoader( + dataset, + batch_sampler=batch_sampler, + ) + res = PaddleSingleDriver.get_dataloader_args(dataloader) + + assert isinstance(res.dataset, PaddleNormalDataset) + assert isinstance(res.batch_sampler, BatchSampler) + assert isinstance(res.sampler, RandomSampler) + assert res.shuffle == shuffle + assert res.batch_size == batch_size + assert res.drop_last == drop_last + + +############################################################################ +# +# 测试 PaddleSingleDrvier 中的一些简单函数 +# +############################################################################ class TestSingleDeviceFunction: """ @@ -242,6 +343,12 @@ class TestSingleDeviceFunction: self.driver.move_data_to_device(paddle.rand((32, 64))) +############################################################################ +# +# 测试 set_dist_repro_dataloader 函数 +# +############################################################################ + class TestSetDistReproDataloder: """ 专门测试 set_dist_repro_dataloader 函数的类 @@ -423,287 +530,204 @@ class TestSetDistReproDataloder: assert len(left_idxes) + len(already_seen_idx) == len(self.dataset) assert len(left_idxes | already_seen_idx) == len(self.dataset) -class TestPaddleDriverFunctions: +############################################################################ +# +# 测试 save 和 load 相关的功能 +# +############################################################################ + +def generate_random_driver(features, labels): """ - 使用 PaddleSingleDriver 测试基类的函数 + 生成driver """ + model = PaddleNormalModel_Classification_1(labels, features) + opt = paddle.optimizer.Adam(parameters=model.parameters(), learning_rate=0.01) + driver = PaddleSingleDriver(model, device="cpu") + driver.set_optimizers(opt) + driver.setup() - @classmethod - def setup_class(self): - model = PaddleNormalModel_Classification_1(10, 32) - self.driver = PaddleSingleDriver(model, device="cpu") - - def test_check_single_optimizer_legality(self): - """ - 测试传入单个optimizer时的表现 - """ - optimizer = paddle.optimizer.Adam( - parameters=self.driver.model.parameters(), - learning_rate=0.01 - ) - - self.driver.set_optimizers(optimizer) + return driver - optimizer = torch.optim.Adam(TorchNormalModel_Classification_1(10, 32).parameters(), 0.01) - # 传入torch的optimizer时,应该报错ValueError - with pytest.raises(ValueError): - self.driver.set_optimizers(optimizer) +@pytest.fixture +def prepare_test_save_load(): + dataset = PaddleRandomMaxDataset(320, 10) + dataloader = DataLoader(dataset, batch_size=32) + driver1, driver2 = generate_random_driver(10, 10), generate_random_driver(10, 10) + return driver1, driver2, dataloader - def test_check_optimizers_legality(self): - """ - 测试传入optimizer list的表现 - """ - optimizers = [ - paddle.optimizer.Adam( - parameters=self.driver.model.parameters(), - learning_rate=0.01 - ) for i in range(10) - ] +@pytest.mark.parametrize("only_state_dict", ([True, False])) +def test_save_and_load_model(prepare_test_save_load, only_state_dict): + """ + 测试 save_model 和 load_model 函数 + """ + try: + path = "model" + driver1, driver2, dataloader = prepare_test_save_load - self.driver.set_optimizers(optimizers) + if only_state_dict: + driver1.save_model(path, only_state_dict) + else: + driver1.save_model(path, only_state_dict, input_spec=[paddle.ones((32, 10))]) + driver2.load_model(path, only_state_dict) - optimizers += [ - torch.optim.Adam(TorchNormalModel_Classification_1(10, 32).parameters(), 0.01) - ] + for batch in dataloader: + batch = driver1.move_data_to_device(batch) + res1 = driver1.model.evaluate_step(**batch) + res2 = driver2.model.evaluate_step(**batch) - with pytest.raises(ValueError): - self.driver.set_optimizers(optimizers) + assert paddle.equal_all(res1["pred"], res2["pred"]) + finally: + if only_state_dict: + synchronize_safe_rm(path) + else: + synchronize_safe_rm(path + ".pdiparams") + synchronize_safe_rm(path + ".pdiparams.info") + synchronize_safe_rm(path + ".pdmodel") - def test_check_dataloader_legality_in_train(self): - """ - 测试is_train参数为True时,_check_dataloader_legality函数的表现 - """ - dataloader = paddle.io.DataLoader(PaddleNormalDataset()) - PaddleSingleDriver.check_dataloader_legality(dataloader, "dataloader", True) +@pytest.mark.parametrize("only_state_dict", ([True, False])) +def test_save_and_load_with_randombatchsampler(only_state_dict): + """ + 测试save和load函数,主要测试 dataloader 被替换了 sampler 之后的情况 + """ - # batch_size 和 batch_sampler 均为 None 的情形 - dataloader = paddle.io.DataLoader(PaddleNormalDataset(), batch_size=None) - with pytest.raises(ValueError): - PaddleSingleDriver.check_dataloader_legality(dataloader, "dataloader", True) + try: + path = "model.ckp" - # 创建torch的dataloader - dataloader = torch.utils.data.DataLoader( - TorchNormalDataset(), - batch_size=32, shuffle=True + driver1, driver2 = generate_random_driver(10, 10), generate_random_driver(10, 10) + dataset = PaddleRandomMaxDataset(40, 10) + dataloader = DataLoader( + dataset=dataset, + batch_sampler=RandomBatchSampler(BatchSampler(dataset, batch_size=4), 4, False) ) - with pytest.raises(ValueError): - PaddleSingleDriver.check_dataloader_legality(dataloader, "dataloader", True) + num_consumed_batches = 2 - def test_check_dataloader_legality_in_test(self): - """ - 测试is_train参数为False时,_check_dataloader_legality函数的表现 - """ - # 此时传入的应该是dict - dataloader = { - "train": paddle.io.DataLoader(PaddleNormalDataset()), - "test":paddle.io.DataLoader(PaddleNormalDataset()) - } - PaddleSingleDriver.check_dataloader_legality(dataloader, "dataloader", False) - - # batch_size 和 batch_sampler 均为 None 的情形 - dataloader = { - "train": paddle.io.DataLoader(PaddleNormalDataset()), - "test":paddle.io.DataLoader(PaddleNormalDataset(), batch_size=None) - } - with pytest.raises(ValueError): - PaddleSingleDriver.check_dataloader_legality(dataloader, "dataloader", False) - - # 传入的不是dict,应该报错 - dataloader = paddle.io.DataLoader(PaddleNormalDataset()) - with pytest.raises(ValueError): - PaddleSingleDriver.check_dataloader_legality(dataloader, "dataloader", False) + already_seen_x_set = set() + already_seen_y_set = set() + for idx, batch in enumerate(dataloader): + if idx >= num_consumed_batches: + break + already_seen_x_set.update(batch["x"]) + already_seen_y_set.update(batch["y"]) - # 创建torch的dataloader - train_loader = torch.utils.data.DataLoader( - TorchNormalDataset(), - batch_size=32, shuffle=True - ) - test_loader = torch.utils.data.DataLoader( - TorchNormalDataset(), - batch_size=32, shuffle=True + sampler_states = dataloader.batch_sampler.state_dict() + save_states = {"num_consumed_batches": num_consumed_batches} + if only_state_dict: + driver1.save(Path(path), save_states, dataloader, only_state_dict, should_save_model=True) + else: + driver1.save(Path(path), save_states, dataloader, only_state_dict, should_save_model=True, input_spec=[paddle.ones((16, 10))]) + # 加载 + # 更改 batch_size + dataloader = DataLoader( + dataset=dataset, + batch_sampler=RandomBatchSampler(BatchSampler(dataset, batch_size=2, shuffle=True), 2, False) ) - dataloader = {"train": train_loader, "test": test_loader} - with pytest.raises(ValueError): - PaddleSingleDriver.check_dataloader_legality(dataloader, "dataloader", False) - - def test_tensor_to_numeric(self): - """ - 测试tensor_to_numeric函数 - """ - # 单个张量 - tensor = paddle.to_tensor(3) - res = PaddleSingleDriver.tensor_to_numeric(tensor) - assert res == 3 - - tensor = paddle.rand((3, 4)) - res = PaddleSingleDriver.tensor_to_numeric(tensor) - assert res == tensor.tolist() - - # 张量list - tensor_list = [paddle.rand((6, 4, 2)) for i in range(10)] - res = PaddleSingleDriver.tensor_to_numeric(tensor_list) - assert isinstance(res, list) - tensor_list = [t.tolist() for t in tensor_list] - assert res == tensor_list - - # 张量tuple - tensor_tuple = tuple([paddle.rand((6, 4, 2)) for i in range(10)]) - res = PaddleSingleDriver.tensor_to_numeric(tensor_tuple) - assert isinstance(res, tuple) - tensor_tuple = tuple([t.tolist() for t in tensor_tuple]) - assert res == tensor_tuple - - # 张量dict - tensor_dict = { - "tensor": paddle.rand((3, 4)), - "list": [paddle.rand((6, 4, 2)) for i in range(10)], - "dict":{ - "list": [paddle.rand((6, 4, 2)) for i in range(10)], - "tensor": paddle.rand((3, 4)) - }, - "int": 2, - "string": "test string" - } - - res = PaddleSingleDriver.tensor_to_numeric(tensor_dict) - assert isinstance(res, dict) - assert res["tensor"] == tensor_dict["tensor"].tolist() - assert isinstance(res["list"], list) - for r, d in zip(res["list"], tensor_dict["list"]): - assert r == d.tolist() - assert isinstance(res["int"], int) - assert isinstance(res["string"], str) - assert isinstance(res["dict"], dict) - assert isinstance(res["dict"]["list"], list) - for r, d in zip(res["dict"]["list"], tensor_dict["dict"]["list"]): - assert r == d.tolist() - assert res["dict"]["tensor"] == tensor_dict["dict"]["tensor"].tolist() + load_states = driver2.load(Path(path), dataloader, only_state_dict, should_load_model=True) + replaced_loader = load_states.pop("dataloader") + # 1. 检查 optimizer 的状态 + # TODO optimizer 的 state_dict 总是为空 - def test_set_model_mode(self): - """ - 测试set_model_mode函数 - """ - self.driver.set_model_mode("train") - assert self.driver.model.training - self.driver.set_model_mode("eval") - assert not self.driver.model.training - # 应该报错 - with pytest.raises(AssertionError): - self.driver.set_model_mode("test") + # 2. 检查 batch_sampler 是否被正确地加载和替换 + assert not (replaced_loader is dataloader) + assert replaced_loader.batch_sampler is dataloader.batch_sampler + assert isinstance(replaced_loader.batch_sampler, RandomBatchSampler) + assert replaced_loader.batch_sampler.index_list == sampler_states["index_list"] + assert replaced_loader.batch_sampler.num_consumed_samples == num_consumed_batches * 4 - def test_move_model_to_device_cpu(self): - """ - 测试move_model_to_device函数 - """ - PaddleSingleDriver.move_model_to_device(self.driver.model, "cpu") - assert self.driver.model.linear1.weight.place.is_cpu_place() + # 3. 检查 model 的参数是否正确 + # 4. 检查 batch_idx + start_batch = load_states.pop('batch_idx_in_epoch') + assert start_batch == 2 * num_consumed_batches + left_x_batches = set() + left_y_batches = set() + for idx, batch in enumerate(replaced_loader): - def test_move_model_to_device_gpu(self): - """ - 测试move_model_to_device函数 - """ - PaddleSingleDriver.move_model_to_device(self.driver.model, "gpu") - assert self.driver.model.linear1.weight.place.is_gpu_place() - assert self.driver.model.linear1.weight.place.gpu_device_id() == 0 + left_x_batches.update(batch["x"]) + left_y_batches.update(batch["y"]) + res1 = driver1.model.evaluate_step(**batch) + res2 = driver2.model.evaluate_step(**batch) + assert paddle.equal_all(res1["pred"], res2["pred"]) - def test_worker_init_function(self): - """ - 测试worker_init_function - """ - # 先确保不影响运行 - # TODO:正确性 - PaddleSingleDriver.worker_init_function(0) + assert len(left_x_batches) + len(already_seen_x_set) == len(dataset) + assert len(left_x_batches | already_seen_x_set) == len(dataset) + assert len(left_y_batches) + len(already_seen_y_set) == len(dataset) + assert len(left_y_batches | already_seen_y_set) == len(dataset) + finally: + synchronize_safe_rm(path) - def test_set_deterministic_dataloader(self): - """ - 测试set_deterministic_dataloader - """ - # 先确保不影响运行 - # TODO:正确性 - dataloader = DataLoader(PaddleNormalDataset()) - self.driver.set_deterministic_dataloader(dataloader) +@pytest.mark.parametrize("only_state_dict", ([True, False])) +def test_save_and_load_with_randomsampler(only_state_dict): + """ + 测试save和load函数,主要测试 dataloader 被替换了 batch_sampler 的情况 + """ - def test_set_sampler_epoch(self): - """ - 测试set_sampler_epoch - """ - # 先确保不影响运行 - # TODO:正确性 - dataloader = DataLoader(PaddleNormalDataset()) - self.driver.set_sampler_epoch(dataloader, 0) + try: + path = "model.ckp" - @pytest.mark.parametrize("batch_size", [16]) - @pytest.mark.parametrize("shuffle", [True, False]) - @pytest.mark.parametrize("drop_last", [True, False]) - def test_get_dataloader_args(self, batch_size, shuffle, drop_last): - """ - 测试正常情况下 get_dataloader_args 的表现 - """ + driver1, driver2 = generate_random_driver(10, 10), generate_random_driver(10, 10) + dataset = PaddleRandomMaxDataset(40, 10) + batch_sampler = BatchSampler(dataset=dataset, batch_size=4) + batch_sampler.sampler = RandomSampler(dataset, True) dataloader = DataLoader( - PaddleNormalDataset(), - batch_size=batch_size, - shuffle=shuffle, - drop_last=drop_last, + dataset, + batch_sampler=batch_sampler ) - res = PaddleSingleDriver.get_dataloader_args(dataloader) + num_consumed_batches = 2 - assert isinstance(res.dataset, PaddleNormalDataset) - assert isinstance(res.batch_sampler, BatchSampler) - if shuffle: - assert isinstance(res.sampler, paddle.io.RandomSampler) - else: - assert isinstance(res.sampler, paddle.io.SequenceSampler) - assert res.shuffle == shuffle - assert res.batch_size == batch_size - assert res.drop_last == drop_last + already_seen_x_set = set() + already_seen_y_set = set() + for idx, batch in enumerate(dataloader): + if idx >= num_consumed_batches: + break + already_seen_x_set.update(batch["x"]) + already_seen_y_set.update(batch["y"]) - @pytest.mark.parametrize("batch_size", [16]) - @pytest.mark.parametrize("shuffle", [True, False]) - @pytest.mark.parametrize("drop_last", [True, False]) - def test_get_dataloader_args_with_randombatchsampler(self, batch_size, shuffle, drop_last): - """ - 测试替换了 batch_sampler 后 get_dataloader_args 的表现 - """ - dataset = PaddleNormalDataset() + sampler_states = dataloader.batch_sampler.sampler.state_dict() + save_states = {"num_consumed_batches": num_consumed_batches} + if only_state_dict: + driver1.save(Path(path), save_states, dataloader, only_state_dict, should_save_model=True) + else: + driver1.save(Path(path), save_states, dataloader, only_state_dict, should_save_model=True, input_spec=[paddle.ones((16, 10))]) + + # 加载 + # 更改 batch_size + batch_sampler = BatchSampler(dataset=dataset, batch_size=2) + batch_sampler.sampler = RandomSampler(dataset, True) dataloader = DataLoader( dataset, - batch_sampler=RandomBatchSampler( - BatchSampler(dataset, batch_size=batch_size, shuffle=shuffle), - batch_size, - drop_last, - ) + batch_sampler=batch_sampler ) - res = PaddleSingleDriver.get_dataloader_args(dataloader) + load_states = driver2.load(Path(path), dataloader, only_state_dict, should_load_model=True) + replaced_loader = load_states.pop("dataloader") - assert isinstance(res.dataset, PaddleNormalDataset) - assert isinstance(res.batch_sampler, RandomBatchSampler) - if shuffle: - assert isinstance(res.sampler, paddle.io.RandomSampler) - else: - assert isinstance(res.sampler, paddle.io.SequenceSampler) - assert res.shuffle == shuffle - assert res.batch_size == batch_size - assert res.drop_last == drop_last + # 1. 检查 optimizer 的状态 + # TODO optimizer 的 state_dict 总是为空 - @pytest.mark.parametrize("batch_size", [16]) - @pytest.mark.parametrize("shuffle", [True, False]) - @pytest.mark.parametrize("drop_last", [True, False]) - def test_get_dataloader_args_with_randomsampler(self, batch_size, shuffle, drop_last): - """ - 测试替换了 sampler 后 get_dataloader_args 的表现 - """ - dataset = PaddleNormalDataset() - batch_sampler = BatchSampler(dataset, batch_size=batch_size, drop_last=drop_last) - batch_sampler.sampler = RandomSampler(dataset, shuffle) - dataloader = DataLoader( - dataset, - batch_sampler=batch_sampler, - ) - res = PaddleSingleDriver.get_dataloader_args(dataloader) + # 2. 检查 sampler 是否被正确地加载和替换 + assert not (replaced_loader is dataloader) + assert isinstance(replaced_loader.batch_sampler.sampler, RandomSampler) + assert replaced_loader.batch_sampler.sampler.seed == sampler_states["seed"] + assert replaced_loader.batch_sampler.sampler.epoch == sampler_states["epoch"] + assert replaced_loader.batch_sampler.sampler.num_consumed_samples == 4 * num_consumed_batches + assert len(replaced_loader.batch_sampler.sampler.dataset) == sampler_states["length"] + assert replaced_loader.batch_sampler.sampler.shuffle == sampler_states["shuffle"] - assert isinstance(res.dataset, PaddleNormalDataset) - assert isinstance(res.batch_sampler, BatchSampler) - assert isinstance(res.sampler, RandomSampler) - assert res.shuffle == shuffle - assert res.batch_size == batch_size - assert res.drop_last == drop_last \ No newline at end of file + # 3. 检查 model 的参数是否正确 + # 4. 检查 batch_idx + start_batch = load_states.pop('batch_idx_in_epoch') + assert start_batch == 2 * num_consumed_batches + left_x_batches = set() + left_y_batches = set() + for idx, batch in enumerate(replaced_loader): + + left_x_batches.update(batch["x"]) + left_y_batches.update(batch["y"]) + res1 = driver1.model.evaluate_step(**batch) + res2 = driver2.model.evaluate_step(**batch) + assert paddle.equal_all(res1["pred"], res2["pred"]) + + assert len(left_x_batches) + len(already_seen_x_set) == len(dataset) + assert len(left_x_batches | already_seen_x_set) == len(dataset) + assert len(left_y_batches) + len(already_seen_y_set) == len(dataset) + assert len(left_y_batches | already_seen_y_set) == len(dataset) + finally: + synchronize_safe_rm(path) From 665d79a3ede01e6252d6ddf9d867900c14adf998 Mon Sep 17 00:00:00 2001 From: MorningForest <2297662686@qq.com> Date: Fri, 15 Apr 2022 20:03:44 +0800 Subject: [PATCH 4/7] =?UTF-8?q?=E5=A2=9E=E5=8A=A0paddle=E5=8D=95=E5=8D=A1?= =?UTF-8?q?=E7=9A=84accuracy=E6=B5=8B=E8=AF=95=E7=94=A8=E4=BE=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../metrics/backend/paddle_backend/backend.py | 3 +- .../metrics/backend/torch_backend/backend.py | 3 +- tests/core/metrics/test_accutacy_paddle.py | 59 +++++++++++++++++++ 3 files changed, 62 insertions(+), 3 deletions(-) create mode 100644 tests/core/metrics/test_accutacy_paddle.py diff --git a/fastNLP/core/metrics/backend/paddle_backend/backend.py b/fastNLP/core/metrics/backend/paddle_backend/backend.py index 12216d4b..7a7e7f7a 100644 --- a/fastNLP/core/metrics/backend/paddle_backend/backend.py +++ b/fastNLP/core/metrics/backend/paddle_backend/backend.py @@ -14,11 +14,13 @@ if _NEED_IMPORT_PADDLE: import paddle.distributed as dist from paddle.fluid.dygraph import parallel_helper + def _simple_gather_all_tensors(result, group: Any, world_size: int) -> List: gathered_result = [paddle.zeros_like(result) for _ in range(world_size)] dist.all_gather(gathered_result, result, group) return gathered_result + class PaddleBackend(Backend): def __init__(self): super().__init__() @@ -124,4 +126,3 @@ class PaddleBackend(Backend): # TODO 如果在这里处理的话,会不会在别的地方引起bug? device = get_device_from_visible(device) return paddle_to(tensor, device) - diff --git a/fastNLP/core/metrics/backend/torch_backend/backend.py b/fastNLP/core/metrics/backend/torch_backend/backend.py index 8945ab01..a602434e 100644 --- a/fastNLP/core/metrics/backend/torch_backend/backend.py +++ b/fastNLP/core/metrics/backend/torch_backend/backend.py @@ -11,7 +11,6 @@ from fastNLP.core.drivers.torch_driver.dist_utils import fastnlp_torch_all_gathe if _NEED_IMPORT_TORCH: import torch import torch.distributed as dist - import torch.nn.functional as F def _simple_gather_all_tensors(result, group: Any, world_size: int) -> List: @@ -33,7 +32,7 @@ class TorchBackend(Backend): if dist.is_initialized(): if method is None: raise AggregateMethodError(should_have_aggregate_method=True) - tensor = fastnlp_torch_all_gather(tensor) + tensor = self.all_gather_object(tensor) if isinstance(tensor[0], torch.Tensor): tensor = torch.stack(tensor) # 第一步, aggregate结果 diff --git a/tests/core/metrics/test_accutacy_paddle.py b/tests/core/metrics/test_accutacy_paddle.py new file mode 100644 index 00000000..1580d3a7 --- /dev/null +++ b/tests/core/metrics/test_accutacy_paddle.py @@ -0,0 +1,59 @@ +import os + +import pytest +import paddle +import paddle.distributed +import paddle.distributed.fleet.base.role_maker as role_maker +import paddle.distributed.fleet as fleet +from fastNLP.core.metrics import Accuracy +from fastNLP.core.drivers.paddle_driver.fleet_launcher import FleetLauncher + +############################################################################ +# +# 测试 单机单卡情况下的Accuracy +# +############################################################################ +def test_accuracy_single(): + pred = paddle.to_tensor([[1.19812393, -0.82041764, -0.53517765, -0.73061031, -1.45006669, + 0.46514302], + [-0.85775983, -2.18273783, -1.07505429, -1.45561373, 0.40011844, + 1.02202022], + [-0.39487389, 0.65682763, -0.62424040, 0.53692561, -0.28390560, + -0.02559055], + [-0.22586937, -0.07676325, -0.95977223, 0.36395910, -0.91758579, + -0.83857095], + [0.25136873, 2.49652624, 1.06251311, 1.60194016, 1.01451588, + 0.08403367], + [0.10844281, 1.19017303, -0.11378096, 1.12686944, -0.08654942, + 0.48605862], + [1.27320433, -1.13902378, 1.47072780, -0.98665696, -0.42589864, + 0.64618838], + [0.83809763, -0.05356205, 0.03042423, -0.28371972, 0.81611472, + -0.45802942], + [0.38535264, 0.09721313, 2.27187467, 0.32045507, -0.20711982, + -0.13550705], + [-0.75228405, -1.34161997, 1.08697927, 0.33218071, -1.19470012, + 2.58735061]]) + tg = paddle.to_tensor([1, 2, 1, 3, 5, 4, 4, 2, 1, 5]) + acc_metric = Accuracy() + acc_metric.update(pred, tg) + result = acc_metric.get_metric() + true_result = {'acc': 0.3} + assert true_result == result + + +############################################################################ +# +# 测试 单机多卡情况下的Accuracy +# +############################################################################ +def test_accuracy_ddp(): + launcher = FleetLauncher(devices=[0, 1]) + launcher.launch() + role = role_maker.PaddleCloudRoleMaker(is_collective=True) + fleet.init(role) + if fleet.is_server(): + pass + elif fleet.is_worker(): + print(os.getenv("PADDLE_TRAINER_ID")) + From 262bc1a82e8cfdf4dfbab4a8e5eec83afd823b40 Mon Sep 17 00:00:00 2001 From: x54-729 <17307130121@fudan.edu.cn> Date: Fri, 15 Apr 2022 14:03:08 +0000 Subject: [PATCH 5/7] small --- fastNLP/core/drivers/paddle_driver/initialize_paddle_driver.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fastNLP/core/drivers/paddle_driver/initialize_paddle_driver.py b/fastNLP/core/drivers/paddle_driver/initialize_paddle_driver.py index 98655757..2cba6388 100644 --- a/fastNLP/core/drivers/paddle_driver/initialize_paddle_driver.py +++ b/fastNLP/core/drivers/paddle_driver/initialize_paddle_driver.py @@ -28,7 +28,7 @@ def initialize_paddle_driver(driver: str, device: Optional[Union[str, int, List[ """ if is_in_paddle_launch_dist(): if device is not None: - logger.warning("Parameter `device` would be ignored when you are using `paddle.distributed.launch` to pull " + logger.warning_once("Parameter `device` would be ignored when you are using `paddle.distributed.launch` to pull " "up your script. And we will directly get the local device via " "and `os.environ['CUDA_VISIBLE_DEVICES']``.") device = [int(g) for g in os.environ["CUDA_VISIBLE_DEVICES"].split(",")] From 5d1ac72ec9f279c4ab4af4f1fa892bbcd34d8281 Mon Sep 17 00:00:00 2001 From: x54-729 <17307130121@fudan.edu.cn> Date: Fri, 15 Apr 2022 14:38:29 +0000 Subject: [PATCH 6/7] =?UTF-8?q?=E5=8A=A0=E8=BD=BDfp16=E6=97=B6=E5=90=8C?= =?UTF-8?q?=E6=97=B6=E8=AE=BE=E7=BD=AEauto=5Fcast=E5=92=8Cfp16=E5=B1=9E?= =?UTF-8?q?=E6=80=A7?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- fastNLP/core/drivers/torch_driver/torch_driver.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fastNLP/core/drivers/torch_driver/torch_driver.py b/fastNLP/core/drivers/torch_driver/torch_driver.py index f00d3f1f..3630b593 100644 --- a/fastNLP/core/drivers/torch_driver/torch_driver.py +++ b/fastNLP/core/drivers/torch_driver/torch_driver.py @@ -259,6 +259,8 @@ class TorchDriver(Driver): grad_scaler_state_dict = states.pop('grad_scaler_state_dict') if not isinstance(self.grad_scaler, DummyGradScaler): self.grad_scaler.load_state_dict(grad_scaler_state_dict) + self.auto_cast = torch.cuda.amp.autocast + self.fp16 = True logger.debug("Load grad_scaler state dict...") elif not isinstance(self.grad_scaler, DummyGradScaler): logger.warning(f"Checkpoint {folder} is not trained with fp16=True, while resume to a fp16=True training, " From 8cda30c426db9099ef862e8250abf0f9e07449d9 Mon Sep 17 00:00:00 2001 From: x54-729 <17307130121@fudan.edu.cn> Date: Fri, 15 Apr 2022 15:10:35 +0000 Subject: [PATCH 7/7] small --- fastNLP/core/drivers/torch_driver/torch_driver.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/fastNLP/core/drivers/torch_driver/torch_driver.py b/fastNLP/core/drivers/torch_driver/torch_driver.py index 3630b593..2a04e62f 100644 --- a/fastNLP/core/drivers/torch_driver/torch_driver.py +++ b/fastNLP/core/drivers/torch_driver/torch_driver.py @@ -255,13 +255,14 @@ class TorchDriver(Driver): logger.debug("Load model...") # 3. 加载fp16的状态 - if 'grad_scaler_state_dict' in states: - grad_scaler_state_dict = states.pop('grad_scaler_state_dict') - if not isinstance(self.grad_scaler, DummyGradScaler): - self.grad_scaler.load_state_dict(grad_scaler_state_dict) - self.auto_cast = torch.cuda.amp.autocast + if "grad_scaler_state_dict" in states: + grad_scaler_state_dict = states.pop("grad_scaler_state_dict") + if isinstance(self.grad_scaler, DummyGradScaler): + self.auto_cast, _grad_scaler = _build_fp16_env(dummy=False) + self.grad_scaler = _grad_scaler() self.fp16 = True - logger.debug("Load grad_scaler state dict...") + self.grad_scaler.load_state_dict(grad_scaler_state_dict) + logger.debug("Load grad_scaler state dict...") elif not isinstance(self.grad_scaler, DummyGradScaler): logger.warning(f"Checkpoint {folder} is not trained with fp16=True, while resume to a fp16=True training, " f"the training process may be unstable.")