diff --git a/fastNLP/core/drivers/paddle_driver/initialize_paddle_driver.py b/fastNLP/core/drivers/paddle_driver/initialize_paddle_driver.py index 9a9d4198..c0489e6e 100644 --- a/fastNLP/core/drivers/paddle_driver/initialize_paddle_driver.py +++ b/fastNLP/core/drivers/paddle_driver/initialize_paddle_driver.py @@ -14,7 +14,7 @@ if _NEED_IMPORT_PADDLE: import paddle def initialize_paddle_driver(driver: str, device: Optional[Union[str, int, List[int]]], - model: paddle.nn.Layer, **kwargs) -> PaddleDriver: + model: "paddle.nn.Layer", **kwargs) -> PaddleDriver: r""" 用来根据参数 `driver` 和 `device` 来确定并且初始化一个具体的 `Driver` 实例然后返回回去; 1、如果检测到当前进程为用户通过 `python -m paddle.distributed.launch xxx.py` 方式拉起的,则将 diff --git a/fastNLP/core/drivers/torch_driver/initialize_torch_driver.py b/fastNLP/core/drivers/torch_driver/initialize_torch_driver.py index 5ee946c4..7cef7316 100644 --- a/fastNLP/core/drivers/torch_driver/initialize_torch_driver.py +++ b/fastNLP/core/drivers/torch_driver/initialize_torch_driver.py @@ -11,8 +11,8 @@ from fastNLP.core.log import logger from fastNLP.envs import FASTNLP_BACKEND_LAUNCH -def initialize_torch_driver(driver: str, device: Optional[Union[str, torch.device, int, List[int]]], - model: torch.nn.Module, **kwargs) -> TorchDriver: +def initialize_torch_driver(driver: str, device: Optional[Union[str, "torch.device", int, List[int]]], + model: "torch.nn.Module", **kwargs) -> TorchDriver: r""" 用来根据参数 `driver` 和 `device` 来确定并且初始化一个具体的 `Driver` 实例然后返回回去; 注意如果输入的 `device` 如果和 `driver` 对应不上就直接报错; diff --git a/fastNLP/core/metrics/accuracy.py b/fastNLP/core/metrics/accuracy.py index d9ccb332..0869d8c8 100644 --- a/fastNLP/core/metrics/accuracy.py +++ b/fastNLP/core/metrics/accuracy.py @@ -28,7 +28,7 @@ class Accuracy(Metric): def get_metric(self) -> dict: r""" - get_metric 函数将根据 evaluate 函数累计的评价指标统计量来计算最终的评价结果. + get_metric 函数将根据 update 函数累计的评价指标统计量来计算最终的评价结果. :return dict evaluate_result: {"acc": float} """ @@ -37,7 +37,7 @@ class Accuracy(Metric): def update(self, pred, target, seq_len=None): r""" - evaluate函数将针对一个批次的预测结果做评价指标的累计 + update 函数将针对一个批次的预测结果做评价指标的累计 :param torch.Tensor pred: 预测的tensor, tensor的形状可以是torch.Size([B,]), torch.Size([B, n_classes]), torch.Size([B, max_len]), 或者torch.Size([B, max_len, n_classes]) diff --git a/fastNLP/core/metrics/classify_f1_pre_rec_metric.py b/fastNLP/core/metrics/classify_f1_pre_rec_metric.py index 2c71602d..8de007ce 100644 --- a/fastNLP/core/metrics/classify_f1_pre_rec_metric.py +++ b/fastNLP/core/metrics/classify_f1_pre_rec_metric.py @@ -56,7 +56,7 @@ class ClassifyFPreRecMetric(Metric): def get_metric(self) -> dict: r""" - get_metric函数将根据evaluate函数累计的评价指标统计量来计算最终的评价结果. + get_metric函数将根据update函数累计的评价指标统计量来计算最终的评价结果. :return dict evaluate_result: {"acc": float} """ @@ -117,7 +117,7 @@ class ClassifyFPreRecMetric(Metric): def update(self, pred, target, seq_len=None): r""" - evaluate函数将针对一个批次的预测结果做评价指标的累计 + update 函数将针对一个批次的预测结果做评价指标的累计 :param torch.Tensor pred: 预测的tensor, tensor的形状可以是torch.Size([B,]), torch.Size([B, n_classes]), torch.Size([B, max_len]), 或者torch.Size([B, max_len, n_classes]) diff --git a/fastNLP/core/metrics/utils.py b/fastNLP/core/metrics/utils.py index ce6f618b..6d3fd74a 100644 --- a/fastNLP/core/metrics/utils.py +++ b/fastNLP/core/metrics/utils.py @@ -11,9 +11,8 @@ _IS_ALLENNLP_AVAILABLE = _module_available('allennlp') if _IS_ALLENNLP_AVAILABLE: from allennlp.training.metrics import Metric as allennlp_Metric -if _NEED_IMPORT_TORCH and _IS_TORCHMETRICS_AVAILABLE: - if _IS_TORCHMETRICS_AVAILABLE: - from torchmetrics import Metric as torchmetrics_Metric +if _IS_TORCHMETRICS_AVAILABLE: + from torchmetrics import Metric as torchmetrics_Metric if _NEED_IMPORT_PADDLE: from paddle.metric import Metric as paddle_Metric diff --git a/tests/core/controllers/test_trainer_fleet.py b/tests/core/controllers/_test_trainer_fleet.py similarity index 98% rename from tests/core/controllers/test_trainer_fleet.py rename to tests/core/controllers/_test_trainer_fleet.py index 46201c67..f438b6de 100644 --- a/tests/core/controllers/test_trainer_fleet.py +++ b/tests/core/controllers/_test_trainer_fleet.py @@ -4,7 +4,6 @@ python -m paddle.distributed.launch --gpus=0,2,3 test_trainer_fleet.py """ import os -os.environ["FASTNLP_BACKEND"] = "paddle" import sys sys.path.append("../../../") diff --git a/tests/core/controllers/test_trainer_fleet_outside.py b/tests/core/controllers/_test_trainer_fleet_outside.py similarity index 98% rename from tests/core/controllers/test_trainer_fleet_outside.py rename to tests/core/controllers/_test_trainer_fleet_outside.py index a48434fa..e8c9a244 100644 --- a/tests/core/controllers/test_trainer_fleet_outside.py +++ b/tests/core/controllers/_test_trainer_fleet_outside.py @@ -4,7 +4,6 @@ python -m paddle.distributed.launch --gpus=0,2,3 test_trainer_fleet_outside.py """ import os -os.environ["FASTNLP_BACKEND"] = "paddle" import sys sys.path.append("../../../") diff --git a/tests/core/controllers/test_trainer_paddle.py b/tests/core/controllers/test_trainer_paddle.py index 8a3ab2ce..aaf20105 100644 --- a/tests/core/controllers/test_trainer_paddle.py +++ b/tests/core/controllers/test_trainer_paddle.py @@ -1,6 +1,4 @@ import pytest -import os -os.environ["FASTNLP_BACKEND"] = "paddle" from dataclasses import dataclass from fastNLP.core.controllers.trainer import Trainer @@ -25,7 +23,7 @@ class TrainPaddleConfig: shuffle: bool = True evaluate_every = 2 -@pytest.mark.parametrize("driver,device", [("paddle", "cpu"), ("paddle", 1)]) +@pytest.mark.parametrize("driver,device", [("paddle", "cpu"), ("paddle", 1), ("fleet", [0, 1])]) # @pytest.mark.parametrize("driver,device", [("fleet", [0, 1])]) @pytest.mark.parametrize("callbacks", [[RecordMetricCallback(monitor="acc#acc", metric_threshold=0.0, larger_better=True), RichCallback(5)]]) diff --git a/tests/core/dataloaders/jittor_dataloader/__init__.py b/tests/core/dataloaders/jittor_dataloader/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/core/dataloaders/paddle_dataloader/__init__.py b/tests/core/dataloaders/paddle_dataloader/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/core/dataloaders/torch_dataloader/__init__.py b/tests/core/dataloaders/torch_dataloader/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/core/drivers/paddle_driver/test_dist_utils.py b/tests/core/drivers/paddle_driver/test_dist_utils.py index 9b81c38d..bd43378e 100644 --- a/tests/core/drivers/paddle_driver/test_dist_utils.py +++ b/tests/core/drivers/paddle_driver/test_dist_utils.py @@ -3,7 +3,6 @@ import sys import signal import pytest import traceback -os.environ["FASTNLP_BACKEND"] = "paddle" import numpy as np diff --git a/tests/core/drivers/paddle_driver/test_fleet.py b/tests/core/drivers/paddle_driver/test_fleet.py index 34c80888..6190dd8c 100644 --- a/tests/core/drivers/paddle_driver/test_fleet.py +++ b/tests/core/drivers/paddle_driver/test_fleet.py @@ -1,8 +1,6 @@ import pytest -import os from pathlib import Path -os.environ["FASTNLP_BACKEND"] = "paddle" from fastNLP.core.drivers.paddle_driver.fleet import PaddleFleetDriver from fastNLP.core.samplers import ( RandomSampler, diff --git a/tests/core/drivers/paddle_driver/test_initialize_paddle_driver.py b/tests/core/drivers/paddle_driver/test_initialize_paddle_driver.py index df96d746..c8b5bfff 100644 --- a/tests/core/drivers/paddle_driver/test_initialize_paddle_driver.py +++ b/tests/core/drivers/paddle_driver/test_initialize_paddle_driver.py @@ -1,8 +1,5 @@ -import os import pytest -os.environ["FASTNLP_BACKEND"] = "paddle" - from fastNLP.core.drivers import PaddleSingleDriver, PaddleFleetDriver from fastNLP.core.drivers.paddle_driver.initialize_paddle_driver import initialize_paddle_driver from fastNLP.envs import get_gpu_count diff --git a/tests/core/drivers/paddle_driver/test_single_device.py b/tests/core/drivers/paddle_driver/test_single_device.py index 2aa4e0e6..ec40e9f3 100644 --- a/tests/core/drivers/paddle_driver/test_single_device.py +++ b/tests/core/drivers/paddle_driver/test_single_device.py @@ -1,6 +1,3 @@ -import os -from re import S -os.environ["FASTNLP_BACKEND"] = "paddle" import pytest from pathlib import Path diff --git a/tests/core/drivers/paddle_driver/test_utils.py b/tests/core/drivers/paddle_driver/test_utils.py index 690d0fb8..69be8055 100644 --- a/tests/core/drivers/paddle_driver/test_utils.py +++ b/tests/core/drivers/paddle_driver/test_utils.py @@ -1,6 +1,4 @@ -import os import pytest -os.environ["FASTNLP_BACKEND"] = "paddle" from fastNLP.core.drivers.paddle_driver.utils import ( get_device_from_visible, diff --git a/tests/core/drivers/torch_driver/test.py b/tests/core/drivers/torch_driver/test.py new file mode 100644 index 00000000..3a1a280d --- /dev/null +++ b/tests/core/drivers/torch_driver/test.py @@ -0,0 +1,31 @@ +import sys +sys.path.append("../../../../") +from fastNLP.core.drivers.torch_driver.ddp import TorchDDPDriver +from tests.helpers.models.torch_model import TorchNormalModel_Classification_1 + +import torch + +device = [0, 1] +torch_model = TorchNormalModel_Classification_1(10, 10) +torch_opt = torch.optim.Adam(params=torch_model.parameters(), lr=0.01) +device = [torch.device(i) for i in device] +driver = TorchDDPDriver( + model=torch_model, + parallel_device=device, + fp16=False +) +driver.set_optimizers(torch_opt) +driver.setup() +print("-----------first--------------") + +device = [0, 2] +torch_model = TorchNormalModel_Classification_1(10, 10) +torch_opt = torch.optim.Adam(params=torch_model.parameters(), lr=0.01) +device = [torch.device(i) for i in device] +driver = TorchDDPDriver( + model=torch_model, + parallel_device=device, + fp16=False +) +driver.set_optimizers(torch_opt) +driver.setup() \ No newline at end of file diff --git a/tests/core/drivers/torch_driver/test_ddp.py b/tests/core/drivers/torch_driver/test_ddp.py index 0e91fe77..87787fbc 100644 --- a/tests/core/drivers/torch_driver/test_ddp.py +++ b/tests/core/drivers/torch_driver/test_ddp.py @@ -1,8 +1,6 @@ import pytest -import os from pathlib import Path -os.environ["FASTNLP_BACKEND"] = "torch" from fastNLP.core.drivers.torch_driver.ddp import TorchDDPDriver from fastNLP.core.samplers import ( RandomSampler, diff --git a/tests/core/drivers/torch_driver/test_initialize_torch_driver.py b/tests/core/drivers/torch_driver/test_initialize_torch_driver.py index 6c47e30e..3e612964 100644 --- a/tests/core/drivers/torch_driver/test_initialize_torch_driver.py +++ b/tests/core/drivers/torch_driver/test_initialize_torch_driver.py @@ -1,8 +1,5 @@ -import os import pytest -os.environ["FASTNLP_BACKEND"] = "torch" - from fastNLP.core.drivers import TorchSingleDriver, TorchDDPDriver from fastNLP.core.drivers.torch_driver.initialize_torch_driver import initialize_torch_driver from fastNLP.envs import get_gpu_count diff --git a/tests/core/drivers/torch_driver/test_single_device.py b/tests/core/drivers/torch_driver/test_single_device.py index b8a8def9..f46f69c0 100644 --- a/tests/core/drivers/torch_driver/test_single_device.py +++ b/tests/core/drivers/torch_driver/test_single_device.py @@ -1,5 +1,3 @@ -import os -os.environ["FASTNLP_BACKEND"] = "torch" import pytest from pathlib import Path diff --git a/tests/core/drivers/torch_driver/test_utils.py b/tests/core/drivers/torch_driver/test_utils.py index 8f0172e0..4df767b5 100644 --- a/tests/core/drivers/torch_driver/test_utils.py +++ b/tests/core/drivers/torch_driver/test_utils.py @@ -1,6 +1,4 @@ -import os import pytest -os.environ["FASTNLP_BACKEND"] = "torch" from fastNLP.core.drivers.torch_driver.utils import ( replace_batch_sampler, diff --git a/tests/core/log/test_logger.py b/tests/core/log/test_logger_torch.py similarity index 100% rename from tests/core/log/test_logger.py rename to tests/core/log/test_logger_torch.py diff --git a/tests/core/samplers/test_reproducible_batch_sampler.py b/tests/core/samplers/test_reproducible_batch_sampler.py index 3514c331..6cf4b7d4 100644 --- a/tests/core/samplers/test_reproducible_batch_sampler.py +++ b/tests/core/samplers/test_reproducible_batch_sampler.py @@ -9,153 +9,153 @@ from fastNLP.core.samplers import RandomBatchSampler, BucketedBatchSampler from fastNLP.core.drivers.torch_driver.utils import replace_batch_sampler from tests.helpers.datasets.torch_data import TorchNormalDataset - -class TestReproducibleBatchSampler: - # TODO 拆分测试,在这里只测试一个东西 - def test_torch_dataloader_1(self): - import torch - from torch.utils.data import DataLoader - # no shuffle - before_batch_size = 7 - dataset = TorchNormalDataset(num_of_data=100) - dataloader = DataLoader(dataset, batch_size=before_batch_size) - re_batchsampler = RandomBatchSampler(dataloader.batch_sampler, dataloader.batch_size, drop_last=False) - dataloader = replace_batch_sampler(dataloader, re_batchsampler) - - forward_steps = 3 - iter_dataloader = iter(dataloader) - for _ in range(forward_steps): - next(iter_dataloader) - - # 1. 保存状态 - _get_re_batchsampler = dataloader.batch_sampler - assert isinstance(_get_re_batchsampler, RandomBatchSampler) - state = _get_re_batchsampler.state_dict() - assert state == {"index_list": array("I", list(range(100))), "num_consumed_samples": forward_steps*before_batch_size, - "sampler_type": "RandomBatchSampler"} - - # 2. 断点重训,重新生成一个 dataloader; - # 不改变 batch_size; - dataloader = DataLoader(dataset, batch_size=before_batch_size) - re_batchsampler = RandomBatchSampler(dataloader.batch_sampler, dataloader.batch_size, drop_last=False) - re_batchsampler.load_state_dict(state) - dataloader = replace_batch_sampler(dataloader, re_batchsampler) - - real_res = [] - supposed_res = (torch.tensor(list(range(21, 28))), torch.tensor(list(range(28, 35)))) - forward_steps = 2 - iter_dataloader = iter(dataloader) - for _ in range(forward_steps): - real_res.append(next(iter_dataloader)) - - for i in range(forward_steps): - assert all(real_res[i] == supposed_res[i]) - - # 改变 batch_size; - after_batch_size = 3 - dataloader = DataLoader(dataset, batch_size=after_batch_size) - re_batchsampler = RandomBatchSampler(dataloader.batch_sampler, dataloader.batch_size, drop_last=False) - re_batchsampler.load_state_dict(state) - dataloader = replace_batch_sampler(dataloader, re_batchsampler) - - real_res = [] - supposed_res = (torch.tensor(list(range(21, 24))), torch.tensor(list(range(24, 27)))) - forward_steps = 2 - iter_dataloader = iter(dataloader) - for _ in range(forward_steps): - real_res.append(next(iter_dataloader)) - - for i in range(forward_steps): - assert all(real_res[i] == supposed_res[i]) - - # 断点重训的第二轮是否是一个完整的 dataloader; - # 先把断点重训所在的那一个 epoch 跑完; - begin_idx = 27 - while True: - try: - data = next(iter_dataloader) - _batch_size = len(data) - assert all(data == torch.tensor(list(range(begin_idx, begin_idx + _batch_size)))) - begin_idx += _batch_size - except StopIteration: - break - - # 开始新的一轮; - begin_idx = 0 - iter_dataloader = iter(dataloader) - while True: - try: - data = next(iter_dataloader) - _batch_size = len(data) - assert all(data == torch.tensor(list(range(begin_idx, begin_idx + _batch_size)))) - begin_idx += _batch_size - except StopIteration: - break - - def test_torch_dataloader_2(self): - # 测试新的一轮的 index list 是重新生成的,而不是沿用上一轮的; - from torch.utils.data import DataLoader - # no shuffle - before_batch_size = 7 - dataset = TorchNormalDataset(num_of_data=100) - # 开启 shuffle,来检验断点重训后的第二轮的 index list 是不是重新生成的; - dataloader = DataLoader(dataset, batch_size=before_batch_size, shuffle=True) - re_batchsampler = RandomBatchSampler(dataloader.batch_sampler, dataloader.batch_size, drop_last=False) - dataloader = replace_batch_sampler(dataloader, re_batchsampler) - - # 将一轮的所有数据保存下来,看是否恢复的是正确的; - all_supposed_data = [] - forward_steps = 3 - iter_dataloader = iter(dataloader) - for _ in range(forward_steps): - all_supposed_data.extend(next(iter_dataloader).tolist()) - - # 1. 保存状态 - _get_re_batchsampler = dataloader.batch_sampler - assert isinstance(_get_re_batchsampler, RandomBatchSampler) - state = _get_re_batchsampler.state_dict() - - # 2. 断点重训,重新生成一个 dataloader; - # 不改变 batch_size; - dataloader = DataLoader(dataset, batch_size=before_batch_size, shuffle=True) - re_batchsampler = RandomBatchSampler(dataloader.batch_sampler, dataloader.batch_size, drop_last=False) - re_batchsampler.load_state_dict(state) - dataloader = replace_batch_sampler(dataloader, re_batchsampler) - - # 先把这一轮的数据过完; - pre_index_list = dataloader.batch_sampler.state_dict()["index_list"] - while True: - try: - all_supposed_data.extend(next(iter_dataloader).tolist()) - except StopIteration: - break - assert all_supposed_data == list(pre_index_list) - - # 重新开启新的一轮; - for _ in range(3): - iter_dataloader = iter(dataloader) - res = [] - while True: - try: - res.append(next(iter_dataloader)) - except StopIteration: - break - - def test_3(self): - import torch - from torch.utils.data import DataLoader - before_batch_size = 7 - dataset = TorchNormalDataset(num_of_data=100) - # 开启 shuffle,来检验断点重训后的第二轮的 index list 是不是重新生成的; - dataloader = DataLoader(dataset, batch_size=before_batch_size) - - for idx, data in enumerate(dataloader): - if idx > 3: - break - - iterator = iter(dataloader) - for each in iterator: - pass +# +# class TestReproducibleBatchSampler: +# # TODO 拆分测试,在这里只测试一个东西 +# def test_torch_dataloader_1(self): +# import torch +# from torch.utils.data import DataLoader +# # no shuffle +# before_batch_size = 7 +# dataset = TorchNormalDataset(num_of_data=100) +# dataloader = DataLoader(dataset, batch_size=before_batch_size) +# re_batchsampler = RandomBatchSampler(dataloader.batch_sampler, dataloader.batch_size, drop_last=False) +# dataloader = replace_batch_sampler(dataloader, re_batchsampler) +# +# forward_steps = 3 +# iter_dataloader = iter(dataloader) +# for _ in range(forward_steps): +# next(iter_dataloader) +# +# # 1. 保存状态 +# _get_re_batchsampler = dataloader.batch_sampler +# assert isinstance(_get_re_batchsampler, RandomBatchSampler) +# state = _get_re_batchsampler.state_dict() +# assert state == {"index_list": array("I", list(range(100))), "num_consumed_samples": forward_steps*before_batch_size, +# "sampler_type": "RandomBatchSampler"} +# +# # 2. 断点重训,重新生成一个 dataloader; +# # 不改变 batch_size; +# dataloader = DataLoader(dataset, batch_size=before_batch_size) +# re_batchsampler = RandomBatchSampler(dataloader.batch_sampler, dataloader.batch_size, drop_last=False) +# re_batchsampler.load_state_dict(state) +# dataloader = replace_batch_sampler(dataloader, re_batchsampler) +# +# real_res = [] +# supposed_res = (torch.tensor(list(range(21, 28))), torch.tensor(list(range(28, 35)))) +# forward_steps = 2 +# iter_dataloader = iter(dataloader) +# for _ in range(forward_steps): +# real_res.append(next(iter_dataloader)) +# +# for i in range(forward_steps): +# assert all(real_res[i] == supposed_res[i]) +# +# # 改变 batch_size; +# after_batch_size = 3 +# dataloader = DataLoader(dataset, batch_size=after_batch_size) +# re_batchsampler = RandomBatchSampler(dataloader.batch_sampler, dataloader.batch_size, drop_last=False) +# re_batchsampler.load_state_dict(state) +# dataloader = replace_batch_sampler(dataloader, re_batchsampler) +# +# real_res = [] +# supposed_res = (torch.tensor(list(range(21, 24))), torch.tensor(list(range(24, 27)))) +# forward_steps = 2 +# iter_dataloader = iter(dataloader) +# for _ in range(forward_steps): +# real_res.append(next(iter_dataloader)) +# +# for i in range(forward_steps): +# assert all(real_res[i] == supposed_res[i]) +# +# # 断点重训的第二轮是否是一个完整的 dataloader; +# # 先把断点重训所在的那一个 epoch 跑完; +# begin_idx = 27 +# while True: +# try: +# data = next(iter_dataloader) +# _batch_size = len(data) +# assert all(data == torch.tensor(list(range(begin_idx, begin_idx + _batch_size)))) +# begin_idx += _batch_size +# except StopIteration: +# break +# +# # 开始新的一轮; +# begin_idx = 0 +# iter_dataloader = iter(dataloader) +# while True: +# try: +# data = next(iter_dataloader) +# _batch_size = len(data) +# assert all(data == torch.tensor(list(range(begin_idx, begin_idx + _batch_size)))) +# begin_idx += _batch_size +# except StopIteration: +# break +# +# def test_torch_dataloader_2(self): +# # 测试新的一轮的 index list 是重新生成的,而不是沿用上一轮的; +# from torch.utils.data import DataLoader +# # no shuffle +# before_batch_size = 7 +# dataset = TorchNormalDataset(num_of_data=100) +# # 开启 shuffle,来检验断点重训后的第二轮的 index list 是不是重新生成的; +# dataloader = DataLoader(dataset, batch_size=before_batch_size, shuffle=True) +# re_batchsampler = RandomBatchSampler(dataloader.batch_sampler, dataloader.batch_size, drop_last=False) +# dataloader = replace_batch_sampler(dataloader, re_batchsampler) +# +# # 将一轮的所有数据保存下来,看是否恢复的是正确的; +# all_supposed_data = [] +# forward_steps = 3 +# iter_dataloader = iter(dataloader) +# for _ in range(forward_steps): +# all_supposed_data.extend(next(iter_dataloader).tolist()) +# +# # 1. 保存状态 +# _get_re_batchsampler = dataloader.batch_sampler +# assert isinstance(_get_re_batchsampler, RandomBatchSampler) +# state = _get_re_batchsampler.state_dict() +# +# # 2. 断点重训,重新生成一个 dataloader; +# # 不改变 batch_size; +# dataloader = DataLoader(dataset, batch_size=before_batch_size, shuffle=True) +# re_batchsampler = RandomBatchSampler(dataloader.batch_sampler, dataloader.batch_size, drop_last=False) +# re_batchsampler.load_state_dict(state) +# dataloader = replace_batch_sampler(dataloader, re_batchsampler) +# +# # 先把这一轮的数据过完; +# pre_index_list = dataloader.batch_sampler.state_dict()["index_list"] +# while True: +# try: +# all_supposed_data.extend(next(iter_dataloader).tolist()) +# except StopIteration: +# break +# assert all_supposed_data == list(pre_index_list) +# +# # 重新开启新的一轮; +# for _ in range(3): +# iter_dataloader = iter(dataloader) +# res = [] +# while True: +# try: +# res.append(next(iter_dataloader)) +# except StopIteration: +# break +# +# def test_3(self): +# import torch +# from torch.utils.data import DataLoader +# before_batch_size = 7 +# dataset = TorchNormalDataset(num_of_data=100) +# # 开启 shuffle,来检验断点重训后的第二轮的 index list 是不是重新生成的; +# dataloader = DataLoader(dataset, batch_size=before_batch_size) +# +# for idx, data in enumerate(dataloader): +# if idx > 3: +# break +# +# iterator = iter(dataloader) +# for each in iterator: +# pass class DatasetWithVaryLength: diff --git a/tests/core/samplers/test_unrepeated_sampler.py b/tests/core/samplers/test_unrepeated_sampler.py index 4a271f41..39d4e34f 100644 --- a/tests/core/samplers/test_unrepeated_sampler.py +++ b/tests/core/samplers/test_unrepeated_sampler.py @@ -28,12 +28,12 @@ class TestUnrepeatedSampler: @pytest.mark.parametrize('num_replicas', [2, 3]) @pytest.mark.parametrize('num_of_data', [2, 3, 4, 100]) @pytest.mark.parametrize('shuffle', [False, True]) - def test_multi(self, num_replica, num_of_data, shuffle): + def test_multi(self, num_replicas, num_of_data, shuffle): data = DatasetWithVaryLength(num_of_data=num_of_data) samplers = [] - for i in range(num_replica): + for i in range(num_replicas): sampler = UnrepeatedRandomSampler(dataset=data, shuffle=shuffle) - sampler.set_distributed(num_replica, rank=i) + sampler.set_distributed(num_replicas, rank=i) samplers.append(sampler) indexes = list(chain(*samplers)) @@ -52,12 +52,12 @@ class TestUnrepeatedSortedSampler: @pytest.mark.parametrize('num_replicas', [2, 3]) @pytest.mark.parametrize('num_of_data', [2, 3, 4, 100]) - def test_multi(self, num_replica, num_of_data): + def test_multi(self, num_replicas, num_of_data): data = DatasetWithVaryLength(num_of_data=num_of_data) samplers = [] - for i in range(num_replica): + for i in range(num_replicas): sampler = UnrepeatedSortedSampler(dataset=data, length=data.data) - sampler.set_distributed(num_replica, rank=i) + sampler.set_distributed(num_replicas, rank=i) samplers.append(sampler) # 保证顺序是没乱的 @@ -83,12 +83,12 @@ class TestUnrepeatedSequentialSampler: @pytest.mark.parametrize('num_replicas', [2, 3]) @pytest.mark.parametrize('num_of_data', [2, 3, 4, 100]) - def test_multi(self, num_replica, num_of_data): + def test_multi(self, num_replicas, num_of_data): data = DatasetWithVaryLength(num_of_data=num_of_data) samplers = [] - for i in range(num_replica): + for i in range(num_replicas): sampler = UnrepeatedSequentialSampler(dataset=data, length=data.data) - sampler.set_distributed(num_replica, rank=i) + sampler.set_distributed(num_replicas, rank=i) samplers.append(sampler) # 保证顺序是没乱的 diff --git a/tutorials/fastnlp_tutorial_0.ipynb b/tutorials/fastnlp_tutorial_0.ipynb index 01913ac0..28fcfddf 100644 --- a/tutorials/fastnlp_tutorial_0.ipynb +++ b/tutorials/fastnlp_tutorial_0.ipynb @@ -15,15 +15,15 @@ "\n", "    1.3   trainer 内部初始化 evaluater\n", "\n", - "  2   使用 trainer 训练模型\n", + "  2   使用 fastNLP 0.8 搭建 argmax 模型\n", "\n", - "    2.1   argmax 模型实例\n", + "    2.1   trainer_step 和 evaluator_step\n", "\n", - "    2.2   trainer 的参数匹配\n", + "    2.2   trainer 和 evaluator 的参数匹配\n", "\n", - "    2.3   trainer 的实际使用 \n", + "    2.3   一个实际案例:argmax 模型\n", "\n", - "  3   使用 evaluator 评测模型\n", + "  3   使用 fastNLP 0.8 训练 argmax 模型\n", " \n", "    3.1   trainer 外部初始化的 evaluator\n", "\n", @@ -50,21 +50,21 @@ "\n", "```python\n", "trainer = Trainer(\n", - " model=model,\n", - " train_dataloader=train_dataloader,\n", - " optimizers=optimizer,\n", + " model=model, # 模型基于 torch.nn.Module\n", + " train_dataloader=train_dataloader, # 加载模块基于 torch.utils.data.DataLoader \n", + " optimizers=optimizer, # 优化模块基于 torch.optim.*\n", "\t...\n", - "\tdriver=\"torch\",\n", - "\tdevice=0,\n", + "\tdriver=\"torch\", # 使用 pytorch 模块进行训练 \n", + "\tdevice='cuda', # 使用 GPU:0 显卡执行训练\n", "\t...\n", ")\n", "...\n", "evaluator = Evaluator(\n", - " model=model,\n", - " dataloaders=evaluate_dataloader,\n", - " metrics={'acc': Accuracy()} \n", + " model=model, # 模型基于 torch.nn.Module\n", + " dataloaders=evaluate_dataloader, # 加载模块基于 torch.utils.data.DataLoader\n", + " metrics={'acc': Accuracy()}, # 测评方法使用 fastNLP.core.metrics.Accuracy \n", " ...\n", - " driver=trainer.driver,\n", + " driver=trainer.driver, # 保持同 trainer 的 driver 一致\n", "\tdevice=None,\n", " ...\n", ")\n", @@ -88,7 +88,7 @@ "\n", "注:在同一脚本中,`Trainer`和`Evaluator`使用的`driver`应当保持一致\n", "\n", - "  一个不能违背的原则在于:**不要将多卡的`driver`前使用单卡的`driver`**(???),这样使用可能会带来很多意想不到的错误。" + "  一个不能违背的原则在于:**不要将多卡的`driver`前使用单卡的`driver`**(???),这样使用可能会带来很多意想不到的错误" ] }, { @@ -109,10 +109,10 @@ " optimizers=optimizer,\n", "\t...\n", "\tdriver=\"torch\",\n", - "\tdevice=0,\n", + "\tdevice='cuda',\n", "\t...\n", - " evaluate_dataloaders=evaluate_dataloader,\n", - " metrics={'acc': Accuracy()},\n", + " evaluate_dataloaders=evaluate_dataloader, # 传入参数 evaluator_dataloaders\n", + " metrics={'acc': Accuracy()}, # 传入参数 metrics\n", "\t...\n", ")\n", "```" @@ -123,7 +123,7 @@ "id": "0c9c7dda", "metadata": {}, "source": [ - "## 2. 使用 trainer 训练模型" + "## 2. argmax 模型的搭建实例" ] }, { @@ -131,71 +131,41 @@ "id": "524ac200", "metadata": {}, "source": [ - "### 2.1 argmax 模型实例\n", + "### 2.1 trainer_step 和 evaluator_step\n", "\n", - "本节将通过训练`argmax`模型,简单介绍如何`Trainer`模块的使用方式\n", + "在`fastNLP 0.8`中,使用`pytorch.nn.Module`搭建需要训练的模型,在搭建模型过程中,除了\n", "\n", - "  使用`pytorch`定义`argmax`模型,输入一组固定维度的向量,输出其中数值最大的数的索引\n", - "\n", - "  除了添加`pytorch`要求的`forward`方法外,还需要添加 **`train_step`** 和 **`evaluate_step`** 这两个方法" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5314482b", - "metadata": { - "pycharm": { - "is_executing": true - } - }, - "outputs": [], - "source": [ - "import torch\n", - "import torch.nn as nn\n", - "\n", - "class ArgMaxModel(nn.Module):\n", - " def __init__(self, num_labels, feature_dimension):\n", - " super(ArgMaxModel, self).__init__()\n", - " self.num_labels = num_labels\n", - "\n", - " self.linear1 = nn.Linear(in_features=feature_dimension, out_features=10)\n", - " self.ac1 = nn.ReLU()\n", - " self.linear2 = nn.Linear(in_features=10, out_features=10)\n", - " self.ac2 = nn.ReLU()\n", - " self.output = nn.Linear(in_features=10, out_features=num_labels)\n", - " self.loss_fn = nn.CrossEntropyLoss()\n", + "  添加`pytorch`要求的`forward`方法外,还需要添加 **`train_step`** 和 **`evaluate_step`** 这两个方法\n", + "***\n", + "```python\n", + "class Model(torch.nn.Module):\n", + " def __init__(self):\n", + " super(Model, self).__init__()\n", + " self.loss_fn = torch.nn.CrossEntropyLoss()\n", + " pass\n", "\n", " def forward(self, x):\n", - " x = self.ac1(self.linear1(x))\n", - " x = self.ac2(self.linear2(x))\n", - " x = self.output(x)\n", - " return x\n", + " pass\n", "\n", " def train_step(self, x, y):\n", - " x = self(x)\n", - " return {\"loss\": self.loss_fn(x, y)}\n", + " pred = self(x)\n", + " return {\"loss\": self.loss_fn(pred, y)}\n", "\n", " def evaluate_step(self, x, y):\n", - " x = self(x)\n", - " x = torch.max(x, dim=-1)[1]\n", - " return {\"pred\": x, \"target\": y}" - ] - }, - { - "cell_type": "markdown", - "id": "ca897322", - "metadata": {}, - "source": [ + " pred = self(x)\n", + " pred = torch.max(pred, dim=-1)[1]\n", + " return {\"pred\": pred, \"target\": y}\n", + "```\n", + "***\n", "在`fastNLP 0.8`中,**函数`train_step`是`Trainer`中参数`train_fn`的默认值**\n", "\n", - "  由于,在`Trainer`训练时,**`Trainer`通过参数`_train_fn_`对应的模型方法获得当前数据批次的损失值**\n", + "  由于,在`Trainer`训练时,**`Trainer`通过参数`train_fn`对应的模型方法获得当前数据批次的损失值**\n", "\n", "  因此,在`Trainer`训练时,`Trainer`首先会寻找模型是否定义了`train_step`这一方法\n", "\n", "    如果没有找到,那么`Trainer`会默认使用模型的`forward`函数来进行训练的前向传播过程\n", "\n", - "注:在`fastNLP 0.8`中,`Trainer`要求模型通过`train_step`来返回一个字典,将损失值作为`loss`的键值\n", + "注:在`fastNLP 0.8`中,**`Trainer`要求模型通过`train_step`来返回一个字典**,**满足如`{\"loss\": loss}`的形式**\n", "\n", "  此外,这里也可以通过传入`Trainer`的参数`output_mapping`来实现高度化的定制,具体请见这一note(???)\n", "\n", @@ -205,7 +175,11 @@ "\n", "  从用户角度,模型通过`evaluate_step`方法来返回一个字典,内容与传入`Evaluator`的`metrics`一致\n", "\n", - "" + "  从模块角度,该字典的键值和`metric`中的`update`函数的签名一致,这样的机制在传参时被称为“**参数匹配**”\n", + "\n", + "***\n", + "\n", + "![fastNLP 0.8 中,Trainer 和 Evaluator 的关系图](./figures/T0-fig-trainer-and-evaluator.png)" ] }, { @@ -213,13 +187,52 @@ "id": "fb3272eb", "metadata": {}, "source": [ - "### 2.2 trainer 的参数匹配\n", + "### 2.2 trainer 和 evaluator 的参数匹配\n", + "\n", + "在`fastNLP 0.8`中,参数匹配涉及到两个方面,分别是在\n", + "\n", + "  一方面,**在模型的前向传播中**,**`dataloader`向`train_step`或`evaluate_step`函数传递`batch`**\n", + "\n", + "  另方面,**在模型的评测过程中**,**`evaluate_dataloader`向`metric`的`update`函数传递`batch`**\n", "\n", - "`fastNLP 0.8`中的参数匹配涉及到两个方面,一是在模型训练或者评测的前向传播过程中,如果从`dataloader`中出来一个`batch`的数据是一个字典,那么我们会查看模型的`train_step`和`evaluate_step`方法的参数签名,然后对于每一个参数,我们会根据其名字从 batch 这一字典中选择出对应的数据传入进去。例如在接下来的定义`Dataset`的部分,注意`ArgMaxDatset`的`__getitem__`方法,您可以通过在`Trainer`和`Evaluator`中设置参数 `model_wo_auto_param_call`来关闭这一行为。当您关闭了这一行为后,我们会将`batch`直接传给您的`train_step`、`evaluate_step`或者 `forward`函数。\n", + "对于前者,在`Trainer`和`Evaluator`中的参数`model_wo_auto_param_call`被设置为`False`时\n", "\n", - "二是在传入`Trainer`或者`Evaluator metrics`后,我们会在需要评测的时间点主动调用`metrics`来对`evaluate_dataloaders`进行评测,这一功能主要就是通过对`metrics`的`update`方法和一个`batch`的数据进行参数评测实现的。首先需要明确的是一个 metric 的计算通常分为 `update` 和 `get_metric`两步,其中`update`表示更新一个`batch`的评测数据,`get_metric` 表示根据已经得到的评测数据计算出最终的评测值,例如对于 `Accuracy`来说,其在`update`的时候会更新一个`batch`计算正确的数量 right_num 和计算错误的数量 total_num,最终在 `get_metric` 时返回评测值`right_num / total_num`。\n", + "    **`fastNLP 0.8`要求`dataloader`生成的每个`batch`**,**满足如`{\"x\": x, \"y\": y}`的形式**\n", + "\n", + "  同时,`fastNLP 0.8`会查看模型的`train_step`和`evaluate_step`方法的参数签名,并为对应参数传入对应数值\n", + "\n", + "    **字典形式的定义**,**对应在`Dataset`定义的`__getitem__`方法中**,例如下方的`ArgMaxDatset`\n", + "\n", + "  而在`Trainer`和`Evaluator`中的参数`model_wo_auto_param_call`被设置为`True`时\n", + "\n", + "    `fastNLP 0.8`会将`batch`直接传给模型的`train_step`、`evaluate_step`或`forward`函数\n", + "***\n", + "```python\n", + "class Dataset(torch.utils.data.Dataset):\n", + " def __init__(self, x, y):\n", + " self.x = x\n", + " self.y = y\n", + "\n", + " def __len__(self):\n", + " return len(self.x)\n", + "\n", + " def __getitem__(self, item):\n", + " return {\"x\": self.x[item], \"y\": self.y[item]}\n", + "```\n", + "***\n", + "对于后者,首先要明确,在`Trainer`和`Evaluator`中,`metrics`的计算分为`update`和`get_metric`两步\n", "\n", - "因为`fastNLP 0.8`的`metrics`是自动计算的(只需要传给`Trainer`或者`Evaluator`),因此其一定依赖于参数匹配。对于从`evaluate_dataloader`中生成的一个`batch`的数据,我们会查看传给 `Trainer`(最终是传给`Evaluator`)和`Evaluator`的每一个`metric`,然后查看其`update`函数的函数签名,然后根据每一个参数的名字从`batch`字典中选择出对应的数据传入进去。" + "    **`update`函数**,**针对一个`batch`的预测结果**,计算其累计的评价指标\n", + "\n", + "    **`get_metric`函数**,**统计`update`函数累计的评价指标**,来计算最终的评价结果\n", + "\n", + "  例如对于`Accuracy`来说,`update`函数会更新一个`batch`的正例数量`right_num`和负例数量`total_num`\n", + "\n", + "    而`get_metric`函数则会返回所有`batch`的评测值`right_num / total_num`\n", + "\n", + "  在此基础上,**`fastNLP 0.8`要求`evaluate_dataloader`生成的每个`batch`传递给对应的`metric`**\n", + "\n", + "    **以`{\"pred\": y_pred, \"target\": y_true}`的形式**,对应其`update`函数的函数签名" ] }, { @@ -227,9 +240,65 @@ "id": "f62b7bb1", "metadata": {}, "source": [ - "### 2.3 trainer的实际使用\n", + "### 2.3 一个实际案例:argmax 模型\n", "\n", - "接下来我们创建用于训练的 dataset,其接受三个参数:数据维度、数据量和随机数种子,生成指定数量的维度为 `feature_dimension` 向量,而每一个向量的标签就是该向量中最大值的索引。" + "下文将通过训练`argmax`模型,简单介绍如何`Trainer`模块的使用方式\n", + "\n", + "  首先,使用`pytorch.nn.Module`定义`argmax`模型,目标是输入一组固定维度的向量,输出其中数值最大的数的索引" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "5314482b", + "metadata": { + "pycharm": { + "is_executing": false + } + }, + "outputs": [], + "source": [ + "import torch\n", + "import torch.nn as nn\n", + "\n", + "class ArgMaxModel(nn.Module):\n", + " def __init__(self, num_labels, feature_dimension):\n", + " super(ArgMaxModel, self).__init__()\n", + " self.num_labels = num_labels\n", + "\n", + " self.linear1 = nn.Linear(in_features=feature_dimension, out_features=10)\n", + " self.ac1 = nn.ReLU()\n", + " self.linear2 = nn.Linear(in_features=10, out_features=10)\n", + " self.ac2 = nn.ReLU()\n", + " self.output = nn.Linear(in_features=10, out_features=num_labels)\n", + " self.loss_fn = nn.CrossEntropyLoss()\n", + "\n", + " def forward(self, x):\n", + " pred = self.ac1(self.linear1(x))\n", + " pred = self.ac2(self.linear2(pred))\n", + " pred = self.output(pred)\n", + " return pred\n", + "\n", + " def train_step(self, x, y):\n", + " pred = self(x)\n", + " return {\"loss\": self.loss_fn(pred, y)}\n", + "\n", + " def evaluate_step(self, x, y):\n", + " pred = self(x)\n", + " pred = torch.max(pred, dim=-1)[1]\n", + " return {\"pred\": pred, \"target\": y}" + ] + }, + { + "cell_type": "markdown", + "id": "71f3fa6b", + "metadata": {}, + "source": [ + "  接着,使用`torch.utils.data.Dataset`定义`ArgMaxDataset`数据集\n", + "\n", + "    数据集包含三个参数:维度`feature_dimension`、数据量`data_num`和随机种子`seed`\n", + "\n", + "    数据及初始化是,自动生成指定维度的向量,并为每个向量标注出其中最大值的索引作为预测标签" ] }, { @@ -245,7 +314,7 @@ "source": [ "from torch.utils.data import Dataset\n", "\n", - "class ArgMaxDatset(Dataset):\n", + "class ArgMaxDataset(Dataset):\n", " def __init__(self, feature_dimension, data_num=1000, seed=0):\n", " self.num_labels = feature_dimension\n", " self.feature_dimension = feature_dimension\n", @@ -269,7 +338,9 @@ "id": "2cb96332", "metadata": {}, "source": [ - "现在准备好数据和模型。" + "  然后,根据`ArgMaxModel`类初始化模型实例,保持输入维度`feature_dimension`和输出标签数量`num_labels`一致\n", + "\n", + "    再根据`ArgMaxDataset`类初始化两个数据集实例,分别用来模型测试和模型评测,数据量各1000笔" ] }, { @@ -283,16 +354,10 @@ }, "outputs": [], "source": [ - "from torch.utils.data import DataLoader\n", - "\n", - "train_dataset = ArgMaxDatset(feature_dimension=10, data_num=1000)\n", - "evaluate_dataset = ArgMaxDatset(feature_dimension=10, data_num=100)\n", - "\n", - "train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)\n", - "evaluate_dataloader = DataLoader(evaluate_dataset, batch_size=8)\n", + "model = ArgMaxModel(num_labels=10, feature_dimension=10)\n", "\n", - "# num_labels 设置为 10,与 feature_dimension 保持一致,因为我们是预测十个位置中哪一个的概率最大。\n", - "model = ArgMaxModel(num_labels=10, feature_dimension=10)" + "train_dataset = ArgMaxDataset(feature_dimension=10, data_num=1000)\n", + "evaluate_dataset = ArgMaxDataset(feature_dimension=10, data_num=100)" ] }, { @@ -300,12 +365,33 @@ "id": "4e7d25ee", "metadata": {}, "source": [ - "将优化器也定义好。" + "  此外,使用`torch.utils.data.DataLoader`初始化两个数据加载模块,批量大小同为8,分别用于训练和测评" ] }, { "cell_type": "code", "execution_count": 4, + "id": "363b5b09", + "metadata": {}, + "outputs": [], + "source": [ + "from torch.utils.data import DataLoader\n", + "\n", + "train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)\n", + "evaluate_dataloader = DataLoader(evaluate_dataset, batch_size=8)" + ] + }, + { + "cell_type": "markdown", + "id": "c8d4443f", + "metadata": {}, + "source": [ + "  最后,使用`torch.optim.SGD`初始化一个优化模块,基于随机梯度下降法" + ] + }, + { + "cell_type": "code", + "execution_count": 5, "id": "dc28a2d9", "metadata": { "pycharm": { @@ -321,15 +407,33 @@ }, { "cell_type": "markdown", - "id": "4f1fba81", + "id": "eb8ca6cf", + "metadata": {}, + "source": [ + "## 3. 使用 fastNLP 0.8 训练 argmax 模型\n", + "\n", + "### 3.1 trainer 外部初始化的 evaluator" + ] + }, + { + "cell_type": "markdown", + "id": "55145553", "metadata": {}, "source": [ - "现在万事俱备,开始使用 Trainer 进行训练!" + "通过从`fastNLP`库中导入`Trainer`类,初始化`trainer`实例,对模型进行训练\n", + "\n", + "  需要导入预先定义好的模型`model`、对应的数据加载模块`train_dataloader`、优化模块`optimizer`\n", + "\n", + "  通过`progress_bar`设定进度条格式,默认为`\"auto\"`,此外还有`\"rich\"`、`\"raw\"`和`None`\n", + "\n", + "    但对于`\"auto\"`和`\"rich\"`格式,训练结束后进度条会不显示(???)\n", + "\n", + "  通过`n_epochs`设定优化迭代轮数,默认为20;全部`Trainer`的全部变量与函数可以通过`dir(trainer)`查询" ] }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 6, "id": "b51b7a2d", "metadata": { "pycharm": { @@ -349,167 +453,20 @@ }, "metadata": {}, "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "['__annotations__',\n", - " '__class__',\n", - " '__delattr__',\n", - " '__dict__',\n", - " '__dir__',\n", - " '__doc__',\n", - " '__eq__',\n", - " '__format__',\n", - " '__ge__',\n", - " '__getattribute__',\n", - " '__gt__',\n", - " '__hash__',\n", - " '__init__',\n", - " '__init_subclass__',\n", - " '__le__',\n", - " '__lt__',\n", - " '__module__',\n", - " '__ne__',\n", - " '__new__',\n", - " '__reduce__',\n", - " '__reduce_ex__',\n", - " '__repr__',\n", - " '__setattr__',\n", - " '__sizeof__',\n", - " '__str__',\n", - " '__subclasshook__',\n", - " '__weakref__',\n", - " '_check_callback_called_legality',\n", - " '_check_train_batch_loop_legality',\n", - " '_custom_callbacks',\n", - " '_driver',\n", - " '_evaluate_dataloaders',\n", - " '_fetch_matched_fn_callbacks',\n", - " '_set_num_eval_batch_per_dl',\n", - " '_train_batch_loop',\n", - " '_train_dataloader',\n", - " '_train_step',\n", - " '_train_step_signature_fn',\n", - " 'accumulation_steps',\n", - " 'add_callback_fn',\n", - " 'backward',\n", - " 'batch_idx_in_epoch',\n", - " 'batch_step_fn',\n", - " 'callback_manager',\n", - " 'check_batch_step_fn',\n", - " 'cur_epoch_idx',\n", - " 'data_device',\n", - " 'dataloader',\n", - " 'device',\n", - " 'driver',\n", - " 'driver_name',\n", - " 'epoch_validate',\n", - " 'evaluate_batch_step_fn',\n", - " 'evaluate_dataloaders',\n", - " 'evaluate_every',\n", - " 'evaluate_fn',\n", - " 'evaluator',\n", - " 'extract_loss_from_outputs',\n", - " 'fp16',\n", - " 'get_no_sync_context',\n", - " 'global_forward_batches',\n", - " 'has_checked_train_batch_loop',\n", - " 'input_mapping',\n", - " 'kwargs',\n", - " 'larger_better',\n", - " 'load',\n", - " 'load_model',\n", - " 'marker',\n", - " 'metrics',\n", - " 'model',\n", - " 'model_device',\n", - " 'monitor',\n", - " 'move_data_to_device',\n", - " 'n_epochs',\n", - " 'num_batches_per_epoch',\n", - " 'on',\n", - " 'on_after_backward',\n", - " 'on_after_optimizers_step',\n", - " 'on_after_trainer_initialized',\n", - " 'on_after_zero_grad',\n", - " 'on_before_backward',\n", - " 'on_before_optimizers_step',\n", - " 'on_before_zero_grad',\n", - " 'on_exception',\n", - " 'on_fetch_data_begin',\n", - " 'on_fetch_data_end',\n", - " 'on_load_checkpoint',\n", - " 'on_load_model',\n", - " 'on_sanity_check_begin',\n", - " 'on_sanity_check_end',\n", - " 'on_save_checkpoint',\n", - " 'on_save_model',\n", - " 'on_train_batch_begin',\n", - " 'on_train_batch_end',\n", - " 'on_train_begin',\n", - " 'on_train_end',\n", - " 'on_train_epoch_begin',\n", - " 'on_train_epoch_end',\n", - " 'on_validate_begin',\n", - " 'on_validate_end',\n", - " 'optimizers',\n", - " 'output_mapping',\n", - " 'run',\n", - " 'save',\n", - " 'save_model',\n", - " 'set_grad_to_none',\n", - " 'state',\n", - " 'step',\n", - " 'step_validate',\n", - " 'total_batches',\n", - " 'train_batch_loop',\n", - " 'train_dataloader',\n", - " 'train_fn',\n", - " 'train_step',\n", - " 'trainer_state',\n", - " 'zero_grad']" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" } ], "source": [ "from fastNLP import Trainer\n", "\n", - "# 定义一个 Trainer\n", "trainer = Trainer(\n", " model=model,\n", - " driver=\"torch\", # 使用 pytorch 进行训练\n", - " device=0, # 使用 GPU:0\n", + " driver=\"torch\",\n", + " device='cuda',\n", " train_dataloader=train_dataloader,\n", " optimizers=optimizer,\n", - " n_epochs=10, # 训练 40 个 epoch\n", - " progress_bar=\"rich\"\n", - ")\n", - "dir(trainer)" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "f8fe9c32", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "FullArgSpec(args=['self', 'num_train_batch_per_epoch', 'num_eval_batch_per_dl', 'num_eval_sanity_batch', 'resume_from', 'resume_training', 'catch_KeyboardInterrupt'], varargs=None, varkw=None, defaults=(-1, -1, 2, None, True, None), kwonlyargs=[], kwonlydefaults=None, annotations={'num_train_batch_per_epoch': , 'num_eval_batch_per_dl': , 'num_eval_sanity_batch': , 'resume_from': , 'resume_training': })\n" - ] - } - ], - "source": [ - "import inspect \n", - "\n", - "print(inspect.getfullargspec(trainer.run))" + " n_epochs=10, # 设定迭代轮数 \n", + " progress_bar=\"auto\" # 设定进度条格式\n", + ")" ] }, { @@ -517,16 +474,20 @@ "id": "6e202d6e", "metadata": {}, "source": [ - "没有问题,那么开始真正的训练!" + "通过使用`Trainer`类的`run`函数,进行训练\n", + "\n", + "  其中,可以通过参数`num_train_batch_per_epoch`决定每个`epoch`运行多少个`batch`后停止,默认全部\n", + "\n", + "  此外,可以通过`inspect.getfullargspec(trainer.run)`查询`run`函数的全部参数列表" ] }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 7, "id": "ba047ead", "metadata": { "pycharm": { - "is_executing": false + "is_executing": true } }, "outputs": [ @@ -585,29 +546,27 @@ "trainer.run()" ] }, - { - "cell_type": "markdown", - "id": "eb8ca6cf", - "metadata": {}, - "source": [ - "## 3. 使用 evaluator 评测模型" - ] - }, { "cell_type": "markdown", "id": "c16c5fa4", "metadata": {}, "source": [ - "模型训练好了我们开始使用 Evaluator 进行评测,查看效果怎么样吧。" + "通过从`fastNLP`库中导入`Evaluator`类,初始化`evaluator`实例,对模型进行评测\n", + "\n", + "  需要导入预先定义好的模型`model`、对应的数据加载模块`evaluate_dataloader`\n", + "\n", + "  需要注意的是评测方法`metrics`,设定为形如`{'acc': fastNLP.core.metrics.Accuracy()}`的字典\n", + "\n", + "  类似地,也可以通过`progress_bar`限定进度条格式,默认为`\"auto\"`" ] }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 8, "id": "1c6b6b36", "metadata": { "pycharm": { - "is_executing": false + "is_executing": true } }, "outputs": [], @@ -617,100 +576,32 @@ "\n", "evaluator = Evaluator(\n", " model=model,\n", - " driver=trainer.driver, # 使用 trainer 已经启动的 driver;\n", + " driver=trainer.driver, # 需要使用 trainer 已经启动的 driver\n", " device=None,\n", " dataloaders=evaluate_dataloader,\n", - " metrics={'acc': Accuracy()} # 注意这里一定得是一个字典;\n", + " metrics={'acc': Accuracy()} # 需要严格使用此种形式的字典\n", ")" ] }, { - "cell_type": "code", - "execution_count": 11, - "id": "257061df", - "metadata": { - "scrolled": true - }, - "outputs": [ - { - "data": { - "text/plain": [ - "['__annotations__',\n", - " '__class__',\n", - " '__delattr__',\n", - " '__dict__',\n", - " '__dir__',\n", - " '__doc__',\n", - " '__eq__',\n", - " '__format__',\n", - " '__ge__',\n", - " '__getattribute__',\n", - " '__gt__',\n", - " '__hash__',\n", - " '__init__',\n", - " '__init_subclass__',\n", - " '__le__',\n", - " '__lt__',\n", - " '__module__',\n", - " '__ne__',\n", - " '__new__',\n", - " '__reduce__',\n", - " '__reduce_ex__',\n", - " '__repr__',\n", - " '__setattr__',\n", - " '__sizeof__',\n", - " '__str__',\n", - " '__subclasshook__',\n", - " '__weakref__',\n", - " '_dist_sampler',\n", - " '_evaluate_batch_loop',\n", - " '_evaluate_step',\n", - " '_evaluate_step_signature_fn',\n", - " '_metric_wrapper',\n", - " '_metrics',\n", - " 'dataloaders',\n", - " 'device',\n", - " 'driver',\n", - " 'evaluate_batch_loop',\n", - " 'evaluate_batch_step_fn',\n", - " 'evaluate_fn',\n", - " 'evaluate_step',\n", - " 'finally_progress_bar',\n", - " 'get_dataloader_metric',\n", - " 'input_mapping',\n", - " 'metrics',\n", - " 'metrics_wrapper',\n", - " 'model',\n", - " 'model_use_eval_mode',\n", - " 'move_data_to_device',\n", - " 'output_mapping',\n", - " 'progress_bar',\n", - " 'remove_progress_bar',\n", - " 'reset',\n", - " 'run',\n", - " 'separator',\n", - " 'start_progress_bar',\n", - " 'update',\n", - " 'update_progress_bar',\n", - " 'verbose']" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], + "cell_type": "markdown", + "id": "8157bb9b", + "metadata": {}, "source": [ - "dir(evaluator)" + "通过使用`Evaluator`类的`run`函数,进行训练\n", + "\n", + "  其中,可以通过参数`num_eval_batch_per_dl`决定每个`evaluate_dataloader`运行多少个`batch`停止,默认全部\n", + "\n", + "  最终,输出形如`{'acc#acc': acc}`的字典,中间的进度条会在运行结束后丢弃掉(???)" ] }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 9, "id": "f7cb0165", "metadata": { "pycharm": { - "is_executing": false + "is_executing": true } }, "outputs": [ @@ -750,11 +641,11 @@ { "data": { "text/html": [ - "
{'acc#acc': 0.3}\n",
+       "
{'acc#acc': 0.43}\n",
        "
\n" ], "text/plain": [ - "\u001b[1m{\u001b[0m\u001b[32m'acc#acc'\u001b[0m: \u001b[1;36m0.3\u001b[0m\u001b[1m}\u001b[0m\n" + "\u001b[1m{\u001b[0m\u001b[32m'acc#acc'\u001b[0m: \u001b[1;36m0.43\u001b[0m\u001b[1m}\u001b[0m\n" ] }, "metadata": {}, @@ -763,10 +654,10 @@ { "data": { "text/plain": [ - "{'acc#acc': 0.3}" + "{'acc#acc': 0.43}" ] }, - "execution_count": 12, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } @@ -780,39 +671,37 @@ "id": "dd9f68fa", "metadata": {}, "source": [ - "## 4. 在 trainer 中加入 metric 来自动评测;" - ] - }, - { - "cell_type": "markdown", - "id": "ca97c9a4", - "metadata": {}, - "source": [ - "现在我们尝试在训练过程中进行评测。" + "### 3.2 trainer 内部初始化的 evaluator \n", + "\n", + "通过在初始化`trainer`实例时加入`evaluate_dataloaders`和`metrics`,可以实现在训练过程中进行评测\n", + "\n", + "  通过`progress_bar`同时设定训练和评估进度条格式,训练结束后进度条会不显示(???)\n", + "\n", + "  **通过`evaluate_every`设定评估频率**,可以为负数、正数或者函数:\n", + "\n", + "    **为负数时**,**表示每隔几个`epoch`评估一次**;**为正数时**,**则表示每隔几个`batch`评估一次**" ] }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 10, "id": "183c7d19", "metadata": { "pycharm": { - "is_executing": false + "is_executing": true } }, "outputs": [], "source": [ - "# 重新定义一个 Trainer\n", - "\n", "trainer = Trainer(\n", " model=model,\n", - " driver=trainer.driver, # 因为我们是在同一脚本中,因此这里的 driver 同样需要重用;\n", + " driver=trainer.driver, # 因为是在同个脚本中,这里的 driver 同样需要重用\n", " train_dataloader=train_dataloader,\n", " evaluate_dataloaders=evaluate_dataloader,\n", " metrics={'acc': Accuracy()},\n", " optimizers=optimizer,\n", - " n_epochs=10, # 训练 40 个 epoch;\n", - " evaluate_every=-1, # 表示每一个 epoch 的结束会进行 evaluate;\n", + " n_epochs=10, \n", + " evaluate_every=-1, # 表示每个 epoch 的结束进行评估\n", ")" ] }, @@ -821,16 +710,18 @@ "id": "714cc404", "metadata": {}, "source": [ - "再次训练。" + "通过使用`Trainer`类的`run`函数,进行训练\n", + "\n", + "  还可以通过参数`num_eval_sanity_batch`决定每次训练前运行多少个`evaluate_batch`进行评测,默认为2" ] }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 11, "id": "2e4daa2c", "metadata": { "pycharm": { - "is_executing": false + "is_executing": true } }, "outputs": [ @@ -884,96 +775,6 @@ "source": [ "trainer.run()" ] - }, - { - "cell_type": "code", - "execution_count": 15, - "id": "eabda5eb", - "metadata": {}, - "outputs": [], - "source": [ - "evaluator = Evaluator(\n", - " model=model,\n", - " driver=trainer.driver, # 使用 trainer 已经启动的 driver;\n", - " dataloaders=evaluate_dataloader,\n", - " metrics={'acc': Accuracy()} # 注意这里一定得是一个字典;\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "id": "a310d157", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n"
-      ],
-      "text/plain": []
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "text/html": [
-       "
\n"
-      ],
-      "text/plain": []
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "text/html": [
-       "
\n",
-       "
\n" - ], - "text/plain": [ - "\n" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
{'acc#acc': 0.5}\n",
-       "
\n" - ], - "text/plain": [ - "\u001b[1m{\u001b[0m\u001b[32m'acc#acc'\u001b[0m: \u001b[1;36m0.5\u001b[0m\u001b[1m}\u001b[0m\n" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "{'acc#acc': 0.5}" - ] - }, - "execution_count": 16, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "evaluator.run()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f1ef78f0", - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { diff --git a/tutorials/figures/T0-fig-trainer-and-evaluator.png b/tutorials/figures/T0-fig-trainer-and-evaluator.png new file mode 100644 index 00000000..a98ab83b Binary files /dev/null and b/tutorials/figures/T0-fig-trainer-and-evaluator.png differ