|
@@ -50,24 +50,24 @@ |
|
|
"\n", |
|
|
"\n", |
|
|
"```python\n", |
|
|
"```python\n", |
|
|
"trainer = Trainer(\n", |
|
|
"trainer = Trainer(\n", |
|
|
" model=model, # 模型基于 torch.nn.Module\n", |
|
|
|
|
|
" train_dataloader=train_dataloader, # 加载模块基于 torch.utils.data.DataLoader \n", |
|
|
|
|
|
" optimizers=optimizer, # 优化模块基于 torch.optim.*\n", |
|
|
|
|
|
"\t...\n", |
|
|
|
|
|
"\tdriver=\"torch\", # 使用 pytorch 模块进行训练 \n", |
|
|
|
|
|
"\tdevice='cuda', # 使用 GPU:0 显卡执行训练\n", |
|
|
|
|
|
"\t...\n", |
|
|
|
|
|
")\n", |
|
|
|
|
|
|
|
|
" model=model, # 模型基于 torch.nn.Module\n", |
|
|
|
|
|
" train_dataloader=train_dataloader, # 加载模块基于 torch.utils.data.DataLoader \n", |
|
|
|
|
|
" optimizers=optimizer, # 优化模块基于 torch.optim.*\n", |
|
|
|
|
|
" ...\n", |
|
|
|
|
|
" driver=\"torch\", # 使用 pytorch 模块进行训练 \n", |
|
|
|
|
|
" device='cuda', # 使用 GPU:0 显卡执行训练\n", |
|
|
|
|
|
" ...\n", |
|
|
|
|
|
" )\n", |
|
|
"...\n", |
|
|
"...\n", |
|
|
"evaluator = Evaluator(\n", |
|
|
"evaluator = Evaluator(\n", |
|
|
" model=model, # 模型基于 torch.nn.Module\n", |
|
|
|
|
|
" dataloaders=evaluate_dataloader, # 加载模块基于 torch.utils.data.DataLoader\n", |
|
|
|
|
|
" metrics={'acc': Accuracy()}, # 测评方法使用 fastNLP.core.metrics.Accuracy \n", |
|
|
|
|
|
" ...\n", |
|
|
|
|
|
" driver=trainer.driver, # 保持同 trainer 的 driver 一致\n", |
|
|
|
|
|
"\tdevice=None,\n", |
|
|
|
|
|
" ...\n", |
|
|
|
|
|
")\n", |
|
|
|
|
|
|
|
|
" model=model, # 模型基于 torch.nn.Module\n", |
|
|
|
|
|
" dataloaders=evaluate_dataloader, # 加载模块基于 torch.utils.data.DataLoader\n", |
|
|
|
|
|
" metrics={'acc': Accuracy()}, # 测评方法使用 fastNLP.core.metrics.Accuracy \n", |
|
|
|
|
|
" ...\n", |
|
|
|
|
|
" driver=trainer.driver, # 保持同 trainer 的 driver 一致\n", |
|
|
|
|
|
" device=None,\n", |
|
|
|
|
|
" ...\n", |
|
|
|
|
|
" )\n", |
|
|
"```" |
|
|
"```" |
|
|
] |
|
|
] |
|
|
}, |
|
|
}, |
|
@@ -84,7 +84,7 @@ |
|
|
"\n", |
|
|
"\n", |
|
|
"在`fastNLP 0.8`中,**`Trainer`和`Evaluator`都依赖于具体的`driver`来完成整体的工作流程**\n", |
|
|
"在`fastNLP 0.8`中,**`Trainer`和`Evaluator`都依赖于具体的`driver`来完成整体的工作流程**\n", |
|
|
"\n", |
|
|
"\n", |
|
|
"  具体`driver`与`Trainer`以及`Evaluator`之间的关系请参考`fastNLP 0.8`的框架设计\n", |
|
|
|
|
|
|
|
|
"  具体`driver`与`Trainer`以及`Evaluator`之间的关系之后`tutorial 4`中的详细介绍\n", |
|
|
"\n", |
|
|
"\n", |
|
|
"注:这里给出一条建议:**在同一脚本中**,**所有的`Trainer`和`Evaluator`使用的`driver`应当保持一致**\n", |
|
|
"注:这里给出一条建议:**在同一脚本中**,**所有的`Trainer`和`Evaluator`使用的`driver`应当保持一致**\n", |
|
|
"\n", |
|
|
"\n", |
|
@@ -106,17 +106,17 @@ |
|
|
"\n", |
|
|
"\n", |
|
|
"```python\n", |
|
|
"```python\n", |
|
|
"trainer = Trainer(\n", |
|
|
"trainer = Trainer(\n", |
|
|
" model=model,\n", |
|
|
|
|
|
" train_dataloader=train_dataloader,\n", |
|
|
|
|
|
" optimizers=optimizer,\n", |
|
|
|
|
|
"\t...\n", |
|
|
|
|
|
"\tdriver=\"torch\",\n", |
|
|
|
|
|
"\tdevice='cuda',\n", |
|
|
|
|
|
"\t...\n", |
|
|
|
|
|
" evaluate_dataloaders=evaluate_dataloader, # 传入参数 evaluator_dataloaders\n", |
|
|
|
|
|
" metrics={'acc': Accuracy()}, # 传入参数 metrics\n", |
|
|
|
|
|
"\t...\n", |
|
|
|
|
|
")\n", |
|
|
|
|
|
|
|
|
" model=model,\n", |
|
|
|
|
|
" train_dataloader=train_dataloader,\n", |
|
|
|
|
|
" optimizers=optimizer,\n", |
|
|
|
|
|
" ...\n", |
|
|
|
|
|
" driver=\"torch\",\n", |
|
|
|
|
|
" device='cuda',\n", |
|
|
|
|
|
" ...\n", |
|
|
|
|
|
" evaluate_dataloaders=evaluate_dataloader, # 传入参数 evaluator_dataloaders\n", |
|
|
|
|
|
" metrics={'acc': Accuracy()}, # 传入参数 metrics\n", |
|
|
|
|
|
" ...\n", |
|
|
|
|
|
" )\n", |
|
|
"```" |
|
|
"```" |
|
|
] |
|
|
] |
|
|
}, |
|
|
}, |
|
@@ -570,7 +570,7 @@ |
|
|
"outputs": [], |
|
|
"outputs": [], |
|
|
"source": [ |
|
|
"source": [ |
|
|
"from fastNLP import Evaluator\n", |
|
|
"from fastNLP import Evaluator\n", |
|
|
"from fastNLP.core.metrics import Accuracy\n", |
|
|
|
|
|
|
|
|
"from fastNLP import Accuracy\n", |
|
|
"\n", |
|
|
"\n", |
|
|
"evaluator = Evaluator(\n", |
|
|
"evaluator = Evaluator(\n", |
|
|
" model=model,\n", |
|
|
" model=model,\n", |
|
@@ -1310,219 +1310,6 @@ |
|
|
"trainer.evaluator.run()" |
|
|
"trainer.evaluator.run()" |
|
|
] |
|
|
] |
|
|
}, |
|
|
}, |
|
|
{ |
|
|
|
|
|
"cell_type": "code", |
|
|
|
|
|
"execution_count": 13, |
|
|
|
|
|
"id": "db784d5b", |
|
|
|
|
|
"metadata": {}, |
|
|
|
|
|
"outputs": [ |
|
|
|
|
|
{ |
|
|
|
|
|
"data": { |
|
|
|
|
|
"text/plain": [ |
|
|
|
|
|
"['__annotations__',\n", |
|
|
|
|
|
" '__class__',\n", |
|
|
|
|
|
" '__delattr__',\n", |
|
|
|
|
|
" '__dict__',\n", |
|
|
|
|
|
" '__dir__',\n", |
|
|
|
|
|
" '__doc__',\n", |
|
|
|
|
|
" '__eq__',\n", |
|
|
|
|
|
" '__format__',\n", |
|
|
|
|
|
" '__ge__',\n", |
|
|
|
|
|
" '__getattribute__',\n", |
|
|
|
|
|
" '__gt__',\n", |
|
|
|
|
|
" '__hash__',\n", |
|
|
|
|
|
" '__init__',\n", |
|
|
|
|
|
" '__init_subclass__',\n", |
|
|
|
|
|
" '__le__',\n", |
|
|
|
|
|
" '__lt__',\n", |
|
|
|
|
|
" '__module__',\n", |
|
|
|
|
|
" '__ne__',\n", |
|
|
|
|
|
" '__new__',\n", |
|
|
|
|
|
" '__reduce__',\n", |
|
|
|
|
|
" '__reduce_ex__',\n", |
|
|
|
|
|
" '__repr__',\n", |
|
|
|
|
|
" '__setattr__',\n", |
|
|
|
|
|
" '__sizeof__',\n", |
|
|
|
|
|
" '__str__',\n", |
|
|
|
|
|
" '__subclasshook__',\n", |
|
|
|
|
|
" '__weakref__',\n", |
|
|
|
|
|
" '_check_callback_called_legality',\n", |
|
|
|
|
|
" '_check_train_batch_loop_legality',\n", |
|
|
|
|
|
" '_custom_callbacks',\n", |
|
|
|
|
|
" '_driver',\n", |
|
|
|
|
|
" '_evaluate_dataloaders',\n", |
|
|
|
|
|
" '_fetch_matched_fn_callbacks',\n", |
|
|
|
|
|
" '_set_num_eval_batch_per_dl',\n", |
|
|
|
|
|
" '_train_batch_loop',\n", |
|
|
|
|
|
" '_train_dataloader',\n", |
|
|
|
|
|
" '_train_step',\n", |
|
|
|
|
|
" '_train_step_signature_fn',\n", |
|
|
|
|
|
" 'accumulation_steps',\n", |
|
|
|
|
|
" 'add_callback_fn',\n", |
|
|
|
|
|
" 'backward',\n", |
|
|
|
|
|
" 'batch_idx_in_epoch',\n", |
|
|
|
|
|
" 'batch_step_fn',\n", |
|
|
|
|
|
" 'callback_manager',\n", |
|
|
|
|
|
" 'check_batch_step_fn',\n", |
|
|
|
|
|
" 'cur_epoch_idx',\n", |
|
|
|
|
|
" 'data_device',\n", |
|
|
|
|
|
" 'dataloader',\n", |
|
|
|
|
|
" 'device',\n", |
|
|
|
|
|
" 'driver',\n", |
|
|
|
|
|
" 'driver_name',\n", |
|
|
|
|
|
" 'epoch_evaluate',\n", |
|
|
|
|
|
" 'evaluate_batch_step_fn',\n", |
|
|
|
|
|
" 'evaluate_dataloaders',\n", |
|
|
|
|
|
" 'evaluate_every',\n", |
|
|
|
|
|
" 'evaluate_fn',\n", |
|
|
|
|
|
" 'evaluator',\n", |
|
|
|
|
|
" 'extract_loss_from_outputs',\n", |
|
|
|
|
|
" 'fp16',\n", |
|
|
|
|
|
" 'get_no_sync_context',\n", |
|
|
|
|
|
" 'global_forward_batches',\n", |
|
|
|
|
|
" 'has_checked_train_batch_loop',\n", |
|
|
|
|
|
" 'input_mapping',\n", |
|
|
|
|
|
" 'kwargs',\n", |
|
|
|
|
|
" 'larger_better',\n", |
|
|
|
|
|
" 'load_checkpoint',\n", |
|
|
|
|
|
" 'load_model',\n", |
|
|
|
|
|
" 'marker',\n", |
|
|
|
|
|
" 'metrics',\n", |
|
|
|
|
|
" 'model',\n", |
|
|
|
|
|
" 'model_device',\n", |
|
|
|
|
|
" 'monitor',\n", |
|
|
|
|
|
" 'move_data_to_device',\n", |
|
|
|
|
|
" 'n_epochs',\n", |
|
|
|
|
|
" 'num_batches_per_epoch',\n", |
|
|
|
|
|
" 'on',\n", |
|
|
|
|
|
" 'on_after_backward',\n", |
|
|
|
|
|
" 'on_after_optimizers_step',\n", |
|
|
|
|
|
" 'on_after_trainer_initialized',\n", |
|
|
|
|
|
" 'on_after_zero_grad',\n", |
|
|
|
|
|
" 'on_before_backward',\n", |
|
|
|
|
|
" 'on_before_optimizers_step',\n", |
|
|
|
|
|
" 'on_before_zero_grad',\n", |
|
|
|
|
|
" 'on_evaluate_begin',\n", |
|
|
|
|
|
" 'on_evaluate_end',\n", |
|
|
|
|
|
" 'on_exception',\n", |
|
|
|
|
|
" 'on_fetch_data_begin',\n", |
|
|
|
|
|
" 'on_fetch_data_end',\n", |
|
|
|
|
|
" 'on_load_checkpoint',\n", |
|
|
|
|
|
" 'on_load_model',\n", |
|
|
|
|
|
" 'on_sanity_check_begin',\n", |
|
|
|
|
|
" 'on_sanity_check_end',\n", |
|
|
|
|
|
" 'on_save_checkpoint',\n", |
|
|
|
|
|
" 'on_save_model',\n", |
|
|
|
|
|
" 'on_train_batch_begin',\n", |
|
|
|
|
|
" 'on_train_batch_end',\n", |
|
|
|
|
|
" 'on_train_begin',\n", |
|
|
|
|
|
" 'on_train_end',\n", |
|
|
|
|
|
" 'on_train_epoch_begin',\n", |
|
|
|
|
|
" 'on_train_epoch_end',\n", |
|
|
|
|
|
" 'optimizers',\n", |
|
|
|
|
|
" 'output_mapping',\n", |
|
|
|
|
|
" 'progress_bar',\n", |
|
|
|
|
|
" 'run',\n", |
|
|
|
|
|
" 'run_evaluate',\n", |
|
|
|
|
|
" 'save_checkpoint',\n", |
|
|
|
|
|
" 'save_model',\n", |
|
|
|
|
|
" 'start_batch_idx_in_epoch',\n", |
|
|
|
|
|
" 'state',\n", |
|
|
|
|
|
" 'step',\n", |
|
|
|
|
|
" 'step_evaluate',\n", |
|
|
|
|
|
" 'total_batches',\n", |
|
|
|
|
|
" 'train_batch_loop',\n", |
|
|
|
|
|
" 'train_dataloader',\n", |
|
|
|
|
|
" 'train_fn',\n", |
|
|
|
|
|
" 'train_step',\n", |
|
|
|
|
|
" 'trainer_state',\n", |
|
|
|
|
|
" 'zero_grad']" |
|
|
|
|
|
] |
|
|
|
|
|
}, |
|
|
|
|
|
"execution_count": 13, |
|
|
|
|
|
"metadata": {}, |
|
|
|
|
|
"output_type": "execute_result" |
|
|
|
|
|
} |
|
|
|
|
|
], |
|
|
|
|
|
"source": [ |
|
|
|
|
|
"dir(trainer)" |
|
|
|
|
|
] |
|
|
|
|
|
}, |
|
|
|
|
|
{ |
|
|
|
|
|
"cell_type": "code", |
|
|
|
|
|
"execution_count": 14, |
|
|
|
|
|
"id": "953533c4", |
|
|
|
|
|
"metadata": {}, |
|
|
|
|
|
"outputs": [ |
|
|
|
|
|
{ |
|
|
|
|
|
"name": "stdout", |
|
|
|
|
|
"output_type": "stream", |
|
|
|
|
|
"text": [ |
|
|
|
|
|
"Help on method run in module fastNLP.core.controllers.trainer:\n", |
|
|
|
|
|
"\n", |
|
|
|
|
|
"run(num_train_batch_per_epoch: int = -1, num_eval_batch_per_dl: int = -1, num_eval_sanity_batch: int = 2, resume_from: str = None, resume_training: bool = True, catch_KeyboardInterrupt=None) method of fastNLP.core.controllers.trainer.Trainer instance\n", |
|
|
|
|
|
" 该函数是在 ``Trainer`` 初始化后用于真正开始训练的函数;\n", |
|
|
|
|
|
" \n", |
|
|
|
|
|
" 注意如果是断点重训的第一次训练,即还没有保存任何用于断点重训的文件,那么其应当置 resume_from 为 None,并且使用 ``CheckpointCallback``\n", |
|
|
|
|
|
" 去保存断点重训的文件;\n", |
|
|
|
|
|
" \n", |
|
|
|
|
|
" :param num_train_batch_per_epoch: 每个 epoch 训练多少个 batch 后停止,*-1* 表示使用 train_dataloader 本身的长度;\n", |
|
|
|
|
|
" :param num_eval_batch_per_dl: 每个 evaluate_dataloader 验证多少个 batch 停止,*-1* 表示使用 evaluate_dataloader 本身的长度;\n", |
|
|
|
|
|
" :param num_eval_sanity_batch: 在训练之前运行多少个 evaluation batch 来检测一下 evaluation 的过程是否有错误。为 0 表示不检测;\n", |
|
|
|
|
|
" :param resume_from: 从哪个路径下恢复 trainer 的状态,注意该值需要为一个文件夹,例如使用 ``CheckpointCallback`` 时帮助您创建的保存的子文件夹;\n", |
|
|
|
|
|
" :param resume_training: 是否按照 checkpoint 中训练状态恢复。如果为 False,则只恢复 model 和 optimizers 的状态;该参数如果为 ``True``,\n", |
|
|
|
|
|
" 在下一次断点重训的时候我们会精确到上次训练截止的具体的 sample 进行训练;否则我们只会恢复 model 和 optimizers 的状态,而 ``Trainer`` 中的\n", |
|
|
|
|
|
" 其余状态都是保持初始化时的状态不会改变;\n", |
|
|
|
|
|
" :param catch_KeyboardInterrupt: 是否捕获 KeyboardInterrupt;如果该参数为 ``True``,在训练时如果您使用 ``ctrl+c`` 来终止程序,\n", |
|
|
|
|
|
" ``Trainer`` 不会抛出异常,但是会提前退出,然后 ``trainer.run()`` 之后的代码会继续运行。注意该参数在您使用分布式训练的 ``Driver``\n", |
|
|
|
|
|
" 时无效,例如 ``TorchDDPDriver``;非分布式训练的 ``Driver`` 下该参数默认为 True;\n", |
|
|
|
|
|
" \n", |
|
|
|
|
|
" .. warning::\n", |
|
|
|
|
|
" \n", |
|
|
|
|
|
" 注意初始化的 ``Trainer`` 只能调用一次 ``run`` 函数,即之后的调用 ``run`` 函数实际不会运行,因为此时\n", |
|
|
|
|
|
" ``trainer.cur_epoch_idx == trainer.n_epochs``;\n", |
|
|
|
|
|
" \n", |
|
|
|
|
|
" 这意味着如果您需要再次调用 ``run`` 函数,您需要重新再初始化一个 ``Trainer``;\n", |
|
|
|
|
|
" \n", |
|
|
|
|
|
" .. note::\n", |
|
|
|
|
|
" \n", |
|
|
|
|
|
" 您可以使用 ``num_train_batch_per_epoch`` 来简单地对您的训练过程进行验证,例如,当您指定 ``num_train_batch_per_epoch=10`` 后,\n", |
|
|
|
|
|
" 每一个 epoch 下实际训练的 batch 的数量则会被修改为 10。您可以先使用该值来设定一个较小的训练长度,在验证整体的训练流程没有错误后,再将\n", |
|
|
|
|
|
" 该值设定为 **-1** 开始真正的训练;\n", |
|
|
|
|
|
" \n", |
|
|
|
|
|
" ``num_eval_batch_per_dl`` 的意思和 ``num_train_batch_per_epoch`` 类似,即您可以通过设定 ``num_eval_batch_per_dl`` 来验证\n", |
|
|
|
|
|
" 整体的验证流程是否正确;\n", |
|
|
|
|
|
" \n", |
|
|
|
|
|
" ``num_eval_sanity_batch`` 的作用可能会让人产生迷惑,其本质和 ``num_eval_batch_per_dl`` 作用一致,但是其只被 ``Trainer`` 使用;\n", |
|
|
|
|
|
" 并且其只会在训练的一开始使用,意思为:我们在训练的开始时会先使用 ``Evaluator``(如果其不为 ``None``) 进行验证,此时验证的 batch 的\n", |
|
|
|
|
|
" 数量只有 ``num_eval_sanity_batch`` 个;但是对于 ``num_eval_batch_per_dl`` 而言,其表示在实际的整体的训练过程中,每次 ``Evaluator``\n", |
|
|
|
|
|
" 进行验证时会验证的 batch 的数量。\n", |
|
|
|
|
|
" \n", |
|
|
|
|
|
" 并且,在实际真正的训练中,``num_train_batch_per_epoch`` 和 ``num_eval_batch_per_dl`` 应当都被设置为 **-1**,但是 ``num_eval_sanity_batch``\n", |
|
|
|
|
|
" 应当为一个很小的正整数,例如 2;\n", |
|
|
|
|
|
" \n", |
|
|
|
|
|
" .. note::\n", |
|
|
|
|
|
" \n", |
|
|
|
|
|
" 参数 ``resume_from`` 和 ``resume_training`` 的设立是为了支持断点重训功能;仅当 ``resume_from`` 不为 ``None`` 时,``resume_training`` 才有效;\n", |
|
|
|
|
|
" \n", |
|
|
|
|
|
" 断点重训的意思为将上一次训练过程中的 ``Trainer`` 的状态保存下来,包括模型和优化器的状态、当前训练过的 epoch 的数量、对于当前的 epoch\n", |
|
|
|
|
|
" 已经训练过的 batch 的数量、callbacks 的状态等等;然后在下一次训练时直接加载这些状态,从而直接恢复到上一次训练过程的某一个具体时间点的状态开始训练;\n", |
|
|
|
|
|
" \n", |
|
|
|
|
|
" fastNLP 将断点重训分为了 **保存状态** 和 **恢复断点重训** 两部分:\n", |
|
|
|
|
|
" \n", |
|
|
|
|
|
" 1. 您需要使用 ``CheckpointCallback`` 来保存训练过程中的 ``Trainer`` 的状态;具体详见 :class:`~fastNLP.core.callbacks.CheckpointCallback`;\n", |
|
|
|
|
|
" ``CheckpointCallback`` 会帮助您把 ``Trainer`` 的状态保存到一个具体的文件夹下,这个文件夹的名字由 ``CheckpointCallback`` 自己生成;\n", |
|
|
|
|
|
" 2. 在第二次训练开始时,您需要找到您想要加载的 ``Trainer`` 状态所存放的文件夹,然后传入给参数 ``resume_from``;\n", |
|
|
|
|
|
" \n", |
|
|
|
|
|
" 需要注意的是 **保存状态** 和 **恢复断点重训** 是互不影响的。\n", |
|
|
|
|
|
"\n" |
|
|
|
|
|
] |
|
|
|
|
|
} |
|
|
|
|
|
], |
|
|
|
|
|
"source": [ |
|
|
|
|
|
"help(trainer.run)" |
|
|
|
|
|
] |
|
|
|
|
|
}, |
|
|
|
|
|
{ |
|
|
{ |
|
|
"cell_type": "code", |
|
|
"cell_type": "code", |
|
|
"execution_count": null, |
|
|
"execution_count": null, |
|
|