Browse Source

fix save_pretrained & load_checkpoint bug in DDP mode

Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/11012439

    * fix save_pretrained & load_checkpoint bug in DDP mode
master^2
pangda huangjun.hj 3 years ago
parent
commit
f59f9146de
1 changed files with 6 additions and 1 deletions
  1. +6
    -1
      modelscope/trainers/hooks/checkpoint_hook.py

+ 6
- 1
modelscope/trainers/hooks/checkpoint_hook.py View File

@@ -215,6 +215,10 @@ class CheckpointHook(Hook):
# TODO a temp fix to avoid pipeline_name and task mismatch
config['pipeline'] = {'type': config['task']}

# remove parallel module that is not JSON serializable
if 'parallel' in config and 'module' in config['parallel']:
del config['parallel']['module']

class SaveConfig:

def __init__(self, output_dir, config):
@@ -422,4 +426,5 @@ class BestCkptSaverHook(CheckpointHook):

def after_run(self, trainer):
if self.restore_best:
self.load_checkpoint(self._best_ckpt_file, trainer)
if is_master():
self.load_checkpoint(self._best_ckpt_file, trainer)

Loading…
Cancel
Save