Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/9644184 * fix ditributed training and evalmaster
@@ -36,20 +36,8 @@ class NAFNetForImageDenoise(TorchModel): | |||
model_path = os.path.join(model_dir, ModelFile.TORCH_MODEL_FILE) | |||
self.model = NAFNet(**self.config.model.network_g) | |||
self.loss = PSNRLoss() | |||
if torch.cuda.is_available(): | |||
self._device = torch.device('cuda') | |||
else: | |||
self._device = torch.device('cpu') | |||
self.model = self.model.to(self._device) | |||
self.model = self._load_pretrained(self.model, model_path) | |||
if self.training: | |||
self.model.train() | |||
else: | |||
self.model.eval() | |||
def _load_pretrained(self, | |||
net, | |||
load_path, | |||
@@ -109,8 +97,6 @@ class NAFNetForImageDenoise(TorchModel): | |||
Returns: | |||
Dict[str, Tensor]: results | |||
""" | |||
for key, value in inputs.items(): | |||
inputs[key] = inputs[key].to(self._device) | |||
if self.training: | |||
return self._train_forward(**inputs) | |||
elif 'target' in inputs: | |||
@@ -6,7 +6,7 @@ from modelscope.utils.import_utils import LazyImportModule | |||
if TYPE_CHECKING: | |||
from .base import Preprocessor | |||
from .builder import PREPROCESSORS, build_preprocessor | |||
from .common import Compose | |||
from .common import Compose, ToTensor, Filter | |||
from .asr import WavToScp | |||
from .audio import LinearAECAndFbank | |||
from .image import (LoadImage, load_image, | |||
@@ -33,7 +33,7 @@ else: | |||
_import_structure = { | |||
'base': ['Preprocessor'], | |||
'builder': ['PREPROCESSORS', 'build_preprocessor'], | |||
'common': ['Compose'], | |||
'common': ['Compose', 'ToTensor', 'Filter'], | |||
'audio': ['LinearAECAndFbank'], | |||
'asr': ['WavToScp'], | |||
'video': ['ReadVideoData'], | |||
@@ -2,6 +2,10 @@ | |||
import time | |||
from collections.abc import Sequence | |||
from typing import Mapping | |||
import numpy as np | |||
import torch | |||
from .builder import PREPROCESSORS, build_preprocessor | |||
@@ -25,12 +29,18 @@ class Compose(object): | |||
if isinstance(transform, dict): | |||
if self.field_name is None: | |||
transform = build_preprocessor(transform, field_name) | |||
self.transforms.append(transform) | |||
else: | |||
# if not found key in field_name, try field_name=None(default_group) | |||
try: | |||
transform = build_preprocessor(transform, field_name) | |||
except KeyError: | |||
transform = build_preprocessor(transform, None) | |||
elif callable(transform): | |||
self.transforms.append(transform) | |||
pass | |||
else: | |||
raise TypeError('transform must be callable or a dict, but got' | |||
f' {type(transform)}') | |||
self.transforms.append(transform) | |||
def __call__(self, data): | |||
for t in self.transforms: | |||
@@ -52,3 +62,82 @@ class Compose(object): | |||
format_string += f'\n {t}' | |||
format_string += '\n)' | |||
return format_string | |||
def to_tensor(data): | |||
"""Convert objects of various python types to :obj:`torch.Tensor`. | |||
Supported types are: :class:`numpy.ndarray`, :class:`torch.Tensor`, | |||
:class:`Sequence`, :class:`int` and :class:`float`. | |||
Args: | |||
data (torch.Tensor | numpy.ndarray | Sequence | int | float): Data to | |||
be converted. | |||
""" | |||
if isinstance(data, torch.Tensor): | |||
return data | |||
elif isinstance(data, np.ndarray): | |||
return torch.from_numpy(data) | |||
elif isinstance(data, Sequence) and not isinstance(data, str): | |||
return torch.tensor(data) | |||
elif isinstance(data, int): | |||
return torch.LongTensor([data]) | |||
elif isinstance(data, float): | |||
return torch.FloatTensor([data]) | |||
else: | |||
raise TypeError(f'type {type(data)} cannot be converted to tensor.') | |||
@PREPROCESSORS.register_module() | |||
class ToTensor(object): | |||
"""Convert target object to tensor. | |||
Args: | |||
keys (Sequence[str]): Key of data to be converted to Tensor. | |||
Only valid when data is type of `Mapping`. If `keys` is None, | |||
all values of keys will be converted to tensor by default. | |||
""" | |||
def __init__(self, keys=None): | |||
self.keys = keys | |||
def __call__(self, data): | |||
if isinstance(data, Mapping): | |||
if self.keys is None: | |||
self.keys = list(data.keys()) | |||
for key in self.keys: | |||
data[key] = to_tensor(data[key]) | |||
else: | |||
data = to_tensor(data) | |||
return data | |||
def __repr__(self): | |||
return self.__class__.__name__ + f'(keys={self.keys})' | |||
@PREPROCESSORS.register_module() | |||
class Filter(object): | |||
"""This is usually the last stage of the dataloader transform. | |||
Only data of reserved keys will be kept and passed directly to the model, others will be removed. | |||
Args: | |||
keys (Sequence[str]): Keys of data to be reserved, others will be removed. | |||
""" | |||
def __init__(self, reserved_keys): | |||
self.reserved_keys = reserved_keys | |||
def __call__(self, data): | |||
assert isinstance(data, Mapping) | |||
reserved_data = {} | |||
for key in self.reserved_keys: | |||
reserved_data[key] = data[key] | |||
return reserved_data | |||
def __repr__(self): | |||
return self.__class__.__name__ + f'(keys={self.reserved_keys})' |
@@ -151,6 +151,11 @@ class ImageDenoisePreprocessor(Preprocessor): | |||
super().__init__(*args, **kwargs) | |||
self.model_dir: str = model_dir | |||
from .common import Filter | |||
# TODO: `Filter` should be moved to configurarion file of each model | |||
self._transforms = [Filter(reserved_keys=['input', 'target'])] | |||
def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]: | |||
"""process the raw input data | |||
@@ -160,6 +165,9 @@ class ImageDenoisePreprocessor(Preprocessor): | |||
Returns: | |||
Dict[str, Any]: the preprocessed data | |||
""" | |||
for t in self._transforms: | |||
data = t(data) | |||
return data | |||
@@ -4,6 +4,7 @@ import os.path as osp | |||
import uuid | |||
from typing import Any, Dict, Iterable, Optional, Tuple, Union | |||
import numpy as np | |||
from transformers import AutoTokenizer | |||
from modelscope.metainfo import Models, Preprocessors | |||
@@ -191,6 +192,10 @@ class NLPTokenizerPreprocessorBase(Preprocessor): | |||
text_b, | |||
return_tensors='pt' if self._mode == ModeKeys.INFERENCE else None, | |||
**self.tokenize_kwargs) | |||
output = { | |||
k: np.array(v) if isinstance(v, list) else v | |||
for k, v in output.items() | |||
} | |||
self.labels_to_id(labels, output) | |||
return output | |||
@@ -240,13 +245,13 @@ class NLPTokenizerPreprocessorBase(Preprocessor): | |||
if labels is not None: | |||
if isinstance(labels, Iterable) and all([label_can_be_mapped(label) for label in labels]) \ | |||
and self.label2id is not None: | |||
output[OutputKeys.LABEL] = [ | |||
output[OutputKeys.LABELS] = [ | |||
self.label2id[str(label)] for label in labels | |||
] | |||
elif label_can_be_mapped(labels) and self.label2id is not None: | |||
output[OutputKeys.LABEL] = self.label2id[str(labels)] | |||
output[OutputKeys.LABELS] = self.label2id[str(labels)] | |||
else: | |||
output[OutputKeys.LABEL] = labels | |||
output[OutputKeys.LABELS] = labels | |||
@PREPROCESSORS.register_module( | |||
@@ -40,7 +40,6 @@ class ImagePortraitEnhancementTrainer(EpochBasedTrainer): | |||
train_outputs = dict() | |||
self._mode = ModeKeys.TRAIN | |||
inputs = self.collate_fn(inputs) | |||
# call model forward but not __call__ to skip postprocess | |||
if isinstance(inputs, Mapping): | |||
d_loss = model._train_forward_d(**inputs) | |||
@@ -110,9 +110,11 @@ class NlpEpochBasedTrainer(EpochBasedTrainer): | |||
self.train_keys = build_dataset_keys( | |||
self.cfg.dataset.train if hasattr(self.cfg, 'dataset') | |||
and hasattr(self.cfg.dataset, 'train') else None) | |||
# TODO eval may has special keys, which is now not supported. | |||
# because there is only one preprocessor in the trainer, and it only supports one group of keys. | |||
self.eval_keys = self.train_keys | |||
self.eval_keys = build_dataset_keys( | |||
self.cfg.dataset.val if hasattr(self.cfg, 'dataset') | |||
and hasattr(self.cfg.dataset, 'val') else None) | |||
if len(self.eval_keys) == 0: | |||
self.eval_keys = self.train_keys | |||
super().__init__( | |||
model=model_dir, | |||
@@ -148,7 +150,7 @@ class NlpEpochBasedTrainer(EpochBasedTrainer): | |||
elif isinstance(model, nn.Module): | |||
return model | |||
def build_preprocessor(self) -> Preprocessor: | |||
def build_preprocessor(self) -> Tuple[Preprocessor, Preprocessor]: | |||
"""Build the preprocessor. | |||
User can override this method to implement custom logits. | |||
@@ -159,16 +161,38 @@ class NlpEpochBasedTrainer(EpochBasedTrainer): | |||
model_args = {} if self.label2id is None else { | |||
'label2id': self.label2id | |||
} | |||
cfg = ConfigDict({ | |||
**getattr(self.cfg, 'preprocessor'), | |||
'model_dir': | |||
self.model_dir, | |||
**model_args, | |||
'mode': | |||
ModeKeys.TRAIN, | |||
**self.train_keys, | |||
}) | |||
return build_preprocessor(cfg, Tasks.find_field_by_task(self.cfg.task)) | |||
field_name = Tasks.find_field_by_task(self.cfg.task) | |||
train_preprocessor, eval_preprocessor = None, None | |||
_train_cfg, _eval_cfg = {}, {} | |||
if 'type' not in self.cfg.preprocessor and ( | |||
'train' in self.cfg.preprocessor | |||
or 'val' in self.cfg.preprocessor): | |||
if 'train' in self.cfg.preprocessor: | |||
_train_cfg = self.cfg.preprocessor.train | |||
if 'val' in self.cfg.preprocessor: | |||
_eval_cfg = self.cfg.preprocessor.val | |||
else: | |||
_train_cfg = self.cfg.preprocessor | |||
_eval_cfg = self.cfg.preprocessor | |||
if len(_train_cfg): | |||
_train_cfg.update({ | |||
'model_dir': self.model_dir, | |||
**model_args, | |||
**self.train_keys, 'mode': ModeKeys.TRAIN | |||
}) | |||
train_preprocessor = build_preprocessor(_train_cfg, field_name) | |||
if len(_eval_cfg): | |||
_eval_cfg.update({ | |||
'model_dir': self.model_dir, | |||
**model_args, | |||
**self.eval_keys, 'mode': ModeKeys.EVAL | |||
}) | |||
eval_preprocessor = build_preprocessor(_eval_cfg, field_name) | |||
return train_preprocessor, eval_preprocessor | |||
@TRAINERS.register_module(module_name=Trainers.nlp_veco_trainer) | |||
@@ -5,15 +5,15 @@ import time | |||
from collections.abc import Mapping | |||
from distutils.version import LooseVersion | |||
from functools import partial | |||
from typing import Callable, List, Optional, Tuple, Union | |||
from typing import Callable, Dict, List, Optional, Sequence, Tuple, Union | |||
import json | |||
import numpy as np | |||
import torch | |||
from addict import Dict | |||
from torch import distributed as dist | |||
from torch import nn | |||
from torch.utils.data import DataLoader, Dataset | |||
from torch.utils.data.dataloader import default_collate | |||
from torch.utils.data.distributed import DistributedSampler | |||
from modelscope.hub.snapshot_download import snapshot_download | |||
@@ -21,8 +21,9 @@ from modelscope.metainfo import Trainers | |||
from modelscope.metrics import build_metric, task_default_metrics | |||
from modelscope.models.base import Model, TorchModel | |||
from modelscope.msdatasets.ms_dataset import MsDataset | |||
from modelscope.preprocessors import build_preprocessor | |||
from modelscope.preprocessors.base import Preprocessor | |||
from modelscope.preprocessors.builder import build_preprocessor | |||
from modelscope.preprocessors.common import Compose | |||
from modelscope.task_datasets.builder import build_task_dataset | |||
from modelscope.task_datasets.torch_base_dataset import TorchTaskDataset | |||
from modelscope.trainers.hooks.builder import HOOKS | |||
@@ -30,14 +31,15 @@ from modelscope.trainers.hooks.priority import Priority, get_priority | |||
from modelscope.trainers.lrscheduler.builder import build_lr_scheduler | |||
from modelscope.trainers.optimizer.builder import build_optimizer | |||
from modelscope.utils.config import Config, ConfigDict | |||
from modelscope.utils.constant import (DEFAULT_MODEL_REVISION, Hubs, ModeKeys, | |||
ModelFile, Tasks, TrainerStages) | |||
from modelscope.utils.constant import (DEFAULT_MODEL_REVISION, ConfigFields, | |||
ConfigKeys, Hubs, ModeKeys, ModelFile, | |||
Tasks, TrainerStages) | |||
from modelscope.utils.data_utils import to_device | |||
from modelscope.utils.file_utils import func_receive_dict_inputs | |||
from modelscope.utils.logger import get_logger | |||
from modelscope.utils.registry import build_from_cfg | |||
from modelscope.utils.tensor_utils import torch_default_data_collator | |||
from modelscope.utils.torch_utils import (broadcast, create_device, | |||
get_dist_info, init_dist) | |||
from modelscope.utils.torch_utils import (create_device, get_dist_info, | |||
init_dist) | |||
from .base import BaseTrainer | |||
from .builder import TRAINERS | |||
from .default_config import DEFAULT_CONFIG | |||
@@ -83,7 +85,8 @@ class EpochBasedTrainer(BaseTrainer): | |||
data_collator: Optional[Callable] = None, | |||
train_dataset: Optional[Union[MsDataset, Dataset]] = None, | |||
eval_dataset: Optional[Union[MsDataset, Dataset]] = None, | |||
preprocessor: Optional[Preprocessor] = None, | |||
preprocessor: Optional[Union[Preprocessor, | |||
Dict[str, Preprocessor]]] = None, | |||
optimizers: Tuple[torch.optim.Optimizer, | |||
torch.optim.lr_scheduler._LRScheduler] = (None, | |||
None), | |||
@@ -120,24 +123,46 @@ class EpochBasedTrainer(BaseTrainer): | |||
else: | |||
self.work_dir = self.cfg.train.get('work_dir', './work_dir') | |||
self.preprocessor = None | |||
self.train_preprocessor, self.eval_preprocessor = None, None | |||
if isinstance(preprocessor, Preprocessor): | |||
self.preprocessor = preprocessor | |||
elif hasattr(self.cfg, 'preprocessor'): | |||
self.preprocessor = self.build_preprocessor() | |||
if self.preprocessor is not None: | |||
self.preprocessor.mode = ModeKeys.TRAIN | |||
self.train_preprocessor = preprocessor | |||
self.eval_preprocessor = preprocessor | |||
elif isinstance(preprocessor, Mapping): | |||
if not (ConfigKeys.train in preprocessor | |||
or ConfigKeys.val in preprocessor): | |||
raise ValueError( | |||
f'Preprocessor must split with `{ConfigKeys.train}` and `{ConfigKeys.val}` keys!' | |||
) | |||
if ConfigKeys.train in preprocessor: | |||
assert isinstance(preprocessor[ConfigKeys.train], Preprocessor) | |||
self.train_preprocessor = preprocessor[ConfigKeys.train] | |||
if ConfigKeys.val in preprocessor: | |||
assert isinstance(preprocessor[ConfigKeys.val], Preprocessor) | |||
self.eval_preprocessor = preprocessor[ConfigKeys.val] | |||
elif hasattr(self.cfg, ConfigFields.preprocessor): | |||
self.train_preprocessor, self.eval_preprocessor = self.build_preprocessor( | |||
) | |||
if self.train_preprocessor is not None: | |||
self.train_preprocessor.mode = ModeKeys.TRAIN | |||
if self.eval_preprocessor is not None: | |||
self.eval_preprocessor.mode = ModeKeys.EVAL | |||
device_name = kwargs.get('device', 'gpu') | |||
assert device_name in ['gpu', | |||
'cpu'], 'device should be either cpu or gpu.' | |||
self.device = create_device(device_name == 'cpu') | |||
self.train_dataset = self.to_task_dataset( | |||
train_dataset, mode=ModeKeys.TRAIN, preprocessor=self.preprocessor) | |||
train_dataset, | |||
mode=ModeKeys.TRAIN, | |||
preprocessor=self.train_preprocessor) | |||
self.eval_dataset = self.to_task_dataset( | |||
eval_dataset, mode=ModeKeys.EVAL, preprocessor=self.preprocessor) | |||
eval_dataset, | |||
mode=ModeKeys.EVAL, | |||
preprocessor=self.eval_preprocessor) | |||
self.data_collator = data_collator if data_collator is not None else torch_default_data_collator | |||
self.data_collator = data_collator if data_collator is not None else default_collate | |||
self.metrics = self.get_metrics() | |||
self._metric_values = None | |||
self.optimizers = optimizers | |||
@@ -229,12 +254,12 @@ class EpochBasedTrainer(BaseTrainer): | |||
return datasets | |||
elif isinstance(datasets, MsDataset): | |||
datasets = datasets.to_torch_dataset( | |||
preprocessors=self.preprocessor) | |||
preprocessors=preprocessor) | |||
return datasets | |||
elif isinstance(datasets, List) and isinstance( | |||
datasets[0], MsDataset): | |||
datasets = [ | |||
d.to_torch_dataset(preprocessor=self.preprocessor) | |||
d.to_torch_dataset(preprocessor=preprocessor) | |||
for d in datasets | |||
] | |||
cfg = ConfigDict( | |||
@@ -258,24 +283,44 @@ class EpochBasedTrainer(BaseTrainer): | |||
else: | |||
return datasets | |||
def build_preprocessor(self) -> Preprocessor: | |||
"""Build the preprocessor. | |||
def build_preprocessor(self) -> Tuple[Preprocessor, Preprocessor]: | |||
"""Build train and eval preprocessor. | |||
User can override this method to implement custom logits. | |||
Returns: The preprocessor instance. | |||
Returns: The train preprocessor and eval preprocessor instance. | |||
""" | |||
# TODO @wenmeng.zwm @jiangnana.jnn add support for different preprocessor | |||
# when they are different ones in training and evaluation | |||
cfg = ConfigDict({ | |||
**getattr(self.cfg, 'preprocessor'), | |||
'model_dir': | |||
self.model_dir, | |||
'mode': | |||
ModeKeys.TRAIN, | |||
}) | |||
return build_preprocessor(cfg, Tasks.find_field_by_task(self.cfg.task)) | |||
field_name = Tasks.find_field_by_task(self.cfg.task) | |||
train_preprocessor, eval_preprocessor = None, None | |||
_train_cfg, _eval_cfg = {}, {} | |||
_dafault_args = {'model_dir': self.model_dir} | |||
if 'type' not in self.cfg.preprocessor and ( | |||
'train' in self.cfg.preprocessor | |||
or 'val' in self.cfg.preprocessor): | |||
if 'train' in self.cfg.preprocessor: | |||
_train_cfg = self.cfg.preprocessor.train | |||
if 'val' in self.cfg.preprocessor: | |||
_eval_cfg = self.cfg.preprocessor.val | |||
else: | |||
_train_cfg = self.cfg.preprocessor | |||
_eval_cfg = self.cfg.preprocessor | |||
if len(_train_cfg): | |||
if isinstance(_train_cfg, Sequence): | |||
# TODO: for Sequence, need adapt to `mode` and `mode_dir` args, | |||
# and add mode for Compose or other plans | |||
raise NotImplementedError('Not supported yet!') | |||
_train_cfg.update(_dafault_args) | |||
train_preprocessor = build_preprocessor(_train_cfg, field_name) | |||
if len(_eval_cfg): | |||
if isinstance(_eval_cfg, Sequence): | |||
raise NotImplementedError('Not supported yet!') | |||
_eval_cfg.update(_dafault_args) | |||
eval_preprocessor = build_preprocessor(_eval_cfg, field_name) | |||
return train_preprocessor, eval_preprocessor | |||
def get_metrics(self) -> List[str]: | |||
"""Get the metric class types. | |||
@@ -373,34 +418,6 @@ class EpochBasedTrainer(BaseTrainer): | |||
return build_parallel(dp_cfg) | |||
def collate_fn(self, data): | |||
"""Prepare the input just before the forward function. | |||
This method will move the tensors to the right device. | |||
Usually this method does not need to be overridden. | |||
Args: | |||
data: The data out of the dataloader. | |||
Returns: The processed data. | |||
""" | |||
from torch.utils.data.dataloader import default_collate | |||
if isinstance(data, dict) or isinstance(data, Mapping): | |||
return type(data)({k: self.collate_fn(v) for k, v in data.items()}) | |||
elif isinstance(data, (tuple, list)): | |||
if isinstance(data[0], (int, float)): | |||
return default_collate(data).to(self.device) | |||
else: | |||
return type(data)(self.collate_fn(v) for v in data) | |||
elif isinstance(data, np.ndarray): | |||
return self.collate_fn(torch.from_numpy(data)) | |||
elif isinstance(data, torch.Tensor): | |||
return data.to(self.device) | |||
elif isinstance(data, (str, int, float, bool)): | |||
return data | |||
else: | |||
raise ValueError(f'Unsupported data type {type(data)}') | |||
def train_step(self, model, inputs): | |||
""" Perform a training step on a batch of inputs. | |||
@@ -421,7 +438,6 @@ class EpochBasedTrainer(BaseTrainer): | |||
# TODO: find more pretty way to change mode | |||
model.train() | |||
self._mode = ModeKeys.TRAIN | |||
inputs = self.collate_fn(inputs) | |||
# call model forward but not __call__ to skip postprocess | |||
if isinstance(inputs, | |||
Mapping) and not func_receive_dict_inputs(model.forward): | |||
@@ -486,7 +502,9 @@ class EpochBasedTrainer(BaseTrainer): | |||
if self.train_dataset is None: | |||
train_data = self.cfg.dataset.train | |||
self.train_dataset = self.build_dataset( | |||
train_data, mode=ModeKeys.TRAIN) | |||
train_data, | |||
mode=ModeKeys.TRAIN, | |||
preprocessor=self.train_preprocessor) | |||
data_loader = self._build_dataloader_with_dataset( | |||
self.train_dataset, | |||
@@ -505,7 +523,9 @@ class EpochBasedTrainer(BaseTrainer): | |||
if self.eval_dataset is None: | |||
val_data = self.cfg.dataset.val | |||
self.eval_dataset = self.build_dataset( | |||
val_data, mode=ModeKeys.EVAL) | |||
val_data, | |||
mode=ModeKeys.EVAL, | |||
preprocessor=self.eval_preprocessor) | |||
batch_size = self.cfg.evaluation.batch_size | |||
workers = self.cfg.evaluation.workers | |||
@@ -521,7 +541,7 @@ class EpochBasedTrainer(BaseTrainer): | |||
) | |||
return data_loader | |||
def build_dataset(self, data_cfg, mode): | |||
def build_dataset(self, data_cfg, mode, preprocessor=None): | |||
""" Build torch dataset object using data config | |||
""" | |||
dataset = MsDataset.load( | |||
@@ -531,8 +551,7 @@ class EpochBasedTrainer(BaseTrainer): | |||
data_cfg, 'subset_name') else None, | |||
hub=data_cfg.hub if hasattr(data_cfg, 'hub') else Hubs.modelscope, | |||
) | |||
torch_dataset = dataset.to_torch_dataset( | |||
preprocessors=self.preprocessor, ) | |||
torch_dataset = dataset.to_torch_dataset(preprocessors=preprocessor) | |||
dataset = self.to_task_dataset(torch_dataset, mode) | |||
return dataset | |||
@@ -698,6 +717,7 @@ class EpochBasedTrainer(BaseTrainer): | |||
self.invoke_hook(TrainerStages.before_train_epoch) | |||
time.sleep(2) # Prevent possible deadlock during epoch transition | |||
for i, data_batch in enumerate(data_loader): | |||
data_batch = to_device(data_batch, self.device) | |||
self.data_batch = data_batch | |||
self._inner_iter = i | |||
self.invoke_hook(TrainerStages.before_train_iter) | |||
@@ -721,16 +741,16 @@ class EpochBasedTrainer(BaseTrainer): | |||
metric_values = multi_gpu_test( | |||
self.model, | |||
data_loader, | |||
device=self.device, | |||
tmpdir=None, | |||
gpu_collect=False, | |||
data_collate_fn=self.collate_fn, | |||
metric_classes=metric_classes) | |||
else: | |||
from modelscope.trainers.utils.inference import single_gpu_test | |||
metric_values = single_gpu_test( | |||
self.model, | |||
data_loader, | |||
data_collate_fn=self.collate_fn, | |||
device=self.device, | |||
metric_classes=metric_classes) | |||
return metric_values | |||
@@ -10,21 +10,19 @@ import torch | |||
from torch import distributed as dist | |||
from tqdm import tqdm | |||
from modelscope.utils.data_utils import to_device | |||
from modelscope.utils.file_utils import func_receive_dict_inputs | |||
from modelscope.utils.torch_utils import (broadcast, get_dist_info, is_master, | |||
make_tmp_dir) | |||
def single_gpu_test(model, | |||
data_loader, | |||
data_collate_fn=None, | |||
metric_classes=None): | |||
def single_gpu_test(model, data_loader, device, metric_classes=None): | |||
"""Test model with a single gpu. | |||
Args: | |||
model (nn.Module): Model to be tested. | |||
data_loader (nn.Dataloader): Pytorch data loader. | |||
data_collate_fn: An optional data_collate_fn before fed into the model | |||
device: (str | torch.device): The target device for the data. | |||
metric_classes(List): List of Metric class that uses to collect metrics | |||
Returns: | |||
@@ -34,8 +32,7 @@ def single_gpu_test(model, | |||
dataset = data_loader.dataset | |||
with tqdm(total=len(dataset), desc='test samples') as pbar: | |||
for data in data_loader: | |||
if data_collate_fn is not None: | |||
data = data_collate_fn(data) | |||
data = to_device(data, device) | |||
with torch.no_grad(): | |||
if isinstance(data, Mapping) and not func_receive_dict_inputs( | |||
model.forward): | |||
@@ -62,9 +59,9 @@ def single_gpu_test(model, | |||
def multi_gpu_test(model, | |||
data_loader, | |||
device, | |||
tmpdir=None, | |||
gpu_collect=False, | |||
data_collate_fn=None, | |||
metric_classes=None): | |||
"""Test model with multiple gpus. | |||
@@ -77,10 +74,10 @@ def multi_gpu_test(model, | |||
Args: | |||
model (nn.Module): Model to be tested. | |||
data_loader (nn.Dataloader): Pytorch data loader. | |||
device: (str | torch.device): The target device for the data. | |||
tmpdir (str): Path of directory to save the temporary results from | |||
different gpus under cpu mode. | |||
gpu_collect (bool): Option to use either gpu or cpu to collect results. | |||
data_collate_fn: An optional data_collate_fn before fed into the model | |||
metric_classes(List): List of Metric class that uses to collect metrics | |||
Returns: | |||
@@ -98,8 +95,7 @@ def multi_gpu_test(model, | |||
count = 0 | |||
with tqdm(total=len(dataset), desc='test samples with multi gpus') as pbar: | |||
for _, data in enumerate(data_loader): | |||
if data_collate_fn is not None: | |||
data = data_collate_fn(data) | |||
data = to_device(data, device) | |||
data_list.append(data) | |||
with torch.no_grad(): | |||
if isinstance(data, Mapping) and not func_receive_dict_inputs( | |||
@@ -219,6 +219,12 @@ class ConfigFields(object): | |||
evaluation = 'evaluation' | |||
class ConfigKeys(object): | |||
"""Fixed keywords in configuration file""" | |||
train = 'train' | |||
val = 'val' | |||
class Requirements(object): | |||
"""Requirement names for each module | |||
""" | |||
@@ -0,0 +1,23 @@ | |||
# Copyright (c) Alibaba, Inc. and its affiliates. | |||
from collections.abc import Mapping | |||
import torch | |||
def to_device(batch, device, non_blocking=False): | |||
"""Put the data to the target cuda device just before the forward function. | |||
Args: | |||
batch: The batch data out of the dataloader. | |||
device: (str | torch.device): The target device for the data. | |||
Returns: The data to the target device. | |||
""" | |||
if isinstance(batch, dict) or isinstance(batch, Mapping): | |||
return type(batch)({k: to_device(v, device) for k, v in batch.items()}) | |||
elif isinstance(batch, (tuple, list)): | |||
return type(batch)(to_device(v, device) for v in batch) | |||
elif isinstance(batch, torch.Tensor): | |||
return batch.to(device, non_blocking=non_blocking) | |||
else: | |||
return batch |
@@ -24,65 +24,3 @@ def torch_nested_detach(tensors): | |||
if isinstance(tensors, torch.Tensor): | |||
return tensors.detach() | |||
return tensors | |||
def torch_default_data_collator(features): | |||
# TODO @jiangnana.jnn refine this default data collator | |||
import torch | |||
first = features[0] | |||
if isinstance(first, Mapping): | |||
batch = {} | |||
# Special handling for labels. | |||
# Ensure that tensor is created with the correct type | |||
# (it should be automatically the case, but let's make sure of it.) | |||
if 'label' in first and first['label'] is not None: | |||
label = first['label'].item() if isinstance( | |||
first['label'], torch.Tensor) else first['label'] | |||
# the msdataset return a 0-dimension np.array with a single value, the following part handle this. | |||
if isinstance(label, np.ndarray): | |||
src_dtype = label[()].dtype | |||
dtype = torch.long if label[( | |||
)].dtype == np.int64 else torch.float | |||
else: | |||
src_dtype = type(label) | |||
dtype = torch.long if isinstance(label, int) else torch.float | |||
# add dtype to np.array to fix "TypeError: can't convert np.ndarray of type numpy.object_" | |||
batch['labels'] = torch.tensor( | |||
np.array([f['label'] for f in features], dtype=src_dtype), | |||
dtype=dtype) | |||
elif 'label_ids' in first and first['label_ids'] is not None: | |||
if isinstance(first['label_ids'], torch.Tensor): | |||
batch['labels'] = torch.stack( | |||
[f['label_ids'] for f in features]) | |||
else: | |||
dtype = torch.long if type( | |||
first['label_ids'][0]) is int else torch.float | |||
batch['labels'] = torch.tensor( | |||
[f['label_ids'] for f in features], dtype=dtype) | |||
# Handling of all other possible keys. | |||
# Again, we will use the first element to figure out which key/values are not None for this model. | |||
for k, v in first.items(): | |||
if k not in ('label', 'label_ids' | |||
) and v is not None and not isinstance(v, str): | |||
if isinstance(v, torch.Tensor): | |||
batch[k] = torch.stack([f[k] for f in features]) | |||
elif isinstance(v, list) and isinstance(v[0], torch.Tensor): | |||
batch[k] = torch.stack([d for f in features for d in f[k]]) | |||
else: | |||
batch[k] = torch.tensor(np.array([f[k] for f in features])) | |||
elif isinstance(first, tuple): | |||
batch = [] | |||
for idx in range(len(first)): | |||
if isinstance(first[idx], torch.Tensor): | |||
batch.append(torch.stack([f[idx] for f in features])) | |||
else: | |||
batch.append(torch.tensor([f[idx] for f in features])) | |||
else: | |||
if isinstance(first, torch.Tensor): | |||
batch = torch.stack(features) | |||
else: | |||
batch = torch.tensor(features) | |||
return batch |
@@ -50,7 +50,7 @@ def set_test_level(level: int): | |||
def create_dummy_test_dataset(feat, label, num): | |||
return MsDataset.from_hf_dataset( | |||
Dataset.from_dict(dict(feat=[feat] * num, label=[label] * num))) | |||
Dataset.from_dict(dict(feat=[feat] * num, labels=[label] * num))) | |||
def download_and_untar(fpath, furl, dst) -> str: | |||
@@ -2,7 +2,10 @@ | |||
import unittest | |||
from modelscope.preprocessors import PREPROCESSORS, Compose, Preprocessor | |||
import torch | |||
from modelscope.preprocessors import (PREPROCESSORS, Compose, Filter, | |||
Preprocessor, ToTensor) | |||
class ComposeTest(unittest.TestCase): | |||
@@ -35,5 +38,27 @@ class ComposeTest(unittest.TestCase): | |||
self.assertEqual(output['tmp2'], 'tmp2') | |||
class ToTensorTest(unittest.TestCase): | |||
def test_totensor(self): | |||
to_tensor_op = ToTensor(keys=['img']) | |||
inputs = {'img': [1, 2, 3], 'label': 1, 'path': 'test.jpg'} | |||
inputs = to_tensor_op(inputs) | |||
self.assertIsInstance(inputs['img'], torch.Tensor) | |||
self.assertEqual(inputs['label'], 1) | |||
self.assertEqual(inputs['path'], 'test.jpg') | |||
class FilterTest(unittest.TestCase): | |||
def test_filter(self): | |||
filter_op = Filter(reserved_keys=['img', 'label']) | |||
inputs = {'img': [1, 2, 3], 'label': 1, 'path': 'test.jpg'} | |||
inputs = filter_op(inputs) | |||
self.assertIn('img', inputs) | |||
self.assertIn('label', inputs) | |||
self.assertNotIn('path', inputs) | |||
if __name__ == '__main__': | |||
unittest.main() |
@@ -12,7 +12,7 @@ from torch import nn | |||
from modelscope.metainfo import Trainers | |||
from modelscope.metrics.builder import METRICS, MetricKeys | |||
from modelscope.trainers import build_trainer | |||
from modelscope.utils.constant import LogKeys, ModelFile | |||
from modelscope.utils.constant import ModelFile | |||
from modelscope.utils.registry import default_group | |||
from modelscope.utils.test_utils import create_dummy_test_dataset | |||
@@ -9,7 +9,7 @@ import numpy as np | |||
import torch | |||
from torch import nn | |||
from torch.optim import SGD | |||
from torch.optim.lr_scheduler import MultiStepLR, ReduceLROnPlateau | |||
from torch.optim.lr_scheduler import MultiStepLR | |||
from modelscope.metainfo import Trainers | |||
from modelscope.metrics.builder import METRICS, MetricKeys | |||
@@ -96,7 +96,8 @@ class LrSchedulerHookTest(unittest.TestCase): | |||
model=model, | |||
train_dataset=dummy_dataset, | |||
optimizers=(optimizer, lr_scheduler), | |||
max_epochs=5) | |||
max_epochs=5, | |||
device='cpu') | |||
trainer = build_trainer(trainer_name, kwargs) | |||
train_dataloader = trainer._build_dataloader_with_dataset( | |||
@@ -160,15 +161,13 @@ class LrSchedulerHookTest(unittest.TestCase): | |||
json.dump(json_cfg, f) | |||
model = DummyModel() | |||
# optimmizer = SGD(model.parameters(), lr=0.01) | |||
# lr_scheduler = MultiStepLR(optimmizer, milestones=[2, 4]) | |||
trainer_name = Trainers.default | |||
kwargs = dict( | |||
cfg_file=config_path, | |||
model=model, | |||
train_dataset=dummy_dataset, | |||
# optimizers=(optimmizer, lr_scheduler), | |||
max_epochs=7) | |||
max_epochs=7, | |||
device='cpu') | |||
trainer = build_trainer(trainer_name, kwargs) | |||
train_dataloader = trainer._build_dataloader_with_dataset( | |||
@@ -266,7 +265,8 @@ class PlateauLrSchedulerHookTest(unittest.TestCase): | |||
train_dataset=dummy_dataset, | |||
eval_dataset=dummy_dataset, | |||
optimizers=(optimizer, None), | |||
max_epochs=5) | |||
max_epochs=5, | |||
device='cpu') | |||
trainer = build_trainer(trainer_name, kwargs) | |||
train_dataloader = trainer._build_dataloader_with_dataset( | |||
@@ -17,7 +17,7 @@ from modelscope.utils.constant import ModelFile, TrainerStages | |||
from modelscope.utils.test_utils import create_dummy_test_dataset | |||
dummy_dataset = create_dummy_test_dataset( | |||
np.random.random(size=(2, 2)), np.random.randint(0, 2, (1, )), 10) | |||
np.random.random(size=(2, )), np.random.randint(0, 2, (1, )), 10) | |||
class DummyModel(nn.Module): | |||
@@ -71,7 +71,8 @@ class OptimizerHookTest(unittest.TestCase): | |||
model=model, | |||
train_dataset=dummy_dataset, | |||
optimizers=(optimizer, lr_scheduler), | |||
max_epochs=2) | |||
max_epochs=2, | |||
device='cpu') | |||
trainer = build_trainer(trainer_name, kwargs) | |||
train_dataloader = trainer._build_dataloader_with_dataset( | |||
@@ -75,7 +75,8 @@ class IterTimerHookTest(unittest.TestCase): | |||
model=model, | |||
train_dataset=dummy_dataset, | |||
optimizers=(optimizer, lr_scheduler), | |||
max_epochs=5) | |||
max_epochs=5, | |||
device='cpu') | |||
trainer = build_trainer(trainer_name, kwargs) | |||
train_dataloader = trainer._build_dataloader_with_dataset( | |||
@@ -3,19 +3,16 @@ import os | |||
import shutil | |||
import tempfile | |||
import unittest | |||
from abc import ABCMeta | |||
import json | |||
import numpy as np | |||
import torch | |||
from datasets import Dataset | |||
from torch import nn | |||
from torch.optim import SGD | |||
from torch.optim.lr_scheduler import StepLR | |||
from modelscope.metainfo import Metrics, Trainers | |||
from modelscope.metrics.builder import MetricKeys | |||
from modelscope.msdatasets import MsDataset | |||
from modelscope.trainers import build_trainer | |||
from modelscope.utils.constant import LogKeys, ModeKeys, ModelFile | |||
from modelscope.utils.test_utils import create_dummy_test_dataset, test_level | |||
@@ -116,7 +113,8 @@ class TrainerTest(unittest.TestCase): | |||
data_collator=None, | |||
train_dataset=dummy_dataset_small, | |||
eval_dataset=dummy_dataset_small, | |||
max_epochs=3) | |||
max_epochs=3, | |||
device='cpu') | |||
trainer = build_trainer(trainer_name, kwargs) | |||
trainer.train() | |||
@@ -175,7 +173,8 @@ class TrainerTest(unittest.TestCase): | |||
train_dataset=dummy_dataset_small, | |||
eval_dataset=dummy_dataset_small, | |||
optimizers=(optimmizer, lr_scheduler), | |||
max_epochs=3) | |||
max_epochs=3, | |||
device='cpu') | |||
trainer = build_trainer(trainer_name, kwargs) | |||
trainer.train() | |||
@@ -225,7 +224,8 @@ class TrainerTest(unittest.TestCase): | |||
train_dataset=dummy_dataset_big, | |||
eval_dataset=dummy_dataset_small, | |||
optimizers=(optimmizer, lr_scheduler), | |||
max_epochs=3) | |||
max_epochs=3, | |||
device='cpu') | |||
trainer = build_trainer(trainer_name, kwargs) | |||
trainer.train() | |||
@@ -37,7 +37,8 @@ class TestTrainerWithNlp(unittest.TestCase): | |||
model=model_id, | |||
train_dataset=self.dataset, | |||
eval_dataset=self.dataset, | |||
work_dir=self.tmp_dir) | |||
work_dir=self.tmp_dir, | |||
model_revision='beta') | |||
trainer = build_trainer(default_args=kwargs) | |||
trainer.train() | |||
@@ -53,7 +54,8 @@ class TestTrainerWithNlp(unittest.TestCase): | |||
model=model_id, | |||
train_dataset=self.dataset, | |||
eval_dataset=self.dataset, | |||
work_dir=self.tmp_dir) | |||
work_dir=self.tmp_dir, | |||
model_revision='beta') | |||
trainer = build_trainer(default_args=kwargs) | |||
trainer.train() | |||
@@ -69,7 +71,7 @@ class TestTrainerWithNlp(unittest.TestCase): | |||
@unittest.skipUnless(test_level() >= 1, 'skip test in current test level') | |||
def test_trainer_with_user_defined_config(self): | |||
model_id = 'damo/nlp_structbert_sentiment-classification_chinese-base' | |||
cfg = read_config(model_id) | |||
cfg = read_config(model_id, revision='beta') | |||
cfg.train.max_epochs = 20 | |||
cfg.train.work_dir = self.tmp_dir | |||
cfg_file = os.path.join(self.tmp_dir, 'config.json') | |||
@@ -78,7 +80,8 @@ class TestTrainerWithNlp(unittest.TestCase): | |||
model=model_id, | |||
train_dataset=self.dataset, | |||
eval_dataset=self.dataset, | |||
cfg_file=cfg_file) | |||
cfg_file=cfg_file, | |||
model_revision='beta') | |||
trainer = build_trainer(default_args=kwargs) | |||
trainer.train() | |||
@@ -98,7 +101,7 @@ class TestTrainerWithNlp(unittest.TestCase): | |||
os.makedirs(tmp_dir) | |||
model_id = 'damo/nlp_structbert_sentence-similarity_chinese-base' | |||
cache_path = snapshot_download(model_id) | |||
cache_path = snapshot_download(model_id, revision='beta') | |||
model = SbertForSequenceClassification.from_pretrained(cache_path) | |||
kwargs = dict( | |||
cfg_file=os.path.join(cache_path, ModelFile.CONFIGURATION), | |||
@@ -0,0 +1,116 @@ | |||
# Copyright (c) Alibaba, Inc. and its affiliates. | |||
import os | |||
import shutil | |||
import tempfile | |||
import unittest | |||
import torch | |||
from torch import nn | |||
from torch.utils.data import DataLoader | |||
from modelscope.metrics.builder import MetricKeys | |||
from modelscope.metrics.sequence_classification_metric import \ | |||
SequenceClassificationMetric | |||
from modelscope.trainers.utils.inference import multi_gpu_test, single_gpu_test | |||
from modelscope.utils.test_utils import (DistributedTestCase, | |||
create_dummy_test_dataset, test_level) | |||
from modelscope.utils.torch_utils import get_dist_info, init_dist | |||
dummy_dataset = create_dummy_test_dataset( | |||
torch.rand((5, )), torch.randint(0, 4, (1, )), 20) | |||
class DummyModel(nn.Module): | |||
def __init__(self): | |||
super().__init__() | |||
self.linear = nn.Linear(5, 4) | |||
self.bn = nn.BatchNorm1d(4) | |||
def forward(self, feat, labels): | |||
x = self.linear(feat) | |||
x = self.bn(x) | |||
loss = torch.sum(x) | |||
return dict(logits=x, loss=loss) | |||
def test_func(dist=False): | |||
dummy_model = DummyModel() | |||
dataset = dummy_dataset.to_torch_dataset() | |||
dummy_loader = DataLoader( | |||
dataset, | |||
batch_size=2, | |||
) | |||
metric_class = SequenceClassificationMetric() | |||
if dist: | |||
init_dist(launcher='pytorch') | |||
rank, world_size = get_dist_info() | |||
device = torch.device(f'cuda:{rank}') | |||
dummy_model.cuda() | |||
if world_size > 1: | |||
from torch.nn.parallel.distributed import DistributedDataParallel | |||
dummy_model = DistributedDataParallel( | |||
dummy_model, device_ids=[torch.cuda.current_device()]) | |||
test_func = multi_gpu_test | |||
else: | |||
test_func = single_gpu_test | |||
metric_results = test_func( | |||
dummy_model, | |||
dummy_loader, | |||
device=device, | |||
metric_classes=[metric_class]) | |||
return metric_results | |||
@unittest.skipIf(not torch.cuda.is_available(), 'cuda unittest') | |||
class SingleGpuTestTest(unittest.TestCase): | |||
def setUp(self): | |||
print(('Testing %s.%s' % (type(self).__name__, self._testMethodName))) | |||
self.tmp_dir = tempfile.TemporaryDirectory().name | |||
if not os.path.exists(self.tmp_dir): | |||
os.makedirs(self.tmp_dir) | |||
def tearDown(self): | |||
super().tearDown() | |||
shutil.rmtree(self.tmp_dir) | |||
@unittest.skipUnless(test_level() >= 1, 'skip test in current test level') | |||
def test_single_gpu_test(self): | |||
metric_results = test_func() | |||
self.assertIn(MetricKeys.ACCURACY, metric_results) | |||
@unittest.skipIf(not torch.cuda.is_available() | |||
or torch.cuda.device_count() <= 1, 'distributed unittest') | |||
class MultiGpuTestTest(DistributedTestCase): | |||
def setUp(self): | |||
print(('Testing %s.%s' % (type(self).__name__, self._testMethodName))) | |||
self.tmp_dir = tempfile.TemporaryDirectory().name | |||
if not os.path.exists(self.tmp_dir): | |||
os.makedirs(self.tmp_dir) | |||
def tearDown(self): | |||
super().tearDown() | |||
shutil.rmtree(self.tmp_dir) | |||
@unittest.skipUnless(test_level() >= 1, 'skip test in current test level') | |||
def test_multi_gpu_test(self): | |||
self.start( | |||
test_func, | |||
num_gpus=2, | |||
assert_callback=lambda x: self.assertIn(MetricKeys.ACCURACY, x), | |||
dist=True) | |||
if __name__ == '__main__': | |||
unittest.main() |