Browse Source

[to #43850241] fix processor and collate_fn

Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/9644184

    * fix ditributed training and eval
master
jiangnana.jnn 3 years ago
parent
commit
76482cc3ea
22 changed files with 442 additions and 202 deletions
  1. +0
    -14
      modelscope/models/cv/image_denoise/nafnet_for_image_denoise.py
  2. +2
    -2
      modelscope/preprocessors/__init__.py
  3. +91
    -2
      modelscope/preprocessors/common.py
  4. +8
    -0
      modelscope/preprocessors/image.py
  5. +8
    -3
      modelscope/preprocessors/nlp.py
  6. +0
    -1
      modelscope/trainers/cv/image_portrait_enhancement_trainer.py
  7. +38
    -14
      modelscope/trainers/nlp_trainer.py
  8. +89
    -69
      modelscope/trainers/trainer.py
  9. +7
    -11
      modelscope/trainers/utils/inference.py
  10. +6
    -0
      modelscope/utils/constant.py
  11. +23
    -0
      modelscope/utils/data_utils.py
  12. +0
    -62
      modelscope/utils/tensor_utils.py
  13. +1
    -1
      modelscope/utils/test_utils.py
  14. +26
    -1
      tests/preprocessors/test_common.py
  15. +1
    -1
      tests/trainers/hooks/test_evaluation_hook.py
  16. +7
    -7
      tests/trainers/hooks/test_lr_scheduler_hook.py
  17. +3
    -2
      tests/trainers/hooks/test_optimizer_hook.py
  18. +2
    -1
      tests/trainers/hooks/test_timer_hook.py
  19. +6
    -6
      tests/trainers/test_trainer.py
  20. +8
    -5
      tests/trainers/test_trainer_with_nlp.py
  21. +0
    -0
      tests/trainers/utils/__init__.py
  22. +116
    -0
      tests/trainers/utils/test_inference.py

+ 0
- 14
modelscope/models/cv/image_denoise/nafnet_for_image_denoise.py View File

@@ -36,20 +36,8 @@ class NAFNetForImageDenoise(TorchModel):
model_path = os.path.join(model_dir, ModelFile.TORCH_MODEL_FILE)
self.model = NAFNet(**self.config.model.network_g)
self.loss = PSNRLoss()

if torch.cuda.is_available():
self._device = torch.device('cuda')
else:
self._device = torch.device('cpu')

self.model = self.model.to(self._device)
self.model = self._load_pretrained(self.model, model_path)

if self.training:
self.model.train()
else:
self.model.eval()

def _load_pretrained(self,
net,
load_path,
@@ -109,8 +97,6 @@ class NAFNetForImageDenoise(TorchModel):
Returns:
Dict[str, Tensor]: results
"""
for key, value in inputs.items():
inputs[key] = inputs[key].to(self._device)
if self.training:
return self._train_forward(**inputs)
elif 'target' in inputs:


+ 2
- 2
modelscope/preprocessors/__init__.py View File

@@ -6,7 +6,7 @@ from modelscope.utils.import_utils import LazyImportModule
if TYPE_CHECKING:
from .base import Preprocessor
from .builder import PREPROCESSORS, build_preprocessor
from .common import Compose
from .common import Compose, ToTensor, Filter
from .asr import WavToScp
from .audio import LinearAECAndFbank
from .image import (LoadImage, load_image,
@@ -33,7 +33,7 @@ else:
_import_structure = {
'base': ['Preprocessor'],
'builder': ['PREPROCESSORS', 'build_preprocessor'],
'common': ['Compose'],
'common': ['Compose', 'ToTensor', 'Filter'],
'audio': ['LinearAECAndFbank'],
'asr': ['WavToScp'],
'video': ['ReadVideoData'],


+ 91
- 2
modelscope/preprocessors/common.py View File

@@ -2,6 +2,10 @@

import time
from collections.abc import Sequence
from typing import Mapping

import numpy as np
import torch

from .builder import PREPROCESSORS, build_preprocessor

@@ -25,12 +29,18 @@ class Compose(object):
if isinstance(transform, dict):
if self.field_name is None:
transform = build_preprocessor(transform, field_name)
self.transforms.append(transform)
else:
# if not found key in field_name, try field_name=None(default_group)
try:
transform = build_preprocessor(transform, field_name)
except KeyError:
transform = build_preprocessor(transform, None)
elif callable(transform):
self.transforms.append(transform)
pass
else:
raise TypeError('transform must be callable or a dict, but got'
f' {type(transform)}')
self.transforms.append(transform)

def __call__(self, data):
for t in self.transforms:
@@ -52,3 +62,82 @@ class Compose(object):
format_string += f'\n {t}'
format_string += '\n)'
return format_string


def to_tensor(data):
"""Convert objects of various python types to :obj:`torch.Tensor`.

Supported types are: :class:`numpy.ndarray`, :class:`torch.Tensor`,
:class:`Sequence`, :class:`int` and :class:`float`.

Args:
data (torch.Tensor | numpy.ndarray | Sequence | int | float): Data to
be converted.
"""

if isinstance(data, torch.Tensor):
return data
elif isinstance(data, np.ndarray):
return torch.from_numpy(data)
elif isinstance(data, Sequence) and not isinstance(data, str):
return torch.tensor(data)
elif isinstance(data, int):
return torch.LongTensor([data])
elif isinstance(data, float):
return torch.FloatTensor([data])
else:
raise TypeError(f'type {type(data)} cannot be converted to tensor.')


@PREPROCESSORS.register_module()
class ToTensor(object):
"""Convert target object to tensor.

Args:
keys (Sequence[str]): Key of data to be converted to Tensor.
Only valid when data is type of `Mapping`. If `keys` is None,
all values of keys ​​will be converted to tensor by default.
"""

def __init__(self, keys=None):
self.keys = keys

def __call__(self, data):
if isinstance(data, Mapping):
if self.keys is None:
self.keys = list(data.keys())

for key in self.keys:
data[key] = to_tensor(data[key])
else:
data = to_tensor(data)

return data

def __repr__(self):
return self.__class__.__name__ + f'(keys={self.keys})'


@PREPROCESSORS.register_module()
class Filter(object):
"""This is usually the last stage of the dataloader transform.
Only data of reserved keys will be kept and passed directly to the model, others will be removed.

Args:
keys (Sequence[str]): Keys of data to be reserved, others will be removed.
"""

def __init__(self, reserved_keys):
self.reserved_keys = reserved_keys

def __call__(self, data):
assert isinstance(data, Mapping)

reserved_data = {}
for key in self.reserved_keys:
reserved_data[key] = data[key]

return reserved_data

def __repr__(self):
return self.__class__.__name__ + f'(keys={self.reserved_keys})'

+ 8
- 0
modelscope/preprocessors/image.py View File

@@ -151,6 +151,11 @@ class ImageDenoisePreprocessor(Preprocessor):
super().__init__(*args, **kwargs)
self.model_dir: str = model_dir

from .common import Filter

# TODO: `Filter` should be moved to configurarion file of each model
self._transforms = [Filter(reserved_keys=['input', 'target'])]

def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
"""process the raw input data

@@ -160,6 +165,9 @@ class ImageDenoisePreprocessor(Preprocessor):
Returns:
Dict[str, Any]: the preprocessed data
"""
for t in self._transforms:
data = t(data)

return data




+ 8
- 3
modelscope/preprocessors/nlp.py View File

@@ -4,6 +4,7 @@ import os.path as osp
import uuid
from typing import Any, Dict, Iterable, Optional, Tuple, Union

import numpy as np
from transformers import AutoTokenizer

from modelscope.metainfo import Models, Preprocessors
@@ -191,6 +192,10 @@ class NLPTokenizerPreprocessorBase(Preprocessor):
text_b,
return_tensors='pt' if self._mode == ModeKeys.INFERENCE else None,
**self.tokenize_kwargs)
output = {
k: np.array(v) if isinstance(v, list) else v
for k, v in output.items()
}
self.labels_to_id(labels, output)
return output

@@ -240,13 +245,13 @@ class NLPTokenizerPreprocessorBase(Preprocessor):
if labels is not None:
if isinstance(labels, Iterable) and all([label_can_be_mapped(label) for label in labels]) \
and self.label2id is not None:
output[OutputKeys.LABEL] = [
output[OutputKeys.LABELS] = [
self.label2id[str(label)] for label in labels
]
elif label_can_be_mapped(labels) and self.label2id is not None:
output[OutputKeys.LABEL] = self.label2id[str(labels)]
output[OutputKeys.LABELS] = self.label2id[str(labels)]
else:
output[OutputKeys.LABEL] = labels
output[OutputKeys.LABELS] = labels


@PREPROCESSORS.register_module(


+ 0
- 1
modelscope/trainers/cv/image_portrait_enhancement_trainer.py View File

@@ -40,7 +40,6 @@ class ImagePortraitEnhancementTrainer(EpochBasedTrainer):

train_outputs = dict()
self._mode = ModeKeys.TRAIN
inputs = self.collate_fn(inputs)
# call model forward but not __call__ to skip postprocess
if isinstance(inputs, Mapping):
d_loss = model._train_forward_d(**inputs)


+ 38
- 14
modelscope/trainers/nlp_trainer.py View File

@@ -110,9 +110,11 @@ class NlpEpochBasedTrainer(EpochBasedTrainer):
self.train_keys = build_dataset_keys(
self.cfg.dataset.train if hasattr(self.cfg, 'dataset')
and hasattr(self.cfg.dataset, 'train') else None)
# TODO eval may has special keys, which is now not supported.
# because there is only one preprocessor in the trainer, and it only supports one group of keys.
self.eval_keys = self.train_keys
self.eval_keys = build_dataset_keys(
self.cfg.dataset.val if hasattr(self.cfg, 'dataset')
and hasattr(self.cfg.dataset, 'val') else None)
if len(self.eval_keys) == 0:
self.eval_keys = self.train_keys

super().__init__(
model=model_dir,
@@ -148,7 +150,7 @@ class NlpEpochBasedTrainer(EpochBasedTrainer):
elif isinstance(model, nn.Module):
return model

def build_preprocessor(self) -> Preprocessor:
def build_preprocessor(self) -> Tuple[Preprocessor, Preprocessor]:
"""Build the preprocessor.

User can override this method to implement custom logits.
@@ -159,16 +161,38 @@ class NlpEpochBasedTrainer(EpochBasedTrainer):
model_args = {} if self.label2id is None else {
'label2id': self.label2id
}
cfg = ConfigDict({
**getattr(self.cfg, 'preprocessor'),
'model_dir':
self.model_dir,
**model_args,
'mode':
ModeKeys.TRAIN,
**self.train_keys,
})
return build_preprocessor(cfg, Tasks.find_field_by_task(self.cfg.task))

field_name = Tasks.find_field_by_task(self.cfg.task)
train_preprocessor, eval_preprocessor = None, None
_train_cfg, _eval_cfg = {}, {}

if 'type' not in self.cfg.preprocessor and (
'train' in self.cfg.preprocessor
or 'val' in self.cfg.preprocessor):
if 'train' in self.cfg.preprocessor:
_train_cfg = self.cfg.preprocessor.train
if 'val' in self.cfg.preprocessor:
_eval_cfg = self.cfg.preprocessor.val
else:
_train_cfg = self.cfg.preprocessor
_eval_cfg = self.cfg.preprocessor

if len(_train_cfg):
_train_cfg.update({
'model_dir': self.model_dir,
**model_args,
**self.train_keys, 'mode': ModeKeys.TRAIN
})
train_preprocessor = build_preprocessor(_train_cfg, field_name)
if len(_eval_cfg):
_eval_cfg.update({
'model_dir': self.model_dir,
**model_args,
**self.eval_keys, 'mode': ModeKeys.EVAL
})
eval_preprocessor = build_preprocessor(_eval_cfg, field_name)

return train_preprocessor, eval_preprocessor


@TRAINERS.register_module(module_name=Trainers.nlp_veco_trainer)


+ 89
- 69
modelscope/trainers/trainer.py View File

@@ -5,15 +5,15 @@ import time
from collections.abc import Mapping
from distutils.version import LooseVersion
from functools import partial
from typing import Callable, List, Optional, Tuple, Union
from typing import Callable, Dict, List, Optional, Sequence, Tuple, Union

import json
import numpy as np
import torch
from addict import Dict
from torch import distributed as dist
from torch import nn
from torch.utils.data import DataLoader, Dataset
from torch.utils.data.dataloader import default_collate
from torch.utils.data.distributed import DistributedSampler

from modelscope.hub.snapshot_download import snapshot_download
@@ -21,8 +21,9 @@ from modelscope.metainfo import Trainers
from modelscope.metrics import build_metric, task_default_metrics
from modelscope.models.base import Model, TorchModel
from modelscope.msdatasets.ms_dataset import MsDataset
from modelscope.preprocessors import build_preprocessor
from modelscope.preprocessors.base import Preprocessor
from modelscope.preprocessors.builder import build_preprocessor
from modelscope.preprocessors.common import Compose
from modelscope.task_datasets.builder import build_task_dataset
from modelscope.task_datasets.torch_base_dataset import TorchTaskDataset
from modelscope.trainers.hooks.builder import HOOKS
@@ -30,14 +31,15 @@ from modelscope.trainers.hooks.priority import Priority, get_priority
from modelscope.trainers.lrscheduler.builder import build_lr_scheduler
from modelscope.trainers.optimizer.builder import build_optimizer
from modelscope.utils.config import Config, ConfigDict
from modelscope.utils.constant import (DEFAULT_MODEL_REVISION, Hubs, ModeKeys,
ModelFile, Tasks, TrainerStages)
from modelscope.utils.constant import (DEFAULT_MODEL_REVISION, ConfigFields,
ConfigKeys, Hubs, ModeKeys, ModelFile,
Tasks, TrainerStages)
from modelscope.utils.data_utils import to_device
from modelscope.utils.file_utils import func_receive_dict_inputs
from modelscope.utils.logger import get_logger
from modelscope.utils.registry import build_from_cfg
from modelscope.utils.tensor_utils import torch_default_data_collator
from modelscope.utils.torch_utils import (broadcast, create_device,
get_dist_info, init_dist)
from modelscope.utils.torch_utils import (create_device, get_dist_info,
init_dist)
from .base import BaseTrainer
from .builder import TRAINERS
from .default_config import DEFAULT_CONFIG
@@ -83,7 +85,8 @@ class EpochBasedTrainer(BaseTrainer):
data_collator: Optional[Callable] = None,
train_dataset: Optional[Union[MsDataset, Dataset]] = None,
eval_dataset: Optional[Union[MsDataset, Dataset]] = None,
preprocessor: Optional[Preprocessor] = None,
preprocessor: Optional[Union[Preprocessor,
Dict[str, Preprocessor]]] = None,
optimizers: Tuple[torch.optim.Optimizer,
torch.optim.lr_scheduler._LRScheduler] = (None,
None),
@@ -120,24 +123,46 @@ class EpochBasedTrainer(BaseTrainer):
else:
self.work_dir = self.cfg.train.get('work_dir', './work_dir')

self.preprocessor = None
self.train_preprocessor, self.eval_preprocessor = None, None
if isinstance(preprocessor, Preprocessor):
self.preprocessor = preprocessor
elif hasattr(self.cfg, 'preprocessor'):
self.preprocessor = self.build_preprocessor()
if self.preprocessor is not None:
self.preprocessor.mode = ModeKeys.TRAIN
self.train_preprocessor = preprocessor
self.eval_preprocessor = preprocessor
elif isinstance(preprocessor, Mapping):
if not (ConfigKeys.train in preprocessor
or ConfigKeys.val in preprocessor):
raise ValueError(
f'Preprocessor must split with `{ConfigKeys.train}` and `{ConfigKeys.val}` keys!'
)
if ConfigKeys.train in preprocessor:
assert isinstance(preprocessor[ConfigKeys.train], Preprocessor)
self.train_preprocessor = preprocessor[ConfigKeys.train]
if ConfigKeys.val in preprocessor:
assert isinstance(preprocessor[ConfigKeys.val], Preprocessor)
self.eval_preprocessor = preprocessor[ConfigKeys.val]
elif hasattr(self.cfg, ConfigFields.preprocessor):
self.train_preprocessor, self.eval_preprocessor = self.build_preprocessor(
)

if self.train_preprocessor is not None:
self.train_preprocessor.mode = ModeKeys.TRAIN
if self.eval_preprocessor is not None:
self.eval_preprocessor.mode = ModeKeys.EVAL

device_name = kwargs.get('device', 'gpu')
assert device_name in ['gpu',
'cpu'], 'device should be either cpu or gpu.'
self.device = create_device(device_name == 'cpu')

self.train_dataset = self.to_task_dataset(
train_dataset, mode=ModeKeys.TRAIN, preprocessor=self.preprocessor)
train_dataset,
mode=ModeKeys.TRAIN,
preprocessor=self.train_preprocessor)
self.eval_dataset = self.to_task_dataset(
eval_dataset, mode=ModeKeys.EVAL, preprocessor=self.preprocessor)
eval_dataset,
mode=ModeKeys.EVAL,
preprocessor=self.eval_preprocessor)

self.data_collator = data_collator if data_collator is not None else torch_default_data_collator
self.data_collator = data_collator if data_collator is not None else default_collate
self.metrics = self.get_metrics()
self._metric_values = None
self.optimizers = optimizers
@@ -229,12 +254,12 @@ class EpochBasedTrainer(BaseTrainer):
return datasets
elif isinstance(datasets, MsDataset):
datasets = datasets.to_torch_dataset(
preprocessors=self.preprocessor)
preprocessors=preprocessor)
return datasets
elif isinstance(datasets, List) and isinstance(
datasets[0], MsDataset):
datasets = [
d.to_torch_dataset(preprocessor=self.preprocessor)
d.to_torch_dataset(preprocessor=preprocessor)
for d in datasets
]
cfg = ConfigDict(
@@ -258,24 +283,44 @@ class EpochBasedTrainer(BaseTrainer):
else:
return datasets

def build_preprocessor(self) -> Preprocessor:
"""Build the preprocessor.
def build_preprocessor(self) -> Tuple[Preprocessor, Preprocessor]:
"""Build train and eval preprocessor.

User can override this method to implement custom logits.

Returns: The preprocessor instance.
Returns: The train preprocessor and eval preprocessor instance.

"""
# TODO @wenmeng.zwm @jiangnana.jnn add support for different preprocessor
# when they are different ones in training and evaluation
cfg = ConfigDict({
**getattr(self.cfg, 'preprocessor'),
'model_dir':
self.model_dir,
'mode':
ModeKeys.TRAIN,
})
return build_preprocessor(cfg, Tasks.find_field_by_task(self.cfg.task))
field_name = Tasks.find_field_by_task(self.cfg.task)
train_preprocessor, eval_preprocessor = None, None
_train_cfg, _eval_cfg = {}, {}
_dafault_args = {'model_dir': self.model_dir}

if 'type' not in self.cfg.preprocessor and (
'train' in self.cfg.preprocessor
or 'val' in self.cfg.preprocessor):
if 'train' in self.cfg.preprocessor:
_train_cfg = self.cfg.preprocessor.train
if 'val' in self.cfg.preprocessor:
_eval_cfg = self.cfg.preprocessor.val
else:
_train_cfg = self.cfg.preprocessor
_eval_cfg = self.cfg.preprocessor

if len(_train_cfg):
if isinstance(_train_cfg, Sequence):
# TODO: for Sequence, need adapt to `mode` and `mode_dir` args,
# and add mode for Compose or other plans
raise NotImplementedError('Not supported yet!')
_train_cfg.update(_dafault_args)
train_preprocessor = build_preprocessor(_train_cfg, field_name)
if len(_eval_cfg):
if isinstance(_eval_cfg, Sequence):
raise NotImplementedError('Not supported yet!')
_eval_cfg.update(_dafault_args)
eval_preprocessor = build_preprocessor(_eval_cfg, field_name)

return train_preprocessor, eval_preprocessor

def get_metrics(self) -> List[str]:
"""Get the metric class types.
@@ -373,34 +418,6 @@ class EpochBasedTrainer(BaseTrainer):

return build_parallel(dp_cfg)

def collate_fn(self, data):
"""Prepare the input just before the forward function.
This method will move the tensors to the right device.
Usually this method does not need to be overridden.

Args:
data: The data out of the dataloader.

Returns: The processed data.

"""
from torch.utils.data.dataloader import default_collate
if isinstance(data, dict) or isinstance(data, Mapping):
return type(data)({k: self.collate_fn(v) for k, v in data.items()})
elif isinstance(data, (tuple, list)):
if isinstance(data[0], (int, float)):
return default_collate(data).to(self.device)
else:
return type(data)(self.collate_fn(v) for v in data)
elif isinstance(data, np.ndarray):
return self.collate_fn(torch.from_numpy(data))
elif isinstance(data, torch.Tensor):
return data.to(self.device)
elif isinstance(data, (str, int, float, bool)):
return data
else:
raise ValueError(f'Unsupported data type {type(data)}')

def train_step(self, model, inputs):
""" Perform a training step on a batch of inputs.

@@ -421,7 +438,6 @@ class EpochBasedTrainer(BaseTrainer):
# TODO: find more pretty way to change mode
model.train()
self._mode = ModeKeys.TRAIN
inputs = self.collate_fn(inputs)
# call model forward but not __call__ to skip postprocess
if isinstance(inputs,
Mapping) and not func_receive_dict_inputs(model.forward):
@@ -486,7 +502,9 @@ class EpochBasedTrainer(BaseTrainer):
if self.train_dataset is None:
train_data = self.cfg.dataset.train
self.train_dataset = self.build_dataset(
train_data, mode=ModeKeys.TRAIN)
train_data,
mode=ModeKeys.TRAIN,
preprocessor=self.train_preprocessor)

data_loader = self._build_dataloader_with_dataset(
self.train_dataset,
@@ -505,7 +523,9 @@ class EpochBasedTrainer(BaseTrainer):
if self.eval_dataset is None:
val_data = self.cfg.dataset.val
self.eval_dataset = self.build_dataset(
val_data, mode=ModeKeys.EVAL)
val_data,
mode=ModeKeys.EVAL,
preprocessor=self.eval_preprocessor)

batch_size = self.cfg.evaluation.batch_size
workers = self.cfg.evaluation.workers
@@ -521,7 +541,7 @@ class EpochBasedTrainer(BaseTrainer):
)
return data_loader

def build_dataset(self, data_cfg, mode):
def build_dataset(self, data_cfg, mode, preprocessor=None):
""" Build torch dataset object using data config
"""
dataset = MsDataset.load(
@@ -531,8 +551,7 @@ class EpochBasedTrainer(BaseTrainer):
data_cfg, 'subset_name') else None,
hub=data_cfg.hub if hasattr(data_cfg, 'hub') else Hubs.modelscope,
)
torch_dataset = dataset.to_torch_dataset(
preprocessors=self.preprocessor, )
torch_dataset = dataset.to_torch_dataset(preprocessors=preprocessor)
dataset = self.to_task_dataset(torch_dataset, mode)
return dataset

@@ -698,6 +717,7 @@ class EpochBasedTrainer(BaseTrainer):
self.invoke_hook(TrainerStages.before_train_epoch)
time.sleep(2) # Prevent possible deadlock during epoch transition
for i, data_batch in enumerate(data_loader):
data_batch = to_device(data_batch, self.device)
self.data_batch = data_batch
self._inner_iter = i
self.invoke_hook(TrainerStages.before_train_iter)
@@ -721,16 +741,16 @@ class EpochBasedTrainer(BaseTrainer):
metric_values = multi_gpu_test(
self.model,
data_loader,
device=self.device,
tmpdir=None,
gpu_collect=False,
data_collate_fn=self.collate_fn,
metric_classes=metric_classes)
else:
from modelscope.trainers.utils.inference import single_gpu_test
metric_values = single_gpu_test(
self.model,
data_loader,
data_collate_fn=self.collate_fn,
device=self.device,
metric_classes=metric_classes)

return metric_values


+ 7
- 11
modelscope/trainers/utils/inference.py View File

@@ -10,21 +10,19 @@ import torch
from torch import distributed as dist
from tqdm import tqdm

from modelscope.utils.data_utils import to_device
from modelscope.utils.file_utils import func_receive_dict_inputs
from modelscope.utils.torch_utils import (broadcast, get_dist_info, is_master,
make_tmp_dir)


def single_gpu_test(model,
data_loader,
data_collate_fn=None,
metric_classes=None):
def single_gpu_test(model, data_loader, device, metric_classes=None):
"""Test model with a single gpu.

Args:
model (nn.Module): Model to be tested.
data_loader (nn.Dataloader): Pytorch data loader.
data_collate_fn: An optional data_collate_fn before fed into the model
device: (str | torch.device): The target device for the data.
metric_classes(List): List of Metric class that uses to collect metrics

Returns:
@@ -34,8 +32,7 @@ def single_gpu_test(model,
dataset = data_loader.dataset
with tqdm(total=len(dataset), desc='test samples') as pbar:
for data in data_loader:
if data_collate_fn is not None:
data = data_collate_fn(data)
data = to_device(data, device)
with torch.no_grad():
if isinstance(data, Mapping) and not func_receive_dict_inputs(
model.forward):
@@ -62,9 +59,9 @@ def single_gpu_test(model,

def multi_gpu_test(model,
data_loader,
device,
tmpdir=None,
gpu_collect=False,
data_collate_fn=None,
metric_classes=None):
"""Test model with multiple gpus.

@@ -77,10 +74,10 @@ def multi_gpu_test(model,
Args:
model (nn.Module): Model to be tested.
data_loader (nn.Dataloader): Pytorch data loader.
device: (str | torch.device): The target device for the data.
tmpdir (str): Path of directory to save the temporary results from
different gpus under cpu mode.
gpu_collect (bool): Option to use either gpu or cpu to collect results.
data_collate_fn: An optional data_collate_fn before fed into the model
metric_classes(List): List of Metric class that uses to collect metrics

Returns:
@@ -98,8 +95,7 @@ def multi_gpu_test(model,
count = 0
with tqdm(total=len(dataset), desc='test samples with multi gpus') as pbar:
for _, data in enumerate(data_loader):
if data_collate_fn is not None:
data = data_collate_fn(data)
data = to_device(data, device)
data_list.append(data)
with torch.no_grad():
if isinstance(data, Mapping) and not func_receive_dict_inputs(


+ 6
- 0
modelscope/utils/constant.py View File

@@ -219,6 +219,12 @@ class ConfigFields(object):
evaluation = 'evaluation'


class ConfigKeys(object):
"""Fixed keywords in configuration file"""
train = 'train'
val = 'val'


class Requirements(object):
"""Requirement names for each module
"""


+ 23
- 0
modelscope/utils/data_utils.py View File

@@ -0,0 +1,23 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
from collections.abc import Mapping

import torch


def to_device(batch, device, non_blocking=False):
"""Put the data to the target cuda device just before the forward function.
Args:
batch: The batch data out of the dataloader.
device: (str | torch.device): The target device for the data.

Returns: The data to the target device.

"""
if isinstance(batch, dict) or isinstance(batch, Mapping):
return type(batch)({k: to_device(v, device) for k, v in batch.items()})
elif isinstance(batch, (tuple, list)):
return type(batch)(to_device(v, device) for v in batch)
elif isinstance(batch, torch.Tensor):
return batch.to(device, non_blocking=non_blocking)
else:
return batch

+ 0
- 62
modelscope/utils/tensor_utils.py View File

@@ -24,65 +24,3 @@ def torch_nested_detach(tensors):
if isinstance(tensors, torch.Tensor):
return tensors.detach()
return tensors


def torch_default_data_collator(features):
# TODO @jiangnana.jnn refine this default data collator
import torch
first = features[0]

if isinstance(first, Mapping):
batch = {}
# Special handling for labels.
# Ensure that tensor is created with the correct type
# (it should be automatically the case, but let's make sure of it.)
if 'label' in first and first['label'] is not None:
label = first['label'].item() if isinstance(
first['label'], torch.Tensor) else first['label']
# the msdataset return a 0-dimension np.array with a single value, the following part handle this.
if isinstance(label, np.ndarray):
src_dtype = label[()].dtype
dtype = torch.long if label[(
)].dtype == np.int64 else torch.float
else:
src_dtype = type(label)
dtype = torch.long if isinstance(label, int) else torch.float
# add dtype to np.array to fix "TypeError: can't convert np.ndarray of type numpy.object_"
batch['labels'] = torch.tensor(
np.array([f['label'] for f in features], dtype=src_dtype),
dtype=dtype)
elif 'label_ids' in first and first['label_ids'] is not None:
if isinstance(first['label_ids'], torch.Tensor):
batch['labels'] = torch.stack(
[f['label_ids'] for f in features])
else:
dtype = torch.long if type(
first['label_ids'][0]) is int else torch.float
batch['labels'] = torch.tensor(
[f['label_ids'] for f in features], dtype=dtype)

# Handling of all other possible keys.
# Again, we will use the first element to figure out which key/values are not None for this model.
for k, v in first.items():
if k not in ('label', 'label_ids'
) and v is not None and not isinstance(v, str):
if isinstance(v, torch.Tensor):
batch[k] = torch.stack([f[k] for f in features])
elif isinstance(v, list) and isinstance(v[0], torch.Tensor):
batch[k] = torch.stack([d for f in features for d in f[k]])
else:
batch[k] = torch.tensor(np.array([f[k] for f in features]))
elif isinstance(first, tuple):
batch = []
for idx in range(len(first)):
if isinstance(first[idx], torch.Tensor):
batch.append(torch.stack([f[idx] for f in features]))
else:
batch.append(torch.tensor([f[idx] for f in features]))
else:
if isinstance(first, torch.Tensor):
batch = torch.stack(features)
else:
batch = torch.tensor(features)

return batch

+ 1
- 1
modelscope/utils/test_utils.py View File

@@ -50,7 +50,7 @@ def set_test_level(level: int):

def create_dummy_test_dataset(feat, label, num):
return MsDataset.from_hf_dataset(
Dataset.from_dict(dict(feat=[feat] * num, label=[label] * num)))
Dataset.from_dict(dict(feat=[feat] * num, labels=[label] * num)))


def download_and_untar(fpath, furl, dst) -> str:


+ 26
- 1
tests/preprocessors/test_common.py View File

@@ -2,7 +2,10 @@

import unittest

from modelscope.preprocessors import PREPROCESSORS, Compose, Preprocessor
import torch

from modelscope.preprocessors import (PREPROCESSORS, Compose, Filter,
Preprocessor, ToTensor)


class ComposeTest(unittest.TestCase):
@@ -35,5 +38,27 @@ class ComposeTest(unittest.TestCase):
self.assertEqual(output['tmp2'], 'tmp2')


class ToTensorTest(unittest.TestCase):

def test_totensor(self):
to_tensor_op = ToTensor(keys=['img'])
inputs = {'img': [1, 2, 3], 'label': 1, 'path': 'test.jpg'}
inputs = to_tensor_op(inputs)
self.assertIsInstance(inputs['img'], torch.Tensor)
self.assertEqual(inputs['label'], 1)
self.assertEqual(inputs['path'], 'test.jpg')


class FilterTest(unittest.TestCase):

def test_filter(self):
filter_op = Filter(reserved_keys=['img', 'label'])
inputs = {'img': [1, 2, 3], 'label': 1, 'path': 'test.jpg'}
inputs = filter_op(inputs)
self.assertIn('img', inputs)
self.assertIn('label', inputs)
self.assertNotIn('path', inputs)


if __name__ == '__main__':
unittest.main()

+ 1
- 1
tests/trainers/hooks/test_evaluation_hook.py View File

@@ -12,7 +12,7 @@ from torch import nn
from modelscope.metainfo import Trainers
from modelscope.metrics.builder import METRICS, MetricKeys
from modelscope.trainers import build_trainer
from modelscope.utils.constant import LogKeys, ModelFile
from modelscope.utils.constant import ModelFile
from modelscope.utils.registry import default_group
from modelscope.utils.test_utils import create_dummy_test_dataset



+ 7
- 7
tests/trainers/hooks/test_lr_scheduler_hook.py View File

@@ -9,7 +9,7 @@ import numpy as np
import torch
from torch import nn
from torch.optim import SGD
from torch.optim.lr_scheduler import MultiStepLR, ReduceLROnPlateau
from torch.optim.lr_scheduler import MultiStepLR

from modelscope.metainfo import Trainers
from modelscope.metrics.builder import METRICS, MetricKeys
@@ -96,7 +96,8 @@ class LrSchedulerHookTest(unittest.TestCase):
model=model,
train_dataset=dummy_dataset,
optimizers=(optimizer, lr_scheduler),
max_epochs=5)
max_epochs=5,
device='cpu')

trainer = build_trainer(trainer_name, kwargs)
train_dataloader = trainer._build_dataloader_with_dataset(
@@ -160,15 +161,13 @@ class LrSchedulerHookTest(unittest.TestCase):
json.dump(json_cfg, f)

model = DummyModel()
# optimmizer = SGD(model.parameters(), lr=0.01)
# lr_scheduler = MultiStepLR(optimmizer, milestones=[2, 4])
trainer_name = Trainers.default
kwargs = dict(
cfg_file=config_path,
model=model,
train_dataset=dummy_dataset,
# optimizers=(optimmizer, lr_scheduler),
max_epochs=7)
max_epochs=7,
device='cpu')

trainer = build_trainer(trainer_name, kwargs)
train_dataloader = trainer._build_dataloader_with_dataset(
@@ -266,7 +265,8 @@ class PlateauLrSchedulerHookTest(unittest.TestCase):
train_dataset=dummy_dataset,
eval_dataset=dummy_dataset,
optimizers=(optimizer, None),
max_epochs=5)
max_epochs=5,
device='cpu')

trainer = build_trainer(trainer_name, kwargs)
train_dataloader = trainer._build_dataloader_with_dataset(


+ 3
- 2
tests/trainers/hooks/test_optimizer_hook.py View File

@@ -17,7 +17,7 @@ from modelscope.utils.constant import ModelFile, TrainerStages
from modelscope.utils.test_utils import create_dummy_test_dataset

dummy_dataset = create_dummy_test_dataset(
np.random.random(size=(2, 2)), np.random.randint(0, 2, (1, )), 10)
np.random.random(size=(2, )), np.random.randint(0, 2, (1, )), 10)


class DummyModel(nn.Module):
@@ -71,7 +71,8 @@ class OptimizerHookTest(unittest.TestCase):
model=model,
train_dataset=dummy_dataset,
optimizers=(optimizer, lr_scheduler),
max_epochs=2)
max_epochs=2,
device='cpu')

trainer = build_trainer(trainer_name, kwargs)
train_dataloader = trainer._build_dataloader_with_dataset(


+ 2
- 1
tests/trainers/hooks/test_timer_hook.py View File

@@ -75,7 +75,8 @@ class IterTimerHookTest(unittest.TestCase):
model=model,
train_dataset=dummy_dataset,
optimizers=(optimizer, lr_scheduler),
max_epochs=5)
max_epochs=5,
device='cpu')

trainer = build_trainer(trainer_name, kwargs)
train_dataloader = trainer._build_dataloader_with_dataset(


+ 6
- 6
tests/trainers/test_trainer.py View File

@@ -3,19 +3,16 @@ import os
import shutil
import tempfile
import unittest
from abc import ABCMeta

import json
import numpy as np
import torch
from datasets import Dataset
from torch import nn
from torch.optim import SGD
from torch.optim.lr_scheduler import StepLR

from modelscope.metainfo import Metrics, Trainers
from modelscope.metrics.builder import MetricKeys
from modelscope.msdatasets import MsDataset
from modelscope.trainers import build_trainer
from modelscope.utils.constant import LogKeys, ModeKeys, ModelFile
from modelscope.utils.test_utils import create_dummy_test_dataset, test_level
@@ -116,7 +113,8 @@ class TrainerTest(unittest.TestCase):
data_collator=None,
train_dataset=dummy_dataset_small,
eval_dataset=dummy_dataset_small,
max_epochs=3)
max_epochs=3,
device='cpu')

trainer = build_trainer(trainer_name, kwargs)
trainer.train()
@@ -175,7 +173,8 @@ class TrainerTest(unittest.TestCase):
train_dataset=dummy_dataset_small,
eval_dataset=dummy_dataset_small,
optimizers=(optimmizer, lr_scheduler),
max_epochs=3)
max_epochs=3,
device='cpu')

trainer = build_trainer(trainer_name, kwargs)
trainer.train()
@@ -225,7 +224,8 @@ class TrainerTest(unittest.TestCase):
train_dataset=dummy_dataset_big,
eval_dataset=dummy_dataset_small,
optimizers=(optimmizer, lr_scheduler),
max_epochs=3)
max_epochs=3,
device='cpu')

trainer = build_trainer(trainer_name, kwargs)
trainer.train()


+ 8
- 5
tests/trainers/test_trainer_with_nlp.py View File

@@ -37,7 +37,8 @@ class TestTrainerWithNlp(unittest.TestCase):
model=model_id,
train_dataset=self.dataset,
eval_dataset=self.dataset,
work_dir=self.tmp_dir)
work_dir=self.tmp_dir,
model_revision='beta')

trainer = build_trainer(default_args=kwargs)
trainer.train()
@@ -53,7 +54,8 @@ class TestTrainerWithNlp(unittest.TestCase):
model=model_id,
train_dataset=self.dataset,
eval_dataset=self.dataset,
work_dir=self.tmp_dir)
work_dir=self.tmp_dir,
model_revision='beta')

trainer = build_trainer(default_args=kwargs)
trainer.train()
@@ -69,7 +71,7 @@ class TestTrainerWithNlp(unittest.TestCase):
@unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
def test_trainer_with_user_defined_config(self):
model_id = 'damo/nlp_structbert_sentiment-classification_chinese-base'
cfg = read_config(model_id)
cfg = read_config(model_id, revision='beta')
cfg.train.max_epochs = 20
cfg.train.work_dir = self.tmp_dir
cfg_file = os.path.join(self.tmp_dir, 'config.json')
@@ -78,7 +80,8 @@ class TestTrainerWithNlp(unittest.TestCase):
model=model_id,
train_dataset=self.dataset,
eval_dataset=self.dataset,
cfg_file=cfg_file)
cfg_file=cfg_file,
model_revision='beta')

trainer = build_trainer(default_args=kwargs)
trainer.train()
@@ -98,7 +101,7 @@ class TestTrainerWithNlp(unittest.TestCase):
os.makedirs(tmp_dir)

model_id = 'damo/nlp_structbert_sentence-similarity_chinese-base'
cache_path = snapshot_download(model_id)
cache_path = snapshot_download(model_id, revision='beta')
model = SbertForSequenceClassification.from_pretrained(cache_path)
kwargs = dict(
cfg_file=os.path.join(cache_path, ModelFile.CONFIGURATION),


+ 0
- 0
tests/trainers/utils/__init__.py View File


+ 116
- 0
tests/trainers/utils/test_inference.py View File

@@ -0,0 +1,116 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
import os
import shutil
import tempfile
import unittest

import torch
from torch import nn
from torch.utils.data import DataLoader

from modelscope.metrics.builder import MetricKeys
from modelscope.metrics.sequence_classification_metric import \
SequenceClassificationMetric
from modelscope.trainers.utils.inference import multi_gpu_test, single_gpu_test
from modelscope.utils.test_utils import (DistributedTestCase,
create_dummy_test_dataset, test_level)
from modelscope.utils.torch_utils import get_dist_info, init_dist

dummy_dataset = create_dummy_test_dataset(
torch.rand((5, )), torch.randint(0, 4, (1, )), 20)


class DummyModel(nn.Module):

def __init__(self):
super().__init__()
self.linear = nn.Linear(5, 4)
self.bn = nn.BatchNorm1d(4)

def forward(self, feat, labels):
x = self.linear(feat)

x = self.bn(x)
loss = torch.sum(x)
return dict(logits=x, loss=loss)


def test_func(dist=False):
dummy_model = DummyModel()
dataset = dummy_dataset.to_torch_dataset()

dummy_loader = DataLoader(
dataset,
batch_size=2,
)

metric_class = SequenceClassificationMetric()

if dist:
init_dist(launcher='pytorch')

rank, world_size = get_dist_info()
device = torch.device(f'cuda:{rank}')
dummy_model.cuda()

if world_size > 1:
from torch.nn.parallel.distributed import DistributedDataParallel
dummy_model = DistributedDataParallel(
dummy_model, device_ids=[torch.cuda.current_device()])
test_func = multi_gpu_test
else:
test_func = single_gpu_test

metric_results = test_func(
dummy_model,
dummy_loader,
device=device,
metric_classes=[metric_class])

return metric_results


@unittest.skipIf(not torch.cuda.is_available(), 'cuda unittest')
class SingleGpuTestTest(unittest.TestCase):

def setUp(self):
print(('Testing %s.%s' % (type(self).__name__, self._testMethodName)))
self.tmp_dir = tempfile.TemporaryDirectory().name
if not os.path.exists(self.tmp_dir):
os.makedirs(self.tmp_dir)

def tearDown(self):
super().tearDown()
shutil.rmtree(self.tmp_dir)

@unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
def test_single_gpu_test(self):
metric_results = test_func()
self.assertIn(MetricKeys.ACCURACY, metric_results)


@unittest.skipIf(not torch.cuda.is_available()
or torch.cuda.device_count() <= 1, 'distributed unittest')
class MultiGpuTestTest(DistributedTestCase):

def setUp(self):
print(('Testing %s.%s' % (type(self).__name__, self._testMethodName)))
self.tmp_dir = tempfile.TemporaryDirectory().name
if not os.path.exists(self.tmp_dir):
os.makedirs(self.tmp_dir)

def tearDown(self):
super().tearDown()
shutil.rmtree(self.tmp_dir)

@unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
def test_multi_gpu_test(self):
self.start(
test_func,
num_gpus=2,
assert_callback=lambda x: self.assertIn(MetricKeys.ACCURACY, x),
dist=True)


if __name__ == '__main__':
unittest.main()

Loading…
Cancel
Save