Browse Source

[to #43850241] fix processor and collate_fn

Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/9644184

    * fix ditributed training and eval
master
jiangnana.jnn 3 years ago
parent
commit
76482cc3ea
22 changed files with 442 additions and 202 deletions
  1. +0
    -14
      modelscope/models/cv/image_denoise/nafnet_for_image_denoise.py
  2. +2
    -2
      modelscope/preprocessors/__init__.py
  3. +91
    -2
      modelscope/preprocessors/common.py
  4. +8
    -0
      modelscope/preprocessors/image.py
  5. +8
    -3
      modelscope/preprocessors/nlp.py
  6. +0
    -1
      modelscope/trainers/cv/image_portrait_enhancement_trainer.py
  7. +38
    -14
      modelscope/trainers/nlp_trainer.py
  8. +89
    -69
      modelscope/trainers/trainer.py
  9. +7
    -11
      modelscope/trainers/utils/inference.py
  10. +6
    -0
      modelscope/utils/constant.py
  11. +23
    -0
      modelscope/utils/data_utils.py
  12. +0
    -62
      modelscope/utils/tensor_utils.py
  13. +1
    -1
      modelscope/utils/test_utils.py
  14. +26
    -1
      tests/preprocessors/test_common.py
  15. +1
    -1
      tests/trainers/hooks/test_evaluation_hook.py
  16. +7
    -7
      tests/trainers/hooks/test_lr_scheduler_hook.py
  17. +3
    -2
      tests/trainers/hooks/test_optimizer_hook.py
  18. +2
    -1
      tests/trainers/hooks/test_timer_hook.py
  19. +6
    -6
      tests/trainers/test_trainer.py
  20. +8
    -5
      tests/trainers/test_trainer_with_nlp.py
  21. +0
    -0
      tests/trainers/utils/__init__.py
  22. +116
    -0
      tests/trainers/utils/test_inference.py

+ 0
- 14
modelscope/models/cv/image_denoise/nafnet_for_image_denoise.py View File

@@ -36,20 +36,8 @@ class NAFNetForImageDenoise(TorchModel):
model_path = os.path.join(model_dir, ModelFile.TORCH_MODEL_FILE) model_path = os.path.join(model_dir, ModelFile.TORCH_MODEL_FILE)
self.model = NAFNet(**self.config.model.network_g) self.model = NAFNet(**self.config.model.network_g)
self.loss = PSNRLoss() self.loss = PSNRLoss()

if torch.cuda.is_available():
self._device = torch.device('cuda')
else:
self._device = torch.device('cpu')

self.model = self.model.to(self._device)
self.model = self._load_pretrained(self.model, model_path) self.model = self._load_pretrained(self.model, model_path)


if self.training:
self.model.train()
else:
self.model.eval()

def _load_pretrained(self, def _load_pretrained(self,
net, net,
load_path, load_path,
@@ -109,8 +97,6 @@ class NAFNetForImageDenoise(TorchModel):
Returns: Returns:
Dict[str, Tensor]: results Dict[str, Tensor]: results
""" """
for key, value in inputs.items():
inputs[key] = inputs[key].to(self._device)
if self.training: if self.training:
return self._train_forward(**inputs) return self._train_forward(**inputs)
elif 'target' in inputs: elif 'target' in inputs:


+ 2
- 2
modelscope/preprocessors/__init__.py View File

@@ -6,7 +6,7 @@ from modelscope.utils.import_utils import LazyImportModule
if TYPE_CHECKING: if TYPE_CHECKING:
from .base import Preprocessor from .base import Preprocessor
from .builder import PREPROCESSORS, build_preprocessor from .builder import PREPROCESSORS, build_preprocessor
from .common import Compose
from .common import Compose, ToTensor, Filter
from .asr import WavToScp from .asr import WavToScp
from .audio import LinearAECAndFbank from .audio import LinearAECAndFbank
from .image import (LoadImage, load_image, from .image import (LoadImage, load_image,
@@ -33,7 +33,7 @@ else:
_import_structure = { _import_structure = {
'base': ['Preprocessor'], 'base': ['Preprocessor'],
'builder': ['PREPROCESSORS', 'build_preprocessor'], 'builder': ['PREPROCESSORS', 'build_preprocessor'],
'common': ['Compose'],
'common': ['Compose', 'ToTensor', 'Filter'],
'audio': ['LinearAECAndFbank'], 'audio': ['LinearAECAndFbank'],
'asr': ['WavToScp'], 'asr': ['WavToScp'],
'video': ['ReadVideoData'], 'video': ['ReadVideoData'],


+ 91
- 2
modelscope/preprocessors/common.py View File

@@ -2,6 +2,10 @@


import time import time
from collections.abc import Sequence from collections.abc import Sequence
from typing import Mapping

import numpy as np
import torch


from .builder import PREPROCESSORS, build_preprocessor from .builder import PREPROCESSORS, build_preprocessor


@@ -25,12 +29,18 @@ class Compose(object):
if isinstance(transform, dict): if isinstance(transform, dict):
if self.field_name is None: if self.field_name is None:
transform = build_preprocessor(transform, field_name) transform = build_preprocessor(transform, field_name)
self.transforms.append(transform)
else:
# if not found key in field_name, try field_name=None(default_group)
try:
transform = build_preprocessor(transform, field_name)
except KeyError:
transform = build_preprocessor(transform, None)
elif callable(transform): elif callable(transform):
self.transforms.append(transform)
pass
else: else:
raise TypeError('transform must be callable or a dict, but got' raise TypeError('transform must be callable or a dict, but got'
f' {type(transform)}') f' {type(transform)}')
self.transforms.append(transform)


def __call__(self, data): def __call__(self, data):
for t in self.transforms: for t in self.transforms:
@@ -52,3 +62,82 @@ class Compose(object):
format_string += f'\n {t}' format_string += f'\n {t}'
format_string += '\n)' format_string += '\n)'
return format_string return format_string


def to_tensor(data):
"""Convert objects of various python types to :obj:`torch.Tensor`.

Supported types are: :class:`numpy.ndarray`, :class:`torch.Tensor`,
:class:`Sequence`, :class:`int` and :class:`float`.

Args:
data (torch.Tensor | numpy.ndarray | Sequence | int | float): Data to
be converted.
"""

if isinstance(data, torch.Tensor):
return data
elif isinstance(data, np.ndarray):
return torch.from_numpy(data)
elif isinstance(data, Sequence) and not isinstance(data, str):
return torch.tensor(data)
elif isinstance(data, int):
return torch.LongTensor([data])
elif isinstance(data, float):
return torch.FloatTensor([data])
else:
raise TypeError(f'type {type(data)} cannot be converted to tensor.')


@PREPROCESSORS.register_module()
class ToTensor(object):
"""Convert target object to tensor.

Args:
keys (Sequence[str]): Key of data to be converted to Tensor.
Only valid when data is type of `Mapping`. If `keys` is None,
all values of keys ​​will be converted to tensor by default.
"""

def __init__(self, keys=None):
self.keys = keys

def __call__(self, data):
if isinstance(data, Mapping):
if self.keys is None:
self.keys = list(data.keys())

for key in self.keys:
data[key] = to_tensor(data[key])
else:
data = to_tensor(data)

return data

def __repr__(self):
return self.__class__.__name__ + f'(keys={self.keys})'


@PREPROCESSORS.register_module()
class Filter(object):
"""This is usually the last stage of the dataloader transform.
Only data of reserved keys will be kept and passed directly to the model, others will be removed.

Args:
keys (Sequence[str]): Keys of data to be reserved, others will be removed.
"""

def __init__(self, reserved_keys):
self.reserved_keys = reserved_keys

def __call__(self, data):
assert isinstance(data, Mapping)

reserved_data = {}
for key in self.reserved_keys:
reserved_data[key] = data[key]

return reserved_data

def __repr__(self):
return self.__class__.__name__ + f'(keys={self.reserved_keys})'

+ 8
- 0
modelscope/preprocessors/image.py View File

@@ -151,6 +151,11 @@ class ImageDenoisePreprocessor(Preprocessor):
super().__init__(*args, **kwargs) super().__init__(*args, **kwargs)
self.model_dir: str = model_dir self.model_dir: str = model_dir


from .common import Filter

# TODO: `Filter` should be moved to configurarion file of each model
self._transforms = [Filter(reserved_keys=['input', 'target'])]

def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]: def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
"""process the raw input data """process the raw input data


@@ -160,6 +165,9 @@ class ImageDenoisePreprocessor(Preprocessor):
Returns: Returns:
Dict[str, Any]: the preprocessed data Dict[str, Any]: the preprocessed data
""" """
for t in self._transforms:
data = t(data)

return data return data






+ 8
- 3
modelscope/preprocessors/nlp.py View File

@@ -4,6 +4,7 @@ import os.path as osp
import uuid import uuid
from typing import Any, Dict, Iterable, Optional, Tuple, Union from typing import Any, Dict, Iterable, Optional, Tuple, Union


import numpy as np
from transformers import AutoTokenizer from transformers import AutoTokenizer


from modelscope.metainfo import Models, Preprocessors from modelscope.metainfo import Models, Preprocessors
@@ -191,6 +192,10 @@ class NLPTokenizerPreprocessorBase(Preprocessor):
text_b, text_b,
return_tensors='pt' if self._mode == ModeKeys.INFERENCE else None, return_tensors='pt' if self._mode == ModeKeys.INFERENCE else None,
**self.tokenize_kwargs) **self.tokenize_kwargs)
output = {
k: np.array(v) if isinstance(v, list) else v
for k, v in output.items()
}
self.labels_to_id(labels, output) self.labels_to_id(labels, output)
return output return output


@@ -240,13 +245,13 @@ class NLPTokenizerPreprocessorBase(Preprocessor):
if labels is not None: if labels is not None:
if isinstance(labels, Iterable) and all([label_can_be_mapped(label) for label in labels]) \ if isinstance(labels, Iterable) and all([label_can_be_mapped(label) for label in labels]) \
and self.label2id is not None: and self.label2id is not None:
output[OutputKeys.LABEL] = [
output[OutputKeys.LABELS] = [
self.label2id[str(label)] for label in labels self.label2id[str(label)] for label in labels
] ]
elif label_can_be_mapped(labels) and self.label2id is not None: elif label_can_be_mapped(labels) and self.label2id is not None:
output[OutputKeys.LABEL] = self.label2id[str(labels)]
output[OutputKeys.LABELS] = self.label2id[str(labels)]
else: else:
output[OutputKeys.LABEL] = labels
output[OutputKeys.LABELS] = labels




@PREPROCESSORS.register_module( @PREPROCESSORS.register_module(


+ 0
- 1
modelscope/trainers/cv/image_portrait_enhancement_trainer.py View File

@@ -40,7 +40,6 @@ class ImagePortraitEnhancementTrainer(EpochBasedTrainer):


train_outputs = dict() train_outputs = dict()
self._mode = ModeKeys.TRAIN self._mode = ModeKeys.TRAIN
inputs = self.collate_fn(inputs)
# call model forward but not __call__ to skip postprocess # call model forward but not __call__ to skip postprocess
if isinstance(inputs, Mapping): if isinstance(inputs, Mapping):
d_loss = model._train_forward_d(**inputs) d_loss = model._train_forward_d(**inputs)


+ 38
- 14
modelscope/trainers/nlp_trainer.py View File

@@ -110,9 +110,11 @@ class NlpEpochBasedTrainer(EpochBasedTrainer):
self.train_keys = build_dataset_keys( self.train_keys = build_dataset_keys(
self.cfg.dataset.train if hasattr(self.cfg, 'dataset') self.cfg.dataset.train if hasattr(self.cfg, 'dataset')
and hasattr(self.cfg.dataset, 'train') else None) and hasattr(self.cfg.dataset, 'train') else None)
# TODO eval may has special keys, which is now not supported.
# because there is only one preprocessor in the trainer, and it only supports one group of keys.
self.eval_keys = self.train_keys
self.eval_keys = build_dataset_keys(
self.cfg.dataset.val if hasattr(self.cfg, 'dataset')
and hasattr(self.cfg.dataset, 'val') else None)
if len(self.eval_keys) == 0:
self.eval_keys = self.train_keys


super().__init__( super().__init__(
model=model_dir, model=model_dir,
@@ -148,7 +150,7 @@ class NlpEpochBasedTrainer(EpochBasedTrainer):
elif isinstance(model, nn.Module): elif isinstance(model, nn.Module):
return model return model


def build_preprocessor(self) -> Preprocessor:
def build_preprocessor(self) -> Tuple[Preprocessor, Preprocessor]:
"""Build the preprocessor. """Build the preprocessor.


User can override this method to implement custom logits. User can override this method to implement custom logits.
@@ -159,16 +161,38 @@ class NlpEpochBasedTrainer(EpochBasedTrainer):
model_args = {} if self.label2id is None else { model_args = {} if self.label2id is None else {
'label2id': self.label2id 'label2id': self.label2id
} }
cfg = ConfigDict({
**getattr(self.cfg, 'preprocessor'),
'model_dir':
self.model_dir,
**model_args,
'mode':
ModeKeys.TRAIN,
**self.train_keys,
})
return build_preprocessor(cfg, Tasks.find_field_by_task(self.cfg.task))

field_name = Tasks.find_field_by_task(self.cfg.task)
train_preprocessor, eval_preprocessor = None, None
_train_cfg, _eval_cfg = {}, {}

if 'type' not in self.cfg.preprocessor and (
'train' in self.cfg.preprocessor
or 'val' in self.cfg.preprocessor):
if 'train' in self.cfg.preprocessor:
_train_cfg = self.cfg.preprocessor.train
if 'val' in self.cfg.preprocessor:
_eval_cfg = self.cfg.preprocessor.val
else:
_train_cfg = self.cfg.preprocessor
_eval_cfg = self.cfg.preprocessor

if len(_train_cfg):
_train_cfg.update({
'model_dir': self.model_dir,
**model_args,
**self.train_keys, 'mode': ModeKeys.TRAIN
})
train_preprocessor = build_preprocessor(_train_cfg, field_name)
if len(_eval_cfg):
_eval_cfg.update({
'model_dir': self.model_dir,
**model_args,
**self.eval_keys, 'mode': ModeKeys.EVAL
})
eval_preprocessor = build_preprocessor(_eval_cfg, field_name)

return train_preprocessor, eval_preprocessor




@TRAINERS.register_module(module_name=Trainers.nlp_veco_trainer) @TRAINERS.register_module(module_name=Trainers.nlp_veco_trainer)


+ 89
- 69
modelscope/trainers/trainer.py View File

@@ -5,15 +5,15 @@ import time
from collections.abc import Mapping from collections.abc import Mapping
from distutils.version import LooseVersion from distutils.version import LooseVersion
from functools import partial from functools import partial
from typing import Callable, List, Optional, Tuple, Union
from typing import Callable, Dict, List, Optional, Sequence, Tuple, Union


import json import json
import numpy as np import numpy as np
import torch import torch
from addict import Dict
from torch import distributed as dist from torch import distributed as dist
from torch import nn from torch import nn
from torch.utils.data import DataLoader, Dataset from torch.utils.data import DataLoader, Dataset
from torch.utils.data.dataloader import default_collate
from torch.utils.data.distributed import DistributedSampler from torch.utils.data.distributed import DistributedSampler


from modelscope.hub.snapshot_download import snapshot_download from modelscope.hub.snapshot_download import snapshot_download
@@ -21,8 +21,9 @@ from modelscope.metainfo import Trainers
from modelscope.metrics import build_metric, task_default_metrics from modelscope.metrics import build_metric, task_default_metrics
from modelscope.models.base import Model, TorchModel from modelscope.models.base import Model, TorchModel
from modelscope.msdatasets.ms_dataset import MsDataset from modelscope.msdatasets.ms_dataset import MsDataset
from modelscope.preprocessors import build_preprocessor
from modelscope.preprocessors.base import Preprocessor from modelscope.preprocessors.base import Preprocessor
from modelscope.preprocessors.builder import build_preprocessor
from modelscope.preprocessors.common import Compose
from modelscope.task_datasets.builder import build_task_dataset from modelscope.task_datasets.builder import build_task_dataset
from modelscope.task_datasets.torch_base_dataset import TorchTaskDataset from modelscope.task_datasets.torch_base_dataset import TorchTaskDataset
from modelscope.trainers.hooks.builder import HOOKS from modelscope.trainers.hooks.builder import HOOKS
@@ -30,14 +31,15 @@ from modelscope.trainers.hooks.priority import Priority, get_priority
from modelscope.trainers.lrscheduler.builder import build_lr_scheduler from modelscope.trainers.lrscheduler.builder import build_lr_scheduler
from modelscope.trainers.optimizer.builder import build_optimizer from modelscope.trainers.optimizer.builder import build_optimizer
from modelscope.utils.config import Config, ConfigDict from modelscope.utils.config import Config, ConfigDict
from modelscope.utils.constant import (DEFAULT_MODEL_REVISION, Hubs, ModeKeys,
ModelFile, Tasks, TrainerStages)
from modelscope.utils.constant import (DEFAULT_MODEL_REVISION, ConfigFields,
ConfigKeys, Hubs, ModeKeys, ModelFile,
Tasks, TrainerStages)
from modelscope.utils.data_utils import to_device
from modelscope.utils.file_utils import func_receive_dict_inputs from modelscope.utils.file_utils import func_receive_dict_inputs
from modelscope.utils.logger import get_logger from modelscope.utils.logger import get_logger
from modelscope.utils.registry import build_from_cfg from modelscope.utils.registry import build_from_cfg
from modelscope.utils.tensor_utils import torch_default_data_collator
from modelscope.utils.torch_utils import (broadcast, create_device,
get_dist_info, init_dist)
from modelscope.utils.torch_utils import (create_device, get_dist_info,
init_dist)
from .base import BaseTrainer from .base import BaseTrainer
from .builder import TRAINERS from .builder import TRAINERS
from .default_config import DEFAULT_CONFIG from .default_config import DEFAULT_CONFIG
@@ -83,7 +85,8 @@ class EpochBasedTrainer(BaseTrainer):
data_collator: Optional[Callable] = None, data_collator: Optional[Callable] = None,
train_dataset: Optional[Union[MsDataset, Dataset]] = None, train_dataset: Optional[Union[MsDataset, Dataset]] = None,
eval_dataset: Optional[Union[MsDataset, Dataset]] = None, eval_dataset: Optional[Union[MsDataset, Dataset]] = None,
preprocessor: Optional[Preprocessor] = None,
preprocessor: Optional[Union[Preprocessor,
Dict[str, Preprocessor]]] = None,
optimizers: Tuple[torch.optim.Optimizer, optimizers: Tuple[torch.optim.Optimizer,
torch.optim.lr_scheduler._LRScheduler] = (None, torch.optim.lr_scheduler._LRScheduler] = (None,
None), None),
@@ -120,24 +123,46 @@ class EpochBasedTrainer(BaseTrainer):
else: else:
self.work_dir = self.cfg.train.get('work_dir', './work_dir') self.work_dir = self.cfg.train.get('work_dir', './work_dir')


self.preprocessor = None
self.train_preprocessor, self.eval_preprocessor = None, None
if isinstance(preprocessor, Preprocessor): if isinstance(preprocessor, Preprocessor):
self.preprocessor = preprocessor
elif hasattr(self.cfg, 'preprocessor'):
self.preprocessor = self.build_preprocessor()
if self.preprocessor is not None:
self.preprocessor.mode = ModeKeys.TRAIN
self.train_preprocessor = preprocessor
self.eval_preprocessor = preprocessor
elif isinstance(preprocessor, Mapping):
if not (ConfigKeys.train in preprocessor
or ConfigKeys.val in preprocessor):
raise ValueError(
f'Preprocessor must split with `{ConfigKeys.train}` and `{ConfigKeys.val}` keys!'
)
if ConfigKeys.train in preprocessor:
assert isinstance(preprocessor[ConfigKeys.train], Preprocessor)
self.train_preprocessor = preprocessor[ConfigKeys.train]
if ConfigKeys.val in preprocessor:
assert isinstance(preprocessor[ConfigKeys.val], Preprocessor)
self.eval_preprocessor = preprocessor[ConfigKeys.val]
elif hasattr(self.cfg, ConfigFields.preprocessor):
self.train_preprocessor, self.eval_preprocessor = self.build_preprocessor(
)

if self.train_preprocessor is not None:
self.train_preprocessor.mode = ModeKeys.TRAIN
if self.eval_preprocessor is not None:
self.eval_preprocessor.mode = ModeKeys.EVAL

device_name = kwargs.get('device', 'gpu') device_name = kwargs.get('device', 'gpu')
assert device_name in ['gpu', assert device_name in ['gpu',
'cpu'], 'device should be either cpu or gpu.' 'cpu'], 'device should be either cpu or gpu.'
self.device = create_device(device_name == 'cpu') self.device = create_device(device_name == 'cpu')


self.train_dataset = self.to_task_dataset( self.train_dataset = self.to_task_dataset(
train_dataset, mode=ModeKeys.TRAIN, preprocessor=self.preprocessor)
train_dataset,
mode=ModeKeys.TRAIN,
preprocessor=self.train_preprocessor)
self.eval_dataset = self.to_task_dataset( self.eval_dataset = self.to_task_dataset(
eval_dataset, mode=ModeKeys.EVAL, preprocessor=self.preprocessor)
eval_dataset,
mode=ModeKeys.EVAL,
preprocessor=self.eval_preprocessor)


self.data_collator = data_collator if data_collator is not None else torch_default_data_collator
self.data_collator = data_collator if data_collator is not None else default_collate
self.metrics = self.get_metrics() self.metrics = self.get_metrics()
self._metric_values = None self._metric_values = None
self.optimizers = optimizers self.optimizers = optimizers
@@ -229,12 +254,12 @@ class EpochBasedTrainer(BaseTrainer):
return datasets return datasets
elif isinstance(datasets, MsDataset): elif isinstance(datasets, MsDataset):
datasets = datasets.to_torch_dataset( datasets = datasets.to_torch_dataset(
preprocessors=self.preprocessor)
preprocessors=preprocessor)
return datasets return datasets
elif isinstance(datasets, List) and isinstance( elif isinstance(datasets, List) and isinstance(
datasets[0], MsDataset): datasets[0], MsDataset):
datasets = [ datasets = [
d.to_torch_dataset(preprocessor=self.preprocessor)
d.to_torch_dataset(preprocessor=preprocessor)
for d in datasets for d in datasets
] ]
cfg = ConfigDict( cfg = ConfigDict(
@@ -258,24 +283,44 @@ class EpochBasedTrainer(BaseTrainer):
else: else:
return datasets return datasets


def build_preprocessor(self) -> Preprocessor:
"""Build the preprocessor.
def build_preprocessor(self) -> Tuple[Preprocessor, Preprocessor]:
"""Build train and eval preprocessor.


User can override this method to implement custom logits. User can override this method to implement custom logits.


Returns: The preprocessor instance.
Returns: The train preprocessor and eval preprocessor instance.


""" """
# TODO @wenmeng.zwm @jiangnana.jnn add support for different preprocessor
# when they are different ones in training and evaluation
cfg = ConfigDict({
**getattr(self.cfg, 'preprocessor'),
'model_dir':
self.model_dir,
'mode':
ModeKeys.TRAIN,
})
return build_preprocessor(cfg, Tasks.find_field_by_task(self.cfg.task))
field_name = Tasks.find_field_by_task(self.cfg.task)
train_preprocessor, eval_preprocessor = None, None
_train_cfg, _eval_cfg = {}, {}
_dafault_args = {'model_dir': self.model_dir}

if 'type' not in self.cfg.preprocessor and (
'train' in self.cfg.preprocessor
or 'val' in self.cfg.preprocessor):
if 'train' in self.cfg.preprocessor:
_train_cfg = self.cfg.preprocessor.train
if 'val' in self.cfg.preprocessor:
_eval_cfg = self.cfg.preprocessor.val
else:
_train_cfg = self.cfg.preprocessor
_eval_cfg = self.cfg.preprocessor

if len(_train_cfg):
if isinstance(_train_cfg, Sequence):
# TODO: for Sequence, need adapt to `mode` and `mode_dir` args,
# and add mode for Compose or other plans
raise NotImplementedError('Not supported yet!')
_train_cfg.update(_dafault_args)
train_preprocessor = build_preprocessor(_train_cfg, field_name)
if len(_eval_cfg):
if isinstance(_eval_cfg, Sequence):
raise NotImplementedError('Not supported yet!')
_eval_cfg.update(_dafault_args)
eval_preprocessor = build_preprocessor(_eval_cfg, field_name)

return train_preprocessor, eval_preprocessor


def get_metrics(self) -> List[str]: def get_metrics(self) -> List[str]:
"""Get the metric class types. """Get the metric class types.
@@ -373,34 +418,6 @@ class EpochBasedTrainer(BaseTrainer):


return build_parallel(dp_cfg) return build_parallel(dp_cfg)


def collate_fn(self, data):
"""Prepare the input just before the forward function.
This method will move the tensors to the right device.
Usually this method does not need to be overridden.

Args:
data: The data out of the dataloader.

Returns: The processed data.

"""
from torch.utils.data.dataloader import default_collate
if isinstance(data, dict) or isinstance(data, Mapping):
return type(data)({k: self.collate_fn(v) for k, v in data.items()})
elif isinstance(data, (tuple, list)):
if isinstance(data[0], (int, float)):
return default_collate(data).to(self.device)
else:
return type(data)(self.collate_fn(v) for v in data)
elif isinstance(data, np.ndarray):
return self.collate_fn(torch.from_numpy(data))
elif isinstance(data, torch.Tensor):
return data.to(self.device)
elif isinstance(data, (str, int, float, bool)):
return data
else:
raise ValueError(f'Unsupported data type {type(data)}')

def train_step(self, model, inputs): def train_step(self, model, inputs):
""" Perform a training step on a batch of inputs. """ Perform a training step on a batch of inputs.


@@ -421,7 +438,6 @@ class EpochBasedTrainer(BaseTrainer):
# TODO: find more pretty way to change mode # TODO: find more pretty way to change mode
model.train() model.train()
self._mode = ModeKeys.TRAIN self._mode = ModeKeys.TRAIN
inputs = self.collate_fn(inputs)
# call model forward but not __call__ to skip postprocess # call model forward but not __call__ to skip postprocess
if isinstance(inputs, if isinstance(inputs,
Mapping) and not func_receive_dict_inputs(model.forward): Mapping) and not func_receive_dict_inputs(model.forward):
@@ -486,7 +502,9 @@ class EpochBasedTrainer(BaseTrainer):
if self.train_dataset is None: if self.train_dataset is None:
train_data = self.cfg.dataset.train train_data = self.cfg.dataset.train
self.train_dataset = self.build_dataset( self.train_dataset = self.build_dataset(
train_data, mode=ModeKeys.TRAIN)
train_data,
mode=ModeKeys.TRAIN,
preprocessor=self.train_preprocessor)


data_loader = self._build_dataloader_with_dataset( data_loader = self._build_dataloader_with_dataset(
self.train_dataset, self.train_dataset,
@@ -505,7 +523,9 @@ class EpochBasedTrainer(BaseTrainer):
if self.eval_dataset is None: if self.eval_dataset is None:
val_data = self.cfg.dataset.val val_data = self.cfg.dataset.val
self.eval_dataset = self.build_dataset( self.eval_dataset = self.build_dataset(
val_data, mode=ModeKeys.EVAL)
val_data,
mode=ModeKeys.EVAL,
preprocessor=self.eval_preprocessor)


batch_size = self.cfg.evaluation.batch_size batch_size = self.cfg.evaluation.batch_size
workers = self.cfg.evaluation.workers workers = self.cfg.evaluation.workers
@@ -521,7 +541,7 @@ class EpochBasedTrainer(BaseTrainer):
) )
return data_loader return data_loader


def build_dataset(self, data_cfg, mode):
def build_dataset(self, data_cfg, mode, preprocessor=None):
""" Build torch dataset object using data config """ Build torch dataset object using data config
""" """
dataset = MsDataset.load( dataset = MsDataset.load(
@@ -531,8 +551,7 @@ class EpochBasedTrainer(BaseTrainer):
data_cfg, 'subset_name') else None, data_cfg, 'subset_name') else None,
hub=data_cfg.hub if hasattr(data_cfg, 'hub') else Hubs.modelscope, hub=data_cfg.hub if hasattr(data_cfg, 'hub') else Hubs.modelscope,
) )
torch_dataset = dataset.to_torch_dataset(
preprocessors=self.preprocessor, )
torch_dataset = dataset.to_torch_dataset(preprocessors=preprocessor)
dataset = self.to_task_dataset(torch_dataset, mode) dataset = self.to_task_dataset(torch_dataset, mode)
return dataset return dataset


@@ -698,6 +717,7 @@ class EpochBasedTrainer(BaseTrainer):
self.invoke_hook(TrainerStages.before_train_epoch) self.invoke_hook(TrainerStages.before_train_epoch)
time.sleep(2) # Prevent possible deadlock during epoch transition time.sleep(2) # Prevent possible deadlock during epoch transition
for i, data_batch in enumerate(data_loader): for i, data_batch in enumerate(data_loader):
data_batch = to_device(data_batch, self.device)
self.data_batch = data_batch self.data_batch = data_batch
self._inner_iter = i self._inner_iter = i
self.invoke_hook(TrainerStages.before_train_iter) self.invoke_hook(TrainerStages.before_train_iter)
@@ -721,16 +741,16 @@ class EpochBasedTrainer(BaseTrainer):
metric_values = multi_gpu_test( metric_values = multi_gpu_test(
self.model, self.model,
data_loader, data_loader,
device=self.device,
tmpdir=None, tmpdir=None,
gpu_collect=False, gpu_collect=False,
data_collate_fn=self.collate_fn,
metric_classes=metric_classes) metric_classes=metric_classes)
else: else:
from modelscope.trainers.utils.inference import single_gpu_test from modelscope.trainers.utils.inference import single_gpu_test
metric_values = single_gpu_test( metric_values = single_gpu_test(
self.model, self.model,
data_loader, data_loader,
data_collate_fn=self.collate_fn,
device=self.device,
metric_classes=metric_classes) metric_classes=metric_classes)


return metric_values return metric_values


+ 7
- 11
modelscope/trainers/utils/inference.py View File

@@ -10,21 +10,19 @@ import torch
from torch import distributed as dist from torch import distributed as dist
from tqdm import tqdm from tqdm import tqdm


from modelscope.utils.data_utils import to_device
from modelscope.utils.file_utils import func_receive_dict_inputs from modelscope.utils.file_utils import func_receive_dict_inputs
from modelscope.utils.torch_utils import (broadcast, get_dist_info, is_master, from modelscope.utils.torch_utils import (broadcast, get_dist_info, is_master,
make_tmp_dir) make_tmp_dir)




def single_gpu_test(model,
data_loader,
data_collate_fn=None,
metric_classes=None):
def single_gpu_test(model, data_loader, device, metric_classes=None):
"""Test model with a single gpu. """Test model with a single gpu.


Args: Args:
model (nn.Module): Model to be tested. model (nn.Module): Model to be tested.
data_loader (nn.Dataloader): Pytorch data loader. data_loader (nn.Dataloader): Pytorch data loader.
data_collate_fn: An optional data_collate_fn before fed into the model
device: (str | torch.device): The target device for the data.
metric_classes(List): List of Metric class that uses to collect metrics metric_classes(List): List of Metric class that uses to collect metrics


Returns: Returns:
@@ -34,8 +32,7 @@ def single_gpu_test(model,
dataset = data_loader.dataset dataset = data_loader.dataset
with tqdm(total=len(dataset), desc='test samples') as pbar: with tqdm(total=len(dataset), desc='test samples') as pbar:
for data in data_loader: for data in data_loader:
if data_collate_fn is not None:
data = data_collate_fn(data)
data = to_device(data, device)
with torch.no_grad(): with torch.no_grad():
if isinstance(data, Mapping) and not func_receive_dict_inputs( if isinstance(data, Mapping) and not func_receive_dict_inputs(
model.forward): model.forward):
@@ -62,9 +59,9 @@ def single_gpu_test(model,


def multi_gpu_test(model, def multi_gpu_test(model,
data_loader, data_loader,
device,
tmpdir=None, tmpdir=None,
gpu_collect=False, gpu_collect=False,
data_collate_fn=None,
metric_classes=None): metric_classes=None):
"""Test model with multiple gpus. """Test model with multiple gpus.


@@ -77,10 +74,10 @@ def multi_gpu_test(model,
Args: Args:
model (nn.Module): Model to be tested. model (nn.Module): Model to be tested.
data_loader (nn.Dataloader): Pytorch data loader. data_loader (nn.Dataloader): Pytorch data loader.
device: (str | torch.device): The target device for the data.
tmpdir (str): Path of directory to save the temporary results from tmpdir (str): Path of directory to save the temporary results from
different gpus under cpu mode. different gpus under cpu mode.
gpu_collect (bool): Option to use either gpu or cpu to collect results. gpu_collect (bool): Option to use either gpu or cpu to collect results.
data_collate_fn: An optional data_collate_fn before fed into the model
metric_classes(List): List of Metric class that uses to collect metrics metric_classes(List): List of Metric class that uses to collect metrics


Returns: Returns:
@@ -98,8 +95,7 @@ def multi_gpu_test(model,
count = 0 count = 0
with tqdm(total=len(dataset), desc='test samples with multi gpus') as pbar: with tqdm(total=len(dataset), desc='test samples with multi gpus') as pbar:
for _, data in enumerate(data_loader): for _, data in enumerate(data_loader):
if data_collate_fn is not None:
data = data_collate_fn(data)
data = to_device(data, device)
data_list.append(data) data_list.append(data)
with torch.no_grad(): with torch.no_grad():
if isinstance(data, Mapping) and not func_receive_dict_inputs( if isinstance(data, Mapping) and not func_receive_dict_inputs(


+ 6
- 0
modelscope/utils/constant.py View File

@@ -219,6 +219,12 @@ class ConfigFields(object):
evaluation = 'evaluation' evaluation = 'evaluation'




class ConfigKeys(object):
"""Fixed keywords in configuration file"""
train = 'train'
val = 'val'


class Requirements(object): class Requirements(object):
"""Requirement names for each module """Requirement names for each module
""" """


+ 23
- 0
modelscope/utils/data_utils.py View File

@@ -0,0 +1,23 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
from collections.abc import Mapping

import torch


def to_device(batch, device, non_blocking=False):
"""Put the data to the target cuda device just before the forward function.
Args:
batch: The batch data out of the dataloader.
device: (str | torch.device): The target device for the data.

Returns: The data to the target device.

"""
if isinstance(batch, dict) or isinstance(batch, Mapping):
return type(batch)({k: to_device(v, device) for k, v in batch.items()})
elif isinstance(batch, (tuple, list)):
return type(batch)(to_device(v, device) for v in batch)
elif isinstance(batch, torch.Tensor):
return batch.to(device, non_blocking=non_blocking)
else:
return batch

+ 0
- 62
modelscope/utils/tensor_utils.py View File

@@ -24,65 +24,3 @@ def torch_nested_detach(tensors):
if isinstance(tensors, torch.Tensor): if isinstance(tensors, torch.Tensor):
return tensors.detach() return tensors.detach()
return tensors return tensors


def torch_default_data_collator(features):
# TODO @jiangnana.jnn refine this default data collator
import torch
first = features[0]

if isinstance(first, Mapping):
batch = {}
# Special handling for labels.
# Ensure that tensor is created with the correct type
# (it should be automatically the case, but let's make sure of it.)
if 'label' in first and first['label'] is not None:
label = first['label'].item() if isinstance(
first['label'], torch.Tensor) else first['label']
# the msdataset return a 0-dimension np.array with a single value, the following part handle this.
if isinstance(label, np.ndarray):
src_dtype = label[()].dtype
dtype = torch.long if label[(
)].dtype == np.int64 else torch.float
else:
src_dtype = type(label)
dtype = torch.long if isinstance(label, int) else torch.float
# add dtype to np.array to fix "TypeError: can't convert np.ndarray of type numpy.object_"
batch['labels'] = torch.tensor(
np.array([f['label'] for f in features], dtype=src_dtype),
dtype=dtype)
elif 'label_ids' in first and first['label_ids'] is not None:
if isinstance(first['label_ids'], torch.Tensor):
batch['labels'] = torch.stack(
[f['label_ids'] for f in features])
else:
dtype = torch.long if type(
first['label_ids'][0]) is int else torch.float
batch['labels'] = torch.tensor(
[f['label_ids'] for f in features], dtype=dtype)

# Handling of all other possible keys.
# Again, we will use the first element to figure out which key/values are not None for this model.
for k, v in first.items():
if k not in ('label', 'label_ids'
) and v is not None and not isinstance(v, str):
if isinstance(v, torch.Tensor):
batch[k] = torch.stack([f[k] for f in features])
elif isinstance(v, list) and isinstance(v[0], torch.Tensor):
batch[k] = torch.stack([d for f in features for d in f[k]])
else:
batch[k] = torch.tensor(np.array([f[k] for f in features]))
elif isinstance(first, tuple):
batch = []
for idx in range(len(first)):
if isinstance(first[idx], torch.Tensor):
batch.append(torch.stack([f[idx] for f in features]))
else:
batch.append(torch.tensor([f[idx] for f in features]))
else:
if isinstance(first, torch.Tensor):
batch = torch.stack(features)
else:
batch = torch.tensor(features)

return batch

+ 1
- 1
modelscope/utils/test_utils.py View File

@@ -50,7 +50,7 @@ def set_test_level(level: int):


def create_dummy_test_dataset(feat, label, num): def create_dummy_test_dataset(feat, label, num):
return MsDataset.from_hf_dataset( return MsDataset.from_hf_dataset(
Dataset.from_dict(dict(feat=[feat] * num, label=[label] * num)))
Dataset.from_dict(dict(feat=[feat] * num, labels=[label] * num)))




def download_and_untar(fpath, furl, dst) -> str: def download_and_untar(fpath, furl, dst) -> str:


+ 26
- 1
tests/preprocessors/test_common.py View File

@@ -2,7 +2,10 @@


import unittest import unittest


from modelscope.preprocessors import PREPROCESSORS, Compose, Preprocessor
import torch

from modelscope.preprocessors import (PREPROCESSORS, Compose, Filter,
Preprocessor, ToTensor)




class ComposeTest(unittest.TestCase): class ComposeTest(unittest.TestCase):
@@ -35,5 +38,27 @@ class ComposeTest(unittest.TestCase):
self.assertEqual(output['tmp2'], 'tmp2') self.assertEqual(output['tmp2'], 'tmp2')




class ToTensorTest(unittest.TestCase):

def test_totensor(self):
to_tensor_op = ToTensor(keys=['img'])
inputs = {'img': [1, 2, 3], 'label': 1, 'path': 'test.jpg'}
inputs = to_tensor_op(inputs)
self.assertIsInstance(inputs['img'], torch.Tensor)
self.assertEqual(inputs['label'], 1)
self.assertEqual(inputs['path'], 'test.jpg')


class FilterTest(unittest.TestCase):

def test_filter(self):
filter_op = Filter(reserved_keys=['img', 'label'])
inputs = {'img': [1, 2, 3], 'label': 1, 'path': 'test.jpg'}
inputs = filter_op(inputs)
self.assertIn('img', inputs)
self.assertIn('label', inputs)
self.assertNotIn('path', inputs)


if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()

+ 1
- 1
tests/trainers/hooks/test_evaluation_hook.py View File

@@ -12,7 +12,7 @@ from torch import nn
from modelscope.metainfo import Trainers from modelscope.metainfo import Trainers
from modelscope.metrics.builder import METRICS, MetricKeys from modelscope.metrics.builder import METRICS, MetricKeys
from modelscope.trainers import build_trainer from modelscope.trainers import build_trainer
from modelscope.utils.constant import LogKeys, ModelFile
from modelscope.utils.constant import ModelFile
from modelscope.utils.registry import default_group from modelscope.utils.registry import default_group
from modelscope.utils.test_utils import create_dummy_test_dataset from modelscope.utils.test_utils import create_dummy_test_dataset




+ 7
- 7
tests/trainers/hooks/test_lr_scheduler_hook.py View File

@@ -9,7 +9,7 @@ import numpy as np
import torch import torch
from torch import nn from torch import nn
from torch.optim import SGD from torch.optim import SGD
from torch.optim.lr_scheduler import MultiStepLR, ReduceLROnPlateau
from torch.optim.lr_scheduler import MultiStepLR


from modelscope.metainfo import Trainers from modelscope.metainfo import Trainers
from modelscope.metrics.builder import METRICS, MetricKeys from modelscope.metrics.builder import METRICS, MetricKeys
@@ -96,7 +96,8 @@ class LrSchedulerHookTest(unittest.TestCase):
model=model, model=model,
train_dataset=dummy_dataset, train_dataset=dummy_dataset,
optimizers=(optimizer, lr_scheduler), optimizers=(optimizer, lr_scheduler),
max_epochs=5)
max_epochs=5,
device='cpu')


trainer = build_trainer(trainer_name, kwargs) trainer = build_trainer(trainer_name, kwargs)
train_dataloader = trainer._build_dataloader_with_dataset( train_dataloader = trainer._build_dataloader_with_dataset(
@@ -160,15 +161,13 @@ class LrSchedulerHookTest(unittest.TestCase):
json.dump(json_cfg, f) json.dump(json_cfg, f)


model = DummyModel() model = DummyModel()
# optimmizer = SGD(model.parameters(), lr=0.01)
# lr_scheduler = MultiStepLR(optimmizer, milestones=[2, 4])
trainer_name = Trainers.default trainer_name = Trainers.default
kwargs = dict( kwargs = dict(
cfg_file=config_path, cfg_file=config_path,
model=model, model=model,
train_dataset=dummy_dataset, train_dataset=dummy_dataset,
# optimizers=(optimmizer, lr_scheduler),
max_epochs=7)
max_epochs=7,
device='cpu')


trainer = build_trainer(trainer_name, kwargs) trainer = build_trainer(trainer_name, kwargs)
train_dataloader = trainer._build_dataloader_with_dataset( train_dataloader = trainer._build_dataloader_with_dataset(
@@ -266,7 +265,8 @@ class PlateauLrSchedulerHookTest(unittest.TestCase):
train_dataset=dummy_dataset, train_dataset=dummy_dataset,
eval_dataset=dummy_dataset, eval_dataset=dummy_dataset,
optimizers=(optimizer, None), optimizers=(optimizer, None),
max_epochs=5)
max_epochs=5,
device='cpu')


trainer = build_trainer(trainer_name, kwargs) trainer = build_trainer(trainer_name, kwargs)
train_dataloader = trainer._build_dataloader_with_dataset( train_dataloader = trainer._build_dataloader_with_dataset(


+ 3
- 2
tests/trainers/hooks/test_optimizer_hook.py View File

@@ -17,7 +17,7 @@ from modelscope.utils.constant import ModelFile, TrainerStages
from modelscope.utils.test_utils import create_dummy_test_dataset from modelscope.utils.test_utils import create_dummy_test_dataset


dummy_dataset = create_dummy_test_dataset( dummy_dataset = create_dummy_test_dataset(
np.random.random(size=(2, 2)), np.random.randint(0, 2, (1, )), 10)
np.random.random(size=(2, )), np.random.randint(0, 2, (1, )), 10)




class DummyModel(nn.Module): class DummyModel(nn.Module):
@@ -71,7 +71,8 @@ class OptimizerHookTest(unittest.TestCase):
model=model, model=model,
train_dataset=dummy_dataset, train_dataset=dummy_dataset,
optimizers=(optimizer, lr_scheduler), optimizers=(optimizer, lr_scheduler),
max_epochs=2)
max_epochs=2,
device='cpu')


trainer = build_trainer(trainer_name, kwargs) trainer = build_trainer(trainer_name, kwargs)
train_dataloader = trainer._build_dataloader_with_dataset( train_dataloader = trainer._build_dataloader_with_dataset(


+ 2
- 1
tests/trainers/hooks/test_timer_hook.py View File

@@ -75,7 +75,8 @@ class IterTimerHookTest(unittest.TestCase):
model=model, model=model,
train_dataset=dummy_dataset, train_dataset=dummy_dataset,
optimizers=(optimizer, lr_scheduler), optimizers=(optimizer, lr_scheduler),
max_epochs=5)
max_epochs=5,
device='cpu')


trainer = build_trainer(trainer_name, kwargs) trainer = build_trainer(trainer_name, kwargs)
train_dataloader = trainer._build_dataloader_with_dataset( train_dataloader = trainer._build_dataloader_with_dataset(


+ 6
- 6
tests/trainers/test_trainer.py View File

@@ -3,19 +3,16 @@ import os
import shutil import shutil
import tempfile import tempfile
import unittest import unittest
from abc import ABCMeta


import json import json
import numpy as np import numpy as np
import torch import torch
from datasets import Dataset
from torch import nn from torch import nn
from torch.optim import SGD from torch.optim import SGD
from torch.optim.lr_scheduler import StepLR from torch.optim.lr_scheduler import StepLR


from modelscope.metainfo import Metrics, Trainers from modelscope.metainfo import Metrics, Trainers
from modelscope.metrics.builder import MetricKeys from modelscope.metrics.builder import MetricKeys
from modelscope.msdatasets import MsDataset
from modelscope.trainers import build_trainer from modelscope.trainers import build_trainer
from modelscope.utils.constant import LogKeys, ModeKeys, ModelFile from modelscope.utils.constant import LogKeys, ModeKeys, ModelFile
from modelscope.utils.test_utils import create_dummy_test_dataset, test_level from modelscope.utils.test_utils import create_dummy_test_dataset, test_level
@@ -116,7 +113,8 @@ class TrainerTest(unittest.TestCase):
data_collator=None, data_collator=None,
train_dataset=dummy_dataset_small, train_dataset=dummy_dataset_small,
eval_dataset=dummy_dataset_small, eval_dataset=dummy_dataset_small,
max_epochs=3)
max_epochs=3,
device='cpu')


trainer = build_trainer(trainer_name, kwargs) trainer = build_trainer(trainer_name, kwargs)
trainer.train() trainer.train()
@@ -175,7 +173,8 @@ class TrainerTest(unittest.TestCase):
train_dataset=dummy_dataset_small, train_dataset=dummy_dataset_small,
eval_dataset=dummy_dataset_small, eval_dataset=dummy_dataset_small,
optimizers=(optimmizer, lr_scheduler), optimizers=(optimmizer, lr_scheduler),
max_epochs=3)
max_epochs=3,
device='cpu')


trainer = build_trainer(trainer_name, kwargs) trainer = build_trainer(trainer_name, kwargs)
trainer.train() trainer.train()
@@ -225,7 +224,8 @@ class TrainerTest(unittest.TestCase):
train_dataset=dummy_dataset_big, train_dataset=dummy_dataset_big,
eval_dataset=dummy_dataset_small, eval_dataset=dummy_dataset_small,
optimizers=(optimmizer, lr_scheduler), optimizers=(optimmizer, lr_scheduler),
max_epochs=3)
max_epochs=3,
device='cpu')


trainer = build_trainer(trainer_name, kwargs) trainer = build_trainer(trainer_name, kwargs)
trainer.train() trainer.train()


+ 8
- 5
tests/trainers/test_trainer_with_nlp.py View File

@@ -37,7 +37,8 @@ class TestTrainerWithNlp(unittest.TestCase):
model=model_id, model=model_id,
train_dataset=self.dataset, train_dataset=self.dataset,
eval_dataset=self.dataset, eval_dataset=self.dataset,
work_dir=self.tmp_dir)
work_dir=self.tmp_dir,
model_revision='beta')


trainer = build_trainer(default_args=kwargs) trainer = build_trainer(default_args=kwargs)
trainer.train() trainer.train()
@@ -53,7 +54,8 @@ class TestTrainerWithNlp(unittest.TestCase):
model=model_id, model=model_id,
train_dataset=self.dataset, train_dataset=self.dataset,
eval_dataset=self.dataset, eval_dataset=self.dataset,
work_dir=self.tmp_dir)
work_dir=self.tmp_dir,
model_revision='beta')


trainer = build_trainer(default_args=kwargs) trainer = build_trainer(default_args=kwargs)
trainer.train() trainer.train()
@@ -69,7 +71,7 @@ class TestTrainerWithNlp(unittest.TestCase):
@unittest.skipUnless(test_level() >= 1, 'skip test in current test level') @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
def test_trainer_with_user_defined_config(self): def test_trainer_with_user_defined_config(self):
model_id = 'damo/nlp_structbert_sentiment-classification_chinese-base' model_id = 'damo/nlp_structbert_sentiment-classification_chinese-base'
cfg = read_config(model_id)
cfg = read_config(model_id, revision='beta')
cfg.train.max_epochs = 20 cfg.train.max_epochs = 20
cfg.train.work_dir = self.tmp_dir cfg.train.work_dir = self.tmp_dir
cfg_file = os.path.join(self.tmp_dir, 'config.json') cfg_file = os.path.join(self.tmp_dir, 'config.json')
@@ -78,7 +80,8 @@ class TestTrainerWithNlp(unittest.TestCase):
model=model_id, model=model_id,
train_dataset=self.dataset, train_dataset=self.dataset,
eval_dataset=self.dataset, eval_dataset=self.dataset,
cfg_file=cfg_file)
cfg_file=cfg_file,
model_revision='beta')


trainer = build_trainer(default_args=kwargs) trainer = build_trainer(default_args=kwargs)
trainer.train() trainer.train()
@@ -98,7 +101,7 @@ class TestTrainerWithNlp(unittest.TestCase):
os.makedirs(tmp_dir) os.makedirs(tmp_dir)


model_id = 'damo/nlp_structbert_sentence-similarity_chinese-base' model_id = 'damo/nlp_structbert_sentence-similarity_chinese-base'
cache_path = snapshot_download(model_id)
cache_path = snapshot_download(model_id, revision='beta')
model = SbertForSequenceClassification.from_pretrained(cache_path) model = SbertForSequenceClassification.from_pretrained(cache_path)
kwargs = dict( kwargs = dict(
cfg_file=os.path.join(cache_path, ModelFile.CONFIGURATION), cfg_file=os.path.join(cache_path, ModelFile.CONFIGURATION),


+ 0
- 0
tests/trainers/utils/__init__.py View File


+ 116
- 0
tests/trainers/utils/test_inference.py View File

@@ -0,0 +1,116 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
import os
import shutil
import tempfile
import unittest

import torch
from torch import nn
from torch.utils.data import DataLoader

from modelscope.metrics.builder import MetricKeys
from modelscope.metrics.sequence_classification_metric import \
SequenceClassificationMetric
from modelscope.trainers.utils.inference import multi_gpu_test, single_gpu_test
from modelscope.utils.test_utils import (DistributedTestCase,
create_dummy_test_dataset, test_level)
from modelscope.utils.torch_utils import get_dist_info, init_dist

dummy_dataset = create_dummy_test_dataset(
torch.rand((5, )), torch.randint(0, 4, (1, )), 20)


class DummyModel(nn.Module):

def __init__(self):
super().__init__()
self.linear = nn.Linear(5, 4)
self.bn = nn.BatchNorm1d(4)

def forward(self, feat, labels):
x = self.linear(feat)

x = self.bn(x)
loss = torch.sum(x)
return dict(logits=x, loss=loss)


def test_func(dist=False):
dummy_model = DummyModel()
dataset = dummy_dataset.to_torch_dataset()

dummy_loader = DataLoader(
dataset,
batch_size=2,
)

metric_class = SequenceClassificationMetric()

if dist:
init_dist(launcher='pytorch')

rank, world_size = get_dist_info()
device = torch.device(f'cuda:{rank}')
dummy_model.cuda()

if world_size > 1:
from torch.nn.parallel.distributed import DistributedDataParallel
dummy_model = DistributedDataParallel(
dummy_model, device_ids=[torch.cuda.current_device()])
test_func = multi_gpu_test
else:
test_func = single_gpu_test

metric_results = test_func(
dummy_model,
dummy_loader,
device=device,
metric_classes=[metric_class])

return metric_results


@unittest.skipIf(not torch.cuda.is_available(), 'cuda unittest')
class SingleGpuTestTest(unittest.TestCase):

def setUp(self):
print(('Testing %s.%s' % (type(self).__name__, self._testMethodName)))
self.tmp_dir = tempfile.TemporaryDirectory().name
if not os.path.exists(self.tmp_dir):
os.makedirs(self.tmp_dir)

def tearDown(self):
super().tearDown()
shutil.rmtree(self.tmp_dir)

@unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
def test_single_gpu_test(self):
metric_results = test_func()
self.assertIn(MetricKeys.ACCURACY, metric_results)


@unittest.skipIf(not torch.cuda.is_available()
or torch.cuda.device_count() <= 1, 'distributed unittest')
class MultiGpuTestTest(DistributedTestCase):

def setUp(self):
print(('Testing %s.%s' % (type(self).__name__, self._testMethodName)))
self.tmp_dir = tempfile.TemporaryDirectory().name
if not os.path.exists(self.tmp_dir):
os.makedirs(self.tmp_dir)

def tearDown(self):
super().tearDown()
shutil.rmtree(self.tmp_dir)

@unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
def test_multi_gpu_test(self):
self.start(
test_func,
num_gpus=2,
assert_callback=lambda x: self.assertIn(MetricKeys.ACCURACY, x),
dist=True)


if __name__ == '__main__':
unittest.main()

Loading…
Cancel
Save