diff --git a/modelscope/models/cv/image_denoise/nafnet_for_image_denoise.py b/modelscope/models/cv/image_denoise/nafnet_for_image_denoise.py index eaf5d0c5..c484b37b 100644 --- a/modelscope/models/cv/image_denoise/nafnet_for_image_denoise.py +++ b/modelscope/models/cv/image_denoise/nafnet_for_image_denoise.py @@ -36,20 +36,8 @@ class NAFNetForImageDenoise(TorchModel): model_path = os.path.join(model_dir, ModelFile.TORCH_MODEL_FILE) self.model = NAFNet(**self.config.model.network_g) self.loss = PSNRLoss() - - if torch.cuda.is_available(): - self._device = torch.device('cuda') - else: - self._device = torch.device('cpu') - - self.model = self.model.to(self._device) self.model = self._load_pretrained(self.model, model_path) - if self.training: - self.model.train() - else: - self.model.eval() - def _load_pretrained(self, net, load_path, @@ -109,8 +97,6 @@ class NAFNetForImageDenoise(TorchModel): Returns: Dict[str, Tensor]: results """ - for key, value in inputs.items(): - inputs[key] = inputs[key].to(self._device) if self.training: return self._train_forward(**inputs) elif 'target' in inputs: diff --git a/modelscope/preprocessors/__init__.py b/modelscope/preprocessors/__init__.py index 9a2adb04..c5c6a33c 100644 --- a/modelscope/preprocessors/__init__.py +++ b/modelscope/preprocessors/__init__.py @@ -6,7 +6,7 @@ from modelscope.utils.import_utils import LazyImportModule if TYPE_CHECKING: from .base import Preprocessor from .builder import PREPROCESSORS, build_preprocessor - from .common import Compose + from .common import Compose, ToTensor, Filter from .asr import WavToScp from .audio import LinearAECAndFbank from .image import (LoadImage, load_image, @@ -33,7 +33,7 @@ else: _import_structure = { 'base': ['Preprocessor'], 'builder': ['PREPROCESSORS', 'build_preprocessor'], - 'common': ['Compose'], + 'common': ['Compose', 'ToTensor', 'Filter'], 'audio': ['LinearAECAndFbank'], 'asr': ['WavToScp'], 'video': ['ReadVideoData'], diff --git a/modelscope/preprocessors/common.py b/modelscope/preprocessors/common.py index 89fa859d..aa1db84c 100644 --- a/modelscope/preprocessors/common.py +++ b/modelscope/preprocessors/common.py @@ -2,6 +2,10 @@ import time from collections.abc import Sequence +from typing import Mapping + +import numpy as np +import torch from .builder import PREPROCESSORS, build_preprocessor @@ -25,12 +29,18 @@ class Compose(object): if isinstance(transform, dict): if self.field_name is None: transform = build_preprocessor(transform, field_name) - self.transforms.append(transform) + else: + # if not found key in field_name, try field_name=None(default_group) + try: + transform = build_preprocessor(transform, field_name) + except KeyError: + transform = build_preprocessor(transform, None) elif callable(transform): - self.transforms.append(transform) + pass else: raise TypeError('transform must be callable or a dict, but got' f' {type(transform)}') + self.transforms.append(transform) def __call__(self, data): for t in self.transforms: @@ -52,3 +62,82 @@ class Compose(object): format_string += f'\n {t}' format_string += '\n)' return format_string + + +def to_tensor(data): + """Convert objects of various python types to :obj:`torch.Tensor`. + + Supported types are: :class:`numpy.ndarray`, :class:`torch.Tensor`, + :class:`Sequence`, :class:`int` and :class:`float`. + + Args: + data (torch.Tensor | numpy.ndarray | Sequence | int | float): Data to + be converted. + """ + + if isinstance(data, torch.Tensor): + return data + elif isinstance(data, np.ndarray): + return torch.from_numpy(data) + elif isinstance(data, Sequence) and not isinstance(data, str): + return torch.tensor(data) + elif isinstance(data, int): + return torch.LongTensor([data]) + elif isinstance(data, float): + return torch.FloatTensor([data]) + else: + raise TypeError(f'type {type(data)} cannot be converted to tensor.') + + +@PREPROCESSORS.register_module() +class ToTensor(object): + """Convert target object to tensor. + + Args: + keys (Sequence[str]): Key of data to be converted to Tensor. + Only valid when data is type of `Mapping`. If `keys` is None, + all values of keys ​​will be converted to tensor by default. + """ + + def __init__(self, keys=None): + self.keys = keys + + def __call__(self, data): + if isinstance(data, Mapping): + if self.keys is None: + self.keys = list(data.keys()) + + for key in self.keys: + data[key] = to_tensor(data[key]) + else: + data = to_tensor(data) + + return data + + def __repr__(self): + return self.__class__.__name__ + f'(keys={self.keys})' + + +@PREPROCESSORS.register_module() +class Filter(object): + """This is usually the last stage of the dataloader transform. + Only data of reserved keys will be kept and passed directly to the model, others will be removed. + + Args: + keys (Sequence[str]): Keys of data to be reserved, others will be removed. + """ + + def __init__(self, reserved_keys): + self.reserved_keys = reserved_keys + + def __call__(self, data): + assert isinstance(data, Mapping) + + reserved_data = {} + for key in self.reserved_keys: + reserved_data[key] = data[key] + + return reserved_data + + def __repr__(self): + return self.__class__.__name__ + f'(keys={self.reserved_keys})' diff --git a/modelscope/preprocessors/image.py b/modelscope/preprocessors/image.py index 775514a2..6932371d 100644 --- a/modelscope/preprocessors/image.py +++ b/modelscope/preprocessors/image.py @@ -151,6 +151,11 @@ class ImageDenoisePreprocessor(Preprocessor): super().__init__(*args, **kwargs) self.model_dir: str = model_dir + from .common import Filter + + # TODO: `Filter` should be moved to configurarion file of each model + self._transforms = [Filter(reserved_keys=['input', 'target'])] + def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]: """process the raw input data @@ -160,6 +165,9 @@ class ImageDenoisePreprocessor(Preprocessor): Returns: Dict[str, Any]: the preprocessed data """ + for t in self._transforms: + data = t(data) + return data diff --git a/modelscope/preprocessors/nlp.py b/modelscope/preprocessors/nlp.py index f231df9a..8bf9943c 100644 --- a/modelscope/preprocessors/nlp.py +++ b/modelscope/preprocessors/nlp.py @@ -4,6 +4,7 @@ import os.path as osp import uuid from typing import Any, Dict, Iterable, Optional, Tuple, Union +import numpy as np from transformers import AutoTokenizer from modelscope.metainfo import Models, Preprocessors @@ -191,6 +192,10 @@ class NLPTokenizerPreprocessorBase(Preprocessor): text_b, return_tensors='pt' if self._mode == ModeKeys.INFERENCE else None, **self.tokenize_kwargs) + output = { + k: np.array(v) if isinstance(v, list) else v + for k, v in output.items() + } self.labels_to_id(labels, output) return output @@ -240,13 +245,13 @@ class NLPTokenizerPreprocessorBase(Preprocessor): if labels is not None: if isinstance(labels, Iterable) and all([label_can_be_mapped(label) for label in labels]) \ and self.label2id is not None: - output[OutputKeys.LABEL] = [ + output[OutputKeys.LABELS] = [ self.label2id[str(label)] for label in labels ] elif label_can_be_mapped(labels) and self.label2id is not None: - output[OutputKeys.LABEL] = self.label2id[str(labels)] + output[OutputKeys.LABELS] = self.label2id[str(labels)] else: - output[OutputKeys.LABEL] = labels + output[OutputKeys.LABELS] = labels @PREPROCESSORS.register_module( diff --git a/modelscope/trainers/cv/image_portrait_enhancement_trainer.py b/modelscope/trainers/cv/image_portrait_enhancement_trainer.py index 7ef0de79..0941d1cd 100644 --- a/modelscope/trainers/cv/image_portrait_enhancement_trainer.py +++ b/modelscope/trainers/cv/image_portrait_enhancement_trainer.py @@ -40,7 +40,6 @@ class ImagePortraitEnhancementTrainer(EpochBasedTrainer): train_outputs = dict() self._mode = ModeKeys.TRAIN - inputs = self.collate_fn(inputs) # call model forward but not __call__ to skip postprocess if isinstance(inputs, Mapping): d_loss = model._train_forward_d(**inputs) diff --git a/modelscope/trainers/nlp_trainer.py b/modelscope/trainers/nlp_trainer.py index 322070a1..9922d374 100644 --- a/modelscope/trainers/nlp_trainer.py +++ b/modelscope/trainers/nlp_trainer.py @@ -110,9 +110,11 @@ class NlpEpochBasedTrainer(EpochBasedTrainer): self.train_keys = build_dataset_keys( self.cfg.dataset.train if hasattr(self.cfg, 'dataset') and hasattr(self.cfg.dataset, 'train') else None) - # TODO eval may has special keys, which is now not supported. - # because there is only one preprocessor in the trainer, and it only supports one group of keys. - self.eval_keys = self.train_keys + self.eval_keys = build_dataset_keys( + self.cfg.dataset.val if hasattr(self.cfg, 'dataset') + and hasattr(self.cfg.dataset, 'val') else None) + if len(self.eval_keys) == 0: + self.eval_keys = self.train_keys super().__init__( model=model_dir, @@ -148,7 +150,7 @@ class NlpEpochBasedTrainer(EpochBasedTrainer): elif isinstance(model, nn.Module): return model - def build_preprocessor(self) -> Preprocessor: + def build_preprocessor(self) -> Tuple[Preprocessor, Preprocessor]: """Build the preprocessor. User can override this method to implement custom logits. @@ -159,16 +161,38 @@ class NlpEpochBasedTrainer(EpochBasedTrainer): model_args = {} if self.label2id is None else { 'label2id': self.label2id } - cfg = ConfigDict({ - **getattr(self.cfg, 'preprocessor'), - 'model_dir': - self.model_dir, - **model_args, - 'mode': - ModeKeys.TRAIN, - **self.train_keys, - }) - return build_preprocessor(cfg, Tasks.find_field_by_task(self.cfg.task)) + + field_name = Tasks.find_field_by_task(self.cfg.task) + train_preprocessor, eval_preprocessor = None, None + _train_cfg, _eval_cfg = {}, {} + + if 'type' not in self.cfg.preprocessor and ( + 'train' in self.cfg.preprocessor + or 'val' in self.cfg.preprocessor): + if 'train' in self.cfg.preprocessor: + _train_cfg = self.cfg.preprocessor.train + if 'val' in self.cfg.preprocessor: + _eval_cfg = self.cfg.preprocessor.val + else: + _train_cfg = self.cfg.preprocessor + _eval_cfg = self.cfg.preprocessor + + if len(_train_cfg): + _train_cfg.update({ + 'model_dir': self.model_dir, + **model_args, + **self.train_keys, 'mode': ModeKeys.TRAIN + }) + train_preprocessor = build_preprocessor(_train_cfg, field_name) + if len(_eval_cfg): + _eval_cfg.update({ + 'model_dir': self.model_dir, + **model_args, + **self.eval_keys, 'mode': ModeKeys.EVAL + }) + eval_preprocessor = build_preprocessor(_eval_cfg, field_name) + + return train_preprocessor, eval_preprocessor @TRAINERS.register_module(module_name=Trainers.nlp_veco_trainer) diff --git a/modelscope/trainers/trainer.py b/modelscope/trainers/trainer.py index a96c186c..b275bba4 100644 --- a/modelscope/trainers/trainer.py +++ b/modelscope/trainers/trainer.py @@ -5,15 +5,15 @@ import time from collections.abc import Mapping from distutils.version import LooseVersion from functools import partial -from typing import Callable, List, Optional, Tuple, Union +from typing import Callable, Dict, List, Optional, Sequence, Tuple, Union import json import numpy as np import torch -from addict import Dict from torch import distributed as dist from torch import nn from torch.utils.data import DataLoader, Dataset +from torch.utils.data.dataloader import default_collate from torch.utils.data.distributed import DistributedSampler from modelscope.hub.snapshot_download import snapshot_download @@ -21,8 +21,9 @@ from modelscope.metainfo import Trainers from modelscope.metrics import build_metric, task_default_metrics from modelscope.models.base import Model, TorchModel from modelscope.msdatasets.ms_dataset import MsDataset -from modelscope.preprocessors import build_preprocessor from modelscope.preprocessors.base import Preprocessor +from modelscope.preprocessors.builder import build_preprocessor +from modelscope.preprocessors.common import Compose from modelscope.task_datasets.builder import build_task_dataset from modelscope.task_datasets.torch_base_dataset import TorchTaskDataset from modelscope.trainers.hooks.builder import HOOKS @@ -30,14 +31,15 @@ from modelscope.trainers.hooks.priority import Priority, get_priority from modelscope.trainers.lrscheduler.builder import build_lr_scheduler from modelscope.trainers.optimizer.builder import build_optimizer from modelscope.utils.config import Config, ConfigDict -from modelscope.utils.constant import (DEFAULT_MODEL_REVISION, Hubs, ModeKeys, - ModelFile, Tasks, TrainerStages) +from modelscope.utils.constant import (DEFAULT_MODEL_REVISION, ConfigFields, + ConfigKeys, Hubs, ModeKeys, ModelFile, + Tasks, TrainerStages) +from modelscope.utils.data_utils import to_device from modelscope.utils.file_utils import func_receive_dict_inputs from modelscope.utils.logger import get_logger from modelscope.utils.registry import build_from_cfg -from modelscope.utils.tensor_utils import torch_default_data_collator -from modelscope.utils.torch_utils import (broadcast, create_device, - get_dist_info, init_dist) +from modelscope.utils.torch_utils import (create_device, get_dist_info, + init_dist) from .base import BaseTrainer from .builder import TRAINERS from .default_config import DEFAULT_CONFIG @@ -83,7 +85,8 @@ class EpochBasedTrainer(BaseTrainer): data_collator: Optional[Callable] = None, train_dataset: Optional[Union[MsDataset, Dataset]] = None, eval_dataset: Optional[Union[MsDataset, Dataset]] = None, - preprocessor: Optional[Preprocessor] = None, + preprocessor: Optional[Union[Preprocessor, + Dict[str, Preprocessor]]] = None, optimizers: Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler._LRScheduler] = (None, None), @@ -120,24 +123,46 @@ class EpochBasedTrainer(BaseTrainer): else: self.work_dir = self.cfg.train.get('work_dir', './work_dir') - self.preprocessor = None + self.train_preprocessor, self.eval_preprocessor = None, None if isinstance(preprocessor, Preprocessor): - self.preprocessor = preprocessor - elif hasattr(self.cfg, 'preprocessor'): - self.preprocessor = self.build_preprocessor() - if self.preprocessor is not None: - self.preprocessor.mode = ModeKeys.TRAIN + self.train_preprocessor = preprocessor + self.eval_preprocessor = preprocessor + elif isinstance(preprocessor, Mapping): + if not (ConfigKeys.train in preprocessor + or ConfigKeys.val in preprocessor): + raise ValueError( + f'Preprocessor must split with `{ConfigKeys.train}` and `{ConfigKeys.val}` keys!' + ) + if ConfigKeys.train in preprocessor: + assert isinstance(preprocessor[ConfigKeys.train], Preprocessor) + self.train_preprocessor = preprocessor[ConfigKeys.train] + if ConfigKeys.val in preprocessor: + assert isinstance(preprocessor[ConfigKeys.val], Preprocessor) + self.eval_preprocessor = preprocessor[ConfigKeys.val] + elif hasattr(self.cfg, ConfigFields.preprocessor): + self.train_preprocessor, self.eval_preprocessor = self.build_preprocessor( + ) + + if self.train_preprocessor is not None: + self.train_preprocessor.mode = ModeKeys.TRAIN + if self.eval_preprocessor is not None: + self.eval_preprocessor.mode = ModeKeys.EVAL + device_name = kwargs.get('device', 'gpu') assert device_name in ['gpu', 'cpu'], 'device should be either cpu or gpu.' self.device = create_device(device_name == 'cpu') self.train_dataset = self.to_task_dataset( - train_dataset, mode=ModeKeys.TRAIN, preprocessor=self.preprocessor) + train_dataset, + mode=ModeKeys.TRAIN, + preprocessor=self.train_preprocessor) self.eval_dataset = self.to_task_dataset( - eval_dataset, mode=ModeKeys.EVAL, preprocessor=self.preprocessor) + eval_dataset, + mode=ModeKeys.EVAL, + preprocessor=self.eval_preprocessor) - self.data_collator = data_collator if data_collator is not None else torch_default_data_collator + self.data_collator = data_collator if data_collator is not None else default_collate self.metrics = self.get_metrics() self._metric_values = None self.optimizers = optimizers @@ -229,12 +254,12 @@ class EpochBasedTrainer(BaseTrainer): return datasets elif isinstance(datasets, MsDataset): datasets = datasets.to_torch_dataset( - preprocessors=self.preprocessor) + preprocessors=preprocessor) return datasets elif isinstance(datasets, List) and isinstance( datasets[0], MsDataset): datasets = [ - d.to_torch_dataset(preprocessor=self.preprocessor) + d.to_torch_dataset(preprocessor=preprocessor) for d in datasets ] cfg = ConfigDict( @@ -258,24 +283,44 @@ class EpochBasedTrainer(BaseTrainer): else: return datasets - def build_preprocessor(self) -> Preprocessor: - """Build the preprocessor. + def build_preprocessor(self) -> Tuple[Preprocessor, Preprocessor]: + """Build train and eval preprocessor. User can override this method to implement custom logits. - Returns: The preprocessor instance. + Returns: The train preprocessor and eval preprocessor instance. """ - # TODO @wenmeng.zwm @jiangnana.jnn add support for different preprocessor - # when they are different ones in training and evaluation - cfg = ConfigDict({ - **getattr(self.cfg, 'preprocessor'), - 'model_dir': - self.model_dir, - 'mode': - ModeKeys.TRAIN, - }) - return build_preprocessor(cfg, Tasks.find_field_by_task(self.cfg.task)) + field_name = Tasks.find_field_by_task(self.cfg.task) + train_preprocessor, eval_preprocessor = None, None + _train_cfg, _eval_cfg = {}, {} + _dafault_args = {'model_dir': self.model_dir} + + if 'type' not in self.cfg.preprocessor and ( + 'train' in self.cfg.preprocessor + or 'val' in self.cfg.preprocessor): + if 'train' in self.cfg.preprocessor: + _train_cfg = self.cfg.preprocessor.train + if 'val' in self.cfg.preprocessor: + _eval_cfg = self.cfg.preprocessor.val + else: + _train_cfg = self.cfg.preprocessor + _eval_cfg = self.cfg.preprocessor + + if len(_train_cfg): + if isinstance(_train_cfg, Sequence): + # TODO: for Sequence, need adapt to `mode` and `mode_dir` args, + # and add mode for Compose or other plans + raise NotImplementedError('Not supported yet!') + _train_cfg.update(_dafault_args) + train_preprocessor = build_preprocessor(_train_cfg, field_name) + if len(_eval_cfg): + if isinstance(_eval_cfg, Sequence): + raise NotImplementedError('Not supported yet!') + _eval_cfg.update(_dafault_args) + eval_preprocessor = build_preprocessor(_eval_cfg, field_name) + + return train_preprocessor, eval_preprocessor def get_metrics(self) -> List[str]: """Get the metric class types. @@ -373,34 +418,6 @@ class EpochBasedTrainer(BaseTrainer): return build_parallel(dp_cfg) - def collate_fn(self, data): - """Prepare the input just before the forward function. - This method will move the tensors to the right device. - Usually this method does not need to be overridden. - - Args: - data: The data out of the dataloader. - - Returns: The processed data. - - """ - from torch.utils.data.dataloader import default_collate - if isinstance(data, dict) or isinstance(data, Mapping): - return type(data)({k: self.collate_fn(v) for k, v in data.items()}) - elif isinstance(data, (tuple, list)): - if isinstance(data[0], (int, float)): - return default_collate(data).to(self.device) - else: - return type(data)(self.collate_fn(v) for v in data) - elif isinstance(data, np.ndarray): - return self.collate_fn(torch.from_numpy(data)) - elif isinstance(data, torch.Tensor): - return data.to(self.device) - elif isinstance(data, (str, int, float, bool)): - return data - else: - raise ValueError(f'Unsupported data type {type(data)}') - def train_step(self, model, inputs): """ Perform a training step on a batch of inputs. @@ -421,7 +438,6 @@ class EpochBasedTrainer(BaseTrainer): # TODO: find more pretty way to change mode model.train() self._mode = ModeKeys.TRAIN - inputs = self.collate_fn(inputs) # call model forward but not __call__ to skip postprocess if isinstance(inputs, Mapping) and not func_receive_dict_inputs(model.forward): @@ -486,7 +502,9 @@ class EpochBasedTrainer(BaseTrainer): if self.train_dataset is None: train_data = self.cfg.dataset.train self.train_dataset = self.build_dataset( - train_data, mode=ModeKeys.TRAIN) + train_data, + mode=ModeKeys.TRAIN, + preprocessor=self.train_preprocessor) data_loader = self._build_dataloader_with_dataset( self.train_dataset, @@ -505,7 +523,9 @@ class EpochBasedTrainer(BaseTrainer): if self.eval_dataset is None: val_data = self.cfg.dataset.val self.eval_dataset = self.build_dataset( - val_data, mode=ModeKeys.EVAL) + val_data, + mode=ModeKeys.EVAL, + preprocessor=self.eval_preprocessor) batch_size = self.cfg.evaluation.batch_size workers = self.cfg.evaluation.workers @@ -521,7 +541,7 @@ class EpochBasedTrainer(BaseTrainer): ) return data_loader - def build_dataset(self, data_cfg, mode): + def build_dataset(self, data_cfg, mode, preprocessor=None): """ Build torch dataset object using data config """ dataset = MsDataset.load( @@ -531,8 +551,7 @@ class EpochBasedTrainer(BaseTrainer): data_cfg, 'subset_name') else None, hub=data_cfg.hub if hasattr(data_cfg, 'hub') else Hubs.modelscope, ) - torch_dataset = dataset.to_torch_dataset( - preprocessors=self.preprocessor, ) + torch_dataset = dataset.to_torch_dataset(preprocessors=preprocessor) dataset = self.to_task_dataset(torch_dataset, mode) return dataset @@ -698,6 +717,7 @@ class EpochBasedTrainer(BaseTrainer): self.invoke_hook(TrainerStages.before_train_epoch) time.sleep(2) # Prevent possible deadlock during epoch transition for i, data_batch in enumerate(data_loader): + data_batch = to_device(data_batch, self.device) self.data_batch = data_batch self._inner_iter = i self.invoke_hook(TrainerStages.before_train_iter) @@ -721,16 +741,16 @@ class EpochBasedTrainer(BaseTrainer): metric_values = multi_gpu_test( self.model, data_loader, + device=self.device, tmpdir=None, gpu_collect=False, - data_collate_fn=self.collate_fn, metric_classes=metric_classes) else: from modelscope.trainers.utils.inference import single_gpu_test metric_values = single_gpu_test( self.model, data_loader, - data_collate_fn=self.collate_fn, + device=self.device, metric_classes=metric_classes) return metric_values diff --git a/modelscope/trainers/utils/inference.py b/modelscope/trainers/utils/inference.py index a90a58b6..ea3b351b 100644 --- a/modelscope/trainers/utils/inference.py +++ b/modelscope/trainers/utils/inference.py @@ -10,21 +10,19 @@ import torch from torch import distributed as dist from tqdm import tqdm +from modelscope.utils.data_utils import to_device from modelscope.utils.file_utils import func_receive_dict_inputs from modelscope.utils.torch_utils import (broadcast, get_dist_info, is_master, make_tmp_dir) -def single_gpu_test(model, - data_loader, - data_collate_fn=None, - metric_classes=None): +def single_gpu_test(model, data_loader, device, metric_classes=None): """Test model with a single gpu. Args: model (nn.Module): Model to be tested. data_loader (nn.Dataloader): Pytorch data loader. - data_collate_fn: An optional data_collate_fn before fed into the model + device: (str | torch.device): The target device for the data. metric_classes(List): List of Metric class that uses to collect metrics Returns: @@ -34,8 +32,7 @@ def single_gpu_test(model, dataset = data_loader.dataset with tqdm(total=len(dataset), desc='test samples') as pbar: for data in data_loader: - if data_collate_fn is not None: - data = data_collate_fn(data) + data = to_device(data, device) with torch.no_grad(): if isinstance(data, Mapping) and not func_receive_dict_inputs( model.forward): @@ -62,9 +59,9 @@ def single_gpu_test(model, def multi_gpu_test(model, data_loader, + device, tmpdir=None, gpu_collect=False, - data_collate_fn=None, metric_classes=None): """Test model with multiple gpus. @@ -77,10 +74,10 @@ def multi_gpu_test(model, Args: model (nn.Module): Model to be tested. data_loader (nn.Dataloader): Pytorch data loader. + device: (str | torch.device): The target device for the data. tmpdir (str): Path of directory to save the temporary results from different gpus under cpu mode. gpu_collect (bool): Option to use either gpu or cpu to collect results. - data_collate_fn: An optional data_collate_fn before fed into the model metric_classes(List): List of Metric class that uses to collect metrics Returns: @@ -98,8 +95,7 @@ def multi_gpu_test(model, count = 0 with tqdm(total=len(dataset), desc='test samples with multi gpus') as pbar: for _, data in enumerate(data_loader): - if data_collate_fn is not None: - data = data_collate_fn(data) + data = to_device(data, device) data_list.append(data) with torch.no_grad(): if isinstance(data, Mapping) and not func_receive_dict_inputs( diff --git a/modelscope/utils/constant.py b/modelscope/utils/constant.py index 927eafbd..5f327ddc 100644 --- a/modelscope/utils/constant.py +++ b/modelscope/utils/constant.py @@ -219,6 +219,12 @@ class ConfigFields(object): evaluation = 'evaluation' +class ConfigKeys(object): + """Fixed keywords in configuration file""" + train = 'train' + val = 'val' + + class Requirements(object): """Requirement names for each module """ diff --git a/modelscope/utils/data_utils.py b/modelscope/utils/data_utils.py new file mode 100644 index 00000000..2bc88e19 --- /dev/null +++ b/modelscope/utils/data_utils.py @@ -0,0 +1,23 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +from collections.abc import Mapping + +import torch + + +def to_device(batch, device, non_blocking=False): + """Put the data to the target cuda device just before the forward function. + Args: + batch: The batch data out of the dataloader. + device: (str | torch.device): The target device for the data. + + Returns: The data to the target device. + + """ + if isinstance(batch, dict) or isinstance(batch, Mapping): + return type(batch)({k: to_device(v, device) for k, v in batch.items()}) + elif isinstance(batch, (tuple, list)): + return type(batch)(to_device(v, device) for v in batch) + elif isinstance(batch, torch.Tensor): + return batch.to(device, non_blocking=non_blocking) + else: + return batch diff --git a/modelscope/utils/tensor_utils.py b/modelscope/utils/tensor_utils.py index aca103d2..7889d944 100644 --- a/modelscope/utils/tensor_utils.py +++ b/modelscope/utils/tensor_utils.py @@ -24,65 +24,3 @@ def torch_nested_detach(tensors): if isinstance(tensors, torch.Tensor): return tensors.detach() return tensors - - -def torch_default_data_collator(features): - # TODO @jiangnana.jnn refine this default data collator - import torch - first = features[0] - - if isinstance(first, Mapping): - batch = {} - # Special handling for labels. - # Ensure that tensor is created with the correct type - # (it should be automatically the case, but let's make sure of it.) - if 'label' in first and first['label'] is not None: - label = first['label'].item() if isinstance( - first['label'], torch.Tensor) else first['label'] - # the msdataset return a 0-dimension np.array with a single value, the following part handle this. - if isinstance(label, np.ndarray): - src_dtype = label[()].dtype - dtype = torch.long if label[( - )].dtype == np.int64 else torch.float - else: - src_dtype = type(label) - dtype = torch.long if isinstance(label, int) else torch.float - # add dtype to np.array to fix "TypeError: can't convert np.ndarray of type numpy.object_" - batch['labels'] = torch.tensor( - np.array([f['label'] for f in features], dtype=src_dtype), - dtype=dtype) - elif 'label_ids' in first and first['label_ids'] is not None: - if isinstance(first['label_ids'], torch.Tensor): - batch['labels'] = torch.stack( - [f['label_ids'] for f in features]) - else: - dtype = torch.long if type( - first['label_ids'][0]) is int else torch.float - batch['labels'] = torch.tensor( - [f['label_ids'] for f in features], dtype=dtype) - - # Handling of all other possible keys. - # Again, we will use the first element to figure out which key/values are not None for this model. - for k, v in first.items(): - if k not in ('label', 'label_ids' - ) and v is not None and not isinstance(v, str): - if isinstance(v, torch.Tensor): - batch[k] = torch.stack([f[k] for f in features]) - elif isinstance(v, list) and isinstance(v[0], torch.Tensor): - batch[k] = torch.stack([d for f in features for d in f[k]]) - else: - batch[k] = torch.tensor(np.array([f[k] for f in features])) - elif isinstance(first, tuple): - batch = [] - for idx in range(len(first)): - if isinstance(first[idx], torch.Tensor): - batch.append(torch.stack([f[idx] for f in features])) - else: - batch.append(torch.tensor([f[idx] for f in features])) - else: - if isinstance(first, torch.Tensor): - batch = torch.stack(features) - else: - batch = torch.tensor(features) - - return batch diff --git a/modelscope/utils/test_utils.py b/modelscope/utils/test_utils.py index 5a606f9c..7adba982 100644 --- a/modelscope/utils/test_utils.py +++ b/modelscope/utils/test_utils.py @@ -50,7 +50,7 @@ def set_test_level(level: int): def create_dummy_test_dataset(feat, label, num): return MsDataset.from_hf_dataset( - Dataset.from_dict(dict(feat=[feat] * num, label=[label] * num))) + Dataset.from_dict(dict(feat=[feat] * num, labels=[label] * num))) def download_and_untar(fpath, furl, dst) -> str: diff --git a/tests/preprocessors/test_common.py b/tests/preprocessors/test_common.py index 1ee13589..714b8588 100644 --- a/tests/preprocessors/test_common.py +++ b/tests/preprocessors/test_common.py @@ -2,7 +2,10 @@ import unittest -from modelscope.preprocessors import PREPROCESSORS, Compose, Preprocessor +import torch + +from modelscope.preprocessors import (PREPROCESSORS, Compose, Filter, + Preprocessor, ToTensor) class ComposeTest(unittest.TestCase): @@ -35,5 +38,27 @@ class ComposeTest(unittest.TestCase): self.assertEqual(output['tmp2'], 'tmp2') +class ToTensorTest(unittest.TestCase): + + def test_totensor(self): + to_tensor_op = ToTensor(keys=['img']) + inputs = {'img': [1, 2, 3], 'label': 1, 'path': 'test.jpg'} + inputs = to_tensor_op(inputs) + self.assertIsInstance(inputs['img'], torch.Tensor) + self.assertEqual(inputs['label'], 1) + self.assertEqual(inputs['path'], 'test.jpg') + + +class FilterTest(unittest.TestCase): + + def test_filter(self): + filter_op = Filter(reserved_keys=['img', 'label']) + inputs = {'img': [1, 2, 3], 'label': 1, 'path': 'test.jpg'} + inputs = filter_op(inputs) + self.assertIn('img', inputs) + self.assertIn('label', inputs) + self.assertNotIn('path', inputs) + + if __name__ == '__main__': unittest.main() diff --git a/tests/trainers/hooks/test_evaluation_hook.py b/tests/trainers/hooks/test_evaluation_hook.py index 9e65f127..1338bb2c 100644 --- a/tests/trainers/hooks/test_evaluation_hook.py +++ b/tests/trainers/hooks/test_evaluation_hook.py @@ -12,7 +12,7 @@ from torch import nn from modelscope.metainfo import Trainers from modelscope.metrics.builder import METRICS, MetricKeys from modelscope.trainers import build_trainer -from modelscope.utils.constant import LogKeys, ModelFile +from modelscope.utils.constant import ModelFile from modelscope.utils.registry import default_group from modelscope.utils.test_utils import create_dummy_test_dataset diff --git a/tests/trainers/hooks/test_lr_scheduler_hook.py b/tests/trainers/hooks/test_lr_scheduler_hook.py index eb30fb52..86d53ecc 100644 --- a/tests/trainers/hooks/test_lr_scheduler_hook.py +++ b/tests/trainers/hooks/test_lr_scheduler_hook.py @@ -9,7 +9,7 @@ import numpy as np import torch from torch import nn from torch.optim import SGD -from torch.optim.lr_scheduler import MultiStepLR, ReduceLROnPlateau +from torch.optim.lr_scheduler import MultiStepLR from modelscope.metainfo import Trainers from modelscope.metrics.builder import METRICS, MetricKeys @@ -96,7 +96,8 @@ class LrSchedulerHookTest(unittest.TestCase): model=model, train_dataset=dummy_dataset, optimizers=(optimizer, lr_scheduler), - max_epochs=5) + max_epochs=5, + device='cpu') trainer = build_trainer(trainer_name, kwargs) train_dataloader = trainer._build_dataloader_with_dataset( @@ -160,15 +161,13 @@ class LrSchedulerHookTest(unittest.TestCase): json.dump(json_cfg, f) model = DummyModel() - # optimmizer = SGD(model.parameters(), lr=0.01) - # lr_scheduler = MultiStepLR(optimmizer, milestones=[2, 4]) trainer_name = Trainers.default kwargs = dict( cfg_file=config_path, model=model, train_dataset=dummy_dataset, - # optimizers=(optimmizer, lr_scheduler), - max_epochs=7) + max_epochs=7, + device='cpu') trainer = build_trainer(trainer_name, kwargs) train_dataloader = trainer._build_dataloader_with_dataset( @@ -266,7 +265,8 @@ class PlateauLrSchedulerHookTest(unittest.TestCase): train_dataset=dummy_dataset, eval_dataset=dummy_dataset, optimizers=(optimizer, None), - max_epochs=5) + max_epochs=5, + device='cpu') trainer = build_trainer(trainer_name, kwargs) train_dataloader = trainer._build_dataloader_with_dataset( diff --git a/tests/trainers/hooks/test_optimizer_hook.py b/tests/trainers/hooks/test_optimizer_hook.py index 62c70632..25457c1c 100644 --- a/tests/trainers/hooks/test_optimizer_hook.py +++ b/tests/trainers/hooks/test_optimizer_hook.py @@ -17,7 +17,7 @@ from modelscope.utils.constant import ModelFile, TrainerStages from modelscope.utils.test_utils import create_dummy_test_dataset dummy_dataset = create_dummy_test_dataset( - np.random.random(size=(2, 2)), np.random.randint(0, 2, (1, )), 10) + np.random.random(size=(2, )), np.random.randint(0, 2, (1, )), 10) class DummyModel(nn.Module): @@ -71,7 +71,8 @@ class OptimizerHookTest(unittest.TestCase): model=model, train_dataset=dummy_dataset, optimizers=(optimizer, lr_scheduler), - max_epochs=2) + max_epochs=2, + device='cpu') trainer = build_trainer(trainer_name, kwargs) train_dataloader = trainer._build_dataloader_with_dataset( diff --git a/tests/trainers/hooks/test_timer_hook.py b/tests/trainers/hooks/test_timer_hook.py index 6f24809b..ecb727b8 100644 --- a/tests/trainers/hooks/test_timer_hook.py +++ b/tests/trainers/hooks/test_timer_hook.py @@ -75,7 +75,8 @@ class IterTimerHookTest(unittest.TestCase): model=model, train_dataset=dummy_dataset, optimizers=(optimizer, lr_scheduler), - max_epochs=5) + max_epochs=5, + device='cpu') trainer = build_trainer(trainer_name, kwargs) train_dataloader = trainer._build_dataloader_with_dataset( diff --git a/tests/trainers/test_trainer.py b/tests/trainers/test_trainer.py index b7639024..051fab6b 100644 --- a/tests/trainers/test_trainer.py +++ b/tests/trainers/test_trainer.py @@ -3,19 +3,16 @@ import os import shutil import tempfile import unittest -from abc import ABCMeta import json import numpy as np import torch -from datasets import Dataset from torch import nn from torch.optim import SGD from torch.optim.lr_scheduler import StepLR from modelscope.metainfo import Metrics, Trainers from modelscope.metrics.builder import MetricKeys -from modelscope.msdatasets import MsDataset from modelscope.trainers import build_trainer from modelscope.utils.constant import LogKeys, ModeKeys, ModelFile from modelscope.utils.test_utils import create_dummy_test_dataset, test_level @@ -116,7 +113,8 @@ class TrainerTest(unittest.TestCase): data_collator=None, train_dataset=dummy_dataset_small, eval_dataset=dummy_dataset_small, - max_epochs=3) + max_epochs=3, + device='cpu') trainer = build_trainer(trainer_name, kwargs) trainer.train() @@ -175,7 +173,8 @@ class TrainerTest(unittest.TestCase): train_dataset=dummy_dataset_small, eval_dataset=dummy_dataset_small, optimizers=(optimmizer, lr_scheduler), - max_epochs=3) + max_epochs=3, + device='cpu') trainer = build_trainer(trainer_name, kwargs) trainer.train() @@ -225,7 +224,8 @@ class TrainerTest(unittest.TestCase): train_dataset=dummy_dataset_big, eval_dataset=dummy_dataset_small, optimizers=(optimmizer, lr_scheduler), - max_epochs=3) + max_epochs=3, + device='cpu') trainer = build_trainer(trainer_name, kwargs) trainer.train() diff --git a/tests/trainers/test_trainer_with_nlp.py b/tests/trainers/test_trainer_with_nlp.py index 7e488c6b..213b6b4f 100644 --- a/tests/trainers/test_trainer_with_nlp.py +++ b/tests/trainers/test_trainer_with_nlp.py @@ -37,7 +37,8 @@ class TestTrainerWithNlp(unittest.TestCase): model=model_id, train_dataset=self.dataset, eval_dataset=self.dataset, - work_dir=self.tmp_dir) + work_dir=self.tmp_dir, + model_revision='beta') trainer = build_trainer(default_args=kwargs) trainer.train() @@ -53,7 +54,8 @@ class TestTrainerWithNlp(unittest.TestCase): model=model_id, train_dataset=self.dataset, eval_dataset=self.dataset, - work_dir=self.tmp_dir) + work_dir=self.tmp_dir, + model_revision='beta') trainer = build_trainer(default_args=kwargs) trainer.train() @@ -69,7 +71,7 @@ class TestTrainerWithNlp(unittest.TestCase): @unittest.skipUnless(test_level() >= 1, 'skip test in current test level') def test_trainer_with_user_defined_config(self): model_id = 'damo/nlp_structbert_sentiment-classification_chinese-base' - cfg = read_config(model_id) + cfg = read_config(model_id, revision='beta') cfg.train.max_epochs = 20 cfg.train.work_dir = self.tmp_dir cfg_file = os.path.join(self.tmp_dir, 'config.json') @@ -78,7 +80,8 @@ class TestTrainerWithNlp(unittest.TestCase): model=model_id, train_dataset=self.dataset, eval_dataset=self.dataset, - cfg_file=cfg_file) + cfg_file=cfg_file, + model_revision='beta') trainer = build_trainer(default_args=kwargs) trainer.train() @@ -98,7 +101,7 @@ class TestTrainerWithNlp(unittest.TestCase): os.makedirs(tmp_dir) model_id = 'damo/nlp_structbert_sentence-similarity_chinese-base' - cache_path = snapshot_download(model_id) + cache_path = snapshot_download(model_id, revision='beta') model = SbertForSequenceClassification.from_pretrained(cache_path) kwargs = dict( cfg_file=os.path.join(cache_path, ModelFile.CONFIGURATION), diff --git a/tests/trainers/utils/__init__.py b/tests/trainers/utils/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/trainers/utils/test_inference.py b/tests/trainers/utils/test_inference.py new file mode 100644 index 00000000..87e5320e --- /dev/null +++ b/tests/trainers/utils/test_inference.py @@ -0,0 +1,116 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +import os +import shutil +import tempfile +import unittest + +import torch +from torch import nn +from torch.utils.data import DataLoader + +from modelscope.metrics.builder import MetricKeys +from modelscope.metrics.sequence_classification_metric import \ + SequenceClassificationMetric +from modelscope.trainers.utils.inference import multi_gpu_test, single_gpu_test +from modelscope.utils.test_utils import (DistributedTestCase, + create_dummy_test_dataset, test_level) +from modelscope.utils.torch_utils import get_dist_info, init_dist + +dummy_dataset = create_dummy_test_dataset( + torch.rand((5, )), torch.randint(0, 4, (1, )), 20) + + +class DummyModel(nn.Module): + + def __init__(self): + super().__init__() + self.linear = nn.Linear(5, 4) + self.bn = nn.BatchNorm1d(4) + + def forward(self, feat, labels): + x = self.linear(feat) + + x = self.bn(x) + loss = torch.sum(x) + return dict(logits=x, loss=loss) + + +def test_func(dist=False): + dummy_model = DummyModel() + dataset = dummy_dataset.to_torch_dataset() + + dummy_loader = DataLoader( + dataset, + batch_size=2, + ) + + metric_class = SequenceClassificationMetric() + + if dist: + init_dist(launcher='pytorch') + + rank, world_size = get_dist_info() + device = torch.device(f'cuda:{rank}') + dummy_model.cuda() + + if world_size > 1: + from torch.nn.parallel.distributed import DistributedDataParallel + dummy_model = DistributedDataParallel( + dummy_model, device_ids=[torch.cuda.current_device()]) + test_func = multi_gpu_test + else: + test_func = single_gpu_test + + metric_results = test_func( + dummy_model, + dummy_loader, + device=device, + metric_classes=[metric_class]) + + return metric_results + + +@unittest.skipIf(not torch.cuda.is_available(), 'cuda unittest') +class SingleGpuTestTest(unittest.TestCase): + + def setUp(self): + print(('Testing %s.%s' % (type(self).__name__, self._testMethodName))) + self.tmp_dir = tempfile.TemporaryDirectory().name + if not os.path.exists(self.tmp_dir): + os.makedirs(self.tmp_dir) + + def tearDown(self): + super().tearDown() + shutil.rmtree(self.tmp_dir) + + @unittest.skipUnless(test_level() >= 1, 'skip test in current test level') + def test_single_gpu_test(self): + metric_results = test_func() + self.assertIn(MetricKeys.ACCURACY, metric_results) + + +@unittest.skipIf(not torch.cuda.is_available() + or torch.cuda.device_count() <= 1, 'distributed unittest') +class MultiGpuTestTest(DistributedTestCase): + + def setUp(self): + print(('Testing %s.%s' % (type(self).__name__, self._testMethodName))) + self.tmp_dir = tempfile.TemporaryDirectory().name + if not os.path.exists(self.tmp_dir): + os.makedirs(self.tmp_dir) + + def tearDown(self): + super().tearDown() + shutil.rmtree(self.tmp_dir) + + @unittest.skipUnless(test_level() >= 1, 'skip test in current test level') + def test_multi_gpu_test(self): + self.start( + test_func, + num_gpus=2, + assert_callback=lambda x: self.assertIn(MetricKeys.ACCURACY, x), + dist=True) + + +if __name__ == '__main__': + unittest.main()