Browse Source

[to #43875101]

msdataset add coco dataset
unify taskdataset and ms dataset
fix hf datasets
master
feiwu.yfw 3 years ago
parent
commit
35548bd492
20 changed files with 296 additions and 122 deletions
  1. +4
    -2
      modelscope/hub/api.py
  2. +0
    -2
      modelscope/models/cv/image_instance_segmentation/__init__.py
  3. +0
    -1
      modelscope/models/cv/image_instance_segmentation/datasets/__init__.py
  4. +54
    -32
      modelscope/msdatasets/ms_dataset.py
  5. +3
    -0
      modelscope/msdatasets/task_datasets/__init__.py
  6. +0
    -0
      modelscope/msdatasets/task_datasets/base.py
  7. +0
    -0
      modelscope/msdatasets/task_datasets/builder.py
  8. +35
    -26
      modelscope/msdatasets/task_datasets/image_instance_segmentation_coco_dataset.py
  9. +0
    -0
      modelscope/msdatasets/task_datasets/torch_base_dataset.py
  10. +0
    -0
      modelscope/msdatasets/task_datasets/veco_dataset.py
  11. +92
    -3
      modelscope/msdatasets/utils/dataset_builder.py
  12. +28
    -11
      modelscope/msdatasets/utils/dataset_utils.py
  13. +0
    -4
      modelscope/trainers/cv/image_instance_segmentation_trainer.py
  14. +1
    -1
      modelscope/trainers/nlp_trainer.py
  15. +20
    -7
      modelscope/trainers/trainer.py
  16. +2
    -2
      modelscope/utils/ast_utils.py
  17. +1
    -2
      requirements/runtime.txt
  18. +11
    -0
      tests/msdatasets/test_ms_dataset.py
  19. +1
    -1
      tests/taskdataset/test_veco_dataset.py
  20. +44
    -28
      tests/trainers/test_image_instance_segmentation_trainer.py

+ 4
- 2
modelscope/hub/api.py View File

@@ -362,8 +362,10 @@ class HubApi:
dataset_name: str,
namespace: str,
revision: Optional[str] = DEFAULT_DATASET_REVISION):
return f'{self.dataset_endpoint}/api/v1/datasets/{namespace}/{dataset_name}/repo?' \
f'Revision={revision}&FilePath={file_name}'
if file_name.endswith('.csv'):
file_name = f'{self.dataset_endpoint}/api/v1/datasets/{namespace}/{dataset_name}/repo?' \
f'Revision={revision}&FilePath={file_name}'
return file_name

def get_dataset_access_config(
self,


+ 0
- 2
modelscope/models/cv/image_instance_segmentation/__init__.py View File

@@ -7,13 +7,11 @@ if TYPE_CHECKING:
from .cascade_mask_rcnn_swin import CascadeMaskRCNNSwin
from .model import CascadeMaskRCNNSwinModel
from .postprocess_utils import get_img_ins_seg_result
from .datasets import ImageInstanceSegmentationCocoDataset
else:
_import_structure = {
'cascade_mask_rcnn_swin': ['CascadeMaskRCNNSwin'],
'model': ['CascadeMaskRCNNSwinModel'],
'postprocess_utils': ['get_img_ins_seg_result'],
'datasets': ['ImageInstanceSegmentationCocoDataset']
}

import sys


+ 0
- 1
modelscope/models/cv/image_instance_segmentation/datasets/__init__.py View File

@@ -1,2 +1 @@
from .dataset import ImageInstanceSegmentationCocoDataset
from .transforms import build_preprocess_transform

+ 54
- 32
modelscope/msdatasets/ms_dataset.py View File

@@ -13,9 +13,12 @@ from datasets.utils.file_utils import (is_relative_path,
relative_to_absolute_path)

from modelscope.msdatasets.config import MS_DATASETS_CACHE
from modelscope.utils.config import ConfigDict
from modelscope.utils.constant import (DEFAULT_DATASET_REVISION,
DatasetFormations, DownloadMode, Hubs)
from modelscope.utils.logger import get_logger
from .task_datasets.builder import build_task_dataset
from .utils.dataset_builder import ExternalDataset
from .utils.dataset_utils import (get_dataset_files,
get_target_dataset_structure,
load_dataset_builder)
@@ -67,9 +70,16 @@ class MsDataset:
def __len__(self):
return len(self._hf_ds)

@property
def config_kwargs(self):
if isinstance(self._hf_ds, ExternalDataset):
return self._hf_ds.config_kwargs
else:
return None

@classmethod
def from_hf_dataset(cls,
hf_ds: Union[Dataset, DatasetDict],
hf_ds: Union[Dataset, DatasetDict, ExternalDataset],
target: str = None) -> Union[dict, 'MsDataset']:
if isinstance(hf_ds, Dataset):
return cls(hf_ds, target)
@@ -77,6 +87,8 @@ class MsDataset:
if len(hf_ds.keys()) == 1:
return cls(next(iter(hf_ds.values())), target)
return {k: cls(v, target) for k, v in hf_ds.items()}
elif isinstance(hf_ds, ExternalDataset):
return cls(hf_ds)
else:
raise TypeError(
f'"hf_ds" must be a Dataset or DatasetDict, but got {type(hf_ds)}'
@@ -96,7 +108,8 @@ class MsDataset:
Mapping[str, Union[str,
Sequence[str]]]]] = None,
download_mode: Optional[DownloadMode] = DownloadMode.
REUSE_DATASET_IF_EXISTS
REUSE_DATASET_IF_EXISTS,
**config_kwargs,
) -> Union[dict, 'MsDataset']:
"""Load a MsDataset from the ModelScope Hub, Hugging Face Hub, urls, or a local dataset.
Args:
@@ -113,6 +126,7 @@ class MsDataset:
hub (Hubs or str, optional): When loading from a remote hub, where it is from. default Hubs.modelscope
download_mode (DownloadMode or str, optional): How to treat existing datasets. default
DownloadMode.REUSE_DATASET_IF_EXISTS
**config_kwargs (additional keyword arguments): Keyword arguments to be passed

Returns:
MsDataset (obj:`MsDataset`): MsDataset object for a certain dataset.
@@ -128,7 +142,8 @@ class MsDataset:
split=split,
data_dir=data_dir,
data_files=data_files,
download_mode=download_mode.value)
download_mode=download_mode.value,
**config_kwargs)
return MsDataset.from_hf_dataset(dataset, target=target)
elif hub == Hubs.modelscope:
return MsDataset._load_ms_dataset(
@@ -140,22 +155,22 @@ class MsDataset:
split=split,
data_dir=data_dir,
data_files=data_files,
download_mode=download_mode)
download_mode=download_mode,
**config_kwargs)

@staticmethod
def _load_ms_dataset(
dataset_name: Union[str, list],
namespace: Optional[str] = None,
target: Optional[str] = None,
version: Optional[str] = DEFAULT_DATASET_REVISION,
subset_name: Optional[str] = None,
split: Optional[str] = None,
data_dir: Optional[str] = None,
data_files: Optional[Union[str, Sequence[str],
Mapping[str, Union[str,
Sequence[str]]]]] = None,
download_mode: Optional[DownloadMode] = None
) -> Union[dict, 'MsDataset']:
def _load_ms_dataset(dataset_name: Union[str, list],
namespace: Optional[str] = None,
target: Optional[str] = None,
version: Optional[str] = DEFAULT_DATASET_REVISION,
subset_name: Optional[str] = None,
split: Optional[str] = None,
data_dir: Optional[str] = None,
data_files: Optional[Union[
str, Sequence[str],
Mapping[str, Union[str, Sequence[str]]]]] = None,
download_mode: Optional[DownloadMode] = None,
**config_kwargs) -> Union[dict, 'MsDataset']:
if isinstance(dataset_name, str):
dataset_formation = DatasetFormations.native
if dataset_name in _PACKAGED_DATASETS_MODULES or os.path.isdir(dataset_name) or \
@@ -184,7 +199,8 @@ class MsDataset:
data_dir=data_dir,
data_files=data_files,
cache_dir=MS_DATASETS_CACHE,
download_mode=download_mode.value)
download_mode=download_mode.value,
**config_kwargs)
else:
dataset = MsDataset._load_from_ms(
dataset_name,
@@ -195,7 +211,7 @@ class MsDataset:
subset_name=subset_name,
split=split,
download_mode=download_mode,
)
**config_kwargs)
elif isinstance(dataset_name, list):
if target is None:
target = 'target'
@@ -206,16 +222,15 @@ class MsDataset:
return MsDataset.from_hf_dataset(dataset, target=target)

@staticmethod
def _load_from_ms(
dataset_name: str,
dataset_files: dict,
download_dir: str,
namespace: Optional[str] = None,
version: Optional[str] = DEFAULT_DATASET_REVISION,
subset_name: Optional[str] = None,
split: Optional[str] = None,
download_mode: Optional[DownloadMode] = None,
) -> Union[Dataset, DatasetDict]:
def _load_from_ms(dataset_name: str,
dataset_files: dict,
download_dir: str,
namespace: Optional[str] = None,
version: Optional[str] = DEFAULT_DATASET_REVISION,
subset_name: Optional[str] = None,
split: Optional[str] = None,
download_mode: Optional[DownloadMode] = None,
**config_kwargs) -> Union[Dataset, DatasetDict]:
for json_path in dataset_files['.json']:
if json_path.endswith(f'{dataset_name}.json'):
with open(json_path, encoding='utf-8') as dataset_json_file:
@@ -226,7 +241,6 @@ class MsDataset:
meta_map, file_map = get_dataset_files(target_dataset_structure,
dataset_name, namespace,
version)

builder = load_dataset_builder(
dataset_name,
subset_name,
@@ -235,7 +249,8 @@ class MsDataset:
zip_data_files=file_map,
cache_dir=MS_DATASETS_CACHE,
version=version,
split=list(target_dataset_structure.keys()))
split=list(target_dataset_structure.keys()),
**config_kwargs)

download_config = DownloadConfig(
cache_dir=download_dir,
@@ -253,7 +268,6 @@ class MsDataset:
data_dir=download_dir,
)
builder.download_and_prepare(
download_config=download_config,
dl_manager=dl_manager,
download_mode=download_mode.value,
try_from_hf_gcs=False)
@@ -338,6 +352,8 @@ class MsDataset:
self,
columns: Union[str, List[str]] = None,
preprocessors: Union[Callable, List[Callable]] = None,
task_name: str = None,
task_data_config: ConfigDict = None,
**format_kwargs,
):
"""Create a torch.utils.data.Dataset from the MS Dataset. The torch.utils.data.Dataset can be passed to
@@ -350,6 +366,8 @@ class MsDataset:
columns (str or List[str], default None): Dataset column(s) to be loaded (numeric data only). If the
preprocessor is None, the arg columns must have at least one column. If the `preprocessors` is not None,
the output fields of processors will also be added.
task_name (str, default None): task name, refer to :obj:`Tasks` for more details
task_data_config (ConfigDict, default None): config dict for model object.
format_kwargs: A `dict` of arguments to be passed to the `torch.tensor`.

Returns:
@@ -360,6 +378,10 @@ class MsDataset:
raise ImportError(
'The function to_torch_dataset requires pytorch to be installed'
)
if isinstance(self._hf_ds, ExternalDataset):
task_data_config.update({'preprocessor': preprocessors})
return build_task_dataset(task_data_config, task_name,
self._hf_ds.config_kwargs)
if preprocessors is not None:
return self.to_torch_dataset_with_processors(
preprocessors, columns=columns)


modelscope/task_datasets/__init__.py → modelscope/msdatasets/task_datasets/__init__.py View File

@@ -8,6 +8,7 @@ if TYPE_CHECKING:
from .builder import TASK_DATASETS, build_task_dataset
from .torch_base_dataset import TorchTaskDataset
from .veco_dataset import VecoDataset
from .image_instance_segmentation_coco_dataset import ImageInstanceSegmentationCocoDataset

else:
_import_structure = {
@@ -15,6 +16,8 @@ else:
'builder': ['TASK_DATASETS', 'build_task_dataset'],
'torch_base_dataset': ['TorchTaskDataset'],
'veco_dataset': ['VecoDataset'],
'image_instance_segmentation_coco_dataset':
['ImageInstanceSegmentationCocoDataset']
}
import sys


modelscope/task_datasets/base.py → modelscope/msdatasets/task_datasets/base.py View File


modelscope/task_datasets/builder.py → modelscope/msdatasets/task_datasets/builder.py View File


modelscope/models/cv/image_instance_segmentation/datasets/dataset.py → modelscope/msdatasets/task_datasets/image_instance_segmentation_coco_dataset.py View File

@@ -2,14 +2,32 @@ import os.path as osp

import numpy as np
from pycocotools.coco import COCO
from torch.utils.data import Dataset


class ImageInstanceSegmentationCocoDataset(Dataset):
from modelscope.metainfo import Models
from modelscope.utils.constant import Tasks
from .builder import TASK_DATASETS
from .torch_base_dataset import TorchTaskDataset

DATASET_STRUCTURE = {
'train': {
'annotation': 'annotations/instances_train.json',
'images': 'images/train'
},
'validation': {
'annotation': 'annotations/instances_val.json',
'images': 'images/val'
}
}


@TASK_DATASETS.register_module(
module_name=Models.cascade_mask_rcnn_swin,
group_key=Tasks.image_segmentation)
class ImageInstanceSegmentationCocoDataset(TorchTaskDataset):
"""Coco-style dataset for image instance segmentation.

Args:
ann_file (str): Annotation file path.
split_config (dict): Annotation file path. {"train":"xxxxx"}
classes (Sequence[str], optional): Specify classes to load.
If is None, ``cls.CLASSES`` will be used. Default: None.
data_root (str, optional): Data root for ``ann_file``,
@@ -37,30 +55,27 @@ class ImageInstanceSegmentationCocoDataset(Dataset):
'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush')

def __init__(self,
ann_file,
split_config: dict,
preprocessor=None,
classes=None,
data_root=None,
img_prefix='',
seg_prefix=None,
test_mode=False,
filter_empty_gt=True):
self.ann_file = ann_file
self.data_root = data_root
self.img_prefix = img_prefix
filter_empty_gt=True,
**kwargs):
self.data_root = next(iter(split_config.values()))
self.split = next(iter(split_config.keys()))
self.preprocessor = preprocessor

self.ann_file = osp.join(self.data_root,
DATASET_STRUCTURE[self.split]['annotation'])

self.img_prefix = osp.join(self.data_root,
DATASET_STRUCTURE[self.split]['images'])
self.seg_prefix = seg_prefix
self.test_mode = test_mode
self.filter_empty_gt = filter_empty_gt
self.CLASSES = self.get_classes(classes)

# join paths if data_root is specified
if self.data_root is not None:
if not osp.isabs(self.ann_file):
self.ann_file = osp.join(self.data_root, self.ann_file)
if not (self.img_prefix is None or osp.isabs(self.img_prefix)):
self.img_prefix = osp.join(self.data_root, self.img_prefix)
if not (self.seg_prefix is None or osp.isabs(self.seg_prefix)):
self.seg_prefix = osp.join(self.data_root, self.seg_prefix)

# load annotations
self.data_infos = self.load_annotations(self.ann_file)

@@ -71,8 +86,6 @@ class ImageInstanceSegmentationCocoDataset(Dataset):
# set group flag for the sampler
self._set_group_flag()

self.preprocessor = None

def __len__(self):
"""Total number of samples of data."""
return len(self.data_infos)
@@ -326,7 +339,3 @@ class ImageInstanceSegmentationCocoDataset(Dataset):
raise ValueError(f'Unsupported type {type(classes)} of classes.')

return class_names

def to_torch_dataset(self, preprocessors=None):
self.preprocessor = preprocessors
return self

modelscope/task_datasets/torch_base_dataset.py → modelscope/msdatasets/task_datasets/torch_base_dataset.py View File


modelscope/task_datasets/veco_dataset.py → modelscope/msdatasets/task_datasets/veco_dataset.py View File


+ 92
- 3
modelscope/msdatasets/utils/dataset_builder.py View File

@@ -8,6 +8,7 @@ from datasets.info import DatasetInfo
from datasets.packaged_modules import csv
from datasets.utils.filelock import FileLock

from modelscope.utils.constant import DownloadMode
from modelscope.utils.logger import get_logger

logger = get_logger()
@@ -26,11 +27,11 @@ class MsCsvDatasetBuilder(csv.Csv):
zip_data_files: Mapping[str, Union[str, Sequence[str]]] = None,
**config_kwargs,
):
self.namespace = namespace
super().__init__(
cache_dir=cache_dir,
name=subset_name,
hash=hash,
namespace=namespace,
data_files=meta_data_files,
**config_kwargs)

@@ -56,6 +57,25 @@ class MsCsvDatasetBuilder(csv.Csv):
os.rmdir(self._cache_dir)
self.zip_data_files = zip_data_files

def _relative_data_dir(self, with_version=True, with_hash=True) -> str:
"""Relative path of this dataset in cache_dir:
Will be:
self.name/self.config.version/self.hash/
or if a namespace has been specified:
self.namespace___self.name/self.config.version/self.hash/
"""
builder_data_dir = self.name if self.namespace is None else f'{self.namespace}___{self.name}'
builder_config = self.config
hash = self.hash
if builder_config:
builder_data_dir = os.path.join(builder_data_dir, self.config_id)
if with_version:
builder_data_dir = os.path.join(builder_data_dir,
str(self.config.version))
if with_hash and hash and isinstance(hash, str):
builder_data_dir = os.path.join(builder_data_dir, hash)
return builder_data_dir

def _build_cache_dir(self):
builder_data_dir = os.path.join(
self._cache_dir_root,
@@ -77,8 +97,15 @@ class MsCsvDatasetBuilder(csv.Csv):
datasets.SplitGenerator(
name=split_name,
gen_kwargs={
'files': dl_manager.iter_files(files),
'base_dir': zip_data_files.get(split_name)
'files':
dl_manager.iter_files(files),
'base_dir':
os.path.join(
zip_data_files.get(split_name),
os.path.splitext(
self.zip_data_files.get(split_name))[0])
if self.zip_data_files.get(split_name) else
zip_data_files.get(split_name)
}))
return splits

@@ -111,3 +138,65 @@ class MsCsvDatasetBuilder(csv.Csv):
logger.error(
f"Failed to read file '{file}' with error {type(e)}: {e}")
raise


class TaskSpecificDatasetBuilder(MsCsvDatasetBuilder):

def __init__(
self,
dataset_name: str,
cache_dir: str,
namespace: str,
subset_name: str,
hash: str,
meta_data_files: Mapping[str, Union[str, Sequence[str]]],
zip_data_files: Mapping[str, Union[str, Sequence[str]]] = None,
**config_kwargs,
):
self.name = dataset_name
self.subset_name = subset_name
self.namespace = namespace
self.hash = hash
self.data_files = meta_data_files
self.zip_data_files = zip_data_files
self.split_path_dict = None
self.config = None
self._cache_dir_root = os.path.expanduser(cache_dir)
self._cache_dir = self._build_cache_dir()
self._config_kwargs = config_kwargs

def download_and_prepare(self, download_mode, dl_manager,
**download_kwargs):
# Prevent parallel disk operations
lock_path = os.path.join(
self._cache_dir_root,
self._cache_dir.replace(os.sep, '_') + '.lock')
with FileLock(lock_path):
data_exists = os.path.exists(self._cache_dir)
if data_exists and download_mode == DownloadMode.REUSE_DATASET_IF_EXISTS:
logger.warning(
f'Reusing dataset {self.name} ({self._cache_dir})')
return
logger.info(f'Generating dataset {self.name} ({self._cache_dir})')
self._download_and_prepare(dl_manager=dl_manager)

def _download_and_prepare(self, dl_manager):
split_path_dict = dl_manager.download_and_extract(self.zip_data_files)
self.split_path_dict = {
k: os.path.join(v,
os.path.splitext(self.zip_data_files[k])[0])
for k, v in split_path_dict.items()
}

def as_dataset(self):
return ExternalDataset(self.split_path_dict, self._config_kwargs)


class ExternalDataset(object):

def __init__(self, split_path_dict, config_kwargs):
config_kwargs.update({'split_config': split_path_dict})
self.config_kwargs = config_kwargs

def __len__(self):
return len(self.config_kwargs['split_config'])

+ 28
- 11
modelscope/msdatasets/utils/dataset_utils.py View File

@@ -6,7 +6,7 @@ from datasets.builder import DatasetBuilder

from modelscope.utils.constant import DEFAULT_DATASET_REVISION
from modelscope.utils.logger import get_logger
from .dataset_builder import MsCsvDatasetBuilder
from .dataset_builder import MsCsvDatasetBuilder, TaskSpecificDatasetBuilder

logger = get_logger()

@@ -87,7 +87,7 @@ def get_dataset_files(subset_split_into: dict,
modelscope_api = HubApi()
for split, info in subset_split_into.items():
meta_map[split] = modelscope_api.get_dataset_file_url(
info['meta'], dataset_name, namespace, revision)
info.get('meta', ''), dataset_name, namespace, revision)
if info.get('file'):
file_map[split] = info['file']
return meta_map, file_map
@@ -99,15 +99,32 @@ def load_dataset_builder(dataset_name: str, subset_name: str, namespace: str,
zip_data_files: Mapping[str, Union[str,
Sequence[str]]],
cache_dir: str, version: Optional[Union[str]],
split: Sequence[str]) -> DatasetBuilder:
split: Sequence[str],
**config_kwargs) -> DatasetBuilder:
sub_dir = os.path.join(version, '_'.join(split))
builder_instance = MsCsvDatasetBuilder(
dataset_name=dataset_name,
namespace=namespace,
cache_dir=cache_dir,
subset_name=subset_name,
meta_data_files=meta_data_files,
zip_data_files=zip_data_files,
hash=sub_dir)
meta_data_file = next(iter(meta_data_files.values()))
if not meta_data_file:
builder_instance = TaskSpecificDatasetBuilder(
dataset_name=dataset_name,
namespace=namespace,
cache_dir=cache_dir,
subset_name=subset_name,
meta_data_files=meta_data_files,
zip_data_files=zip_data_files,
hash=sub_dir,
**config_kwargs)
elif meta_data_file.endswith('.csv'):
builder_instance = MsCsvDatasetBuilder(
dataset_name=dataset_name,
namespace=namespace,
cache_dir=cache_dir,
subset_name=subset_name,
meta_data_files=meta_data_files,
zip_data_files=zip_data_files,
hash=sub_dir)
else:
raise NotImplementedError(
f'Dataset mete file extensions "{os.path.splitext(meta_data_file)[-1]}" is not implemented yet'
)

return builder_instance

+ 0
- 4
modelscope/trainers/cv/image_instance_segmentation_trainer.py View File

@@ -22,7 +22,3 @@ class ImageInstanceSegmentationTrainer(EpochBasedTrainer):

def prediction_step(self, model, inputs):
pass

def to_task_dataset(self, datasets, mode, preprocessor=None):
# wait for dataset interface to become stable...
return datasets.to_torch_dataset(preprocessor)

+ 1
- 1
modelscope/trainers/nlp_trainer.py View File

@@ -202,7 +202,7 @@ class VecoTrainer(NlpEpochBasedTrainer):
"""Veco evaluates the datasets one by one.

"""
from modelscope.task_datasets import VecoDataset
from modelscope.msdatasets.task_datasets import VecoDataset
self.model.eval()
self._mode = ModeKeys.EVAL
metric_values = {}


+ 20
- 7
modelscope/trainers/trainer.py View File

@@ -21,11 +21,12 @@ from modelscope.metainfo import Trainers
from modelscope.metrics import build_metric, task_default_metrics
from modelscope.models.base import Model, TorchModel
from modelscope.msdatasets.ms_dataset import MsDataset
from modelscope.msdatasets.task_datasets.builder import build_task_dataset
from modelscope.msdatasets.task_datasets.torch_base_dataset import \
TorchTaskDataset
from modelscope.preprocessors.base import Preprocessor
from modelscope.preprocessors.builder import build_preprocessor
from modelscope.preprocessors.common import Compose
from modelscope.task_datasets.builder import build_task_dataset
from modelscope.task_datasets.torch_base_dataset import TorchTaskDataset
from modelscope.trainers.hooks.builder import HOOKS
from modelscope.trainers.hooks.priority import Priority, get_priority
from modelscope.trainers.lrscheduler.builder import build_lr_scheduler
@@ -288,14 +289,21 @@ class EpochBasedTrainer(BaseTrainer):
if isinstance(datasets, TorchTaskDataset):
return datasets
elif isinstance(datasets, MsDataset):
datasets = datasets.to_torch_dataset(
cfg = ConfigDict(type=self.cfg.model.type, mode=mode) if hasattr(self.cfg, ConfigFields.model) \
else ConfigDict(type=None, mode=mode)
return datasets.to_torch_dataset(
task_data_config=cfg,
task_name=self.cfg.task,
preprocessors=preprocessor)
return datasets
elif isinstance(datasets, List) and isinstance(
datasets[0], MsDataset):
cfg = ConfigDict(type=self.cfg.model.type, mode=mode) if hasattr(self.cfg, ConfigFields.model) \
else ConfigDict(type=None, mode=mode)
datasets = [
d.to_torch_dataset(preprocessor=preprocessor)
for d in datasets
d.to_torch_dataset(
task_data_config=cfg,
task_name=self.cfg.task,
preprocessors=preprocessor) for d in datasets
]
cfg = ConfigDict(
type=self.cfg.task, mode=mode, datasets=datasets)
@@ -585,8 +593,13 @@ class EpochBasedTrainer(BaseTrainer):
subset_name=data_cfg.subset_name if hasattr(
data_cfg, 'subset_name') else None,
hub=data_cfg.hub if hasattr(data_cfg, 'hub') else Hubs.modelscope,
**data_cfg,
)
torch_dataset = dataset.to_torch_dataset(preprocessors=preprocessor)
cfg = ConfigDict(type=self.cfg.model.type, mode=mode)
torch_dataset = dataset.to_torch_dataset(
task_data_config=cfg,
task_name=self.cfg.task,
preprocessors=self.preprocessor)
dataset = self.to_task_dataset(torch_dataset, mode)
return dataset



+ 2
- 2
modelscope/utils/ast_utils.py View File

@@ -30,8 +30,8 @@ MODELSCOPE_PATH = '/'.join(os.path.dirname(__file__).split('/')[:-1])
REGISTER_MODULE = 'register_module'
IGNORED_PACKAGES = ['modelscope', '.']
SCAN_SUB_FOLDERS = [
'models', 'metrics', 'pipelines', 'preprocessors', 'task_datasets',
'trainers'
'models', 'metrics', 'pipelines', 'preprocessors',
'msdatasets/task_datasets', 'trainers'
]
INDEXER_FILE = 'ast_indexer'
DECORATOR_KEY = 'decorators'


+ 1
- 2
requirements/runtime.txt View File

@@ -1,6 +1,5 @@
addict
#version above 2.1.0 introduces backward-compatability issue which is being resolved
datasets==2.1.0
datasets
easydict
einops
filelock>=3.3.0


+ 11
- 0
tests/msdatasets/test_ms_dataset.py View File

@@ -4,6 +4,7 @@ from modelscope.models import Model
from modelscope.msdatasets import MsDataset
from modelscope.preprocessors import SequenceClassificationPreprocessor
from modelscope.preprocessors.base import Preprocessor
from modelscope.utils.constant import DownloadMode
from modelscope.utils.test_utils import require_tf, require_torch, test_level


@@ -30,6 +31,16 @@ class ImgPreprocessor(Preprocessor):

class MsDatasetTest(unittest.TestCase):

@unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
def test_coco(self):
ms_ds_train = MsDataset.load(
'pets_small',
namespace='modelscope',
split='train',
download_mode=DownloadMode.FORCE_REDOWNLOAD,
classes=('1', '2'))
print(ms_ds_train._hf_ds.config_kwargs)

@unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
def test_ms_csv_basic(self):
ms_ds_train = MsDataset.load(


+ 1
- 1
tests/taskdataset/test_veco_dataset.py View File

@@ -2,7 +2,7 @@

import unittest

from modelscope.task_datasets.veco_dataset import VecoDataset
from modelscope.msdatasets.task_datasets.veco_dataset import VecoDataset
from modelscope.utils.test_utils import test_level




+ 44
- 28
tests/trainers/test_image_instance_segmentation_trainer.py View File

@@ -8,10 +8,13 @@ from functools import partial

from modelscope.hub.snapshot_download import snapshot_download
from modelscope.metainfo import Trainers
from modelscope.models.cv.image_instance_segmentation import (
CascadeMaskRCNNSwinModel, ImageInstanceSegmentationCocoDataset)
from modelscope.models.cv.image_instance_segmentation import \
CascadeMaskRCNNSwinModel
from modelscope.msdatasets import MsDataset
from modelscope.msdatasets.task_datasets import \
ImageInstanceSegmentationCocoDataset
from modelscope.trainers import build_trainer
from modelscope.utils.config import Config
from modelscope.utils.config import Config, ConfigDict
from modelscope.utils.constant import ModelFile
from modelscope.utils.test_utils import test_level

@@ -27,34 +30,47 @@ class TestImageInstanceSegmentationTrainer(unittest.TestCase):
config_path = os.path.join(cache_path, ModelFile.CONFIGURATION)
cfg = Config.from_file(config_path)

data_root = cfg.dataset.data_root
classes = tuple(cfg.dataset.classes)
max_epochs = cfg.train.max_epochs
samples_per_gpu = cfg.train.dataloader.batch_size_per_gpu

if data_root is None:
try:
train_data_cfg = cfg.dataset.train
val_data_cfg = cfg.dataset.val
except Exception:
train_data_cfg = None
val_data_cfg = None
if train_data_cfg is None:
# use default toy data
dataset_path = os.path.join(cache_path, 'toydata.zip')
with zipfile.ZipFile(dataset_path, 'r') as zipf:
zipf.extractall(cache_path)
data_root = cache_path + '/toydata/'
classes = ('Cat', 'Dog')

self.train_dataset = ImageInstanceSegmentationCocoDataset(
data_root + 'annotations/instances_train.json',
classes=classes,
data_root=data_root,
img_prefix=data_root + 'images/train/',
seg_prefix=None,
test_mode=False)

self.eval_dataset = ImageInstanceSegmentationCocoDataset(
data_root + 'annotations/instances_val.json',
classes=classes,
data_root=data_root,
img_prefix=data_root + 'images/val/',
seg_prefix=None,
test_mode=True)
train_data_cfg = ConfigDict(
name='pets_small',
split='train',
classes=('Cat', 'Dog'),
test_mode=False)
if val_data_cfg is None:
val_data_cfg = ConfigDict(
name='pets_small',
split='validation',
classes=('Cat', 'Dog'),
test_mode=True)

self.train_dataset = MsDataset.load(
dataset_name=train_data_cfg.name,
split=train_data_cfg.split,
classes=train_data_cfg.classes,
test_mode=train_data_cfg.test_mode)
assert self.train_dataset.config_kwargs[
'classes'] == train_data_cfg.classes
assert next(
iter(self.train_dataset.config_kwargs['split_config'].values()))

self.eval_dataset = MsDataset.load(
dataset_name=val_data_cfg.name,
split=val_data_cfg.split,
classes=val_data_cfg.classes,
test_mode=val_data_cfg.test_mode)
assert self.eval_dataset.config_kwargs[
'classes'] == val_data_cfg.classes
assert next(
iter(self.eval_dataset.config_kwargs['split_config'].values()))

from mmcv.parallel import collate



Loading…
Cancel
Save