Browse Source

[to #43875101]

msdataset add coco dataset
unify taskdataset and ms dataset
fix hf datasets
master
feiwu.yfw 3 years ago
parent
commit
35548bd492
20 changed files with 296 additions and 122 deletions
  1. +4
    -2
      modelscope/hub/api.py
  2. +0
    -2
      modelscope/models/cv/image_instance_segmentation/__init__.py
  3. +0
    -1
      modelscope/models/cv/image_instance_segmentation/datasets/__init__.py
  4. +54
    -32
      modelscope/msdatasets/ms_dataset.py
  5. +3
    -0
      modelscope/msdatasets/task_datasets/__init__.py
  6. +0
    -0
      modelscope/msdatasets/task_datasets/base.py
  7. +0
    -0
      modelscope/msdatasets/task_datasets/builder.py
  8. +35
    -26
      modelscope/msdatasets/task_datasets/image_instance_segmentation_coco_dataset.py
  9. +0
    -0
      modelscope/msdatasets/task_datasets/torch_base_dataset.py
  10. +0
    -0
      modelscope/msdatasets/task_datasets/veco_dataset.py
  11. +92
    -3
      modelscope/msdatasets/utils/dataset_builder.py
  12. +28
    -11
      modelscope/msdatasets/utils/dataset_utils.py
  13. +0
    -4
      modelscope/trainers/cv/image_instance_segmentation_trainer.py
  14. +1
    -1
      modelscope/trainers/nlp_trainer.py
  15. +20
    -7
      modelscope/trainers/trainer.py
  16. +2
    -2
      modelscope/utils/ast_utils.py
  17. +1
    -2
      requirements/runtime.txt
  18. +11
    -0
      tests/msdatasets/test_ms_dataset.py
  19. +1
    -1
      tests/taskdataset/test_veco_dataset.py
  20. +44
    -28
      tests/trainers/test_image_instance_segmentation_trainer.py

+ 4
- 2
modelscope/hub/api.py View File

@@ -362,8 +362,10 @@ class HubApi:
dataset_name: str, dataset_name: str,
namespace: str, namespace: str,
revision: Optional[str] = DEFAULT_DATASET_REVISION): revision: Optional[str] = DEFAULT_DATASET_REVISION):
return f'{self.dataset_endpoint}/api/v1/datasets/{namespace}/{dataset_name}/repo?' \
f'Revision={revision}&FilePath={file_name}'
if file_name.endswith('.csv'):
file_name = f'{self.dataset_endpoint}/api/v1/datasets/{namespace}/{dataset_name}/repo?' \
f'Revision={revision}&FilePath={file_name}'
return file_name


def get_dataset_access_config( def get_dataset_access_config(
self, self,


+ 0
- 2
modelscope/models/cv/image_instance_segmentation/__init__.py View File

@@ -7,13 +7,11 @@ if TYPE_CHECKING:
from .cascade_mask_rcnn_swin import CascadeMaskRCNNSwin from .cascade_mask_rcnn_swin import CascadeMaskRCNNSwin
from .model import CascadeMaskRCNNSwinModel from .model import CascadeMaskRCNNSwinModel
from .postprocess_utils import get_img_ins_seg_result from .postprocess_utils import get_img_ins_seg_result
from .datasets import ImageInstanceSegmentationCocoDataset
else: else:
_import_structure = { _import_structure = {
'cascade_mask_rcnn_swin': ['CascadeMaskRCNNSwin'], 'cascade_mask_rcnn_swin': ['CascadeMaskRCNNSwin'],
'model': ['CascadeMaskRCNNSwinModel'], 'model': ['CascadeMaskRCNNSwinModel'],
'postprocess_utils': ['get_img_ins_seg_result'], 'postprocess_utils': ['get_img_ins_seg_result'],
'datasets': ['ImageInstanceSegmentationCocoDataset']
} }


import sys import sys


+ 0
- 1
modelscope/models/cv/image_instance_segmentation/datasets/__init__.py View File

@@ -1,2 +1 @@
from .dataset import ImageInstanceSegmentationCocoDataset
from .transforms import build_preprocess_transform from .transforms import build_preprocess_transform

+ 54
- 32
modelscope/msdatasets/ms_dataset.py View File

@@ -13,9 +13,12 @@ from datasets.utils.file_utils import (is_relative_path,
relative_to_absolute_path) relative_to_absolute_path)


from modelscope.msdatasets.config import MS_DATASETS_CACHE from modelscope.msdatasets.config import MS_DATASETS_CACHE
from modelscope.utils.config import ConfigDict
from modelscope.utils.constant import (DEFAULT_DATASET_REVISION, from modelscope.utils.constant import (DEFAULT_DATASET_REVISION,
DatasetFormations, DownloadMode, Hubs) DatasetFormations, DownloadMode, Hubs)
from modelscope.utils.logger import get_logger from modelscope.utils.logger import get_logger
from .task_datasets.builder import build_task_dataset
from .utils.dataset_builder import ExternalDataset
from .utils.dataset_utils import (get_dataset_files, from .utils.dataset_utils import (get_dataset_files,
get_target_dataset_structure, get_target_dataset_structure,
load_dataset_builder) load_dataset_builder)
@@ -67,9 +70,16 @@ class MsDataset:
def __len__(self): def __len__(self):
return len(self._hf_ds) return len(self._hf_ds)


@property
def config_kwargs(self):
if isinstance(self._hf_ds, ExternalDataset):
return self._hf_ds.config_kwargs
else:
return None

@classmethod @classmethod
def from_hf_dataset(cls, def from_hf_dataset(cls,
hf_ds: Union[Dataset, DatasetDict],
hf_ds: Union[Dataset, DatasetDict, ExternalDataset],
target: str = None) -> Union[dict, 'MsDataset']: target: str = None) -> Union[dict, 'MsDataset']:
if isinstance(hf_ds, Dataset): if isinstance(hf_ds, Dataset):
return cls(hf_ds, target) return cls(hf_ds, target)
@@ -77,6 +87,8 @@ class MsDataset:
if len(hf_ds.keys()) == 1: if len(hf_ds.keys()) == 1:
return cls(next(iter(hf_ds.values())), target) return cls(next(iter(hf_ds.values())), target)
return {k: cls(v, target) for k, v in hf_ds.items()} return {k: cls(v, target) for k, v in hf_ds.items()}
elif isinstance(hf_ds, ExternalDataset):
return cls(hf_ds)
else: else:
raise TypeError( raise TypeError(
f'"hf_ds" must be a Dataset or DatasetDict, but got {type(hf_ds)}' f'"hf_ds" must be a Dataset or DatasetDict, but got {type(hf_ds)}'
@@ -96,7 +108,8 @@ class MsDataset:
Mapping[str, Union[str, Mapping[str, Union[str,
Sequence[str]]]]] = None, Sequence[str]]]]] = None,
download_mode: Optional[DownloadMode] = DownloadMode. download_mode: Optional[DownloadMode] = DownloadMode.
REUSE_DATASET_IF_EXISTS
REUSE_DATASET_IF_EXISTS,
**config_kwargs,
) -> Union[dict, 'MsDataset']: ) -> Union[dict, 'MsDataset']:
"""Load a MsDataset from the ModelScope Hub, Hugging Face Hub, urls, or a local dataset. """Load a MsDataset from the ModelScope Hub, Hugging Face Hub, urls, or a local dataset.
Args: Args:
@@ -113,6 +126,7 @@ class MsDataset:
hub (Hubs or str, optional): When loading from a remote hub, where it is from. default Hubs.modelscope hub (Hubs or str, optional): When loading from a remote hub, where it is from. default Hubs.modelscope
download_mode (DownloadMode or str, optional): How to treat existing datasets. default download_mode (DownloadMode or str, optional): How to treat existing datasets. default
DownloadMode.REUSE_DATASET_IF_EXISTS DownloadMode.REUSE_DATASET_IF_EXISTS
**config_kwargs (additional keyword arguments): Keyword arguments to be passed


Returns: Returns:
MsDataset (obj:`MsDataset`): MsDataset object for a certain dataset. MsDataset (obj:`MsDataset`): MsDataset object for a certain dataset.
@@ -128,7 +142,8 @@ class MsDataset:
split=split, split=split,
data_dir=data_dir, data_dir=data_dir,
data_files=data_files, data_files=data_files,
download_mode=download_mode.value)
download_mode=download_mode.value,
**config_kwargs)
return MsDataset.from_hf_dataset(dataset, target=target) return MsDataset.from_hf_dataset(dataset, target=target)
elif hub == Hubs.modelscope: elif hub == Hubs.modelscope:
return MsDataset._load_ms_dataset( return MsDataset._load_ms_dataset(
@@ -140,22 +155,22 @@ class MsDataset:
split=split, split=split,
data_dir=data_dir, data_dir=data_dir,
data_files=data_files, data_files=data_files,
download_mode=download_mode)
download_mode=download_mode,
**config_kwargs)


@staticmethod @staticmethod
def _load_ms_dataset(
dataset_name: Union[str, list],
namespace: Optional[str] = None,
target: Optional[str] = None,
version: Optional[str] = DEFAULT_DATASET_REVISION,
subset_name: Optional[str] = None,
split: Optional[str] = None,
data_dir: Optional[str] = None,
data_files: Optional[Union[str, Sequence[str],
Mapping[str, Union[str,
Sequence[str]]]]] = None,
download_mode: Optional[DownloadMode] = None
) -> Union[dict, 'MsDataset']:
def _load_ms_dataset(dataset_name: Union[str, list],
namespace: Optional[str] = None,
target: Optional[str] = None,
version: Optional[str] = DEFAULT_DATASET_REVISION,
subset_name: Optional[str] = None,
split: Optional[str] = None,
data_dir: Optional[str] = None,
data_files: Optional[Union[
str, Sequence[str],
Mapping[str, Union[str, Sequence[str]]]]] = None,
download_mode: Optional[DownloadMode] = None,
**config_kwargs) -> Union[dict, 'MsDataset']:
if isinstance(dataset_name, str): if isinstance(dataset_name, str):
dataset_formation = DatasetFormations.native dataset_formation = DatasetFormations.native
if dataset_name in _PACKAGED_DATASETS_MODULES or os.path.isdir(dataset_name) or \ if dataset_name in _PACKAGED_DATASETS_MODULES or os.path.isdir(dataset_name) or \
@@ -184,7 +199,8 @@ class MsDataset:
data_dir=data_dir, data_dir=data_dir,
data_files=data_files, data_files=data_files,
cache_dir=MS_DATASETS_CACHE, cache_dir=MS_DATASETS_CACHE,
download_mode=download_mode.value)
download_mode=download_mode.value,
**config_kwargs)
else: else:
dataset = MsDataset._load_from_ms( dataset = MsDataset._load_from_ms(
dataset_name, dataset_name,
@@ -195,7 +211,7 @@ class MsDataset:
subset_name=subset_name, subset_name=subset_name,
split=split, split=split,
download_mode=download_mode, download_mode=download_mode,
)
**config_kwargs)
elif isinstance(dataset_name, list): elif isinstance(dataset_name, list):
if target is None: if target is None:
target = 'target' target = 'target'
@@ -206,16 +222,15 @@ class MsDataset:
return MsDataset.from_hf_dataset(dataset, target=target) return MsDataset.from_hf_dataset(dataset, target=target)


@staticmethod @staticmethod
def _load_from_ms(
dataset_name: str,
dataset_files: dict,
download_dir: str,
namespace: Optional[str] = None,
version: Optional[str] = DEFAULT_DATASET_REVISION,
subset_name: Optional[str] = None,
split: Optional[str] = None,
download_mode: Optional[DownloadMode] = None,
) -> Union[Dataset, DatasetDict]:
def _load_from_ms(dataset_name: str,
dataset_files: dict,
download_dir: str,
namespace: Optional[str] = None,
version: Optional[str] = DEFAULT_DATASET_REVISION,
subset_name: Optional[str] = None,
split: Optional[str] = None,
download_mode: Optional[DownloadMode] = None,
**config_kwargs) -> Union[Dataset, DatasetDict]:
for json_path in dataset_files['.json']: for json_path in dataset_files['.json']:
if json_path.endswith(f'{dataset_name}.json'): if json_path.endswith(f'{dataset_name}.json'):
with open(json_path, encoding='utf-8') as dataset_json_file: with open(json_path, encoding='utf-8') as dataset_json_file:
@@ -226,7 +241,6 @@ class MsDataset:
meta_map, file_map = get_dataset_files(target_dataset_structure, meta_map, file_map = get_dataset_files(target_dataset_structure,
dataset_name, namespace, dataset_name, namespace,
version) version)

builder = load_dataset_builder( builder = load_dataset_builder(
dataset_name, dataset_name,
subset_name, subset_name,
@@ -235,7 +249,8 @@ class MsDataset:
zip_data_files=file_map, zip_data_files=file_map,
cache_dir=MS_DATASETS_CACHE, cache_dir=MS_DATASETS_CACHE,
version=version, version=version,
split=list(target_dataset_structure.keys()))
split=list(target_dataset_structure.keys()),
**config_kwargs)


download_config = DownloadConfig( download_config = DownloadConfig(
cache_dir=download_dir, cache_dir=download_dir,
@@ -253,7 +268,6 @@ class MsDataset:
data_dir=download_dir, data_dir=download_dir,
) )
builder.download_and_prepare( builder.download_and_prepare(
download_config=download_config,
dl_manager=dl_manager, dl_manager=dl_manager,
download_mode=download_mode.value, download_mode=download_mode.value,
try_from_hf_gcs=False) try_from_hf_gcs=False)
@@ -338,6 +352,8 @@ class MsDataset:
self, self,
columns: Union[str, List[str]] = None, columns: Union[str, List[str]] = None,
preprocessors: Union[Callable, List[Callable]] = None, preprocessors: Union[Callable, List[Callable]] = None,
task_name: str = None,
task_data_config: ConfigDict = None,
**format_kwargs, **format_kwargs,
): ):
"""Create a torch.utils.data.Dataset from the MS Dataset. The torch.utils.data.Dataset can be passed to """Create a torch.utils.data.Dataset from the MS Dataset. The torch.utils.data.Dataset can be passed to
@@ -350,6 +366,8 @@ class MsDataset:
columns (str or List[str], default None): Dataset column(s) to be loaded (numeric data only). If the columns (str or List[str], default None): Dataset column(s) to be loaded (numeric data only). If the
preprocessor is None, the arg columns must have at least one column. If the `preprocessors` is not None, preprocessor is None, the arg columns must have at least one column. If the `preprocessors` is not None,
the output fields of processors will also be added. the output fields of processors will also be added.
task_name (str, default None): task name, refer to :obj:`Tasks` for more details
task_data_config (ConfigDict, default None): config dict for model object.
format_kwargs: A `dict` of arguments to be passed to the `torch.tensor`. format_kwargs: A `dict` of arguments to be passed to the `torch.tensor`.


Returns: Returns:
@@ -360,6 +378,10 @@ class MsDataset:
raise ImportError( raise ImportError(
'The function to_torch_dataset requires pytorch to be installed' 'The function to_torch_dataset requires pytorch to be installed'
) )
if isinstance(self._hf_ds, ExternalDataset):
task_data_config.update({'preprocessor': preprocessors})
return build_task_dataset(task_data_config, task_name,
self._hf_ds.config_kwargs)
if preprocessors is not None: if preprocessors is not None:
return self.to_torch_dataset_with_processors( return self.to_torch_dataset_with_processors(
preprocessors, columns=columns) preprocessors, columns=columns)


modelscope/task_datasets/__init__.py → modelscope/msdatasets/task_datasets/__init__.py View File

@@ -8,6 +8,7 @@ if TYPE_CHECKING:
from .builder import TASK_DATASETS, build_task_dataset from .builder import TASK_DATASETS, build_task_dataset
from .torch_base_dataset import TorchTaskDataset from .torch_base_dataset import TorchTaskDataset
from .veco_dataset import VecoDataset from .veco_dataset import VecoDataset
from .image_instance_segmentation_coco_dataset import ImageInstanceSegmentationCocoDataset


else: else:
_import_structure = { _import_structure = {
@@ -15,6 +16,8 @@ else:
'builder': ['TASK_DATASETS', 'build_task_dataset'], 'builder': ['TASK_DATASETS', 'build_task_dataset'],
'torch_base_dataset': ['TorchTaskDataset'], 'torch_base_dataset': ['TorchTaskDataset'],
'veco_dataset': ['VecoDataset'], 'veco_dataset': ['VecoDataset'],
'image_instance_segmentation_coco_dataset':
['ImageInstanceSegmentationCocoDataset']
} }
import sys import sys



modelscope/task_datasets/base.py → modelscope/msdatasets/task_datasets/base.py View File


modelscope/task_datasets/builder.py → modelscope/msdatasets/task_datasets/builder.py View File


modelscope/models/cv/image_instance_segmentation/datasets/dataset.py → modelscope/msdatasets/task_datasets/image_instance_segmentation_coco_dataset.py View File

@@ -2,14 +2,32 @@ import os.path as osp


import numpy as np import numpy as np
from pycocotools.coco import COCO from pycocotools.coco import COCO
from torch.utils.data import Dataset



class ImageInstanceSegmentationCocoDataset(Dataset):
from modelscope.metainfo import Models
from modelscope.utils.constant import Tasks
from .builder import TASK_DATASETS
from .torch_base_dataset import TorchTaskDataset

DATASET_STRUCTURE = {
'train': {
'annotation': 'annotations/instances_train.json',
'images': 'images/train'
},
'validation': {
'annotation': 'annotations/instances_val.json',
'images': 'images/val'
}
}


@TASK_DATASETS.register_module(
module_name=Models.cascade_mask_rcnn_swin,
group_key=Tasks.image_segmentation)
class ImageInstanceSegmentationCocoDataset(TorchTaskDataset):
"""Coco-style dataset for image instance segmentation. """Coco-style dataset for image instance segmentation.


Args: Args:
ann_file (str): Annotation file path.
split_config (dict): Annotation file path. {"train":"xxxxx"}
classes (Sequence[str], optional): Specify classes to load. classes (Sequence[str], optional): Specify classes to load.
If is None, ``cls.CLASSES`` will be used. Default: None. If is None, ``cls.CLASSES`` will be used. Default: None.
data_root (str, optional): Data root for ``ann_file``, data_root (str, optional): Data root for ``ann_file``,
@@ -37,30 +55,27 @@ class ImageInstanceSegmentationCocoDataset(Dataset):
'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush') 'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush')


def __init__(self, def __init__(self,
ann_file,
split_config: dict,
preprocessor=None,
classes=None, classes=None,
data_root=None,
img_prefix='',
seg_prefix=None, seg_prefix=None,
test_mode=False, test_mode=False,
filter_empty_gt=True):
self.ann_file = ann_file
self.data_root = data_root
self.img_prefix = img_prefix
filter_empty_gt=True,
**kwargs):
self.data_root = next(iter(split_config.values()))
self.split = next(iter(split_config.keys()))
self.preprocessor = preprocessor

self.ann_file = osp.join(self.data_root,
DATASET_STRUCTURE[self.split]['annotation'])

self.img_prefix = osp.join(self.data_root,
DATASET_STRUCTURE[self.split]['images'])
self.seg_prefix = seg_prefix self.seg_prefix = seg_prefix
self.test_mode = test_mode self.test_mode = test_mode
self.filter_empty_gt = filter_empty_gt self.filter_empty_gt = filter_empty_gt
self.CLASSES = self.get_classes(classes) self.CLASSES = self.get_classes(classes)


# join paths if data_root is specified
if self.data_root is not None:
if not osp.isabs(self.ann_file):
self.ann_file = osp.join(self.data_root, self.ann_file)
if not (self.img_prefix is None or osp.isabs(self.img_prefix)):
self.img_prefix = osp.join(self.data_root, self.img_prefix)
if not (self.seg_prefix is None or osp.isabs(self.seg_prefix)):
self.seg_prefix = osp.join(self.data_root, self.seg_prefix)

# load annotations # load annotations
self.data_infos = self.load_annotations(self.ann_file) self.data_infos = self.load_annotations(self.ann_file)


@@ -71,8 +86,6 @@ class ImageInstanceSegmentationCocoDataset(Dataset):
# set group flag for the sampler # set group flag for the sampler
self._set_group_flag() self._set_group_flag()


self.preprocessor = None

def __len__(self): def __len__(self):
"""Total number of samples of data.""" """Total number of samples of data."""
return len(self.data_infos) return len(self.data_infos)
@@ -326,7 +339,3 @@ class ImageInstanceSegmentationCocoDataset(Dataset):
raise ValueError(f'Unsupported type {type(classes)} of classes.') raise ValueError(f'Unsupported type {type(classes)} of classes.')


return class_names return class_names

def to_torch_dataset(self, preprocessors=None):
self.preprocessor = preprocessors
return self

modelscope/task_datasets/torch_base_dataset.py → modelscope/msdatasets/task_datasets/torch_base_dataset.py View File


modelscope/task_datasets/veco_dataset.py → modelscope/msdatasets/task_datasets/veco_dataset.py View File


+ 92
- 3
modelscope/msdatasets/utils/dataset_builder.py View File

@@ -8,6 +8,7 @@ from datasets.info import DatasetInfo
from datasets.packaged_modules import csv from datasets.packaged_modules import csv
from datasets.utils.filelock import FileLock from datasets.utils.filelock import FileLock


from modelscope.utils.constant import DownloadMode
from modelscope.utils.logger import get_logger from modelscope.utils.logger import get_logger


logger = get_logger() logger = get_logger()
@@ -26,11 +27,11 @@ class MsCsvDatasetBuilder(csv.Csv):
zip_data_files: Mapping[str, Union[str, Sequence[str]]] = None, zip_data_files: Mapping[str, Union[str, Sequence[str]]] = None,
**config_kwargs, **config_kwargs,
): ):
self.namespace = namespace
super().__init__( super().__init__(
cache_dir=cache_dir, cache_dir=cache_dir,
name=subset_name, name=subset_name,
hash=hash, hash=hash,
namespace=namespace,
data_files=meta_data_files, data_files=meta_data_files,
**config_kwargs) **config_kwargs)


@@ -56,6 +57,25 @@ class MsCsvDatasetBuilder(csv.Csv):
os.rmdir(self._cache_dir) os.rmdir(self._cache_dir)
self.zip_data_files = zip_data_files self.zip_data_files = zip_data_files


def _relative_data_dir(self, with_version=True, with_hash=True) -> str:
"""Relative path of this dataset in cache_dir:
Will be:
self.name/self.config.version/self.hash/
or if a namespace has been specified:
self.namespace___self.name/self.config.version/self.hash/
"""
builder_data_dir = self.name if self.namespace is None else f'{self.namespace}___{self.name}'
builder_config = self.config
hash = self.hash
if builder_config:
builder_data_dir = os.path.join(builder_data_dir, self.config_id)
if with_version:
builder_data_dir = os.path.join(builder_data_dir,
str(self.config.version))
if with_hash and hash and isinstance(hash, str):
builder_data_dir = os.path.join(builder_data_dir, hash)
return builder_data_dir

def _build_cache_dir(self): def _build_cache_dir(self):
builder_data_dir = os.path.join( builder_data_dir = os.path.join(
self._cache_dir_root, self._cache_dir_root,
@@ -77,8 +97,15 @@ class MsCsvDatasetBuilder(csv.Csv):
datasets.SplitGenerator( datasets.SplitGenerator(
name=split_name, name=split_name,
gen_kwargs={ gen_kwargs={
'files': dl_manager.iter_files(files),
'base_dir': zip_data_files.get(split_name)
'files':
dl_manager.iter_files(files),
'base_dir':
os.path.join(
zip_data_files.get(split_name),
os.path.splitext(
self.zip_data_files.get(split_name))[0])
if self.zip_data_files.get(split_name) else
zip_data_files.get(split_name)
})) }))
return splits return splits


@@ -111,3 +138,65 @@ class MsCsvDatasetBuilder(csv.Csv):
logger.error( logger.error(
f"Failed to read file '{file}' with error {type(e)}: {e}") f"Failed to read file '{file}' with error {type(e)}: {e}")
raise raise


class TaskSpecificDatasetBuilder(MsCsvDatasetBuilder):

def __init__(
self,
dataset_name: str,
cache_dir: str,
namespace: str,
subset_name: str,
hash: str,
meta_data_files: Mapping[str, Union[str, Sequence[str]]],
zip_data_files: Mapping[str, Union[str, Sequence[str]]] = None,
**config_kwargs,
):
self.name = dataset_name
self.subset_name = subset_name
self.namespace = namespace
self.hash = hash
self.data_files = meta_data_files
self.zip_data_files = zip_data_files
self.split_path_dict = None
self.config = None
self._cache_dir_root = os.path.expanduser(cache_dir)
self._cache_dir = self._build_cache_dir()
self._config_kwargs = config_kwargs

def download_and_prepare(self, download_mode, dl_manager,
**download_kwargs):
# Prevent parallel disk operations
lock_path = os.path.join(
self._cache_dir_root,
self._cache_dir.replace(os.sep, '_') + '.lock')
with FileLock(lock_path):
data_exists = os.path.exists(self._cache_dir)
if data_exists and download_mode == DownloadMode.REUSE_DATASET_IF_EXISTS:
logger.warning(
f'Reusing dataset {self.name} ({self._cache_dir})')
return
logger.info(f'Generating dataset {self.name} ({self._cache_dir})')
self._download_and_prepare(dl_manager=dl_manager)

def _download_and_prepare(self, dl_manager):
split_path_dict = dl_manager.download_and_extract(self.zip_data_files)
self.split_path_dict = {
k: os.path.join(v,
os.path.splitext(self.zip_data_files[k])[0])
for k, v in split_path_dict.items()
}

def as_dataset(self):
return ExternalDataset(self.split_path_dict, self._config_kwargs)


class ExternalDataset(object):

def __init__(self, split_path_dict, config_kwargs):
config_kwargs.update({'split_config': split_path_dict})
self.config_kwargs = config_kwargs

def __len__(self):
return len(self.config_kwargs['split_config'])

+ 28
- 11
modelscope/msdatasets/utils/dataset_utils.py View File

@@ -6,7 +6,7 @@ from datasets.builder import DatasetBuilder


from modelscope.utils.constant import DEFAULT_DATASET_REVISION from modelscope.utils.constant import DEFAULT_DATASET_REVISION
from modelscope.utils.logger import get_logger from modelscope.utils.logger import get_logger
from .dataset_builder import MsCsvDatasetBuilder
from .dataset_builder import MsCsvDatasetBuilder, TaskSpecificDatasetBuilder


logger = get_logger() logger = get_logger()


@@ -87,7 +87,7 @@ def get_dataset_files(subset_split_into: dict,
modelscope_api = HubApi() modelscope_api = HubApi()
for split, info in subset_split_into.items(): for split, info in subset_split_into.items():
meta_map[split] = modelscope_api.get_dataset_file_url( meta_map[split] = modelscope_api.get_dataset_file_url(
info['meta'], dataset_name, namespace, revision)
info.get('meta', ''), dataset_name, namespace, revision)
if info.get('file'): if info.get('file'):
file_map[split] = info['file'] file_map[split] = info['file']
return meta_map, file_map return meta_map, file_map
@@ -99,15 +99,32 @@ def load_dataset_builder(dataset_name: str, subset_name: str, namespace: str,
zip_data_files: Mapping[str, Union[str, zip_data_files: Mapping[str, Union[str,
Sequence[str]]], Sequence[str]]],
cache_dir: str, version: Optional[Union[str]], cache_dir: str, version: Optional[Union[str]],
split: Sequence[str]) -> DatasetBuilder:
split: Sequence[str],
**config_kwargs) -> DatasetBuilder:
sub_dir = os.path.join(version, '_'.join(split)) sub_dir = os.path.join(version, '_'.join(split))
builder_instance = MsCsvDatasetBuilder(
dataset_name=dataset_name,
namespace=namespace,
cache_dir=cache_dir,
subset_name=subset_name,
meta_data_files=meta_data_files,
zip_data_files=zip_data_files,
hash=sub_dir)
meta_data_file = next(iter(meta_data_files.values()))
if not meta_data_file:
builder_instance = TaskSpecificDatasetBuilder(
dataset_name=dataset_name,
namespace=namespace,
cache_dir=cache_dir,
subset_name=subset_name,
meta_data_files=meta_data_files,
zip_data_files=zip_data_files,
hash=sub_dir,
**config_kwargs)
elif meta_data_file.endswith('.csv'):
builder_instance = MsCsvDatasetBuilder(
dataset_name=dataset_name,
namespace=namespace,
cache_dir=cache_dir,
subset_name=subset_name,
meta_data_files=meta_data_files,
zip_data_files=zip_data_files,
hash=sub_dir)
else:
raise NotImplementedError(
f'Dataset mete file extensions "{os.path.splitext(meta_data_file)[-1]}" is not implemented yet'
)


return builder_instance return builder_instance

+ 0
- 4
modelscope/trainers/cv/image_instance_segmentation_trainer.py View File

@@ -22,7 +22,3 @@ class ImageInstanceSegmentationTrainer(EpochBasedTrainer):


def prediction_step(self, model, inputs): def prediction_step(self, model, inputs):
pass pass

def to_task_dataset(self, datasets, mode, preprocessor=None):
# wait for dataset interface to become stable...
return datasets.to_torch_dataset(preprocessor)

+ 1
- 1
modelscope/trainers/nlp_trainer.py View File

@@ -202,7 +202,7 @@ class VecoTrainer(NlpEpochBasedTrainer):
"""Veco evaluates the datasets one by one. """Veco evaluates the datasets one by one.


""" """
from modelscope.task_datasets import VecoDataset
from modelscope.msdatasets.task_datasets import VecoDataset
self.model.eval() self.model.eval()
self._mode = ModeKeys.EVAL self._mode = ModeKeys.EVAL
metric_values = {} metric_values = {}


+ 20
- 7
modelscope/trainers/trainer.py View File

@@ -21,11 +21,12 @@ from modelscope.metainfo import Trainers
from modelscope.metrics import build_metric, task_default_metrics from modelscope.metrics import build_metric, task_default_metrics
from modelscope.models.base import Model, TorchModel from modelscope.models.base import Model, TorchModel
from modelscope.msdatasets.ms_dataset import MsDataset from modelscope.msdatasets.ms_dataset import MsDataset
from modelscope.msdatasets.task_datasets.builder import build_task_dataset
from modelscope.msdatasets.task_datasets.torch_base_dataset import \
TorchTaskDataset
from modelscope.preprocessors.base import Preprocessor from modelscope.preprocessors.base import Preprocessor
from modelscope.preprocessors.builder import build_preprocessor from modelscope.preprocessors.builder import build_preprocessor
from modelscope.preprocessors.common import Compose from modelscope.preprocessors.common import Compose
from modelscope.task_datasets.builder import build_task_dataset
from modelscope.task_datasets.torch_base_dataset import TorchTaskDataset
from modelscope.trainers.hooks.builder import HOOKS from modelscope.trainers.hooks.builder import HOOKS
from modelscope.trainers.hooks.priority import Priority, get_priority from modelscope.trainers.hooks.priority import Priority, get_priority
from modelscope.trainers.lrscheduler.builder import build_lr_scheduler from modelscope.trainers.lrscheduler.builder import build_lr_scheduler
@@ -288,14 +289,21 @@ class EpochBasedTrainer(BaseTrainer):
if isinstance(datasets, TorchTaskDataset): if isinstance(datasets, TorchTaskDataset):
return datasets return datasets
elif isinstance(datasets, MsDataset): elif isinstance(datasets, MsDataset):
datasets = datasets.to_torch_dataset(
cfg = ConfigDict(type=self.cfg.model.type, mode=mode) if hasattr(self.cfg, ConfigFields.model) \
else ConfigDict(type=None, mode=mode)
return datasets.to_torch_dataset(
task_data_config=cfg,
task_name=self.cfg.task,
preprocessors=preprocessor) preprocessors=preprocessor)
return datasets
elif isinstance(datasets, List) and isinstance( elif isinstance(datasets, List) and isinstance(
datasets[0], MsDataset): datasets[0], MsDataset):
cfg = ConfigDict(type=self.cfg.model.type, mode=mode) if hasattr(self.cfg, ConfigFields.model) \
else ConfigDict(type=None, mode=mode)
datasets = [ datasets = [
d.to_torch_dataset(preprocessor=preprocessor)
for d in datasets
d.to_torch_dataset(
task_data_config=cfg,
task_name=self.cfg.task,
preprocessors=preprocessor) for d in datasets
] ]
cfg = ConfigDict( cfg = ConfigDict(
type=self.cfg.task, mode=mode, datasets=datasets) type=self.cfg.task, mode=mode, datasets=datasets)
@@ -585,8 +593,13 @@ class EpochBasedTrainer(BaseTrainer):
subset_name=data_cfg.subset_name if hasattr( subset_name=data_cfg.subset_name if hasattr(
data_cfg, 'subset_name') else None, data_cfg, 'subset_name') else None,
hub=data_cfg.hub if hasattr(data_cfg, 'hub') else Hubs.modelscope, hub=data_cfg.hub if hasattr(data_cfg, 'hub') else Hubs.modelscope,
**data_cfg,
) )
torch_dataset = dataset.to_torch_dataset(preprocessors=preprocessor)
cfg = ConfigDict(type=self.cfg.model.type, mode=mode)
torch_dataset = dataset.to_torch_dataset(
task_data_config=cfg,
task_name=self.cfg.task,
preprocessors=self.preprocessor)
dataset = self.to_task_dataset(torch_dataset, mode) dataset = self.to_task_dataset(torch_dataset, mode)
return dataset return dataset




+ 2
- 2
modelscope/utils/ast_utils.py View File

@@ -30,8 +30,8 @@ MODELSCOPE_PATH = '/'.join(os.path.dirname(__file__).split('/')[:-1])
REGISTER_MODULE = 'register_module' REGISTER_MODULE = 'register_module'
IGNORED_PACKAGES = ['modelscope', '.'] IGNORED_PACKAGES = ['modelscope', '.']
SCAN_SUB_FOLDERS = [ SCAN_SUB_FOLDERS = [
'models', 'metrics', 'pipelines', 'preprocessors', 'task_datasets',
'trainers'
'models', 'metrics', 'pipelines', 'preprocessors',
'msdatasets/task_datasets', 'trainers'
] ]
INDEXER_FILE = 'ast_indexer' INDEXER_FILE = 'ast_indexer'
DECORATOR_KEY = 'decorators' DECORATOR_KEY = 'decorators'


+ 1
- 2
requirements/runtime.txt View File

@@ -1,6 +1,5 @@
addict addict
#version above 2.1.0 introduces backward-compatability issue which is being resolved
datasets==2.1.0
datasets
easydict easydict
einops einops
filelock>=3.3.0 filelock>=3.3.0


+ 11
- 0
tests/msdatasets/test_ms_dataset.py View File

@@ -4,6 +4,7 @@ from modelscope.models import Model
from modelscope.msdatasets import MsDataset from modelscope.msdatasets import MsDataset
from modelscope.preprocessors import SequenceClassificationPreprocessor from modelscope.preprocessors import SequenceClassificationPreprocessor
from modelscope.preprocessors.base import Preprocessor from modelscope.preprocessors.base import Preprocessor
from modelscope.utils.constant import DownloadMode
from modelscope.utils.test_utils import require_tf, require_torch, test_level from modelscope.utils.test_utils import require_tf, require_torch, test_level




@@ -30,6 +31,16 @@ class ImgPreprocessor(Preprocessor):


class MsDatasetTest(unittest.TestCase): class MsDatasetTest(unittest.TestCase):


@unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
def test_coco(self):
ms_ds_train = MsDataset.load(
'pets_small',
namespace='modelscope',
split='train',
download_mode=DownloadMode.FORCE_REDOWNLOAD,
classes=('1', '2'))
print(ms_ds_train._hf_ds.config_kwargs)

@unittest.skipUnless(test_level() >= 1, 'skip test in current test level') @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
def test_ms_csv_basic(self): def test_ms_csv_basic(self):
ms_ds_train = MsDataset.load( ms_ds_train = MsDataset.load(


+ 1
- 1
tests/taskdataset/test_veco_dataset.py View File

@@ -2,7 +2,7 @@


import unittest import unittest


from modelscope.task_datasets.veco_dataset import VecoDataset
from modelscope.msdatasets.task_datasets.veco_dataset import VecoDataset
from modelscope.utils.test_utils import test_level from modelscope.utils.test_utils import test_level






+ 44
- 28
tests/trainers/test_image_instance_segmentation_trainer.py View File

@@ -8,10 +8,13 @@ from functools import partial


from modelscope.hub.snapshot_download import snapshot_download from modelscope.hub.snapshot_download import snapshot_download
from modelscope.metainfo import Trainers from modelscope.metainfo import Trainers
from modelscope.models.cv.image_instance_segmentation import (
CascadeMaskRCNNSwinModel, ImageInstanceSegmentationCocoDataset)
from modelscope.models.cv.image_instance_segmentation import \
CascadeMaskRCNNSwinModel
from modelscope.msdatasets import MsDataset
from modelscope.msdatasets.task_datasets import \
ImageInstanceSegmentationCocoDataset
from modelscope.trainers import build_trainer from modelscope.trainers import build_trainer
from modelscope.utils.config import Config
from modelscope.utils.config import Config, ConfigDict
from modelscope.utils.constant import ModelFile from modelscope.utils.constant import ModelFile
from modelscope.utils.test_utils import test_level from modelscope.utils.test_utils import test_level


@@ -27,34 +30,47 @@ class TestImageInstanceSegmentationTrainer(unittest.TestCase):
config_path = os.path.join(cache_path, ModelFile.CONFIGURATION) config_path = os.path.join(cache_path, ModelFile.CONFIGURATION)
cfg = Config.from_file(config_path) cfg = Config.from_file(config_path)


data_root = cfg.dataset.data_root
classes = tuple(cfg.dataset.classes)
max_epochs = cfg.train.max_epochs max_epochs = cfg.train.max_epochs
samples_per_gpu = cfg.train.dataloader.batch_size_per_gpu samples_per_gpu = cfg.train.dataloader.batch_size_per_gpu

if data_root is None:
try:
train_data_cfg = cfg.dataset.train
val_data_cfg = cfg.dataset.val
except Exception:
train_data_cfg = None
val_data_cfg = None
if train_data_cfg is None:
# use default toy data # use default toy data
dataset_path = os.path.join(cache_path, 'toydata.zip')
with zipfile.ZipFile(dataset_path, 'r') as zipf:
zipf.extractall(cache_path)
data_root = cache_path + '/toydata/'
classes = ('Cat', 'Dog')

self.train_dataset = ImageInstanceSegmentationCocoDataset(
data_root + 'annotations/instances_train.json',
classes=classes,
data_root=data_root,
img_prefix=data_root + 'images/train/',
seg_prefix=None,
test_mode=False)

self.eval_dataset = ImageInstanceSegmentationCocoDataset(
data_root + 'annotations/instances_val.json',
classes=classes,
data_root=data_root,
img_prefix=data_root + 'images/val/',
seg_prefix=None,
test_mode=True)
train_data_cfg = ConfigDict(
name='pets_small',
split='train',
classes=('Cat', 'Dog'),
test_mode=False)
if val_data_cfg is None:
val_data_cfg = ConfigDict(
name='pets_small',
split='validation',
classes=('Cat', 'Dog'),
test_mode=True)

self.train_dataset = MsDataset.load(
dataset_name=train_data_cfg.name,
split=train_data_cfg.split,
classes=train_data_cfg.classes,
test_mode=train_data_cfg.test_mode)
assert self.train_dataset.config_kwargs[
'classes'] == train_data_cfg.classes
assert next(
iter(self.train_dataset.config_kwargs['split_config'].values()))

self.eval_dataset = MsDataset.load(
dataset_name=val_data_cfg.name,
split=val_data_cfg.split,
classes=val_data_cfg.classes,
test_mode=val_data_cfg.test_mode)
assert self.eval_dataset.config_kwargs[
'classes'] == val_data_cfg.classes
assert next(
iter(self.eval_dataset.config_kwargs['split_config'].values()))


from mmcv.parallel import collate from mmcv.parallel import collate




Loading…
Cancel
Save