msdataset add coco dataset unify taskdataset and ms dataset fix hf datasetsmaster
@@ -362,8 +362,10 @@ class HubApi: | |||||
dataset_name: str, | dataset_name: str, | ||||
namespace: str, | namespace: str, | ||||
revision: Optional[str] = DEFAULT_DATASET_REVISION): | revision: Optional[str] = DEFAULT_DATASET_REVISION): | ||||
return f'{self.dataset_endpoint}/api/v1/datasets/{namespace}/{dataset_name}/repo?' \ | |||||
f'Revision={revision}&FilePath={file_name}' | |||||
if file_name.endswith('.csv'): | |||||
file_name = f'{self.dataset_endpoint}/api/v1/datasets/{namespace}/{dataset_name}/repo?' \ | |||||
f'Revision={revision}&FilePath={file_name}' | |||||
return file_name | |||||
def get_dataset_access_config( | def get_dataset_access_config( | ||||
self, | self, | ||||
@@ -7,13 +7,11 @@ if TYPE_CHECKING: | |||||
from .cascade_mask_rcnn_swin import CascadeMaskRCNNSwin | from .cascade_mask_rcnn_swin import CascadeMaskRCNNSwin | ||||
from .model import CascadeMaskRCNNSwinModel | from .model import CascadeMaskRCNNSwinModel | ||||
from .postprocess_utils import get_img_ins_seg_result | from .postprocess_utils import get_img_ins_seg_result | ||||
from .datasets import ImageInstanceSegmentationCocoDataset | |||||
else: | else: | ||||
_import_structure = { | _import_structure = { | ||||
'cascade_mask_rcnn_swin': ['CascadeMaskRCNNSwin'], | 'cascade_mask_rcnn_swin': ['CascadeMaskRCNNSwin'], | ||||
'model': ['CascadeMaskRCNNSwinModel'], | 'model': ['CascadeMaskRCNNSwinModel'], | ||||
'postprocess_utils': ['get_img_ins_seg_result'], | 'postprocess_utils': ['get_img_ins_seg_result'], | ||||
'datasets': ['ImageInstanceSegmentationCocoDataset'] | |||||
} | } | ||||
import sys | import sys | ||||
@@ -1,2 +1 @@ | |||||
from .dataset import ImageInstanceSegmentationCocoDataset | |||||
from .transforms import build_preprocess_transform | from .transforms import build_preprocess_transform |
@@ -13,9 +13,12 @@ from datasets.utils.file_utils import (is_relative_path, | |||||
relative_to_absolute_path) | relative_to_absolute_path) | ||||
from modelscope.msdatasets.config import MS_DATASETS_CACHE | from modelscope.msdatasets.config import MS_DATASETS_CACHE | ||||
from modelscope.utils.config import ConfigDict | |||||
from modelscope.utils.constant import (DEFAULT_DATASET_REVISION, | from modelscope.utils.constant import (DEFAULT_DATASET_REVISION, | ||||
DatasetFormations, DownloadMode, Hubs) | DatasetFormations, DownloadMode, Hubs) | ||||
from modelscope.utils.logger import get_logger | from modelscope.utils.logger import get_logger | ||||
from .task_datasets.builder import build_task_dataset | |||||
from .utils.dataset_builder import ExternalDataset | |||||
from .utils.dataset_utils import (get_dataset_files, | from .utils.dataset_utils import (get_dataset_files, | ||||
get_target_dataset_structure, | get_target_dataset_structure, | ||||
load_dataset_builder) | load_dataset_builder) | ||||
@@ -67,9 +70,16 @@ class MsDataset: | |||||
def __len__(self): | def __len__(self): | ||||
return len(self._hf_ds) | return len(self._hf_ds) | ||||
@property | |||||
def config_kwargs(self): | |||||
if isinstance(self._hf_ds, ExternalDataset): | |||||
return self._hf_ds.config_kwargs | |||||
else: | |||||
return None | |||||
@classmethod | @classmethod | ||||
def from_hf_dataset(cls, | def from_hf_dataset(cls, | ||||
hf_ds: Union[Dataset, DatasetDict], | |||||
hf_ds: Union[Dataset, DatasetDict, ExternalDataset], | |||||
target: str = None) -> Union[dict, 'MsDataset']: | target: str = None) -> Union[dict, 'MsDataset']: | ||||
if isinstance(hf_ds, Dataset): | if isinstance(hf_ds, Dataset): | ||||
return cls(hf_ds, target) | return cls(hf_ds, target) | ||||
@@ -77,6 +87,8 @@ class MsDataset: | |||||
if len(hf_ds.keys()) == 1: | if len(hf_ds.keys()) == 1: | ||||
return cls(next(iter(hf_ds.values())), target) | return cls(next(iter(hf_ds.values())), target) | ||||
return {k: cls(v, target) for k, v in hf_ds.items()} | return {k: cls(v, target) for k, v in hf_ds.items()} | ||||
elif isinstance(hf_ds, ExternalDataset): | |||||
return cls(hf_ds) | |||||
else: | else: | ||||
raise TypeError( | raise TypeError( | ||||
f'"hf_ds" must be a Dataset or DatasetDict, but got {type(hf_ds)}' | f'"hf_ds" must be a Dataset or DatasetDict, but got {type(hf_ds)}' | ||||
@@ -96,7 +108,8 @@ class MsDataset: | |||||
Mapping[str, Union[str, | Mapping[str, Union[str, | ||||
Sequence[str]]]]] = None, | Sequence[str]]]]] = None, | ||||
download_mode: Optional[DownloadMode] = DownloadMode. | download_mode: Optional[DownloadMode] = DownloadMode. | ||||
REUSE_DATASET_IF_EXISTS | |||||
REUSE_DATASET_IF_EXISTS, | |||||
**config_kwargs, | |||||
) -> Union[dict, 'MsDataset']: | ) -> Union[dict, 'MsDataset']: | ||||
"""Load a MsDataset from the ModelScope Hub, Hugging Face Hub, urls, or a local dataset. | """Load a MsDataset from the ModelScope Hub, Hugging Face Hub, urls, or a local dataset. | ||||
Args: | Args: | ||||
@@ -113,6 +126,7 @@ class MsDataset: | |||||
hub (Hubs or str, optional): When loading from a remote hub, where it is from. default Hubs.modelscope | hub (Hubs or str, optional): When loading from a remote hub, where it is from. default Hubs.modelscope | ||||
download_mode (DownloadMode or str, optional): How to treat existing datasets. default | download_mode (DownloadMode or str, optional): How to treat existing datasets. default | ||||
DownloadMode.REUSE_DATASET_IF_EXISTS | DownloadMode.REUSE_DATASET_IF_EXISTS | ||||
**config_kwargs (additional keyword arguments): Keyword arguments to be passed | |||||
Returns: | Returns: | ||||
MsDataset (obj:`MsDataset`): MsDataset object for a certain dataset. | MsDataset (obj:`MsDataset`): MsDataset object for a certain dataset. | ||||
@@ -128,7 +142,8 @@ class MsDataset: | |||||
split=split, | split=split, | ||||
data_dir=data_dir, | data_dir=data_dir, | ||||
data_files=data_files, | data_files=data_files, | ||||
download_mode=download_mode.value) | |||||
download_mode=download_mode.value, | |||||
**config_kwargs) | |||||
return MsDataset.from_hf_dataset(dataset, target=target) | return MsDataset.from_hf_dataset(dataset, target=target) | ||||
elif hub == Hubs.modelscope: | elif hub == Hubs.modelscope: | ||||
return MsDataset._load_ms_dataset( | return MsDataset._load_ms_dataset( | ||||
@@ -140,22 +155,22 @@ class MsDataset: | |||||
split=split, | split=split, | ||||
data_dir=data_dir, | data_dir=data_dir, | ||||
data_files=data_files, | data_files=data_files, | ||||
download_mode=download_mode) | |||||
download_mode=download_mode, | |||||
**config_kwargs) | |||||
@staticmethod | @staticmethod | ||||
def _load_ms_dataset( | |||||
dataset_name: Union[str, list], | |||||
namespace: Optional[str] = None, | |||||
target: Optional[str] = None, | |||||
version: Optional[str] = DEFAULT_DATASET_REVISION, | |||||
subset_name: Optional[str] = None, | |||||
split: Optional[str] = None, | |||||
data_dir: Optional[str] = None, | |||||
data_files: Optional[Union[str, Sequence[str], | |||||
Mapping[str, Union[str, | |||||
Sequence[str]]]]] = None, | |||||
download_mode: Optional[DownloadMode] = None | |||||
) -> Union[dict, 'MsDataset']: | |||||
def _load_ms_dataset(dataset_name: Union[str, list], | |||||
namespace: Optional[str] = None, | |||||
target: Optional[str] = None, | |||||
version: Optional[str] = DEFAULT_DATASET_REVISION, | |||||
subset_name: Optional[str] = None, | |||||
split: Optional[str] = None, | |||||
data_dir: Optional[str] = None, | |||||
data_files: Optional[Union[ | |||||
str, Sequence[str], | |||||
Mapping[str, Union[str, Sequence[str]]]]] = None, | |||||
download_mode: Optional[DownloadMode] = None, | |||||
**config_kwargs) -> Union[dict, 'MsDataset']: | |||||
if isinstance(dataset_name, str): | if isinstance(dataset_name, str): | ||||
dataset_formation = DatasetFormations.native | dataset_formation = DatasetFormations.native | ||||
if dataset_name in _PACKAGED_DATASETS_MODULES or os.path.isdir(dataset_name) or \ | if dataset_name in _PACKAGED_DATASETS_MODULES or os.path.isdir(dataset_name) or \ | ||||
@@ -184,7 +199,8 @@ class MsDataset: | |||||
data_dir=data_dir, | data_dir=data_dir, | ||||
data_files=data_files, | data_files=data_files, | ||||
cache_dir=MS_DATASETS_CACHE, | cache_dir=MS_DATASETS_CACHE, | ||||
download_mode=download_mode.value) | |||||
download_mode=download_mode.value, | |||||
**config_kwargs) | |||||
else: | else: | ||||
dataset = MsDataset._load_from_ms( | dataset = MsDataset._load_from_ms( | ||||
dataset_name, | dataset_name, | ||||
@@ -195,7 +211,7 @@ class MsDataset: | |||||
subset_name=subset_name, | subset_name=subset_name, | ||||
split=split, | split=split, | ||||
download_mode=download_mode, | download_mode=download_mode, | ||||
) | |||||
**config_kwargs) | |||||
elif isinstance(dataset_name, list): | elif isinstance(dataset_name, list): | ||||
if target is None: | if target is None: | ||||
target = 'target' | target = 'target' | ||||
@@ -206,16 +222,15 @@ class MsDataset: | |||||
return MsDataset.from_hf_dataset(dataset, target=target) | return MsDataset.from_hf_dataset(dataset, target=target) | ||||
@staticmethod | @staticmethod | ||||
def _load_from_ms( | |||||
dataset_name: str, | |||||
dataset_files: dict, | |||||
download_dir: str, | |||||
namespace: Optional[str] = None, | |||||
version: Optional[str] = DEFAULT_DATASET_REVISION, | |||||
subset_name: Optional[str] = None, | |||||
split: Optional[str] = None, | |||||
download_mode: Optional[DownloadMode] = None, | |||||
) -> Union[Dataset, DatasetDict]: | |||||
def _load_from_ms(dataset_name: str, | |||||
dataset_files: dict, | |||||
download_dir: str, | |||||
namespace: Optional[str] = None, | |||||
version: Optional[str] = DEFAULT_DATASET_REVISION, | |||||
subset_name: Optional[str] = None, | |||||
split: Optional[str] = None, | |||||
download_mode: Optional[DownloadMode] = None, | |||||
**config_kwargs) -> Union[Dataset, DatasetDict]: | |||||
for json_path in dataset_files['.json']: | for json_path in dataset_files['.json']: | ||||
if json_path.endswith(f'{dataset_name}.json'): | if json_path.endswith(f'{dataset_name}.json'): | ||||
with open(json_path, encoding='utf-8') as dataset_json_file: | with open(json_path, encoding='utf-8') as dataset_json_file: | ||||
@@ -226,7 +241,6 @@ class MsDataset: | |||||
meta_map, file_map = get_dataset_files(target_dataset_structure, | meta_map, file_map = get_dataset_files(target_dataset_structure, | ||||
dataset_name, namespace, | dataset_name, namespace, | ||||
version) | version) | ||||
builder = load_dataset_builder( | builder = load_dataset_builder( | ||||
dataset_name, | dataset_name, | ||||
subset_name, | subset_name, | ||||
@@ -235,7 +249,8 @@ class MsDataset: | |||||
zip_data_files=file_map, | zip_data_files=file_map, | ||||
cache_dir=MS_DATASETS_CACHE, | cache_dir=MS_DATASETS_CACHE, | ||||
version=version, | version=version, | ||||
split=list(target_dataset_structure.keys())) | |||||
split=list(target_dataset_structure.keys()), | |||||
**config_kwargs) | |||||
download_config = DownloadConfig( | download_config = DownloadConfig( | ||||
cache_dir=download_dir, | cache_dir=download_dir, | ||||
@@ -253,7 +268,6 @@ class MsDataset: | |||||
data_dir=download_dir, | data_dir=download_dir, | ||||
) | ) | ||||
builder.download_and_prepare( | builder.download_and_prepare( | ||||
download_config=download_config, | |||||
dl_manager=dl_manager, | dl_manager=dl_manager, | ||||
download_mode=download_mode.value, | download_mode=download_mode.value, | ||||
try_from_hf_gcs=False) | try_from_hf_gcs=False) | ||||
@@ -338,6 +352,8 @@ class MsDataset: | |||||
self, | self, | ||||
columns: Union[str, List[str]] = None, | columns: Union[str, List[str]] = None, | ||||
preprocessors: Union[Callable, List[Callable]] = None, | preprocessors: Union[Callable, List[Callable]] = None, | ||||
task_name: str = None, | |||||
task_data_config: ConfigDict = None, | |||||
**format_kwargs, | **format_kwargs, | ||||
): | ): | ||||
"""Create a torch.utils.data.Dataset from the MS Dataset. The torch.utils.data.Dataset can be passed to | """Create a torch.utils.data.Dataset from the MS Dataset. The torch.utils.data.Dataset can be passed to | ||||
@@ -350,6 +366,8 @@ class MsDataset: | |||||
columns (str or List[str], default None): Dataset column(s) to be loaded (numeric data only). If the | columns (str or List[str], default None): Dataset column(s) to be loaded (numeric data only). If the | ||||
preprocessor is None, the arg columns must have at least one column. If the `preprocessors` is not None, | preprocessor is None, the arg columns must have at least one column. If the `preprocessors` is not None, | ||||
the output fields of processors will also be added. | the output fields of processors will also be added. | ||||
task_name (str, default None): task name, refer to :obj:`Tasks` for more details | |||||
task_data_config (ConfigDict, default None): config dict for model object. | |||||
format_kwargs: A `dict` of arguments to be passed to the `torch.tensor`. | format_kwargs: A `dict` of arguments to be passed to the `torch.tensor`. | ||||
Returns: | Returns: | ||||
@@ -360,6 +378,10 @@ class MsDataset: | |||||
raise ImportError( | raise ImportError( | ||||
'The function to_torch_dataset requires pytorch to be installed' | 'The function to_torch_dataset requires pytorch to be installed' | ||||
) | ) | ||||
if isinstance(self._hf_ds, ExternalDataset): | |||||
task_data_config.update({'preprocessor': preprocessors}) | |||||
return build_task_dataset(task_data_config, task_name, | |||||
self._hf_ds.config_kwargs) | |||||
if preprocessors is not None: | if preprocessors is not None: | ||||
return self.to_torch_dataset_with_processors( | return self.to_torch_dataset_with_processors( | ||||
preprocessors, columns=columns) | preprocessors, columns=columns) | ||||
@@ -8,6 +8,7 @@ if TYPE_CHECKING: | |||||
from .builder import TASK_DATASETS, build_task_dataset | from .builder import TASK_DATASETS, build_task_dataset | ||||
from .torch_base_dataset import TorchTaskDataset | from .torch_base_dataset import TorchTaskDataset | ||||
from .veco_dataset import VecoDataset | from .veco_dataset import VecoDataset | ||||
from .image_instance_segmentation_coco_dataset import ImageInstanceSegmentationCocoDataset | |||||
else: | else: | ||||
_import_structure = { | _import_structure = { | ||||
@@ -15,6 +16,8 @@ else: | |||||
'builder': ['TASK_DATASETS', 'build_task_dataset'], | 'builder': ['TASK_DATASETS', 'build_task_dataset'], | ||||
'torch_base_dataset': ['TorchTaskDataset'], | 'torch_base_dataset': ['TorchTaskDataset'], | ||||
'veco_dataset': ['VecoDataset'], | 'veco_dataset': ['VecoDataset'], | ||||
'image_instance_segmentation_coco_dataset': | |||||
['ImageInstanceSegmentationCocoDataset'] | |||||
} | } | ||||
import sys | import sys | ||||
@@ -2,14 +2,32 @@ import os.path as osp | |||||
import numpy as np | import numpy as np | ||||
from pycocotools.coco import COCO | from pycocotools.coco import COCO | ||||
from torch.utils.data import Dataset | |||||
class ImageInstanceSegmentationCocoDataset(Dataset): | |||||
from modelscope.metainfo import Models | |||||
from modelscope.utils.constant import Tasks | |||||
from .builder import TASK_DATASETS | |||||
from .torch_base_dataset import TorchTaskDataset | |||||
DATASET_STRUCTURE = { | |||||
'train': { | |||||
'annotation': 'annotations/instances_train.json', | |||||
'images': 'images/train' | |||||
}, | |||||
'validation': { | |||||
'annotation': 'annotations/instances_val.json', | |||||
'images': 'images/val' | |||||
} | |||||
} | |||||
@TASK_DATASETS.register_module( | |||||
module_name=Models.cascade_mask_rcnn_swin, | |||||
group_key=Tasks.image_segmentation) | |||||
class ImageInstanceSegmentationCocoDataset(TorchTaskDataset): | |||||
"""Coco-style dataset for image instance segmentation. | """Coco-style dataset for image instance segmentation. | ||||
Args: | Args: | ||||
ann_file (str): Annotation file path. | |||||
split_config (dict): Annotation file path. {"train":"xxxxx"} | |||||
classes (Sequence[str], optional): Specify classes to load. | classes (Sequence[str], optional): Specify classes to load. | ||||
If is None, ``cls.CLASSES`` will be used. Default: None. | If is None, ``cls.CLASSES`` will be used. Default: None. | ||||
data_root (str, optional): Data root for ``ann_file``, | data_root (str, optional): Data root for ``ann_file``, | ||||
@@ -37,30 +55,27 @@ class ImageInstanceSegmentationCocoDataset(Dataset): | |||||
'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush') | 'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush') | ||||
def __init__(self, | def __init__(self, | ||||
ann_file, | |||||
split_config: dict, | |||||
preprocessor=None, | |||||
classes=None, | classes=None, | ||||
data_root=None, | |||||
img_prefix='', | |||||
seg_prefix=None, | seg_prefix=None, | ||||
test_mode=False, | test_mode=False, | ||||
filter_empty_gt=True): | |||||
self.ann_file = ann_file | |||||
self.data_root = data_root | |||||
self.img_prefix = img_prefix | |||||
filter_empty_gt=True, | |||||
**kwargs): | |||||
self.data_root = next(iter(split_config.values())) | |||||
self.split = next(iter(split_config.keys())) | |||||
self.preprocessor = preprocessor | |||||
self.ann_file = osp.join(self.data_root, | |||||
DATASET_STRUCTURE[self.split]['annotation']) | |||||
self.img_prefix = osp.join(self.data_root, | |||||
DATASET_STRUCTURE[self.split]['images']) | |||||
self.seg_prefix = seg_prefix | self.seg_prefix = seg_prefix | ||||
self.test_mode = test_mode | self.test_mode = test_mode | ||||
self.filter_empty_gt = filter_empty_gt | self.filter_empty_gt = filter_empty_gt | ||||
self.CLASSES = self.get_classes(classes) | self.CLASSES = self.get_classes(classes) | ||||
# join paths if data_root is specified | |||||
if self.data_root is not None: | |||||
if not osp.isabs(self.ann_file): | |||||
self.ann_file = osp.join(self.data_root, self.ann_file) | |||||
if not (self.img_prefix is None or osp.isabs(self.img_prefix)): | |||||
self.img_prefix = osp.join(self.data_root, self.img_prefix) | |||||
if not (self.seg_prefix is None or osp.isabs(self.seg_prefix)): | |||||
self.seg_prefix = osp.join(self.data_root, self.seg_prefix) | |||||
# load annotations | # load annotations | ||||
self.data_infos = self.load_annotations(self.ann_file) | self.data_infos = self.load_annotations(self.ann_file) | ||||
@@ -71,8 +86,6 @@ class ImageInstanceSegmentationCocoDataset(Dataset): | |||||
# set group flag for the sampler | # set group flag for the sampler | ||||
self._set_group_flag() | self._set_group_flag() | ||||
self.preprocessor = None | |||||
def __len__(self): | def __len__(self): | ||||
"""Total number of samples of data.""" | """Total number of samples of data.""" | ||||
return len(self.data_infos) | return len(self.data_infos) | ||||
@@ -326,7 +339,3 @@ class ImageInstanceSegmentationCocoDataset(Dataset): | |||||
raise ValueError(f'Unsupported type {type(classes)} of classes.') | raise ValueError(f'Unsupported type {type(classes)} of classes.') | ||||
return class_names | return class_names | ||||
def to_torch_dataset(self, preprocessors=None): | |||||
self.preprocessor = preprocessors | |||||
return self |
@@ -8,6 +8,7 @@ from datasets.info import DatasetInfo | |||||
from datasets.packaged_modules import csv | from datasets.packaged_modules import csv | ||||
from datasets.utils.filelock import FileLock | from datasets.utils.filelock import FileLock | ||||
from modelscope.utils.constant import DownloadMode | |||||
from modelscope.utils.logger import get_logger | from modelscope.utils.logger import get_logger | ||||
logger = get_logger() | logger = get_logger() | ||||
@@ -26,11 +27,11 @@ class MsCsvDatasetBuilder(csv.Csv): | |||||
zip_data_files: Mapping[str, Union[str, Sequence[str]]] = None, | zip_data_files: Mapping[str, Union[str, Sequence[str]]] = None, | ||||
**config_kwargs, | **config_kwargs, | ||||
): | ): | ||||
self.namespace = namespace | |||||
super().__init__( | super().__init__( | ||||
cache_dir=cache_dir, | cache_dir=cache_dir, | ||||
name=subset_name, | name=subset_name, | ||||
hash=hash, | hash=hash, | ||||
namespace=namespace, | |||||
data_files=meta_data_files, | data_files=meta_data_files, | ||||
**config_kwargs) | **config_kwargs) | ||||
@@ -56,6 +57,25 @@ class MsCsvDatasetBuilder(csv.Csv): | |||||
os.rmdir(self._cache_dir) | os.rmdir(self._cache_dir) | ||||
self.zip_data_files = zip_data_files | self.zip_data_files = zip_data_files | ||||
def _relative_data_dir(self, with_version=True, with_hash=True) -> str: | |||||
"""Relative path of this dataset in cache_dir: | |||||
Will be: | |||||
self.name/self.config.version/self.hash/ | |||||
or if a namespace has been specified: | |||||
self.namespace___self.name/self.config.version/self.hash/ | |||||
""" | |||||
builder_data_dir = self.name if self.namespace is None else f'{self.namespace}___{self.name}' | |||||
builder_config = self.config | |||||
hash = self.hash | |||||
if builder_config: | |||||
builder_data_dir = os.path.join(builder_data_dir, self.config_id) | |||||
if with_version: | |||||
builder_data_dir = os.path.join(builder_data_dir, | |||||
str(self.config.version)) | |||||
if with_hash and hash and isinstance(hash, str): | |||||
builder_data_dir = os.path.join(builder_data_dir, hash) | |||||
return builder_data_dir | |||||
def _build_cache_dir(self): | def _build_cache_dir(self): | ||||
builder_data_dir = os.path.join( | builder_data_dir = os.path.join( | ||||
self._cache_dir_root, | self._cache_dir_root, | ||||
@@ -77,8 +97,15 @@ class MsCsvDatasetBuilder(csv.Csv): | |||||
datasets.SplitGenerator( | datasets.SplitGenerator( | ||||
name=split_name, | name=split_name, | ||||
gen_kwargs={ | gen_kwargs={ | ||||
'files': dl_manager.iter_files(files), | |||||
'base_dir': zip_data_files.get(split_name) | |||||
'files': | |||||
dl_manager.iter_files(files), | |||||
'base_dir': | |||||
os.path.join( | |||||
zip_data_files.get(split_name), | |||||
os.path.splitext( | |||||
self.zip_data_files.get(split_name))[0]) | |||||
if self.zip_data_files.get(split_name) else | |||||
zip_data_files.get(split_name) | |||||
})) | })) | ||||
return splits | return splits | ||||
@@ -111,3 +138,65 @@ class MsCsvDatasetBuilder(csv.Csv): | |||||
logger.error( | logger.error( | ||||
f"Failed to read file '{file}' with error {type(e)}: {e}") | f"Failed to read file '{file}' with error {type(e)}: {e}") | ||||
raise | raise | ||||
class TaskSpecificDatasetBuilder(MsCsvDatasetBuilder): | |||||
def __init__( | |||||
self, | |||||
dataset_name: str, | |||||
cache_dir: str, | |||||
namespace: str, | |||||
subset_name: str, | |||||
hash: str, | |||||
meta_data_files: Mapping[str, Union[str, Sequence[str]]], | |||||
zip_data_files: Mapping[str, Union[str, Sequence[str]]] = None, | |||||
**config_kwargs, | |||||
): | |||||
self.name = dataset_name | |||||
self.subset_name = subset_name | |||||
self.namespace = namespace | |||||
self.hash = hash | |||||
self.data_files = meta_data_files | |||||
self.zip_data_files = zip_data_files | |||||
self.split_path_dict = None | |||||
self.config = None | |||||
self._cache_dir_root = os.path.expanduser(cache_dir) | |||||
self._cache_dir = self._build_cache_dir() | |||||
self._config_kwargs = config_kwargs | |||||
def download_and_prepare(self, download_mode, dl_manager, | |||||
**download_kwargs): | |||||
# Prevent parallel disk operations | |||||
lock_path = os.path.join( | |||||
self._cache_dir_root, | |||||
self._cache_dir.replace(os.sep, '_') + '.lock') | |||||
with FileLock(lock_path): | |||||
data_exists = os.path.exists(self._cache_dir) | |||||
if data_exists and download_mode == DownloadMode.REUSE_DATASET_IF_EXISTS: | |||||
logger.warning( | |||||
f'Reusing dataset {self.name} ({self._cache_dir})') | |||||
return | |||||
logger.info(f'Generating dataset {self.name} ({self._cache_dir})') | |||||
self._download_and_prepare(dl_manager=dl_manager) | |||||
def _download_and_prepare(self, dl_manager): | |||||
split_path_dict = dl_manager.download_and_extract(self.zip_data_files) | |||||
self.split_path_dict = { | |||||
k: os.path.join(v, | |||||
os.path.splitext(self.zip_data_files[k])[0]) | |||||
for k, v in split_path_dict.items() | |||||
} | |||||
def as_dataset(self): | |||||
return ExternalDataset(self.split_path_dict, self._config_kwargs) | |||||
class ExternalDataset(object): | |||||
def __init__(self, split_path_dict, config_kwargs): | |||||
config_kwargs.update({'split_config': split_path_dict}) | |||||
self.config_kwargs = config_kwargs | |||||
def __len__(self): | |||||
return len(self.config_kwargs['split_config']) |
@@ -6,7 +6,7 @@ from datasets.builder import DatasetBuilder | |||||
from modelscope.utils.constant import DEFAULT_DATASET_REVISION | from modelscope.utils.constant import DEFAULT_DATASET_REVISION | ||||
from modelscope.utils.logger import get_logger | from modelscope.utils.logger import get_logger | ||||
from .dataset_builder import MsCsvDatasetBuilder | |||||
from .dataset_builder import MsCsvDatasetBuilder, TaskSpecificDatasetBuilder | |||||
logger = get_logger() | logger = get_logger() | ||||
@@ -87,7 +87,7 @@ def get_dataset_files(subset_split_into: dict, | |||||
modelscope_api = HubApi() | modelscope_api = HubApi() | ||||
for split, info in subset_split_into.items(): | for split, info in subset_split_into.items(): | ||||
meta_map[split] = modelscope_api.get_dataset_file_url( | meta_map[split] = modelscope_api.get_dataset_file_url( | ||||
info['meta'], dataset_name, namespace, revision) | |||||
info.get('meta', ''), dataset_name, namespace, revision) | |||||
if info.get('file'): | if info.get('file'): | ||||
file_map[split] = info['file'] | file_map[split] = info['file'] | ||||
return meta_map, file_map | return meta_map, file_map | ||||
@@ -99,15 +99,32 @@ def load_dataset_builder(dataset_name: str, subset_name: str, namespace: str, | |||||
zip_data_files: Mapping[str, Union[str, | zip_data_files: Mapping[str, Union[str, | ||||
Sequence[str]]], | Sequence[str]]], | ||||
cache_dir: str, version: Optional[Union[str]], | cache_dir: str, version: Optional[Union[str]], | ||||
split: Sequence[str]) -> DatasetBuilder: | |||||
split: Sequence[str], | |||||
**config_kwargs) -> DatasetBuilder: | |||||
sub_dir = os.path.join(version, '_'.join(split)) | sub_dir = os.path.join(version, '_'.join(split)) | ||||
builder_instance = MsCsvDatasetBuilder( | |||||
dataset_name=dataset_name, | |||||
namespace=namespace, | |||||
cache_dir=cache_dir, | |||||
subset_name=subset_name, | |||||
meta_data_files=meta_data_files, | |||||
zip_data_files=zip_data_files, | |||||
hash=sub_dir) | |||||
meta_data_file = next(iter(meta_data_files.values())) | |||||
if not meta_data_file: | |||||
builder_instance = TaskSpecificDatasetBuilder( | |||||
dataset_name=dataset_name, | |||||
namespace=namespace, | |||||
cache_dir=cache_dir, | |||||
subset_name=subset_name, | |||||
meta_data_files=meta_data_files, | |||||
zip_data_files=zip_data_files, | |||||
hash=sub_dir, | |||||
**config_kwargs) | |||||
elif meta_data_file.endswith('.csv'): | |||||
builder_instance = MsCsvDatasetBuilder( | |||||
dataset_name=dataset_name, | |||||
namespace=namespace, | |||||
cache_dir=cache_dir, | |||||
subset_name=subset_name, | |||||
meta_data_files=meta_data_files, | |||||
zip_data_files=zip_data_files, | |||||
hash=sub_dir) | |||||
else: | |||||
raise NotImplementedError( | |||||
f'Dataset mete file extensions "{os.path.splitext(meta_data_file)[-1]}" is not implemented yet' | |||||
) | |||||
return builder_instance | return builder_instance |
@@ -22,7 +22,3 @@ class ImageInstanceSegmentationTrainer(EpochBasedTrainer): | |||||
def prediction_step(self, model, inputs): | def prediction_step(self, model, inputs): | ||||
pass | pass | ||||
def to_task_dataset(self, datasets, mode, preprocessor=None): | |||||
# wait for dataset interface to become stable... | |||||
return datasets.to_torch_dataset(preprocessor) |
@@ -202,7 +202,7 @@ class VecoTrainer(NlpEpochBasedTrainer): | |||||
"""Veco evaluates the datasets one by one. | """Veco evaluates the datasets one by one. | ||||
""" | """ | ||||
from modelscope.task_datasets import VecoDataset | |||||
from modelscope.msdatasets.task_datasets import VecoDataset | |||||
self.model.eval() | self.model.eval() | ||||
self._mode = ModeKeys.EVAL | self._mode = ModeKeys.EVAL | ||||
metric_values = {} | metric_values = {} | ||||
@@ -21,11 +21,12 @@ from modelscope.metainfo import Trainers | |||||
from modelscope.metrics import build_metric, task_default_metrics | from modelscope.metrics import build_metric, task_default_metrics | ||||
from modelscope.models.base import Model, TorchModel | from modelscope.models.base import Model, TorchModel | ||||
from modelscope.msdatasets.ms_dataset import MsDataset | from modelscope.msdatasets.ms_dataset import MsDataset | ||||
from modelscope.msdatasets.task_datasets.builder import build_task_dataset | |||||
from modelscope.msdatasets.task_datasets.torch_base_dataset import \ | |||||
TorchTaskDataset | |||||
from modelscope.preprocessors.base import Preprocessor | from modelscope.preprocessors.base import Preprocessor | ||||
from modelscope.preprocessors.builder import build_preprocessor | from modelscope.preprocessors.builder import build_preprocessor | ||||
from modelscope.preprocessors.common import Compose | from modelscope.preprocessors.common import Compose | ||||
from modelscope.task_datasets.builder import build_task_dataset | |||||
from modelscope.task_datasets.torch_base_dataset import TorchTaskDataset | |||||
from modelscope.trainers.hooks.builder import HOOKS | from modelscope.trainers.hooks.builder import HOOKS | ||||
from modelscope.trainers.hooks.priority import Priority, get_priority | from modelscope.trainers.hooks.priority import Priority, get_priority | ||||
from modelscope.trainers.lrscheduler.builder import build_lr_scheduler | from modelscope.trainers.lrscheduler.builder import build_lr_scheduler | ||||
@@ -288,14 +289,21 @@ class EpochBasedTrainer(BaseTrainer): | |||||
if isinstance(datasets, TorchTaskDataset): | if isinstance(datasets, TorchTaskDataset): | ||||
return datasets | return datasets | ||||
elif isinstance(datasets, MsDataset): | elif isinstance(datasets, MsDataset): | ||||
datasets = datasets.to_torch_dataset( | |||||
cfg = ConfigDict(type=self.cfg.model.type, mode=mode) if hasattr(self.cfg, ConfigFields.model) \ | |||||
else ConfigDict(type=None, mode=mode) | |||||
return datasets.to_torch_dataset( | |||||
task_data_config=cfg, | |||||
task_name=self.cfg.task, | |||||
preprocessors=preprocessor) | preprocessors=preprocessor) | ||||
return datasets | |||||
elif isinstance(datasets, List) and isinstance( | elif isinstance(datasets, List) and isinstance( | ||||
datasets[0], MsDataset): | datasets[0], MsDataset): | ||||
cfg = ConfigDict(type=self.cfg.model.type, mode=mode) if hasattr(self.cfg, ConfigFields.model) \ | |||||
else ConfigDict(type=None, mode=mode) | |||||
datasets = [ | datasets = [ | ||||
d.to_torch_dataset(preprocessor=preprocessor) | |||||
for d in datasets | |||||
d.to_torch_dataset( | |||||
task_data_config=cfg, | |||||
task_name=self.cfg.task, | |||||
preprocessors=preprocessor) for d in datasets | |||||
] | ] | ||||
cfg = ConfigDict( | cfg = ConfigDict( | ||||
type=self.cfg.task, mode=mode, datasets=datasets) | type=self.cfg.task, mode=mode, datasets=datasets) | ||||
@@ -585,8 +593,13 @@ class EpochBasedTrainer(BaseTrainer): | |||||
subset_name=data_cfg.subset_name if hasattr( | subset_name=data_cfg.subset_name if hasattr( | ||||
data_cfg, 'subset_name') else None, | data_cfg, 'subset_name') else None, | ||||
hub=data_cfg.hub if hasattr(data_cfg, 'hub') else Hubs.modelscope, | hub=data_cfg.hub if hasattr(data_cfg, 'hub') else Hubs.modelscope, | ||||
**data_cfg, | |||||
) | ) | ||||
torch_dataset = dataset.to_torch_dataset(preprocessors=preprocessor) | |||||
cfg = ConfigDict(type=self.cfg.model.type, mode=mode) | |||||
torch_dataset = dataset.to_torch_dataset( | |||||
task_data_config=cfg, | |||||
task_name=self.cfg.task, | |||||
preprocessors=self.preprocessor) | |||||
dataset = self.to_task_dataset(torch_dataset, mode) | dataset = self.to_task_dataset(torch_dataset, mode) | ||||
return dataset | return dataset | ||||
@@ -30,8 +30,8 @@ MODELSCOPE_PATH = '/'.join(os.path.dirname(__file__).split('/')[:-1]) | |||||
REGISTER_MODULE = 'register_module' | REGISTER_MODULE = 'register_module' | ||||
IGNORED_PACKAGES = ['modelscope', '.'] | IGNORED_PACKAGES = ['modelscope', '.'] | ||||
SCAN_SUB_FOLDERS = [ | SCAN_SUB_FOLDERS = [ | ||||
'models', 'metrics', 'pipelines', 'preprocessors', 'task_datasets', | |||||
'trainers' | |||||
'models', 'metrics', 'pipelines', 'preprocessors', | |||||
'msdatasets/task_datasets', 'trainers' | |||||
] | ] | ||||
INDEXER_FILE = 'ast_indexer' | INDEXER_FILE = 'ast_indexer' | ||||
DECORATOR_KEY = 'decorators' | DECORATOR_KEY = 'decorators' | ||||
@@ -1,6 +1,5 @@ | |||||
addict | addict | ||||
#version above 2.1.0 introduces backward-compatability issue which is being resolved | |||||
datasets==2.1.0 | |||||
datasets | |||||
easydict | easydict | ||||
einops | einops | ||||
filelock>=3.3.0 | filelock>=3.3.0 | ||||
@@ -4,6 +4,7 @@ from modelscope.models import Model | |||||
from modelscope.msdatasets import MsDataset | from modelscope.msdatasets import MsDataset | ||||
from modelscope.preprocessors import SequenceClassificationPreprocessor | from modelscope.preprocessors import SequenceClassificationPreprocessor | ||||
from modelscope.preprocessors.base import Preprocessor | from modelscope.preprocessors.base import Preprocessor | ||||
from modelscope.utils.constant import DownloadMode | |||||
from modelscope.utils.test_utils import require_tf, require_torch, test_level | from modelscope.utils.test_utils import require_tf, require_torch, test_level | ||||
@@ -30,6 +31,16 @@ class ImgPreprocessor(Preprocessor): | |||||
class MsDatasetTest(unittest.TestCase): | class MsDatasetTest(unittest.TestCase): | ||||
@unittest.skipUnless(test_level() >= 0, 'skip test in current test level') | |||||
def test_coco(self): | |||||
ms_ds_train = MsDataset.load( | |||||
'pets_small', | |||||
namespace='modelscope', | |||||
split='train', | |||||
download_mode=DownloadMode.FORCE_REDOWNLOAD, | |||||
classes=('1', '2')) | |||||
print(ms_ds_train._hf_ds.config_kwargs) | |||||
@unittest.skipUnless(test_level() >= 1, 'skip test in current test level') | @unittest.skipUnless(test_level() >= 1, 'skip test in current test level') | ||||
def test_ms_csv_basic(self): | def test_ms_csv_basic(self): | ||||
ms_ds_train = MsDataset.load( | ms_ds_train = MsDataset.load( | ||||
@@ -2,7 +2,7 @@ | |||||
import unittest | import unittest | ||||
from modelscope.task_datasets.veco_dataset import VecoDataset | |||||
from modelscope.msdatasets.task_datasets.veco_dataset import VecoDataset | |||||
from modelscope.utils.test_utils import test_level | from modelscope.utils.test_utils import test_level | ||||
@@ -8,10 +8,13 @@ from functools import partial | |||||
from modelscope.hub.snapshot_download import snapshot_download | from modelscope.hub.snapshot_download import snapshot_download | ||||
from modelscope.metainfo import Trainers | from modelscope.metainfo import Trainers | ||||
from modelscope.models.cv.image_instance_segmentation import ( | |||||
CascadeMaskRCNNSwinModel, ImageInstanceSegmentationCocoDataset) | |||||
from modelscope.models.cv.image_instance_segmentation import \ | |||||
CascadeMaskRCNNSwinModel | |||||
from modelscope.msdatasets import MsDataset | |||||
from modelscope.msdatasets.task_datasets import \ | |||||
ImageInstanceSegmentationCocoDataset | |||||
from modelscope.trainers import build_trainer | from modelscope.trainers import build_trainer | ||||
from modelscope.utils.config import Config | |||||
from modelscope.utils.config import Config, ConfigDict | |||||
from modelscope.utils.constant import ModelFile | from modelscope.utils.constant import ModelFile | ||||
from modelscope.utils.test_utils import test_level | from modelscope.utils.test_utils import test_level | ||||
@@ -27,34 +30,47 @@ class TestImageInstanceSegmentationTrainer(unittest.TestCase): | |||||
config_path = os.path.join(cache_path, ModelFile.CONFIGURATION) | config_path = os.path.join(cache_path, ModelFile.CONFIGURATION) | ||||
cfg = Config.from_file(config_path) | cfg = Config.from_file(config_path) | ||||
data_root = cfg.dataset.data_root | |||||
classes = tuple(cfg.dataset.classes) | |||||
max_epochs = cfg.train.max_epochs | max_epochs = cfg.train.max_epochs | ||||
samples_per_gpu = cfg.train.dataloader.batch_size_per_gpu | samples_per_gpu = cfg.train.dataloader.batch_size_per_gpu | ||||
if data_root is None: | |||||
try: | |||||
train_data_cfg = cfg.dataset.train | |||||
val_data_cfg = cfg.dataset.val | |||||
except Exception: | |||||
train_data_cfg = None | |||||
val_data_cfg = None | |||||
if train_data_cfg is None: | |||||
# use default toy data | # use default toy data | ||||
dataset_path = os.path.join(cache_path, 'toydata.zip') | |||||
with zipfile.ZipFile(dataset_path, 'r') as zipf: | |||||
zipf.extractall(cache_path) | |||||
data_root = cache_path + '/toydata/' | |||||
classes = ('Cat', 'Dog') | |||||
self.train_dataset = ImageInstanceSegmentationCocoDataset( | |||||
data_root + 'annotations/instances_train.json', | |||||
classes=classes, | |||||
data_root=data_root, | |||||
img_prefix=data_root + 'images/train/', | |||||
seg_prefix=None, | |||||
test_mode=False) | |||||
self.eval_dataset = ImageInstanceSegmentationCocoDataset( | |||||
data_root + 'annotations/instances_val.json', | |||||
classes=classes, | |||||
data_root=data_root, | |||||
img_prefix=data_root + 'images/val/', | |||||
seg_prefix=None, | |||||
test_mode=True) | |||||
train_data_cfg = ConfigDict( | |||||
name='pets_small', | |||||
split='train', | |||||
classes=('Cat', 'Dog'), | |||||
test_mode=False) | |||||
if val_data_cfg is None: | |||||
val_data_cfg = ConfigDict( | |||||
name='pets_small', | |||||
split='validation', | |||||
classes=('Cat', 'Dog'), | |||||
test_mode=True) | |||||
self.train_dataset = MsDataset.load( | |||||
dataset_name=train_data_cfg.name, | |||||
split=train_data_cfg.split, | |||||
classes=train_data_cfg.classes, | |||||
test_mode=train_data_cfg.test_mode) | |||||
assert self.train_dataset.config_kwargs[ | |||||
'classes'] == train_data_cfg.classes | |||||
assert next( | |||||
iter(self.train_dataset.config_kwargs['split_config'].values())) | |||||
self.eval_dataset = MsDataset.load( | |||||
dataset_name=val_data_cfg.name, | |||||
split=val_data_cfg.split, | |||||
classes=val_data_cfg.classes, | |||||
test_mode=val_data_cfg.test_mode) | |||||
assert self.eval_dataset.config_kwargs[ | |||||
'classes'] == val_data_cfg.classes | |||||
assert next( | |||||
iter(self.eval_dataset.config_kwargs['split_config'].values())) | |||||
from mmcv.parallel import collate | from mmcv.parallel import collate | ||||