msdataset add coco dataset unify taskdataset and ms dataset fix hf datasetsmaster
@@ -362,8 +362,10 @@ class HubApi: | |||
dataset_name: str, | |||
namespace: str, | |||
revision: Optional[str] = DEFAULT_DATASET_REVISION): | |||
return f'{self.dataset_endpoint}/api/v1/datasets/{namespace}/{dataset_name}/repo?' \ | |||
f'Revision={revision}&FilePath={file_name}' | |||
if file_name.endswith('.csv'): | |||
file_name = f'{self.dataset_endpoint}/api/v1/datasets/{namespace}/{dataset_name}/repo?' \ | |||
f'Revision={revision}&FilePath={file_name}' | |||
return file_name | |||
def get_dataset_access_config( | |||
self, | |||
@@ -7,13 +7,11 @@ if TYPE_CHECKING: | |||
from .cascade_mask_rcnn_swin import CascadeMaskRCNNSwin | |||
from .model import CascadeMaskRCNNSwinModel | |||
from .postprocess_utils import get_img_ins_seg_result | |||
from .datasets import ImageInstanceSegmentationCocoDataset | |||
else: | |||
_import_structure = { | |||
'cascade_mask_rcnn_swin': ['CascadeMaskRCNNSwin'], | |||
'model': ['CascadeMaskRCNNSwinModel'], | |||
'postprocess_utils': ['get_img_ins_seg_result'], | |||
'datasets': ['ImageInstanceSegmentationCocoDataset'] | |||
} | |||
import sys | |||
@@ -1,2 +1 @@ | |||
from .dataset import ImageInstanceSegmentationCocoDataset | |||
from .transforms import build_preprocess_transform |
@@ -13,9 +13,12 @@ from datasets.utils.file_utils import (is_relative_path, | |||
relative_to_absolute_path) | |||
from modelscope.msdatasets.config import MS_DATASETS_CACHE | |||
from modelscope.utils.config import ConfigDict | |||
from modelscope.utils.constant import (DEFAULT_DATASET_REVISION, | |||
DatasetFormations, DownloadMode, Hubs) | |||
from modelscope.utils.logger import get_logger | |||
from .task_datasets.builder import build_task_dataset | |||
from .utils.dataset_builder import ExternalDataset | |||
from .utils.dataset_utils import (get_dataset_files, | |||
get_target_dataset_structure, | |||
load_dataset_builder) | |||
@@ -67,9 +70,16 @@ class MsDataset: | |||
def __len__(self): | |||
return len(self._hf_ds) | |||
@property | |||
def config_kwargs(self): | |||
if isinstance(self._hf_ds, ExternalDataset): | |||
return self._hf_ds.config_kwargs | |||
else: | |||
return None | |||
@classmethod | |||
def from_hf_dataset(cls, | |||
hf_ds: Union[Dataset, DatasetDict], | |||
hf_ds: Union[Dataset, DatasetDict, ExternalDataset], | |||
target: str = None) -> Union[dict, 'MsDataset']: | |||
if isinstance(hf_ds, Dataset): | |||
return cls(hf_ds, target) | |||
@@ -77,6 +87,8 @@ class MsDataset: | |||
if len(hf_ds.keys()) == 1: | |||
return cls(next(iter(hf_ds.values())), target) | |||
return {k: cls(v, target) for k, v in hf_ds.items()} | |||
elif isinstance(hf_ds, ExternalDataset): | |||
return cls(hf_ds) | |||
else: | |||
raise TypeError( | |||
f'"hf_ds" must be a Dataset or DatasetDict, but got {type(hf_ds)}' | |||
@@ -96,7 +108,8 @@ class MsDataset: | |||
Mapping[str, Union[str, | |||
Sequence[str]]]]] = None, | |||
download_mode: Optional[DownloadMode] = DownloadMode. | |||
REUSE_DATASET_IF_EXISTS | |||
REUSE_DATASET_IF_EXISTS, | |||
**config_kwargs, | |||
) -> Union[dict, 'MsDataset']: | |||
"""Load a MsDataset from the ModelScope Hub, Hugging Face Hub, urls, or a local dataset. | |||
Args: | |||
@@ -113,6 +126,7 @@ class MsDataset: | |||
hub (Hubs or str, optional): When loading from a remote hub, where it is from. default Hubs.modelscope | |||
download_mode (DownloadMode or str, optional): How to treat existing datasets. default | |||
DownloadMode.REUSE_DATASET_IF_EXISTS | |||
**config_kwargs (additional keyword arguments): Keyword arguments to be passed | |||
Returns: | |||
MsDataset (obj:`MsDataset`): MsDataset object for a certain dataset. | |||
@@ -128,7 +142,8 @@ class MsDataset: | |||
split=split, | |||
data_dir=data_dir, | |||
data_files=data_files, | |||
download_mode=download_mode.value) | |||
download_mode=download_mode.value, | |||
**config_kwargs) | |||
return MsDataset.from_hf_dataset(dataset, target=target) | |||
elif hub == Hubs.modelscope: | |||
return MsDataset._load_ms_dataset( | |||
@@ -140,22 +155,22 @@ class MsDataset: | |||
split=split, | |||
data_dir=data_dir, | |||
data_files=data_files, | |||
download_mode=download_mode) | |||
download_mode=download_mode, | |||
**config_kwargs) | |||
@staticmethod | |||
def _load_ms_dataset( | |||
dataset_name: Union[str, list], | |||
namespace: Optional[str] = None, | |||
target: Optional[str] = None, | |||
version: Optional[str] = DEFAULT_DATASET_REVISION, | |||
subset_name: Optional[str] = None, | |||
split: Optional[str] = None, | |||
data_dir: Optional[str] = None, | |||
data_files: Optional[Union[str, Sequence[str], | |||
Mapping[str, Union[str, | |||
Sequence[str]]]]] = None, | |||
download_mode: Optional[DownloadMode] = None | |||
) -> Union[dict, 'MsDataset']: | |||
def _load_ms_dataset(dataset_name: Union[str, list], | |||
namespace: Optional[str] = None, | |||
target: Optional[str] = None, | |||
version: Optional[str] = DEFAULT_DATASET_REVISION, | |||
subset_name: Optional[str] = None, | |||
split: Optional[str] = None, | |||
data_dir: Optional[str] = None, | |||
data_files: Optional[Union[ | |||
str, Sequence[str], | |||
Mapping[str, Union[str, Sequence[str]]]]] = None, | |||
download_mode: Optional[DownloadMode] = None, | |||
**config_kwargs) -> Union[dict, 'MsDataset']: | |||
if isinstance(dataset_name, str): | |||
dataset_formation = DatasetFormations.native | |||
if dataset_name in _PACKAGED_DATASETS_MODULES or os.path.isdir(dataset_name) or \ | |||
@@ -184,7 +199,8 @@ class MsDataset: | |||
data_dir=data_dir, | |||
data_files=data_files, | |||
cache_dir=MS_DATASETS_CACHE, | |||
download_mode=download_mode.value) | |||
download_mode=download_mode.value, | |||
**config_kwargs) | |||
else: | |||
dataset = MsDataset._load_from_ms( | |||
dataset_name, | |||
@@ -195,7 +211,7 @@ class MsDataset: | |||
subset_name=subset_name, | |||
split=split, | |||
download_mode=download_mode, | |||
) | |||
**config_kwargs) | |||
elif isinstance(dataset_name, list): | |||
if target is None: | |||
target = 'target' | |||
@@ -206,16 +222,15 @@ class MsDataset: | |||
return MsDataset.from_hf_dataset(dataset, target=target) | |||
@staticmethod | |||
def _load_from_ms( | |||
dataset_name: str, | |||
dataset_files: dict, | |||
download_dir: str, | |||
namespace: Optional[str] = None, | |||
version: Optional[str] = DEFAULT_DATASET_REVISION, | |||
subset_name: Optional[str] = None, | |||
split: Optional[str] = None, | |||
download_mode: Optional[DownloadMode] = None, | |||
) -> Union[Dataset, DatasetDict]: | |||
def _load_from_ms(dataset_name: str, | |||
dataset_files: dict, | |||
download_dir: str, | |||
namespace: Optional[str] = None, | |||
version: Optional[str] = DEFAULT_DATASET_REVISION, | |||
subset_name: Optional[str] = None, | |||
split: Optional[str] = None, | |||
download_mode: Optional[DownloadMode] = None, | |||
**config_kwargs) -> Union[Dataset, DatasetDict]: | |||
for json_path in dataset_files['.json']: | |||
if json_path.endswith(f'{dataset_name}.json'): | |||
with open(json_path, encoding='utf-8') as dataset_json_file: | |||
@@ -226,7 +241,6 @@ class MsDataset: | |||
meta_map, file_map = get_dataset_files(target_dataset_structure, | |||
dataset_name, namespace, | |||
version) | |||
builder = load_dataset_builder( | |||
dataset_name, | |||
subset_name, | |||
@@ -235,7 +249,8 @@ class MsDataset: | |||
zip_data_files=file_map, | |||
cache_dir=MS_DATASETS_CACHE, | |||
version=version, | |||
split=list(target_dataset_structure.keys())) | |||
split=list(target_dataset_structure.keys()), | |||
**config_kwargs) | |||
download_config = DownloadConfig( | |||
cache_dir=download_dir, | |||
@@ -253,7 +268,6 @@ class MsDataset: | |||
data_dir=download_dir, | |||
) | |||
builder.download_and_prepare( | |||
download_config=download_config, | |||
dl_manager=dl_manager, | |||
download_mode=download_mode.value, | |||
try_from_hf_gcs=False) | |||
@@ -338,6 +352,8 @@ class MsDataset: | |||
self, | |||
columns: Union[str, List[str]] = None, | |||
preprocessors: Union[Callable, List[Callable]] = None, | |||
task_name: str = None, | |||
task_data_config: ConfigDict = None, | |||
**format_kwargs, | |||
): | |||
"""Create a torch.utils.data.Dataset from the MS Dataset. The torch.utils.data.Dataset can be passed to | |||
@@ -350,6 +366,8 @@ class MsDataset: | |||
columns (str or List[str], default None): Dataset column(s) to be loaded (numeric data only). If the | |||
preprocessor is None, the arg columns must have at least one column. If the `preprocessors` is not None, | |||
the output fields of processors will also be added. | |||
task_name (str, default None): task name, refer to :obj:`Tasks` for more details | |||
task_data_config (ConfigDict, default None): config dict for model object. | |||
format_kwargs: A `dict` of arguments to be passed to the `torch.tensor`. | |||
Returns: | |||
@@ -360,6 +378,10 @@ class MsDataset: | |||
raise ImportError( | |||
'The function to_torch_dataset requires pytorch to be installed' | |||
) | |||
if isinstance(self._hf_ds, ExternalDataset): | |||
task_data_config.update({'preprocessor': preprocessors}) | |||
return build_task_dataset(task_data_config, task_name, | |||
self._hf_ds.config_kwargs) | |||
if preprocessors is not None: | |||
return self.to_torch_dataset_with_processors( | |||
preprocessors, columns=columns) | |||
@@ -8,6 +8,7 @@ if TYPE_CHECKING: | |||
from .builder import TASK_DATASETS, build_task_dataset | |||
from .torch_base_dataset import TorchTaskDataset | |||
from .veco_dataset import VecoDataset | |||
from .image_instance_segmentation_coco_dataset import ImageInstanceSegmentationCocoDataset | |||
else: | |||
_import_structure = { | |||
@@ -15,6 +16,8 @@ else: | |||
'builder': ['TASK_DATASETS', 'build_task_dataset'], | |||
'torch_base_dataset': ['TorchTaskDataset'], | |||
'veco_dataset': ['VecoDataset'], | |||
'image_instance_segmentation_coco_dataset': | |||
['ImageInstanceSegmentationCocoDataset'] | |||
} | |||
import sys | |||
@@ -2,14 +2,32 @@ import os.path as osp | |||
import numpy as np | |||
from pycocotools.coco import COCO | |||
from torch.utils.data import Dataset | |||
class ImageInstanceSegmentationCocoDataset(Dataset): | |||
from modelscope.metainfo import Models | |||
from modelscope.utils.constant import Tasks | |||
from .builder import TASK_DATASETS | |||
from .torch_base_dataset import TorchTaskDataset | |||
DATASET_STRUCTURE = { | |||
'train': { | |||
'annotation': 'annotations/instances_train.json', | |||
'images': 'images/train' | |||
}, | |||
'validation': { | |||
'annotation': 'annotations/instances_val.json', | |||
'images': 'images/val' | |||
} | |||
} | |||
@TASK_DATASETS.register_module( | |||
module_name=Models.cascade_mask_rcnn_swin, | |||
group_key=Tasks.image_segmentation) | |||
class ImageInstanceSegmentationCocoDataset(TorchTaskDataset): | |||
"""Coco-style dataset for image instance segmentation. | |||
Args: | |||
ann_file (str): Annotation file path. | |||
split_config (dict): Annotation file path. {"train":"xxxxx"} | |||
classes (Sequence[str], optional): Specify classes to load. | |||
If is None, ``cls.CLASSES`` will be used. Default: None. | |||
data_root (str, optional): Data root for ``ann_file``, | |||
@@ -37,30 +55,27 @@ class ImageInstanceSegmentationCocoDataset(Dataset): | |||
'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush') | |||
def __init__(self, | |||
ann_file, | |||
split_config: dict, | |||
preprocessor=None, | |||
classes=None, | |||
data_root=None, | |||
img_prefix='', | |||
seg_prefix=None, | |||
test_mode=False, | |||
filter_empty_gt=True): | |||
self.ann_file = ann_file | |||
self.data_root = data_root | |||
self.img_prefix = img_prefix | |||
filter_empty_gt=True, | |||
**kwargs): | |||
self.data_root = next(iter(split_config.values())) | |||
self.split = next(iter(split_config.keys())) | |||
self.preprocessor = preprocessor | |||
self.ann_file = osp.join(self.data_root, | |||
DATASET_STRUCTURE[self.split]['annotation']) | |||
self.img_prefix = osp.join(self.data_root, | |||
DATASET_STRUCTURE[self.split]['images']) | |||
self.seg_prefix = seg_prefix | |||
self.test_mode = test_mode | |||
self.filter_empty_gt = filter_empty_gt | |||
self.CLASSES = self.get_classes(classes) | |||
# join paths if data_root is specified | |||
if self.data_root is not None: | |||
if not osp.isabs(self.ann_file): | |||
self.ann_file = osp.join(self.data_root, self.ann_file) | |||
if not (self.img_prefix is None or osp.isabs(self.img_prefix)): | |||
self.img_prefix = osp.join(self.data_root, self.img_prefix) | |||
if not (self.seg_prefix is None or osp.isabs(self.seg_prefix)): | |||
self.seg_prefix = osp.join(self.data_root, self.seg_prefix) | |||
# load annotations | |||
self.data_infos = self.load_annotations(self.ann_file) | |||
@@ -71,8 +86,6 @@ class ImageInstanceSegmentationCocoDataset(Dataset): | |||
# set group flag for the sampler | |||
self._set_group_flag() | |||
self.preprocessor = None | |||
def __len__(self): | |||
"""Total number of samples of data.""" | |||
return len(self.data_infos) | |||
@@ -326,7 +339,3 @@ class ImageInstanceSegmentationCocoDataset(Dataset): | |||
raise ValueError(f'Unsupported type {type(classes)} of classes.') | |||
return class_names | |||
def to_torch_dataset(self, preprocessors=None): | |||
self.preprocessor = preprocessors | |||
return self |
@@ -8,6 +8,7 @@ from datasets.info import DatasetInfo | |||
from datasets.packaged_modules import csv | |||
from datasets.utils.filelock import FileLock | |||
from modelscope.utils.constant import DownloadMode | |||
from modelscope.utils.logger import get_logger | |||
logger = get_logger() | |||
@@ -26,11 +27,11 @@ class MsCsvDatasetBuilder(csv.Csv): | |||
zip_data_files: Mapping[str, Union[str, Sequence[str]]] = None, | |||
**config_kwargs, | |||
): | |||
self.namespace = namespace | |||
super().__init__( | |||
cache_dir=cache_dir, | |||
name=subset_name, | |||
hash=hash, | |||
namespace=namespace, | |||
data_files=meta_data_files, | |||
**config_kwargs) | |||
@@ -56,6 +57,25 @@ class MsCsvDatasetBuilder(csv.Csv): | |||
os.rmdir(self._cache_dir) | |||
self.zip_data_files = zip_data_files | |||
def _relative_data_dir(self, with_version=True, with_hash=True) -> str: | |||
"""Relative path of this dataset in cache_dir: | |||
Will be: | |||
self.name/self.config.version/self.hash/ | |||
or if a namespace has been specified: | |||
self.namespace___self.name/self.config.version/self.hash/ | |||
""" | |||
builder_data_dir = self.name if self.namespace is None else f'{self.namespace}___{self.name}' | |||
builder_config = self.config | |||
hash = self.hash | |||
if builder_config: | |||
builder_data_dir = os.path.join(builder_data_dir, self.config_id) | |||
if with_version: | |||
builder_data_dir = os.path.join(builder_data_dir, | |||
str(self.config.version)) | |||
if with_hash and hash and isinstance(hash, str): | |||
builder_data_dir = os.path.join(builder_data_dir, hash) | |||
return builder_data_dir | |||
def _build_cache_dir(self): | |||
builder_data_dir = os.path.join( | |||
self._cache_dir_root, | |||
@@ -77,8 +97,15 @@ class MsCsvDatasetBuilder(csv.Csv): | |||
datasets.SplitGenerator( | |||
name=split_name, | |||
gen_kwargs={ | |||
'files': dl_manager.iter_files(files), | |||
'base_dir': zip_data_files.get(split_name) | |||
'files': | |||
dl_manager.iter_files(files), | |||
'base_dir': | |||
os.path.join( | |||
zip_data_files.get(split_name), | |||
os.path.splitext( | |||
self.zip_data_files.get(split_name))[0]) | |||
if self.zip_data_files.get(split_name) else | |||
zip_data_files.get(split_name) | |||
})) | |||
return splits | |||
@@ -111,3 +138,65 @@ class MsCsvDatasetBuilder(csv.Csv): | |||
logger.error( | |||
f"Failed to read file '{file}' with error {type(e)}: {e}") | |||
raise | |||
class TaskSpecificDatasetBuilder(MsCsvDatasetBuilder): | |||
def __init__( | |||
self, | |||
dataset_name: str, | |||
cache_dir: str, | |||
namespace: str, | |||
subset_name: str, | |||
hash: str, | |||
meta_data_files: Mapping[str, Union[str, Sequence[str]]], | |||
zip_data_files: Mapping[str, Union[str, Sequence[str]]] = None, | |||
**config_kwargs, | |||
): | |||
self.name = dataset_name | |||
self.subset_name = subset_name | |||
self.namespace = namespace | |||
self.hash = hash | |||
self.data_files = meta_data_files | |||
self.zip_data_files = zip_data_files | |||
self.split_path_dict = None | |||
self.config = None | |||
self._cache_dir_root = os.path.expanduser(cache_dir) | |||
self._cache_dir = self._build_cache_dir() | |||
self._config_kwargs = config_kwargs | |||
def download_and_prepare(self, download_mode, dl_manager, | |||
**download_kwargs): | |||
# Prevent parallel disk operations | |||
lock_path = os.path.join( | |||
self._cache_dir_root, | |||
self._cache_dir.replace(os.sep, '_') + '.lock') | |||
with FileLock(lock_path): | |||
data_exists = os.path.exists(self._cache_dir) | |||
if data_exists and download_mode == DownloadMode.REUSE_DATASET_IF_EXISTS: | |||
logger.warning( | |||
f'Reusing dataset {self.name} ({self._cache_dir})') | |||
return | |||
logger.info(f'Generating dataset {self.name} ({self._cache_dir})') | |||
self._download_and_prepare(dl_manager=dl_manager) | |||
def _download_and_prepare(self, dl_manager): | |||
split_path_dict = dl_manager.download_and_extract(self.zip_data_files) | |||
self.split_path_dict = { | |||
k: os.path.join(v, | |||
os.path.splitext(self.zip_data_files[k])[0]) | |||
for k, v in split_path_dict.items() | |||
} | |||
def as_dataset(self): | |||
return ExternalDataset(self.split_path_dict, self._config_kwargs) | |||
class ExternalDataset(object): | |||
def __init__(self, split_path_dict, config_kwargs): | |||
config_kwargs.update({'split_config': split_path_dict}) | |||
self.config_kwargs = config_kwargs | |||
def __len__(self): | |||
return len(self.config_kwargs['split_config']) |
@@ -6,7 +6,7 @@ from datasets.builder import DatasetBuilder | |||
from modelscope.utils.constant import DEFAULT_DATASET_REVISION | |||
from modelscope.utils.logger import get_logger | |||
from .dataset_builder import MsCsvDatasetBuilder | |||
from .dataset_builder import MsCsvDatasetBuilder, TaskSpecificDatasetBuilder | |||
logger = get_logger() | |||
@@ -87,7 +87,7 @@ def get_dataset_files(subset_split_into: dict, | |||
modelscope_api = HubApi() | |||
for split, info in subset_split_into.items(): | |||
meta_map[split] = modelscope_api.get_dataset_file_url( | |||
info['meta'], dataset_name, namespace, revision) | |||
info.get('meta', ''), dataset_name, namespace, revision) | |||
if info.get('file'): | |||
file_map[split] = info['file'] | |||
return meta_map, file_map | |||
@@ -99,15 +99,32 @@ def load_dataset_builder(dataset_name: str, subset_name: str, namespace: str, | |||
zip_data_files: Mapping[str, Union[str, | |||
Sequence[str]]], | |||
cache_dir: str, version: Optional[Union[str]], | |||
split: Sequence[str]) -> DatasetBuilder: | |||
split: Sequence[str], | |||
**config_kwargs) -> DatasetBuilder: | |||
sub_dir = os.path.join(version, '_'.join(split)) | |||
builder_instance = MsCsvDatasetBuilder( | |||
dataset_name=dataset_name, | |||
namespace=namespace, | |||
cache_dir=cache_dir, | |||
subset_name=subset_name, | |||
meta_data_files=meta_data_files, | |||
zip_data_files=zip_data_files, | |||
hash=sub_dir) | |||
meta_data_file = next(iter(meta_data_files.values())) | |||
if not meta_data_file: | |||
builder_instance = TaskSpecificDatasetBuilder( | |||
dataset_name=dataset_name, | |||
namespace=namespace, | |||
cache_dir=cache_dir, | |||
subset_name=subset_name, | |||
meta_data_files=meta_data_files, | |||
zip_data_files=zip_data_files, | |||
hash=sub_dir, | |||
**config_kwargs) | |||
elif meta_data_file.endswith('.csv'): | |||
builder_instance = MsCsvDatasetBuilder( | |||
dataset_name=dataset_name, | |||
namespace=namespace, | |||
cache_dir=cache_dir, | |||
subset_name=subset_name, | |||
meta_data_files=meta_data_files, | |||
zip_data_files=zip_data_files, | |||
hash=sub_dir) | |||
else: | |||
raise NotImplementedError( | |||
f'Dataset mete file extensions "{os.path.splitext(meta_data_file)[-1]}" is not implemented yet' | |||
) | |||
return builder_instance |
@@ -22,7 +22,3 @@ class ImageInstanceSegmentationTrainer(EpochBasedTrainer): | |||
def prediction_step(self, model, inputs): | |||
pass | |||
def to_task_dataset(self, datasets, mode, preprocessor=None): | |||
# wait for dataset interface to become stable... | |||
return datasets.to_torch_dataset(preprocessor) |
@@ -202,7 +202,7 @@ class VecoTrainer(NlpEpochBasedTrainer): | |||
"""Veco evaluates the datasets one by one. | |||
""" | |||
from modelscope.task_datasets import VecoDataset | |||
from modelscope.msdatasets.task_datasets import VecoDataset | |||
self.model.eval() | |||
self._mode = ModeKeys.EVAL | |||
metric_values = {} | |||
@@ -21,11 +21,12 @@ from modelscope.metainfo import Trainers | |||
from modelscope.metrics import build_metric, task_default_metrics | |||
from modelscope.models.base import Model, TorchModel | |||
from modelscope.msdatasets.ms_dataset import MsDataset | |||
from modelscope.msdatasets.task_datasets.builder import build_task_dataset | |||
from modelscope.msdatasets.task_datasets.torch_base_dataset import \ | |||
TorchTaskDataset | |||
from modelscope.preprocessors.base import Preprocessor | |||
from modelscope.preprocessors.builder import build_preprocessor | |||
from modelscope.preprocessors.common import Compose | |||
from modelscope.task_datasets.builder import build_task_dataset | |||
from modelscope.task_datasets.torch_base_dataset import TorchTaskDataset | |||
from modelscope.trainers.hooks.builder import HOOKS | |||
from modelscope.trainers.hooks.priority import Priority, get_priority | |||
from modelscope.trainers.lrscheduler.builder import build_lr_scheduler | |||
@@ -288,14 +289,21 @@ class EpochBasedTrainer(BaseTrainer): | |||
if isinstance(datasets, TorchTaskDataset): | |||
return datasets | |||
elif isinstance(datasets, MsDataset): | |||
datasets = datasets.to_torch_dataset( | |||
cfg = ConfigDict(type=self.cfg.model.type, mode=mode) if hasattr(self.cfg, ConfigFields.model) \ | |||
else ConfigDict(type=None, mode=mode) | |||
return datasets.to_torch_dataset( | |||
task_data_config=cfg, | |||
task_name=self.cfg.task, | |||
preprocessors=preprocessor) | |||
return datasets | |||
elif isinstance(datasets, List) and isinstance( | |||
datasets[0], MsDataset): | |||
cfg = ConfigDict(type=self.cfg.model.type, mode=mode) if hasattr(self.cfg, ConfigFields.model) \ | |||
else ConfigDict(type=None, mode=mode) | |||
datasets = [ | |||
d.to_torch_dataset(preprocessor=preprocessor) | |||
for d in datasets | |||
d.to_torch_dataset( | |||
task_data_config=cfg, | |||
task_name=self.cfg.task, | |||
preprocessors=preprocessor) for d in datasets | |||
] | |||
cfg = ConfigDict( | |||
type=self.cfg.task, mode=mode, datasets=datasets) | |||
@@ -585,8 +593,13 @@ class EpochBasedTrainer(BaseTrainer): | |||
subset_name=data_cfg.subset_name if hasattr( | |||
data_cfg, 'subset_name') else None, | |||
hub=data_cfg.hub if hasattr(data_cfg, 'hub') else Hubs.modelscope, | |||
**data_cfg, | |||
) | |||
torch_dataset = dataset.to_torch_dataset(preprocessors=preprocessor) | |||
cfg = ConfigDict(type=self.cfg.model.type, mode=mode) | |||
torch_dataset = dataset.to_torch_dataset( | |||
task_data_config=cfg, | |||
task_name=self.cfg.task, | |||
preprocessors=self.preprocessor) | |||
dataset = self.to_task_dataset(torch_dataset, mode) | |||
return dataset | |||
@@ -30,8 +30,8 @@ MODELSCOPE_PATH = '/'.join(os.path.dirname(__file__).split('/')[:-1]) | |||
REGISTER_MODULE = 'register_module' | |||
IGNORED_PACKAGES = ['modelscope', '.'] | |||
SCAN_SUB_FOLDERS = [ | |||
'models', 'metrics', 'pipelines', 'preprocessors', 'task_datasets', | |||
'trainers' | |||
'models', 'metrics', 'pipelines', 'preprocessors', | |||
'msdatasets/task_datasets', 'trainers' | |||
] | |||
INDEXER_FILE = 'ast_indexer' | |||
DECORATOR_KEY = 'decorators' | |||
@@ -1,6 +1,5 @@ | |||
addict | |||
#version above 2.1.0 introduces backward-compatability issue which is being resolved | |||
datasets==2.1.0 | |||
datasets | |||
easydict | |||
einops | |||
filelock>=3.3.0 | |||
@@ -4,6 +4,7 @@ from modelscope.models import Model | |||
from modelscope.msdatasets import MsDataset | |||
from modelscope.preprocessors import SequenceClassificationPreprocessor | |||
from modelscope.preprocessors.base import Preprocessor | |||
from modelscope.utils.constant import DownloadMode | |||
from modelscope.utils.test_utils import require_tf, require_torch, test_level | |||
@@ -30,6 +31,16 @@ class ImgPreprocessor(Preprocessor): | |||
class MsDatasetTest(unittest.TestCase): | |||
@unittest.skipUnless(test_level() >= 0, 'skip test in current test level') | |||
def test_coco(self): | |||
ms_ds_train = MsDataset.load( | |||
'pets_small', | |||
namespace='modelscope', | |||
split='train', | |||
download_mode=DownloadMode.FORCE_REDOWNLOAD, | |||
classes=('1', '2')) | |||
print(ms_ds_train._hf_ds.config_kwargs) | |||
@unittest.skipUnless(test_level() >= 1, 'skip test in current test level') | |||
def test_ms_csv_basic(self): | |||
ms_ds_train = MsDataset.load( | |||
@@ -2,7 +2,7 @@ | |||
import unittest | |||
from modelscope.task_datasets.veco_dataset import VecoDataset | |||
from modelscope.msdatasets.task_datasets.veco_dataset import VecoDataset | |||
from modelscope.utils.test_utils import test_level | |||
@@ -8,10 +8,13 @@ from functools import partial | |||
from modelscope.hub.snapshot_download import snapshot_download | |||
from modelscope.metainfo import Trainers | |||
from modelscope.models.cv.image_instance_segmentation import ( | |||
CascadeMaskRCNNSwinModel, ImageInstanceSegmentationCocoDataset) | |||
from modelscope.models.cv.image_instance_segmentation import \ | |||
CascadeMaskRCNNSwinModel | |||
from modelscope.msdatasets import MsDataset | |||
from modelscope.msdatasets.task_datasets import \ | |||
ImageInstanceSegmentationCocoDataset | |||
from modelscope.trainers import build_trainer | |||
from modelscope.utils.config import Config | |||
from modelscope.utils.config import Config, ConfigDict | |||
from modelscope.utils.constant import ModelFile | |||
from modelscope.utils.test_utils import test_level | |||
@@ -27,34 +30,47 @@ class TestImageInstanceSegmentationTrainer(unittest.TestCase): | |||
config_path = os.path.join(cache_path, ModelFile.CONFIGURATION) | |||
cfg = Config.from_file(config_path) | |||
data_root = cfg.dataset.data_root | |||
classes = tuple(cfg.dataset.classes) | |||
max_epochs = cfg.train.max_epochs | |||
samples_per_gpu = cfg.train.dataloader.batch_size_per_gpu | |||
if data_root is None: | |||
try: | |||
train_data_cfg = cfg.dataset.train | |||
val_data_cfg = cfg.dataset.val | |||
except Exception: | |||
train_data_cfg = None | |||
val_data_cfg = None | |||
if train_data_cfg is None: | |||
# use default toy data | |||
dataset_path = os.path.join(cache_path, 'toydata.zip') | |||
with zipfile.ZipFile(dataset_path, 'r') as zipf: | |||
zipf.extractall(cache_path) | |||
data_root = cache_path + '/toydata/' | |||
classes = ('Cat', 'Dog') | |||
self.train_dataset = ImageInstanceSegmentationCocoDataset( | |||
data_root + 'annotations/instances_train.json', | |||
classes=classes, | |||
data_root=data_root, | |||
img_prefix=data_root + 'images/train/', | |||
seg_prefix=None, | |||
test_mode=False) | |||
self.eval_dataset = ImageInstanceSegmentationCocoDataset( | |||
data_root + 'annotations/instances_val.json', | |||
classes=classes, | |||
data_root=data_root, | |||
img_prefix=data_root + 'images/val/', | |||
seg_prefix=None, | |||
test_mode=True) | |||
train_data_cfg = ConfigDict( | |||
name='pets_small', | |||
split='train', | |||
classes=('Cat', 'Dog'), | |||
test_mode=False) | |||
if val_data_cfg is None: | |||
val_data_cfg = ConfigDict( | |||
name='pets_small', | |||
split='validation', | |||
classes=('Cat', 'Dog'), | |||
test_mode=True) | |||
self.train_dataset = MsDataset.load( | |||
dataset_name=train_data_cfg.name, | |||
split=train_data_cfg.split, | |||
classes=train_data_cfg.classes, | |||
test_mode=train_data_cfg.test_mode) | |||
assert self.train_dataset.config_kwargs[ | |||
'classes'] == train_data_cfg.classes | |||
assert next( | |||
iter(self.train_dataset.config_kwargs['split_config'].values())) | |||
self.eval_dataset = MsDataset.load( | |||
dataset_name=val_data_cfg.name, | |||
split=val_data_cfg.split, | |||
classes=val_data_cfg.classes, | |||
test_mode=val_data_cfg.test_mode) | |||
assert self.eval_dataset.config_kwargs[ | |||
'classes'] == val_data_cfg.classes | |||
assert next( | |||
iter(self.eval_dataset.config_kwargs['split_config'].values())) | |||
from mmcv.parallel import collate | |||