From 2b64cf2bb6f22f55f99abf9b700fa05a4844cdbf Mon Sep 17 00:00:00 2001 From: "feiwu.yfw" Date: Tue, 30 Aug 2022 15:15:15 +0800 Subject: [PATCH] =?UTF-8?q?[to=20#42322933]=E6=94=AF=E6=8C=81=E4=BB=8Edata?= =?UTF-8?q?set=20json=E6=96=87=E4=BB=B6=E4=B8=AD=E8=8E=B7=E5=8F=96?= =?UTF-8?q?=E5=8F=82=E6=95=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * dataset json file add args --- modelscope/msdatasets/ms_dataset.py | 6 ++-- ...mage_instance_segmentation_coco_dataset.py | 8 ++--- modelscope/msdatasets/utils/dataset_utils.py | 16 ++++++---- tests/msdatasets/test_ms_dataset.py | 5 ++-- tests/trainers/test_finetune_mplug.py | 1 - ...est_image_instance_segmentation_trainer.py | 30 ++++++------------- 6 files changed, 29 insertions(+), 37 deletions(-) diff --git a/modelscope/msdatasets/ms_dataset.py b/modelscope/msdatasets/ms_dataset.py index 454044a4..b5527734 100644 --- a/modelscope/msdatasets/ms_dataset.py +++ b/modelscope/msdatasets/ms_dataset.py @@ -248,15 +248,15 @@ class MsDataset: break target_subset_name, target_dataset_structure = get_target_dataset_structure( dataset_json, subset_name, split) - meta_map, file_map = get_dataset_files(target_dataset_structure, - dataset_name, namespace, - version) + meta_map, file_map, args_map = get_dataset_files( + target_dataset_structure, dataset_name, namespace, version) builder = load_dataset_builder( dataset_name, subset_name, namespace, meta_data_files=meta_map, zip_data_files=file_map, + args_map=args_map, cache_dir=MS_DATASETS_CACHE, version=version, split=list(target_dataset_structure.keys()), diff --git a/modelscope/msdatasets/task_datasets/image_instance_segmentation_coco_dataset.py b/modelscope/msdatasets/task_datasets/image_instance_segmentation_coco_dataset.py index a001fe36..10cf7bfb 100644 --- a/modelscope/msdatasets/task_datasets/image_instance_segmentation_coco_dataset.py +++ b/modelscope/msdatasets/task_datasets/image_instance_segmentation_coco_dataset.py @@ -60,6 +60,8 @@ class ImageInstanceSegmentationCocoDataset(TorchTaskDataset): classes=None, seg_prefix=None, folder_name=None, + ann_file=None, + img_prefix=None, test_mode=False, filter_empty_gt=True, **kwargs): @@ -69,11 +71,9 @@ class ImageInstanceSegmentationCocoDataset(TorchTaskDataset): self.split = next(iter(split_config.keys())) self.preprocessor = preprocessor - self.ann_file = osp.join(self.data_root, - DATASET_STRUCTURE[self.split]['annotation']) + self.ann_file = osp.join(self.data_root, ann_file) - self.img_prefix = osp.join(self.data_root, - DATASET_STRUCTURE[self.split]['images']) + self.img_prefix = osp.join(self.data_root, img_prefix) self.seg_prefix = seg_prefix self.test_mode = test_mode self.filter_empty_gt = filter_empty_gt diff --git a/modelscope/msdatasets/utils/dataset_utils.py b/modelscope/msdatasets/utils/dataset_utils.py index 08a6de84..769bed93 100644 --- a/modelscope/msdatasets/utils/dataset_utils.py +++ b/modelscope/msdatasets/utils/dataset_utils.py @@ -1,6 +1,6 @@ import os from collections import defaultdict -from typing import Mapping, Optional, Sequence, Union +from typing import Any, Mapping, Optional, Sequence, Union from datasets.builder import DatasetBuilder @@ -92,6 +92,7 @@ def get_dataset_files(subset_split_into: dict, """ meta_map = defaultdict(dict) file_map = defaultdict(dict) + args_map = defaultdict(dict) from modelscope.hub.api import HubApi modelscope_api = HubApi() for split, info in subset_split_into.items(): @@ -99,7 +100,8 @@ def get_dataset_files(subset_split_into: dict, info.get('meta', ''), dataset_name, namespace, revision) if info.get('file'): file_map[split] = info['file'] - return meta_map, file_map + args_map[split] = info.get('args') + return meta_map, file_map, args_map def load_dataset_builder(dataset_name: str, subset_name: str, namespace: str, @@ -107,12 +109,16 @@ def load_dataset_builder(dataset_name: str, subset_name: str, namespace: str, Sequence[str]]], zip_data_files: Mapping[str, Union[str, Sequence[str]]], - cache_dir: str, version: Optional[Union[str]], - split: Sequence[str], + args_map: Mapping[str, Any], cache_dir: str, + version: Optional[Union[str]], split: Sequence[str], **config_kwargs) -> DatasetBuilder: sub_dir = os.path.join(version, '_'.join(split)) meta_data_file = next(iter(meta_data_files.values())) if not meta_data_file: + args_map = next(iter(args_map.values())) + if args_map is None: + args_map = {} + args_map.update(config_kwargs) builder_instance = TaskSpecificDatasetBuilder( dataset_name=dataset_name, namespace=namespace, @@ -121,7 +127,7 @@ def load_dataset_builder(dataset_name: str, subset_name: str, namespace: str, meta_data_files=meta_data_files, zip_data_files=zip_data_files, hash=sub_dir, - **config_kwargs) + **args_map) elif meta_data_file.endswith('.csv'): builder_instance = MsCsvDatasetBuilder( dataset_name=dataset_name, diff --git a/tests/msdatasets/test_ms_dataset.py b/tests/msdatasets/test_ms_dataset.py index 1d62d2d1..ed07def7 100644 --- a/tests/msdatasets/test_ms_dataset.py +++ b/tests/msdatasets/test_ms_dataset.py @@ -36,9 +36,8 @@ class MsDatasetTest(unittest.TestCase): ms_ds_train = MsDataset.load( 'pets_small', namespace=DEFAULT_DATASET_NAMESPACE, - split='train', - classes=('1', '2'), - folder_name='Pets') + download_mode=DownloadMode.FORCE_REDOWNLOAD, + split='train') print(ms_ds_train.config_kwargs) assert next(iter(ms_ds_train.config_kwargs['split_config'].values())) diff --git a/tests/trainers/test_finetune_mplug.py b/tests/trainers/test_finetune_mplug.py index 1298f1cd..351600c6 100644 --- a/tests/trainers/test_finetune_mplug.py +++ b/tests/trainers/test_finetune_mplug.py @@ -20,7 +20,6 @@ class TestFinetuneMPlug(unittest.TestCase): self.tmp_dir = tempfile.TemporaryDirectory().name if not os.path.exists(self.tmp_dir): os.makedirs(self.tmp_dir) - from modelscope.utils.constant import DownloadMode datadict = MsDataset.load( 'coco_captions_small_slice', diff --git a/tests/trainers/test_image_instance_segmentation_trainer.py b/tests/trainers/test_image_instance_segmentation_trainer.py index 774f8fa8..03f7eea3 100644 --- a/tests/trainers/test_image_instance_segmentation_trainer.py +++ b/tests/trainers/test_image_instance_segmentation_trainer.py @@ -15,7 +15,7 @@ from modelscope.msdatasets.task_datasets import \ ImageInstanceSegmentationCocoDataset from modelscope.trainers import build_trainer from modelscope.utils.config import Config, ConfigDict -from modelscope.utils.constant import ModelFile +from modelscope.utils.constant import DownloadMode, ModelFile from modelscope.utils.test_utils import test_level @@ -41,38 +41,26 @@ class TestImageInstanceSegmentationTrainer(unittest.TestCase): if train_data_cfg is None: # use default toy data train_data_cfg = ConfigDict( - name='pets_small', - split='train', - classes=('Cat', 'Dog'), - folder_name='Pets', - test_mode=False) + name='pets_small', split='train', test_mode=False) if val_data_cfg is None: val_data_cfg = ConfigDict( - name='pets_small', - split='validation', - classes=('Cat', 'Dog'), - folder_name='Pets', - test_mode=True) + name='pets_small', split='validation', test_mode=True) self.train_dataset = MsDataset.load( dataset_name=train_data_cfg.name, split=train_data_cfg.split, - classes=train_data_cfg.classes, - folder_name=train_data_cfg.folder_name, - test_mode=train_data_cfg.test_mode) - assert self.train_dataset.config_kwargs[ - 'classes'] == train_data_cfg.classes + test_mode=train_data_cfg.test_mode, + download_mode=DownloadMode.FORCE_REDOWNLOAD) + assert self.train_dataset.config_kwargs['classes'] assert next( iter(self.train_dataset.config_kwargs['split_config'].values())) self.eval_dataset = MsDataset.load( dataset_name=val_data_cfg.name, split=val_data_cfg.split, - classes=val_data_cfg.classes, - folder_name=val_data_cfg.folder_name, - test_mode=val_data_cfg.test_mode) - assert self.eval_dataset.config_kwargs[ - 'classes'] == val_data_cfg.classes + test_mode=val_data_cfg.test_mode, + download_mode=DownloadMode.FORCE_REDOWNLOAD) + assert self.eval_dataset.config_kwargs['classes'] assert next( iter(self.eval_dataset.config_kwargs['split_config'].values()))