|
- # Copyright (c) Alibaba, Inc. and its affiliates.
- import os
- import shutil
- import tempfile
- import unittest
- import zipfile
-
- from modelscope.msdatasets import MsDataset
- from modelscope.msdatasets.utils.dataset_utils import list_dataset_objects
- from modelscope.utils import logger as logging
- from modelscope.utils.constant import (DEFAULT_DATASET_REVISION, DownloadMode,
- ModelFile)
- from modelscope.utils.test_utils import test_level
-
- logger = logging.get_logger(__name__)
-
- KEY_EXTRACTED = 'extracted'
-
-
- class DatasetUploadTest(unittest.TestCase):
-
- def setUp(self):
- self.old_dir = os.getcwd()
- self.dataset_name = 'small_coco_for_test'
- self.dataset_file_name = self.dataset_name
- self.prepared_dataset_name = 'pets_small'
- self.token = os.getenv('TEST_UPLOAD_MS_TOKEN')
- error_msg = 'The modelscope token can not be empty, please set env variable: TEST_UPLOAD_MS_TOKEN'
- self.assertIsNotNone(self.token, msg=error_msg)
- from modelscope.hub.api import HubApi
- from modelscope.hub.api import ModelScopeConfig
- self.api = HubApi()
- self.api.login(self.token)
-
- # get user info
- self.namespace, _ = ModelScopeConfig.get_user_info()
-
- self.temp_dir = tempfile.mkdtemp()
- self.test_work_dir = os.path.join(self.temp_dir, self.dataset_name)
- self.test_meta_dir = os.path.join(self.test_work_dir, 'meta')
- if not os.path.exists(self.test_work_dir):
- os.makedirs(self.test_work_dir)
-
- def tearDown(self):
- os.chdir(self.old_dir)
- shutil.rmtree(self.temp_dir, ignore_errors=True)
- logger.info(
- f'Temporary directory {self.temp_dir} successfully removed!')
-
- @staticmethod
- def get_raw_downloaded_file_path(extracted_path):
- raw_downloaded_file_path = ''
- raw_data_dir = os.path.abspath(
- os.path.join(extracted_path, '../../..'))
- for root, dirs, files in os.walk(raw_data_dir):
- if KEY_EXTRACTED in dirs:
- for file in files:
- curr_file_path = os.path.join(root, file)
- if zipfile.is_zipfile(curr_file_path):
- raw_downloaded_file_path = curr_file_path
- return raw_downloaded_file_path
-
- @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
- def test_ds_upload(self):
- # Get the prepared data from hub, using default modelscope namespace
- ms_ds_train = MsDataset.load(self.prepared_dataset_name, split='train')
- config_res = ms_ds_train._hf_ds.config_kwargs
- extracted_path = config_res.get('split_config').get('train')
- raw_zipfile_path = self.get_raw_downloaded_file_path(extracted_path)
-
- MsDataset.upload(
- object_name=self.dataset_file_name + '.zip',
- local_file_path=raw_zipfile_path,
- dataset_name=self.dataset_name,
- namespace=self.namespace)
-
- @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
- def test_ds_upload_dir(self):
- ms_ds_train = MsDataset.load(self.prepared_dataset_name, split='train')
- config_train = ms_ds_train._hf_ds.config_kwargs
- extracted_path_train = config_train.get('split_config').get('train')
-
- MsDataset.upload(
- object_name='train',
- local_file_path=os.path.join(extracted_path_train,
- 'Pets/images/train'),
- dataset_name=self.dataset_name,
- namespace=self.namespace)
- MsDataset.upload(
- object_name='val',
- local_file_path=os.path.join(extracted_path_train,
- 'Pets/images/val'),
- dataset_name=self.dataset_name,
- namespace=self.namespace)
-
- objects = list_dataset_objects(
- hub_api=self.api,
- max_limit=-1,
- is_recursive=True,
- dataset_name=self.dataset_name,
- namespace=self.namespace,
- version=DEFAULT_DATASET_REVISION)
-
- logger.info(f'{len(objects)} objects have been uploaded: {objects}')
-
- @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
- def test_ds_download_dir(self):
- test_ds = MsDataset.load(
- self.dataset_name,
- namespace=self.namespace,
- download_mode=DownloadMode.FORCE_REDOWNLOAD)
- assert test_ds.config_kwargs['split_config'].values()
-
- @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
- def test_ds_clone_meta(self):
- MsDataset.clone_meta(
- dataset_work_dir=self.test_meta_dir,
- dataset_id=os.path.join(self.namespace, self.dataset_name))
-
- @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
- def test_ds_upload_meta(self):
- # Clone dataset meta repo first.
- MsDataset.clone_meta(
- dataset_work_dir=self.test_meta_dir,
- dataset_id=os.path.join(self.namespace, self.dataset_name))
-
- with open(os.path.join(self.test_meta_dir, ModelFile.README),
- 'a') as f:
- f.write('\nThis is a line for unit test.')
-
- MsDataset.upload_meta(
- dataset_work_dir=self.test_meta_dir,
- commit_message='Update for unit test.')
-
-
- if __name__ == '__main__':
- unittest.main()
|