1. Add : MsDataset.delete() , support delete dataset file or dir.
2. Add: upload mode, MsDataset.upload(xx, upload_mode=UploadMode.FORCE_UPLOAD), or MsDataset.upload(xx, upload_mode=UploadMode.APPEND_UPLOAD)
if upload_mode = UploadMode.APPEND_UPLOAD, then skip object in case of this object exists.
3. Add: support reload sts token automatically to avoid expire. (current expiration: 24h)
4. Fix: add cookies in api.py for downloading private datasets.
Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10524449
master
| @@ -506,13 +506,14 @@ class HubApi: | |||||
| shutil.rmtree(cache_dir) | shutil.rmtree(cache_dir) | ||||
| os.makedirs(cache_dir, exist_ok=True) | os.makedirs(cache_dir, exist_ok=True) | ||||
| datahub_url = f'{self.endpoint}/api/v1/datasets/{namespace}/{dataset_name}' | datahub_url = f'{self.endpoint}/api/v1/datasets/{namespace}/{dataset_name}' | ||||
| r = requests.get(datahub_url) | |||||
| cookies = ModelScopeConfig.get_cookies() | |||||
| r = requests.get(datahub_url, cookies=cookies) | |||||
| resp = r.json() | resp = r.json() | ||||
| datahub_raise_on_error(datahub_url, resp) | datahub_raise_on_error(datahub_url, resp) | ||||
| dataset_id = resp['Data']['Id'] | dataset_id = resp['Data']['Id'] | ||||
| dataset_type = resp['Data']['Type'] | dataset_type = resp['Data']['Type'] | ||||
| datahub_url = f'{self.endpoint}/api/v1/datasets/{dataset_id}/repo/tree?Revision={revision}' | datahub_url = f'{self.endpoint}/api/v1/datasets/{dataset_id}/repo/tree?Revision={revision}' | ||||
| r = requests.get(datahub_url, headers=self.headers) | |||||
| r = requests.get(datahub_url, cookies=cookies, headers=self.headers) | |||||
| resp = r.json() | resp = r.json() | ||||
| datahub_raise_on_error(datahub_url, resp) | datahub_raise_on_error(datahub_url, resp) | ||||
| file_list = resp['Data'] | file_list = resp['Data'] | ||||
| @@ -531,7 +532,7 @@ class HubApi: | |||||
| if extension in dataset_meta_format: | if extension in dataset_meta_format: | ||||
| datahub_url = f'{self.endpoint}/api/v1/datasets/{namespace}/{dataset_name}/repo?' \ | datahub_url = f'{self.endpoint}/api/v1/datasets/{namespace}/{dataset_name}/repo?' \ | ||||
| f'Revision={revision}&FilePath={file_path}' | f'Revision={revision}&FilePath={file_path}' | ||||
| r = requests.get(datahub_url) | |||||
| r = requests.get(datahub_url, cookies=cookies) | |||||
| raise_for_http_status(r) | raise_for_http_status(r) | ||||
| local_path = os.path.join(cache_dir, file_path) | local_path = os.path.join(cache_dir, file_path) | ||||
| if os.path.exists(local_path): | if os.path.exists(local_path): | ||||
| @@ -576,9 +577,7 @@ class HubApi: | |||||
| datahub_url = f'{self.endpoint}/api/v1/datasets/{namespace}/{dataset_name}/' \ | datahub_url = f'{self.endpoint}/api/v1/datasets/{namespace}/{dataset_name}/' \ | ||||
| f'ststoken?Revision={revision}' | f'ststoken?Revision={revision}' | ||||
| cookies = requests.utils.dict_from_cookiejar(cookies) | |||||
| r = requests.get( | |||||
| url=datahub_url, cookies=cookies, headers=self.headers) | |||||
| r = requests.get(url=datahub_url, cookies=cookies, headers=self.headers) | |||||
| resp = r.json() | resp = r.json() | ||||
| raise_on_error(resp) | raise_on_error(resp) | ||||
| return resp['Data'] | return resp['Data'] | ||||
| @@ -589,9 +588,6 @@ class HubApi: | |||||
| f'MaxLimit={max_limit}&Revision={revision}&Recursive={is_recursive}&FilterDir={is_filter_dir}' | f'MaxLimit={max_limit}&Revision={revision}&Recursive={is_recursive}&FilterDir={is_filter_dir}' | ||||
| cookies = ModelScopeConfig.get_cookies() | cookies = ModelScopeConfig.get_cookies() | ||||
| if cookies: | |||||
| cookies = requests.utils.dict_from_cookiejar(cookies) | |||||
| resp = requests.get(url=url, cookies=cookies) | resp = requests.get(url=url, cookies=cookies) | ||||
| resp = resp.json() | resp = resp.json() | ||||
| raise_on_error(resp) | raise_on_error(resp) | ||||
| @@ -600,17 +596,48 @@ class HubApi: | |||||
| def on_dataset_download(self, dataset_name: str, namespace: str) -> None: | def on_dataset_download(self, dataset_name: str, namespace: str) -> None: | ||||
| url = f'{self.endpoint}/api/v1/datasets/{namespace}/{dataset_name}/download/increase' | url = f'{self.endpoint}/api/v1/datasets/{namespace}/{dataset_name}/download/increase' | ||||
| r = requests.post(url, headers=self.headers) | |||||
| cookies = ModelScopeConfig.get_cookies() | |||||
| r = requests.post(url, cookies=cookies, headers=self.headers) | |||||
| raise_for_http_status(r) | raise_for_http_status(r) | ||||
| def delete_oss_dataset_object(self, object_name: str, dataset_name: str, | |||||
| namespace: str, revision: str) -> str: | |||||
| if not object_name or not dataset_name or not namespace or not revision: | |||||
| raise ValueError('Args cannot be empty!') | |||||
| url = f'{self.endpoint}/api/v1/datasets/{namespace}/{dataset_name}/oss?Path={object_name}&Revision={revision}' | |||||
| cookies = self.check_local_cookies(use_cookies=True) | |||||
| resp = requests.delete(url=url, cookies=cookies) | |||||
| resp = resp.json() | |||||
| raise_on_error(resp) | |||||
| resp = resp['Message'] | |||||
| return resp | |||||
| def delete_oss_dataset_dir(self, object_name: str, dataset_name: str, | |||||
| namespace: str, revision: str) -> str: | |||||
| if not object_name or not dataset_name or not namespace or not revision: | |||||
| raise ValueError('Args cannot be empty!') | |||||
| url = f'{self.endpoint}/api/v1/datasets/{namespace}/{dataset_name}/oss/prefix?Prefix={object_name}/' \ | |||||
| f'&Revision={revision}' | |||||
| cookies = self.check_local_cookies(use_cookies=True) | |||||
| resp = requests.delete(url=url, cookies=cookies) | |||||
| resp = resp.json() | |||||
| raise_on_error(resp) | |||||
| resp = resp['Message'] | |||||
| return resp | |||||
| @staticmethod | @staticmethod | ||||
| def datahub_remote_call(url): | def datahub_remote_call(url): | ||||
| r = requests.get(url, headers={'user-agent': ModelScopeConfig.get_user_agent()}) | |||||
| cookies = ModelScopeConfig.get_cookies() | |||||
| r = requests.get(url, cookies=cookies, headers={'user-agent': ModelScopeConfig.get_user_agent()}) | |||||
| resp = r.json() | resp = r.json() | ||||
| datahub_raise_on_error(url, resp) | datahub_raise_on_error(url, resp) | ||||
| return resp['Data'] | return resp['Data'] | ||||
| def check_cookies_upload_data(self, use_cookies) -> CookieJar: | |||||
| def check_local_cookies(self, use_cookies) -> CookieJar: | |||||
| return self._check_cookie(use_cookies=use_cookies) | return self._check_cookie(use_cookies=use_cookies) | ||||
| @@ -20,13 +20,15 @@ from modelscope.msdatasets.task_datasets.builder import build_task_dataset | |||||
| from modelscope.msdatasets.utils.dataset_builder import ExternalDataset | from modelscope.msdatasets.utils.dataset_builder import ExternalDataset | ||||
| from modelscope.msdatasets.utils.dataset_utils import ( | from modelscope.msdatasets.utils.dataset_utils import ( | ||||
| get_dataset_files, get_target_dataset_structure, load_dataset_builder) | get_dataset_files, get_target_dataset_structure, load_dataset_builder) | ||||
| from modelscope.msdatasets.utils.delete_utils import DatasetDeleteManager | |||||
| from modelscope.msdatasets.utils.download_utils import DatasetDownloadManager | from modelscope.msdatasets.utils.download_utils import DatasetDownloadManager | ||||
| from modelscope.msdatasets.utils.upload_utils import DatasetUploadManager | from modelscope.msdatasets.utils.upload_utils import DatasetUploadManager | ||||
| from modelscope.utils.config import ConfigDict | from modelscope.utils.config import ConfigDict | ||||
| from modelscope.utils.config_ds import MS_DATASETS_CACHE | from modelscope.utils.config_ds import MS_DATASETS_CACHE | ||||
| from modelscope.utils.constant import (DEFAULT_DATASET_NAMESPACE, | from modelscope.utils.constant import (DEFAULT_DATASET_NAMESPACE, | ||||
| DEFAULT_DATASET_REVISION, | DEFAULT_DATASET_REVISION, | ||||
| DatasetFormations, DownloadMode, Hubs) | |||||
| DatasetFormations, DownloadMode, Hubs, | |||||
| UploadMode) | |||||
| from modelscope.utils.logger import get_logger | from modelscope.utils.logger import get_logger | ||||
| logger = get_logger() | logger = get_logger() | ||||
| @@ -576,15 +578,17 @@ class MsDataset: | |||||
| return self._hf_ds.rename_columns(column_mapping) | return self._hf_ds.rename_columns(column_mapping) | ||||
| @staticmethod | @staticmethod | ||||
| def upload(object_name: str, | |||||
| local_file_path: str, | |||||
| dataset_name: str, | |||||
| namespace: Optional[str] = DEFAULT_DATASET_NAMESPACE, | |||||
| version: Optional[str] = DEFAULT_DATASET_REVISION, | |||||
| num_processes: Optional[int] = None, | |||||
| chunksize: Optional[int] = 1, | |||||
| filter_hidden_files: Optional[bool] = True) -> None: | |||||
| """Upload dataset file or directory to the ModelScope Hub. Please login to the ModelScope Hub first. | |||||
| def upload( | |||||
| object_name: str, | |||||
| local_file_path: str, | |||||
| dataset_name: str, | |||||
| namespace: Optional[str] = DEFAULT_DATASET_NAMESPACE, | |||||
| version: Optional[str] = DEFAULT_DATASET_REVISION, | |||||
| num_processes: Optional[int] = None, | |||||
| chunksize: Optional[int] = 1, | |||||
| filter_hidden_files: Optional[bool] = True, | |||||
| upload_mode: Optional[UploadMode] = UploadMode.OVERWRITE) -> None: | |||||
| """Upload dataset file or directory to the ModelScope Hub. Please log in to the ModelScope Hub first. | |||||
| Args: | Args: | ||||
| object_name (str): The object name on ModelScope, in the form of your-dataset-name.zip or your-dataset-name | object_name (str): The object name on ModelScope, in the form of your-dataset-name.zip or your-dataset-name | ||||
| @@ -592,7 +596,7 @@ class MsDataset: | |||||
| dataset_name (str): Name of the dataset | dataset_name (str): Name of the dataset | ||||
| namespace(str, optional): Namespace of the dataset | namespace(str, optional): Namespace of the dataset | ||||
| version: Optional[str]: Version of the dataset | version: Optional[str]: Version of the dataset | ||||
| num_processes: Optional[int]: The number of processes used for multi-process uploading. | |||||
| num_processes: Optional[int]: The number of processes used for multiprocess uploading. | |||||
| This is only applicable when local_file_path is a directory, and we are uploading mutliple-files | This is only applicable when local_file_path is a directory, and we are uploading mutliple-files | ||||
| insided the directory. When None provided, the number returned by os.cpu_count() is used as default. | insided the directory. When None provided, the number returned by os.cpu_count() is used as default. | ||||
| chunksize: Optional[int]: The chunksize of objects to upload. | chunksize: Optional[int]: The chunksize of objects to upload. | ||||
| @@ -600,24 +604,34 @@ class MsDataset: | |||||
| using the default value of 1. Available if local_file_path is a directory. | using the default value of 1. Available if local_file_path is a directory. | ||||
| filter_hidden_files: Optional[bool]: Whether to filter hidden files. | filter_hidden_files: Optional[bool]: Whether to filter hidden files. | ||||
| Available if local_file_path is a directory. | Available if local_file_path is a directory. | ||||
| upload_mode: Optional[UploadMode]: How to upload objects from local. Default: UploadMode.OVERWRITE, upload | |||||
| all objects from local, existing remote objects may be overwritten. | |||||
| Returns: | Returns: | ||||
| None | None | ||||
| """ | """ | ||||
| if not object_name: | |||||
| raise ValueError('object_name cannot be empty!') | |||||
| _upload_manager = DatasetUploadManager( | _upload_manager = DatasetUploadManager( | ||||
| dataset_name=dataset_name, namespace=namespace, version=version) | dataset_name=dataset_name, namespace=namespace, version=version) | ||||
| upload_mode = UploadMode(upload_mode or UploadMode.OVERWRITE) | |||||
| if os.path.isfile(local_file_path): | if os.path.isfile(local_file_path): | ||||
| _upload_manager.upload( | _upload_manager.upload( | ||||
| object_name=object_name, local_file_path=local_file_path) | |||||
| object_name=object_name, | |||||
| local_file_path=local_file_path, | |||||
| upload_mode=upload_mode) | |||||
| elif os.path.isdir(local_file_path): | elif os.path.isdir(local_file_path): | ||||
| _upload_manager.upload_dir( | _upload_manager.upload_dir( | ||||
| object_dir_name=object_name, | object_dir_name=object_name, | ||||
| local_dir_path=local_file_path, | local_dir_path=local_file_path, | ||||
| num_processes=num_processes, | num_processes=num_processes, | ||||
| chunksize=chunksize, | chunksize=chunksize, | ||||
| filter_hidden_files=filter_hidden_files) | |||||
| filter_hidden_files=filter_hidden_files, | |||||
| upload_mode=upload_mode) | |||||
| else: | else: | ||||
| raise ValueError( | raise ValueError( | ||||
| f'{local_file_path} is not a valid file path or directory') | f'{local_file_path} is not a valid file path or directory') | ||||
| @@ -672,7 +686,7 @@ class MsDataset: | |||||
| revision of the model you want to clone from. Can be any of a branch, tag or commit hash | revision of the model you want to clone from. Can be any of a branch, tag or commit hash | ||||
| auth_token(`Optional[str]`): | auth_token(`Optional[str]`): | ||||
| token obtained when calling `HubApi.login()`. Usually you can safely ignore the parameter | token obtained when calling `HubApi.login()`. Usually you can safely ignore the parameter | ||||
| as the token is already saved when you login the first time, if None, we will use saved token. | |||||
| as the token is already saved when you log in the first time, if None, we will use saved token. | |||||
| git_path:(`Optional[str]`): | git_path:(`Optional[str]`): | ||||
| The git command line path, if None, we use 'git' | The git command line path, if None, we use 'git' | ||||
| force (Optional[bool]): whether to use forced-push. | force (Optional[bool]): whether to use forced-push. | ||||
| @@ -687,8 +701,29 @@ class MsDataset: | |||||
| revision=revision, | revision=revision, | ||||
| auth_token=auth_token, | auth_token=auth_token, | ||||
| git_path=git_path) | git_path=git_path) | ||||
| _repo.push( | |||||
| commit_message=commit_message, | |||||
| local_branch=revision, | |||||
| remote_branch=revision, | |||||
| force=force) | |||||
| _repo.push(commit_message=commit_message, branch=revision, force=force) | |||||
| @staticmethod | |||||
| def delete(object_name: str, | |||||
| dataset_name: str, | |||||
| namespace: Optional[str] = DEFAULT_DATASET_NAMESPACE, | |||||
| version: Optional[str] = DEFAULT_DATASET_REVISION) -> str: | |||||
| """ Delete object of dataset. Please log in first and make sure you have permission to manage the dataset. | |||||
| Args: | |||||
| object_name (str): The object name of dataset to be deleted. Could be a name of file or directory. If it's | |||||
| directory, then ends with `/`. | |||||
| For example: your-data-name.zip, train/001/img_001.png, train/, ... | |||||
| dataset_name (str): Path or name of the dataset. | |||||
| namespace(str, optional): Namespace of the dataset. | |||||
| version (str, optional): Version of the dataset. | |||||
| Returns: | |||||
| res_msg (str): Response message. | |||||
| """ | |||||
| _delete_manager = DatasetDeleteManager( | |||||
| dataset_name=dataset_name, namespace=namespace, version=version) | |||||
| resp_msg = _delete_manager.delete(object_name=object_name) | |||||
| logger.info(f'Object {object_name} successfully removed!') | |||||
| return resp_msg | |||||
| @@ -82,7 +82,7 @@ def list_dataset_objects(hub_api: HubApi, max_limit: int, is_recursive: bool, | |||||
| dataset_name: str, namespace: str, | dataset_name: str, namespace: str, | ||||
| version: str) -> list: | version: str) -> list: | ||||
| """ | """ | ||||
| List all of objects for specific dataset. | |||||
| List all objects for specific dataset. | |||||
| Args: | Args: | ||||
| hub_api (class HubApi): HubApi instance. | hub_api (class HubApi): HubApi instance. | ||||
| @@ -0,0 +1,32 @@ | |||||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||||
| from modelscope.hub.api import HubApi | |||||
| class DatasetDeleteManager(object): | |||||
| def __init__(self, dataset_name: str, namespace: str, version: str): | |||||
| self.api = HubApi() | |||||
| self.dataset_name = dataset_name | |||||
| self.namespace = namespace | |||||
| self.version = version | |||||
| def delete(self, object_name: str) -> str: | |||||
| # single object | |||||
| if not object_name.endswith('/'): | |||||
| resp_msg = self.api.delete_oss_dataset_object( | |||||
| object_name=object_name, | |||||
| dataset_name=self.dataset_name, | |||||
| namespace=self.namespace, | |||||
| revision=self.version) | |||||
| else: | |||||
| # multiple objects | |||||
| object_name = object_name.strip('/') | |||||
| resp_msg = self.api.delete_oss_dataset_dir( | |||||
| object_name=object_name, | |||||
| dataset_name=self.dataset_name, | |||||
| namespace=self.namespace, | |||||
| revision=self.version) | |||||
| return resp_msg | |||||
| @@ -27,7 +27,11 @@ class DatasetDownloadManager(DownloadManager): | |||||
| oss_config = api.get_dataset_access_config(self._dataset_name, | oss_config = api.get_dataset_access_config(self._dataset_name, | ||||
| self._namespace, | self._namespace, | ||||
| self._version) | self._version) | ||||
| self.oss_utilities = OssUtilities(oss_config) | |||||
| self.oss_utilities = OssUtilities( | |||||
| oss_config=oss_config, | |||||
| dataset_name=self._dataset_name, | |||||
| namespace=self._namespace, | |||||
| revision=self._version) | |||||
| def _download(self, url_or_filename: str, | def _download(self, url_or_filename: str, | ||||
| download_config: DownloadConfig) -> str: | download_config: DownloadConfig) -> str: | ||||
| @@ -6,19 +6,28 @@ import os | |||||
| import oss2 | import oss2 | ||||
| from datasets.utils.file_utils import hash_url_to_filename | from datasets.utils.file_utils import hash_url_to_filename | ||||
| from modelscope.hub.api import HubApi | |||||
| from modelscope.utils.constant import UploadMode | |||||
| from modelscope.utils.logger import get_logger | |||||
| logger = get_logger() | |||||
| ACCESS_ID = 'AccessId' | |||||
| ACCESS_SECRET = 'AccessSecret' | |||||
| SECURITY_TOKEN = 'SecurityToken' | |||||
| BUCKET = 'Bucket' | |||||
| BACK_DIR = 'BackupDir' | |||||
| DIR = 'Dir' | |||||
| class OssUtilities: | class OssUtilities: | ||||
| def __init__(self, oss_config): | |||||
| self.key = oss_config['AccessId'] | |||||
| self.secret = oss_config['AccessSecret'] | |||||
| self.token = oss_config['SecurityToken'] | |||||
| self.endpoint = f"https://{oss_config['Region']}.aliyuncs.com" | |||||
| self.bucket_name = oss_config['Bucket'] | |||||
| auth = oss2.StsAuth(self.key, self.secret, self.token) | |||||
| self.bucket = oss2.Bucket(auth, self.endpoint, self.bucket_name) | |||||
| self.oss_dir = oss_config['Dir'] | |||||
| self.oss_backup_dir = oss_config['BackupDir'] | |||||
| def __init__(self, oss_config, dataset_name, namespace, revision): | |||||
| self._do_init(oss_config=oss_config) | |||||
| self.dataset_name = dataset_name | |||||
| self.namespace = namespace | |||||
| self.revision = revision | |||||
| self.upload_resumable_tmp_store = '/tmp/modelscope/tmp_dataset' | self.upload_resumable_tmp_store = '/tmp/modelscope/tmp_dataset' | ||||
| self.upload_multipart_threshold = 50 * 1024 * 1024 | self.upload_multipart_threshold = 50 * 1024 * 1024 | ||||
| @@ -26,6 +35,28 @@ class OssUtilities: | |||||
| self.upload_num_threads = 4 | self.upload_num_threads = 4 | ||||
| self.upload_max_retries = 3 | self.upload_max_retries = 3 | ||||
| self.api = HubApi() | |||||
| def _do_init(self, oss_config): | |||||
| self.key = oss_config[ACCESS_ID] | |||||
| self.secret = oss_config[ACCESS_SECRET] | |||||
| self.token = oss_config[SECURITY_TOKEN] | |||||
| self.endpoint = f"https://{oss_config['Region']}.aliyuncs.com" | |||||
| self.bucket_name = oss_config[BUCKET] | |||||
| auth = oss2.StsAuth(self.key, self.secret, self.token) | |||||
| self.bucket = oss2.Bucket(auth, self.endpoint, self.bucket_name) | |||||
| self.oss_dir = oss_config[DIR] | |||||
| self.oss_backup_dir = oss_config[BACK_DIR] | |||||
| def _reload_sts(self): | |||||
| cookies = self.api.check_local_cookies(use_cookies=True) | |||||
| oss_config_refresh = self.api.get_dataset_access_config_session( | |||||
| cookies=cookies, | |||||
| dataset_name=self.dataset_name, | |||||
| namespace=self.namespace, | |||||
| revision=self.revision) | |||||
| self._do_init(oss_config_refresh) | |||||
| @staticmethod | @staticmethod | ||||
| def _percentage(consumed_bytes, total_bytes): | def _percentage(consumed_bytes, total_bytes): | ||||
| if total_bytes: | if total_bytes: | ||||
| @@ -51,7 +82,8 @@ class OssUtilities: | |||||
| return local_path | return local_path | ||||
| def upload(self, oss_object_name: str, local_file_path: str, | def upload(self, oss_object_name: str, local_file_path: str, | ||||
| indicate_individual_progress: bool) -> str: | |||||
| indicate_individual_progress: bool, | |||||
| upload_mode: UploadMode) -> str: | |||||
| retry_count = 0 | retry_count = 0 | ||||
| object_key = os.path.join(self.oss_dir, oss_object_name) | object_key = os.path.join(self.oss_dir, oss_object_name) | ||||
| resumable_store = oss2.ResumableStore( | resumable_store = oss2.ResumableStore( | ||||
| @@ -64,6 +96,13 @@ class OssUtilities: | |||||
| while True: | while True: | ||||
| try: | try: | ||||
| retry_count += 1 | retry_count += 1 | ||||
| exist = self.bucket.object_exists(object_key) | |||||
| if upload_mode == UploadMode.APPEND and exist: | |||||
| logger.info( | |||||
| f'Skip {oss_object_name} in case of {upload_mode.value} mode.' | |||||
| ) | |||||
| break | |||||
| oss2.resumable_upload( | oss2.resumable_upload( | ||||
| self.bucket, | self.bucket, | ||||
| object_key, | object_key, | ||||
| @@ -74,7 +113,9 @@ class OssUtilities: | |||||
| progress_callback=progress_callback, | progress_callback=progress_callback, | ||||
| num_threads=self.upload_num_threads) | num_threads=self.upload_num_threads) | ||||
| break | break | ||||
| except Exception: | |||||
| except Exception as e: | |||||
| if e.__getattribute__('status') == 403: | |||||
| self._reload_sts() | |||||
| if retry_count >= self.upload_max_retries: | if retry_count >= self.upload_max_retries: | ||||
| raise | raise | ||||
| @@ -5,6 +5,7 @@ from multiprocessing.dummy import Pool as ThreadPool | |||||
| from tqdm import tqdm | from tqdm import tqdm | ||||
| from modelscope.utils.constant import UploadMode | |||||
| from .oss_utils import OssUtilities | from .oss_utils import OssUtilities | ||||
| @@ -13,38 +14,45 @@ class DatasetUploadManager(object): | |||||
| def __init__(self, dataset_name: str, namespace: str, version: str): | def __init__(self, dataset_name: str, namespace: str, version: str): | ||||
| from modelscope.hub.api import HubApi | from modelscope.hub.api import HubApi | ||||
| _hub_api = HubApi() | _hub_api = HubApi() | ||||
| _cookies = _hub_api.check_cookies_upload_data(use_cookies=True) | |||||
| _cookies = _hub_api.check_local_cookies(use_cookies=True) | |||||
| _oss_config = _hub_api.get_dataset_access_config_session( | _oss_config = _hub_api.get_dataset_access_config_session( | ||||
| cookies=_cookies, | cookies=_cookies, | ||||
| dataset_name=dataset_name, | dataset_name=dataset_name, | ||||
| namespace=namespace, | namespace=namespace, | ||||
| revision=version) | revision=version) | ||||
| self.oss_utilities = OssUtilities(_oss_config) | |||||
| self.oss_utilities = OssUtilities( | |||||
| oss_config=_oss_config, | |||||
| dataset_name=dataset_name, | |||||
| namespace=namespace, | |||||
| revision=version) | |||||
| def upload(self, object_name: str, local_file_path: str) -> str: | |||||
| def upload(self, object_name: str, local_file_path: str, | |||||
| upload_mode: UploadMode) -> str: | |||||
| object_key = self.oss_utilities.upload( | object_key = self.oss_utilities.upload( | ||||
| oss_object_name=object_name, | oss_object_name=object_name, | ||||
| local_file_path=local_file_path, | local_file_path=local_file_path, | ||||
| indicate_individual_progress=True) | |||||
| indicate_individual_progress=True, | |||||
| upload_mode=upload_mode) | |||||
| return object_key | return object_key | ||||
| def upload_dir(self, object_dir_name: str, local_dir_path: str, | def upload_dir(self, object_dir_name: str, local_dir_path: str, | ||||
| num_processes: int, chunksize: int, | num_processes: int, chunksize: int, | ||||
| filter_hidden_files: bool) -> int: | |||||
| filter_hidden_files: bool, upload_mode: UploadMode) -> int: | |||||
| def run_upload(args): | def run_upload(args): | ||||
| self.oss_utilities.upload( | self.oss_utilities.upload( | ||||
| oss_object_name=args[0], | oss_object_name=args[0], | ||||
| local_file_path=args[1], | local_file_path=args[1], | ||||
| indicate_individual_progress=False) | |||||
| indicate_individual_progress=False, | |||||
| upload_mode=upload_mode) | |||||
| files_list = [] | files_list = [] | ||||
| for root, dirs, files in os.walk(local_dir_path): | for root, dirs, files in os.walk(local_dir_path): | ||||
| for file_name in files: | for file_name in files: | ||||
| if filter_hidden_files and file_name.startswith('.'): | if filter_hidden_files and file_name.startswith('.'): | ||||
| continue | continue | ||||
| # Concatenate directory name and relative path into a oss object key. e.g., train/001/1_1230.png | |||||
| # Concatenate directory name and relative path into oss object key. e.g., train/001/1_1230.png | |||||
| object_name = os.path.join( | object_name = os.path.join( | ||||
| object_dir_name, | object_dir_name, | ||||
| root.replace(local_dir_path, '', 1).strip('/'), file_name) | root.replace(local_dir_path, '', 1).strip('/'), file_name) | ||||
| @@ -238,6 +238,15 @@ class DownloadMode(enum.Enum): | |||||
| FORCE_REDOWNLOAD = 'force_redownload' | FORCE_REDOWNLOAD = 'force_redownload' | ||||
| class UploadMode(enum.Enum): | |||||
| """ How to upload object to remote. | |||||
| """ | |||||
| # Upload all objects from local, existing remote objects may be overwritten. (Default) | |||||
| OVERWRITE = 'overwrite' | |||||
| # Upload local objects in append mode, skipping all existing remote objects. | |||||
| APPEND = 'append' | |||||
| class DatasetFormations(enum.Enum): | class DatasetFormations(enum.Enum): | ||||
| """ How a dataset is organized and interpreted | """ How a dataset is organized and interpreted | ||||
| """ | """ | ||||
| @@ -0,0 +1,112 @@ | |||||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||||
| import os | |||||
| import shutil | |||||
| import tempfile | |||||
| import unittest | |||||
| import zipfile | |||||
| from modelscope.msdatasets import MsDataset | |||||
| from modelscope.utils import logger as logging | |||||
| from modelscope.utils.test_utils import test_level | |||||
| logger = logging.get_logger(__name__) | |||||
| KEY_EXTRACTED = 'extracted' | |||||
| EXPECTED_MSG = 'success' | |||||
| class DatasetDeleteTest(unittest.TestCase): | |||||
| def setUp(self): | |||||
| self.old_dir = os.getcwd() | |||||
| self.dataset_name = 'small_coco_for_test' | |||||
| self.dataset_file_name = self.dataset_name | |||||
| self.prepared_dataset_name = 'pets_small' | |||||
| self.token = os.getenv('TEST_UPLOAD_MS_TOKEN') | |||||
| error_msg = 'The modelscope token can not be empty, please set env variable: TEST_UPLOAD_MS_TOKEN' | |||||
| self.assertIsNotNone(self.token, msg=error_msg) | |||||
| from modelscope.hub.api import HubApi | |||||
| from modelscope.hub.api import ModelScopeConfig | |||||
| self.api = HubApi() | |||||
| self.api.login(self.token) | |||||
| # get user info | |||||
| self.namespace, _ = ModelScopeConfig.get_user_info() | |||||
| self.temp_dir = tempfile.mkdtemp() | |||||
| self.test_work_dir = os.path.join(self.temp_dir, self.dataset_name) | |||||
| if not os.path.exists(self.test_work_dir): | |||||
| os.makedirs(self.test_work_dir) | |||||
| def tearDown(self): | |||||
| os.chdir(self.old_dir) | |||||
| shutil.rmtree(self.temp_dir, ignore_errors=True) | |||||
| logger.info( | |||||
| f'Temporary directory {self.temp_dir} successfully removed!') | |||||
| @staticmethod | |||||
| def get_raw_downloaded_file_path(extracted_path): | |||||
| raw_downloaded_file_path = '' | |||||
| raw_data_dir = os.path.abspath( | |||||
| os.path.join(extracted_path, '../../..')) | |||||
| for root, dirs, files in os.walk(raw_data_dir): | |||||
| if KEY_EXTRACTED in dirs: | |||||
| for file in files: | |||||
| curr_file_path = os.path.join(root, file) | |||||
| if zipfile.is_zipfile(curr_file_path): | |||||
| raw_downloaded_file_path = curr_file_path | |||||
| return raw_downloaded_file_path | |||||
| def upload_test_file(self): | |||||
| # Get the prepared data from hub, using default modelscope namespace | |||||
| ms_ds_train = MsDataset.load(self.prepared_dataset_name, split='train') | |||||
| config_res = ms_ds_train._hf_ds.config_kwargs | |||||
| extracted_path = config_res.get('split_config').get('train') | |||||
| raw_zipfile_path = self.get_raw_downloaded_file_path(extracted_path) | |||||
| object_name = self.dataset_file_name + '_for_del.zip' | |||||
| MsDataset.upload( | |||||
| object_name=object_name, | |||||
| local_file_path=raw_zipfile_path, | |||||
| dataset_name=self.dataset_name, | |||||
| namespace=self.namespace) | |||||
| return object_name | |||||
| def upload_test_dir(self): | |||||
| ms_ds_train = MsDataset.load(self.prepared_dataset_name, split='train') | |||||
| config_train = ms_ds_train._hf_ds.config_kwargs | |||||
| extracted_path_train = config_train.get('split_config').get('train') | |||||
| object_name = 'train_for_del' | |||||
| MsDataset.upload( | |||||
| object_name=object_name, | |||||
| local_file_path=os.path.join(extracted_path_train, | |||||
| 'Pets/images/train'), | |||||
| dataset_name=self.dataset_name, | |||||
| namespace=self.namespace) | |||||
| return object_name + '/' | |||||
| @unittest.skipUnless(test_level() >= 1, 'skip test in current test level') | |||||
| def test_ds_delete_object(self): | |||||
| # upload prepared data | |||||
| file_name = self.upload_test_file() | |||||
| dir_name = self.upload_test_dir() | |||||
| # delete object | |||||
| del_file_msg = MsDataset.delete( | |||||
| object_name=file_name, | |||||
| dataset_name=self.dataset_name, | |||||
| namespace=self.namespace) | |||||
| del_dir_msg = MsDataset.delete( | |||||
| object_name=dir_name, | |||||
| dataset_name=self.dataset_name, | |||||
| namespace=self.namespace) | |||||
| assert all([del_file_msg == EXPECTED_MSG, del_dir_msg == EXPECTED_MSG]) | |||||
| if __name__ == '__main__': | |||||
| unittest.main() | |||||