@@ -26,18 +26,15 @@ from modelscope.utils.logger import get_logger | |||||
from .errors import (InvalidParameter, NotExistError, RequestError, | from .errors import (InvalidParameter, NotExistError, RequestError, | ||||
datahub_raise_on_error, handle_http_post_error, | datahub_raise_on_error, handle_http_post_error, | ||||
handle_http_response, is_ok, raise_on_error) | handle_http_response, is_ok, raise_on_error) | ||||
from .utils.utils import (get_dataset_hub_endpoint, get_endpoint, | |||||
model_id_to_group_owner_name) | |||||
from .utils.utils import get_endpoint, model_id_to_group_owner_name | |||||
logger = get_logger() | logger = get_logger() | ||||
class HubApi: | class HubApi: | ||||
def __init__(self, endpoint=None, dataset_endpoint=None): | |||||
def __init__(self, endpoint=None): | |||||
self.endpoint = endpoint if endpoint is not None else get_endpoint() | self.endpoint = endpoint if endpoint is not None else get_endpoint() | ||||
self.dataset_endpoint = dataset_endpoint if dataset_endpoint is not None else get_dataset_hub_endpoint( | |||||
) | |||||
def login( | def login( | ||||
self, | self, | ||||
@@ -288,7 +285,7 @@ class HubApi: | |||||
return files | return files | ||||
def list_datasets(self): | def list_datasets(self): | ||||
path = f'{self.dataset_endpoint}/api/v1/datasets' | |||||
path = f'{self.endpoint}/api/v1/datasets' | |||||
headers = None | headers = None | ||||
params = {} | params = {} | ||||
r = requests.get(path, params=params, headers=headers) | r = requests.get(path, params=params, headers=headers) | ||||
@@ -315,13 +312,13 @@ class HubApi: | |||||
cache_dir): | cache_dir): | ||||
shutil.rmtree(cache_dir) | shutil.rmtree(cache_dir) | ||||
os.makedirs(cache_dir, exist_ok=True) | os.makedirs(cache_dir, exist_ok=True) | ||||
datahub_url = f'{self.dataset_endpoint}/api/v1/datasets/{namespace}/{dataset_name}' | |||||
datahub_url = f'{self.endpoint}/api/v1/datasets/{namespace}/{dataset_name}' | |||||
r = requests.get(datahub_url) | r = requests.get(datahub_url) | ||||
resp = r.json() | resp = r.json() | ||||
datahub_raise_on_error(datahub_url, resp) | datahub_raise_on_error(datahub_url, resp) | ||||
dataset_id = resp['Data']['Id'] | dataset_id = resp['Data']['Id'] | ||||
dataset_type = resp['Data']['Type'] | dataset_type = resp['Data']['Type'] | ||||
datahub_url = f'{self.dataset_endpoint}/api/v1/datasets/{dataset_id}/repo/tree?Revision={revision}' | |||||
datahub_url = f'{self.endpoint}/api/v1/datasets/{dataset_id}/repo/tree?Revision={revision}' | |||||
r = requests.get(datahub_url) | r = requests.get(datahub_url) | ||||
resp = r.json() | resp = r.json() | ||||
datahub_raise_on_error(datahub_url, resp) | datahub_raise_on_error(datahub_url, resp) | ||||
@@ -339,7 +336,7 @@ class HubApi: | |||||
file_path = file_info['Path'] | file_path = file_info['Path'] | ||||
extension = os.path.splitext(file_path)[-1] | extension = os.path.splitext(file_path)[-1] | ||||
if extension in dataset_meta_format: | if extension in dataset_meta_format: | ||||
datahub_url = f'{self.dataset_endpoint}/api/v1/datasets/{namespace}/{dataset_name}/repo?' \ | |||||
datahub_url = f'{self.endpoint}/api/v1/datasets/{namespace}/{dataset_name}/repo?' \ | |||||
f'Revision={revision}&FilePath={file_path}' | f'Revision={revision}&FilePath={file_path}' | ||||
r = requests.get(datahub_url) | r = requests.get(datahub_url) | ||||
r.raise_for_status() | r.raise_for_status() | ||||
@@ -363,7 +360,7 @@ class HubApi: | |||||
namespace: str, | namespace: str, | ||||
revision: Optional[str] = DEFAULT_DATASET_REVISION): | revision: Optional[str] = DEFAULT_DATASET_REVISION): | ||||
if file_name.endswith('.csv'): | if file_name.endswith('.csv'): | ||||
file_name = f'{self.dataset_endpoint}/api/v1/datasets/{namespace}/{dataset_name}/repo?' \ | |||||
file_name = f'{self.endpoint}/api/v1/datasets/{namespace}/{dataset_name}/repo?' \ | |||||
f'Revision={revision}&FilePath={file_name}' | f'Revision={revision}&FilePath={file_name}' | ||||
return file_name | return file_name | ||||
@@ -372,7 +369,7 @@ class HubApi: | |||||
dataset_name: str, | dataset_name: str, | ||||
namespace: str, | namespace: str, | ||||
revision: Optional[str] = DEFAULT_DATASET_REVISION): | revision: Optional[str] = DEFAULT_DATASET_REVISION): | ||||
datahub_url = f'{self.dataset_endpoint}/api/v1/datasets/{namespace}/{dataset_name}/' \ | |||||
datahub_url = f'{self.endpoint}/api/v1/datasets/{namespace}/{dataset_name}/' \ | |||||
f'ststoken?Revision={revision}' | f'ststoken?Revision={revision}' | ||||
return self.datahub_remote_call(datahub_url) | return self.datahub_remote_call(datahub_url) | ||||
@@ -383,7 +380,7 @@ class HubApi: | |||||
namespace: str, | namespace: str, | ||||
revision: Optional[str] = DEFAULT_DATASET_REVISION): | revision: Optional[str] = DEFAULT_DATASET_REVISION): | ||||
datahub_url = f'{self.dataset_endpoint}/api/v1/datasets/{namespace}/{dataset_name}/' \ | |||||
datahub_url = f'{self.endpoint}/api/v1/datasets/{namespace}/{dataset_name}/' \ | |||||
f'ststoken?Revision={revision}' | f'ststoken?Revision={revision}' | ||||
cookies = requests.utils.dict_from_cookiejar(cookies) | cookies = requests.utils.dict_from_cookiejar(cookies) | ||||
@@ -392,6 +389,19 @@ class HubApi: | |||||
raise_on_error(resp) | raise_on_error(resp) | ||||
return resp['Data'] | return resp['Data'] | ||||
def list_oss_dataset_objects(self, dataset_name, namespace, max_limit, | |||||
is_recursive, is_filter_dir, revision, | |||||
cookies): | |||||
url = f'{self.endpoint}/api/v1/datasets/{namespace}/{dataset_name}/oss/tree/?' \ | |||||
f'MaxLimit={max_limit}&Revision={revision}&Recursive={is_recursive}&FilterDir={is_filter_dir}' | |||||
cookies = requests.utils.dict_from_cookiejar(cookies) | |||||
resp = requests.get(url=url, cookies=cookies) | |||||
resp = resp.json() | |||||
raise_on_error(resp) | |||||
resp = resp['Data'] | |||||
return resp | |||||
def on_dataset_download(self, dataset_name: str, namespace: str) -> None: | def on_dataset_download(self, dataset_name: str, namespace: str) -> None: | ||||
url = f'{self.endpoint}/api/v1/datasets/{namespace}/{dataset_name}/download/increase' | url = f'{self.endpoint}/api/v1/datasets/{namespace}/{dataset_name}/download/increase' | ||||
r = requests.post(url) | r = requests.post(url) | ||||
@@ -4,8 +4,7 @@ import hashlib | |||||
import os | import os | ||||
from typing import Optional | from typing import Optional | ||||
from modelscope.hub.constants import (DEFAULT_MODELSCOPE_DATA_ENDPOINT, | |||||
DEFAULT_MODELSCOPE_DOMAIN, | |||||
from modelscope.hub.constants import (DEFAULT_MODELSCOPE_DOMAIN, | |||||
DEFAULT_MODELSCOPE_GROUP, | DEFAULT_MODELSCOPE_GROUP, | ||||
MODEL_ID_SEPARATOR, | MODEL_ID_SEPARATOR, | ||||
MODELSCOPE_URL_SCHEME) | MODELSCOPE_URL_SCHEME) | ||||
@@ -44,11 +43,6 @@ def get_endpoint(): | |||||
return MODELSCOPE_URL_SCHEME + modelscope_domain | return MODELSCOPE_URL_SCHEME + modelscope_domain | ||||
def get_dataset_hub_endpoint(): | |||||
return os.environ.get('HUB_DATASET_ENDPOINT', | |||||
DEFAULT_MODELSCOPE_DATA_ENDPOINT) | |||||
def compute_hash(file_path): | def compute_hash(file_path): | ||||
BUFFER_SIZE = 1024 * 64 # 64k buffer size | BUFFER_SIZE = 1024 * 64 # 64k buffer size | ||||
sha256_hash = hashlib.sha256() | sha256_hash = hashlib.sha256() | ||||
@@ -1,6 +1,5 @@ | |||||
# Copyright (c) Alibaba, Inc. and its affiliates. | # Copyright (c) Alibaba, Inc. and its affiliates. | ||||
import math | |||||
import os | import os | ||||
from typing import (Any, Callable, Dict, Iterable, List, Mapping, Optional, | from typing import (Any, Callable, Dict, Iterable, List, Mapping, Optional, | ||||
Sequence, Union) | Sequence, Union) | ||||
@@ -17,19 +16,18 @@ from datasets.utils.file_utils import (is_relative_path, | |||||
relative_to_absolute_path) | relative_to_absolute_path) | ||||
from modelscope.hub.repository import DatasetRepository | from modelscope.hub.repository import DatasetRepository | ||||
from modelscope.msdatasets.task_datasets.builder import build_task_dataset | |||||
from modelscope.msdatasets.utils.dataset_builder import ExternalDataset | |||||
from modelscope.msdatasets.utils.dataset_utils import ( | |||||
get_dataset_files, get_target_dataset_structure, load_dataset_builder) | |||||
from modelscope.msdatasets.utils.download_utils import DatasetDownloadManager | |||||
from modelscope.msdatasets.utils.upload_utils import DatasetUploadManager | |||||
from modelscope.utils.config import ConfigDict | from modelscope.utils.config import ConfigDict | ||||
from modelscope.utils.config_ds import MS_DATASETS_CACHE | from modelscope.utils.config_ds import MS_DATASETS_CACHE | ||||
from modelscope.utils.constant import (DEFAULT_DATASET_NAMESPACE, | from modelscope.utils.constant import (DEFAULT_DATASET_NAMESPACE, | ||||
DEFAULT_DATASET_REVISION, | DEFAULT_DATASET_REVISION, | ||||
DatasetFormations, DownloadMode, Hubs) | DatasetFormations, DownloadMode, Hubs) | ||||
from modelscope.utils.logger import get_logger | from modelscope.utils.logger import get_logger | ||||
from .task_datasets.builder import build_task_dataset | |||||
from .utils.dataset_builder import ExternalDataset | |||||
from .utils.dataset_utils import (get_dataset_files, | |||||
get_target_dataset_structure, | |||||
load_dataset_builder) | |||||
from .utils.download_utils import DatasetDownloadManager | |||||
from .utils.upload_utils import DatasetUploadManager | |||||
logger = get_logger() | logger = get_logger() | ||||
@@ -234,7 +232,6 @@ class MsDataset: | |||||
# dataset organized to be compatible with hf format | # dataset organized to be compatible with hf format | ||||
if dataset_formation == DatasetFormations.hf_compatible: | if dataset_formation == DatasetFormations.hf_compatible: | ||||
dataset_name = dataset_scripts['.py'][0] | dataset_name = dataset_scripts['.py'][0] | ||||
download_dataset = dataset_name | |||||
else: | else: | ||||
raise FileNotFoundError( | raise FileNotFoundError( | ||||
f"Couldn't find a dataset script at {relative_to_absolute_path(dataset_name)} " | f"Couldn't find a dataset script at {relative_to_absolute_path(dataset_name)} " | ||||
@@ -270,7 +267,8 @@ class MsDataset: | |||||
raise TypeError('path must be a str or a list, but got' | raise TypeError('path must be a str or a list, but got' | ||||
f' {type(dataset_name)}') | f' {type(dataset_name)}') | ||||
if download_dataset: | |||||
is_ci_test = os.getenv('CI_TEST') == 'True' | |||||
if download_dataset and not is_ci_test: | |||||
try: | try: | ||||
api.on_dataset_download( | api.on_dataset_download( | ||||
dataset_name=download_dataset, namespace=namespace) | dataset_name=download_dataset, namespace=namespace) | ||||
@@ -570,15 +568,26 @@ class MsDataset: | |||||
local_file_path: str, | local_file_path: str, | ||||
dataset_name: str, | dataset_name: str, | ||||
namespace: Optional[str] = DEFAULT_DATASET_NAMESPACE, | namespace: Optional[str] = DEFAULT_DATASET_NAMESPACE, | ||||
version: Optional[str] = DEFAULT_DATASET_REVISION) -> None: | |||||
"""Upload dataset file to the ModelScope Hub. Please login to the ModelScope Hub first. | |||||
version: Optional[str] = DEFAULT_DATASET_REVISION, | |||||
num_processes: Optional[int] = None, | |||||
chunksize: Optional[int] = 1, | |||||
filter_hidden_files: Optional[bool] = True) -> None: | |||||
"""Upload dataset file or directory to the ModelScope Hub. Please login to the ModelScope Hub first. | |||||
Args: | Args: | ||||
object_name (str): The object name on ModelScope, in the form of your-dataset-name.zip | |||||
local_file_path (str): Local file to upload | |||||
object_name (str): The object name on ModelScope, in the form of your-dataset-name.zip or your-dataset-name | |||||
local_file_path (str): Local file or directory to upload | |||||
dataset_name (str): Name of the dataset | dataset_name (str): Name of the dataset | ||||
namespace(str, optional): Namespace of the dataset | namespace(str, optional): Namespace of the dataset | ||||
version: Optional[str]: Version of the dataset | version: Optional[str]: Version of the dataset | ||||
num_processes: Optional[int]: The number of processes used for multi-process uploading. | |||||
This is only applicable when local_file_path is a directory, and we are uploading mutliple-files | |||||
insided the directory. When None provided, the number returned by os.cpu_count() is used as default. | |||||
chunksize: Optional[int]: The chunksize of objects to upload. | |||||
For very long iterables using a large value for chunksize can make the job complete much faster than | |||||
using the default value of 1. Available if local_file_path is a directory. | |||||
filter_hidden_files: Optional[bool]: Whether to filter hidden files. | |||||
Available if local_file_path is a directory. | |||||
Returns: | Returns: | ||||
None | None | ||||
@@ -586,7 +595,20 @@ class MsDataset: | |||||
""" | """ | ||||
_upload_manager = DatasetUploadManager( | _upload_manager = DatasetUploadManager( | ||||
dataset_name=dataset_name, namespace=namespace, version=version) | dataset_name=dataset_name, namespace=namespace, version=version) | ||||
_upload_manager.upload(object_name, local_file_path) | |||||
if os.path.isfile(local_file_path): | |||||
_upload_manager.upload( | |||||
object_name=object_name, local_file_path=local_file_path) | |||||
elif os.path.isdir(local_file_path): | |||||
_upload_manager.upload_dir( | |||||
object_dir_name=object_name, | |||||
local_dir_path=local_file_path, | |||||
num_processes=num_processes, | |||||
chunksize=chunksize, | |||||
filter_hidden_files=filter_hidden_files) | |||||
else: | |||||
raise ValueError( | |||||
f'{local_file_path} is not a valid file path or directory') | |||||
@staticmethod | @staticmethod | ||||
def clone_meta(dataset_work_dir: str, | def clone_meta(dataset_work_dir: str, | ||||
@@ -6,7 +6,8 @@ from typing import Any, Mapping, Optional, Sequence, Union | |||||
from datasets.builder import DatasetBuilder | from datasets.builder import DatasetBuilder | ||||
from modelscope.utils.constant import DEFAULT_DATASET_REVISION | |||||
from modelscope.hub.api import HubApi | |||||
from modelscope.utils.constant import DEFAULT_DATASET_REVISION, DownloadParams | |||||
from modelscope.utils.logger import get_logger | from modelscope.utils.logger import get_logger | ||||
from .dataset_builder import MsCsvDatasetBuilder, TaskSpecificDatasetBuilder | from .dataset_builder import MsCsvDatasetBuilder, TaskSpecificDatasetBuilder | ||||
@@ -77,6 +78,81 @@ def get_target_dataset_structure(dataset_structure: dict, | |||||
return target_subset_name, target_dataset_structure | return target_subset_name, target_dataset_structure | ||||
def list_dataset_objects(hub_api: HubApi, max_limit: int, is_recursive: bool, | |||||
dataset_name: str, namespace: str, | |||||
version: str) -> list: | |||||
""" | |||||
List all of objects for specific dataset. | |||||
Args: | |||||
hub_api (class HubApi): HubApi instance. | |||||
max_limit (int): Max number of objects. | |||||
is_recursive (bool): Whether to list objects recursively. | |||||
dataset_name (str): Dataset name. | |||||
namespace (str): Namespace. | |||||
version (str): Dataset version. | |||||
Returns: | |||||
res (list): List of objects, i.e., ['train/images/001.png', 'train/images/002.png', 'val/images/001.png', ...] | |||||
""" | |||||
res = [] | |||||
cookies = hub_api.check_cookies_upload_data(use_cookies=True) | |||||
objects = hub_api.list_oss_dataset_objects( | |||||
dataset_name=dataset_name, | |||||
namespace=namespace, | |||||
max_limit=max_limit, | |||||
is_recursive=is_recursive, | |||||
is_filter_dir=True, | |||||
revision=version, | |||||
cookies=cookies) | |||||
for item in objects: | |||||
object_key = item.get('Key') | |||||
res.append(object_key) | |||||
return res | |||||
def contains_dir(file_map) -> bool: | |||||
""" | |||||
To check whether input contains at least one directory. | |||||
Args: | |||||
file_map (dict): Structure of data files. e.g., {'train': 'train.zip', 'validation': 'val.zip'} | |||||
Returns: | |||||
True if input contains at least one directory, False otherwise. | |||||
""" | |||||
res = False | |||||
for k, v in file_map.items(): | |||||
if isinstance(v, str) and not v.endswith('.zip'): | |||||
res = True | |||||
break | |||||
return res | |||||
def get_split_objects_map(file_map, objects): | |||||
""" | |||||
Get the map between dataset split and oss objects. | |||||
Args: | |||||
file_map (dict): Structure of data files. e.g., {'train': 'train', 'validation': 'val'}, both of train and val | |||||
are dirs. | |||||
objects (list): List of oss objects. e.g., ['train/001/1_123.png', 'train/001/1_124.png', 'val/003/3_38.png'] | |||||
Returns: | |||||
A map of split-objects. e.g., {'train': ['train/001/1_123.png', 'train/001/1_124.png'], | |||||
'validation':['val/003/3_38.png']} | |||||
""" | |||||
res = {} | |||||
for k, v in file_map.items(): | |||||
res[k] = [] | |||||
for obj_key in objects: | |||||
for k, v in file_map.items(): | |||||
if obj_key.startswith(v): | |||||
res[k].append(obj_key) | |||||
return res | |||||
def get_dataset_files(subset_split_into: dict, | def get_dataset_files(subset_split_into: dict, | ||||
dataset_name: str, | dataset_name: str, | ||||
namespace: str, | namespace: str, | ||||
@@ -95,14 +171,24 @@ def get_dataset_files(subset_split_into: dict, | |||||
meta_map = defaultdict(dict) | meta_map = defaultdict(dict) | ||||
file_map = defaultdict(dict) | file_map = defaultdict(dict) | ||||
args_map = defaultdict(dict) | args_map = defaultdict(dict) | ||||
from modelscope.hub.api import HubApi | |||||
modelscope_api = HubApi() | modelscope_api = HubApi() | ||||
objects = list_dataset_objects( | |||||
hub_api=modelscope_api, | |||||
max_limit=DownloadParams.MAX_LIST_OBJECTS_NUM.value, | |||||
is_recursive=True, | |||||
dataset_name=dataset_name, | |||||
namespace=namespace, | |||||
version=revision) | |||||
for split, info in subset_split_into.items(): | for split, info in subset_split_into.items(): | ||||
meta_map[split] = modelscope_api.get_dataset_file_url( | meta_map[split] = modelscope_api.get_dataset_file_url( | ||||
info.get('meta', ''), dataset_name, namespace, revision) | info.get('meta', ''), dataset_name, namespace, revision) | ||||
if info.get('file'): | if info.get('file'): | ||||
file_map[split] = info['file'] | file_map[split] = info['file'] | ||||
args_map[split] = info.get('args') | args_map[split] = info.get('args') | ||||
if contains_dir(file_map): | |||||
file_map = get_split_objects_map(file_map, objects) | |||||
return meta_map, file_map, args_map | return meta_map, file_map, args_map | ||||
@@ -10,16 +10,14 @@ from .oss_utils import OssUtilities | |||||
class DatasetDownloadManager(DownloadManager): | class DatasetDownloadManager(DownloadManager): | ||||
def __init__( | |||||
self, | |||||
dataset_name: str, | |||||
namespace: str, | |||||
version: str, | |||||
data_dir: Optional[str] = None, | |||||
download_config: Optional[DownloadConfig] = None, | |||||
base_path: Optional[str] = None, | |||||
record_checksums=True, | |||||
): | |||||
def __init__(self, | |||||
dataset_name: str, | |||||
namespace: str, | |||||
version: str, | |||||
data_dir: Optional[str] = None, | |||||
download_config: Optional[DownloadConfig] = None, | |||||
base_path: Optional[str] = None, | |||||
record_checksums=True): | |||||
super().__init__(dataset_name, data_dir, download_config, base_path, | super().__init__(dataset_name, data_dir, download_config, base_path, | ||||
record_checksums) | record_checksums) | ||||
self._namespace = namespace | self._namespace = namespace | ||||
@@ -50,11 +50,16 @@ class OssUtilities: | |||||
progress_callback=self._percentage) | progress_callback=self._percentage) | ||||
return local_path | return local_path | ||||
def upload(self, oss_object_name: str, local_file_path: str) -> str: | |||||
def upload(self, oss_object_name: str, local_file_path: str, | |||||
indicate_individual_progress: bool) -> str: | |||||
retry_count = 0 | retry_count = 0 | ||||
object_key = os.path.join(self.oss_dir, oss_object_name) | object_key = os.path.join(self.oss_dir, oss_object_name) | ||||
resumable_store = oss2.ResumableStore( | resumable_store = oss2.ResumableStore( | ||||
root=self.upload_resumable_tmp_store) | root=self.upload_resumable_tmp_store) | ||||
if indicate_individual_progress: | |||||
progress_callback = self._percentage | |||||
else: | |||||
progress_callback = None | |||||
while True: | while True: | ||||
try: | try: | ||||
@@ -66,7 +71,7 @@ class OssUtilities: | |||||
store=resumable_store, | store=resumable_store, | ||||
multipart_threshold=self.upload_multipart_threshold, | multipart_threshold=self.upload_multipart_threshold, | ||||
part_size=self.upload_part_size, | part_size=self.upload_part_size, | ||||
progress_callback=self._percentage, | |||||
progress_callback=progress_callback, | |||||
num_threads=self.upload_num_threads) | num_threads=self.upload_num_threads) | ||||
break | break | ||||
except Exception: | except Exception: | ||||
@@ -1,5 +1,10 @@ | |||||
# Copyright (c) Alibaba, Inc. and its affiliates. | # Copyright (c) Alibaba, Inc. and its affiliates. | ||||
import os | |||||
from multiprocessing.dummy import Pool as ThreadPool | |||||
from tqdm import tqdm | |||||
from .oss_utils import OssUtilities | from .oss_utils import OssUtilities | ||||
@@ -19,5 +24,38 @@ class DatasetUploadManager(object): | |||||
def upload(self, object_name: str, local_file_path: str) -> str: | def upload(self, object_name: str, local_file_path: str) -> str: | ||||
object_key = self.oss_utilities.upload( | object_key = self.oss_utilities.upload( | ||||
oss_object_name=object_name, local_file_path=local_file_path) | |||||
oss_object_name=object_name, | |||||
local_file_path=local_file_path, | |||||
indicate_individual_progress=True) | |||||
return object_key | return object_key | ||||
def upload_dir(self, object_dir_name: str, local_dir_path: str, | |||||
num_processes: int, chunksize: int, | |||||
filter_hidden_files: bool) -> int: | |||||
def run_upload(args): | |||||
self.oss_utilities.upload( | |||||
oss_object_name=args[0], | |||||
local_file_path=args[1], | |||||
indicate_individual_progress=False) | |||||
files_list = [] | |||||
for root, dirs, files in os.walk(local_dir_path): | |||||
for file_name in files: | |||||
if filter_hidden_files and file_name.startswith('.'): | |||||
continue | |||||
# Concatenate directory name and relative path into a oss object key. e.g., train/001/1_1230.png | |||||
object_name = os.path.join( | |||||
object_dir_name, | |||||
root.replace(local_dir_path, '', 1).strip('/'), file_name) | |||||
local_file_path = os.path.join(root, file_name) | |||||
files_list.append((object_name, local_file_path)) | |||||
with ThreadPool(processes=num_processes) as pool: | |||||
result = list( | |||||
tqdm( | |||||
pool.imap(run_upload, files_list, chunksize=chunksize), | |||||
total=len(files_list))) | |||||
return len(result) |
@@ -227,6 +227,13 @@ class DownloadMode(enum.Enum): | |||||
FORCE_REDOWNLOAD = 'force_redownload' | FORCE_REDOWNLOAD = 'force_redownload' | ||||
class DownloadParams(enum.Enum): | |||||
""" | |||||
Parameters for downloading dataset. | |||||
""" | |||||
MAX_LIST_OBJECTS_NUM = 50000 | |||||
class DatasetFormations(enum.Enum): | class DatasetFormations(enum.Enum): | ||||
""" How a dataset is organized and interpreted | """ How a dataset is organized and interpreted | ||||
""" | """ | ||||
@@ -6,9 +6,13 @@ import unittest | |||||
import zipfile | import zipfile | ||||
from modelscope.msdatasets import MsDataset | from modelscope.msdatasets import MsDataset | ||||
from modelscope.utils.constant import ModelFile | |||||
from modelscope.msdatasets.utils.dataset_utils import list_dataset_objects | |||||
from modelscope.utils import logger as logging | |||||
from modelscope.utils.constant import DEFAULT_DATASET_REVISION, ModelFile | |||||
from modelscope.utils.test_utils import test_level | from modelscope.utils.test_utils import test_level | ||||
logger = logging.get_logger(__name__) | |||||
KEY_EXTRACTED = 'extracted' | KEY_EXTRACTED = 'extracted' | ||||
@@ -39,7 +43,8 @@ class DatasetUploadTest(unittest.TestCase): | |||||
def tearDown(self): | def tearDown(self): | ||||
os.chdir(self.old_dir) | os.chdir(self.old_dir) | ||||
shutil.rmtree(self.temp_dir, ignore_errors=True) | shutil.rmtree(self.temp_dir, ignore_errors=True) | ||||
print('The test dir successfully removed!') | |||||
logger.info( | |||||
f'Temporary directory {self.temp_dir} successfully removed!') | |||||
@staticmethod | @staticmethod | ||||
def get_raw_downloaded_file_path(extracted_path): | def get_raw_downloaded_file_path(extracted_path): | ||||
@@ -68,6 +73,40 @@ class DatasetUploadTest(unittest.TestCase): | |||||
dataset_name=self.dataset_name, | dataset_name=self.dataset_name, | ||||
namespace=self.namespace) | namespace=self.namespace) | ||||
@unittest.skipUnless(test_level() >= 1, 'skip test in current test level') | |||||
def test_ds_upload_dir(self): | |||||
ms_ds_train = MsDataset.load(self.prepared_dataset_name, split='train') | |||||
config_train = ms_ds_train._hf_ds.config_kwargs | |||||
extracted_path_train = config_train.get('split_config').get('train') | |||||
MsDataset.upload( | |||||
object_name='train', | |||||
local_file_path=os.path.join(extracted_path_train, | |||||
'Pets/images/train'), | |||||
dataset_name=self.dataset_name, | |||||
namespace=self.namespace) | |||||
MsDataset.upload( | |||||
object_name='val', | |||||
local_file_path=os.path.join(extracted_path_train, | |||||
'Pets/images/val'), | |||||
dataset_name=self.dataset_name, | |||||
namespace=self.namespace) | |||||
objects = list_dataset_objects( | |||||
hub_api=self.api, | |||||
max_limit=-1, | |||||
is_recursive=True, | |||||
dataset_name=self.dataset_name, | |||||
namespace=self.namespace, | |||||
version=DEFAULT_DATASET_REVISION) | |||||
logger.info(f'{len(objects)} objects have been uploaded: {objects}') | |||||
@unittest.skipUnless(test_level() >= 1, 'skip test in current test level') | |||||
def test_ds_download_dir(self): | |||||
test_ds = MsDataset.load(self.dataset_name, self.namespace) | |||||
assert test_ds.config_kwargs['split_config'].values() | |||||
@unittest.skipUnless(test_level() >= 1, 'skip test in current test level') | @unittest.skipUnless(test_level() >= 1, 'skip test in current test level') | ||||
def test_ds_clone_meta(self): | def test_ds_clone_meta(self): | ||||
MsDataset.clone_meta( | MsDataset.clone_meta( | ||||