Browse Source

[to #42322933]数据集断点续传下载+修复数据集命名存在大写字母导致加载失败的问题

Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/9973942

* fix msdataset dataset name
* add resume download
master
feiwu.yfw 3 years ago
parent
commit
3f97278564
2 changed files with 11 additions and 5 deletions
  1. +5
    -3
      modelscope/msdatasets/utils/dataset_builder.py
  2. +6
    -2
      modelscope/msdatasets/utils/oss_utils.py

+ 5
- 3
modelscope/msdatasets/utils/dataset_builder.py View File

@@ -5,6 +5,7 @@ import datasets
import pandas as pd import pandas as pd
import pyarrow as pa import pyarrow as pa
from datasets.info import DatasetInfo from datasets.info import DatasetInfo
from datasets.naming import camelcase_to_snakecase
from datasets.packaged_modules import csv from datasets.packaged_modules import csv
from datasets.utils.filelock import FileLock from datasets.utils.filelock import FileLock


@@ -34,8 +35,8 @@ class MsCsvDatasetBuilder(csv.Csv):
data_files=meta_data_files, data_files=meta_data_files,
**config_kwargs) **config_kwargs)


self.name = dataset_name
self.info.builder_name = self.name
self.name = camelcase_to_snakecase(dataset_name)
self.info.builder_name = dataset_name
self._cache_dir = self._build_cache_dir(namespace=namespace) self._cache_dir = self._build_cache_dir(namespace=namespace)
lock_path = os.path.join( lock_path = os.path.join(
self._cache_dir_root, self._cache_dir_root,
@@ -65,7 +66,7 @@ class MsCsvDatasetBuilder(csv.Csv):
or if a namespace has been specified: or if a namespace has been specified:
self.namespace___self.name/self.config.version/self.hash/ self.namespace___self.name/self.config.version/self.hash/
""" """
builder_data_dir = self.name if namespace is None else f'{namespace}___{self.name}'
builder_data_dir = self.info.builder_name if namespace is None else f'{namespace}___{self.info.builder_name}'
builder_config = self.config builder_config = self.config
hash = self.hash hash = self.hash
if builder_config: if builder_config:
@@ -156,6 +157,7 @@ class TaskSpecificDatasetBuilder(MsCsvDatasetBuilder):
self.zip_data_files = zip_data_files self.zip_data_files = zip_data_files
self.split_path_dict = None self.split_path_dict = None
self.config = None self.config = None
self.info = DatasetInfo.from_dict({'builder_name': dataset_name})
self._cache_dir_root = os.path.expanduser(cache_dir) self._cache_dir_root = os.path.expanduser(cache_dir)
self._cache_dir = self._build_cache_dir() self._cache_dir = self._build_cache_dir()
self._config_kwargs = config_kwargs self._config_kwargs = config_kwargs


+ 6
- 2
modelscope/msdatasets/utils/oss_utils.py View File

@@ -34,8 +34,12 @@ class OssUtilities:
local_path = os.path.join(cache_dir, filename) local_path = os.path.join(cache_dir, filename)


if download_config.force_download or not os.path.exists(local_path): if download_config.force_download or not os.path.exists(local_path):
self.bucket.get_object_to_file(
file_oss_key, local_path, progress_callback=self._percentage)
oss2.resumable_download(
self.bucket,
file_oss_key,
local_path,
multiget_threshold=0,
progress_callback=self._percentage)
return local_path return local_path


def upload(self, oss_file_name: str, local_file_path: str) -> str: def upload(self, oss_file_name: str, local_file_path: str) -> str:


Loading…
Cancel
Save