feiwu.yfw 3 years ago
parent
commit
5da470fd5d
8 changed files with 121 additions and 45 deletions
  1. +15
    -0
      modelscope/hub/errors.py
  2. +1
    -1
      modelscope/msdatasets/config.py
  3. +41
    -15
      modelscope/msdatasets/ms_dataset.py
  4. +33
    -15
      modelscope/msdatasets/utils/ms_api.py
  5. +9
    -1
      modelscope/utils/constant.py
  6. +14
    -10
      tests/msdatasets/test_ms_dataset.py
  7. +2
    -1
      tests/pipelines/test_image_matting.py
  8. +6
    -2
      tests/pipelines/test_text_classification.py

+ 15
- 0
modelscope/hub/errors.py View File

@@ -32,3 +32,18 @@ def raise_on_error(rsp):
return True return True
else: else:
raise RequestError(rsp['Message']) raise RequestError(rsp['Message'])


# TODO use raise_on_error instead if modelhub and datahub response have uniform structures,
def datahub_raise_on_error(url, rsp):
"""If response error, raise exception

Args:
rsp (_type_): The server response
"""
if rsp.get('Code') == 200:
return True
else:
raise RequestError(
f"Url = {url}, Status = {rsp.get('status')}, error = {rsp.get('error')}, message = {rsp.get('message')}"
)

+ 1
- 1
modelscope/msdatasets/config.py View File

@@ -19,4 +19,4 @@ DOWNLOADED_DATASETS_PATH = Path(
os.getenv('DOWNLOADED_DATASETS_PATH', DEFAULT_DOWNLOADED_DATASETS_PATH)) os.getenv('DOWNLOADED_DATASETS_PATH', DEFAULT_DOWNLOADED_DATASETS_PATH))


MS_HUB_ENDPOINT = os.environ.get('MS_HUB_ENDPOINT', MS_HUB_ENDPOINT = os.environ.get('MS_HUB_ENDPOINT',
'http://101.201.119.157:31752')
'http://123.57.189.90:31752')

+ 41
- 15
modelscope/msdatasets/ms_dataset.py View File

@@ -3,7 +3,7 @@ from typing import (Any, Callable, Dict, Iterable, List, Mapping, Optional,
Sequence, Union) Sequence, Union)


import numpy as np import numpy as np
from datasets import Dataset
from datasets import Dataset, DatasetDict
from datasets import load_dataset as hf_load_dataset from datasets import load_dataset as hf_load_dataset
from datasets.config import TF_AVAILABLE, TORCH_AVAILABLE from datasets.config import TF_AVAILABLE, TORCH_AVAILABLE
from datasets.packaged_modules import _PACKAGED_DATASETS_MODULES from datasets.packaged_modules import _PACKAGED_DATASETS_MODULES
@@ -12,7 +12,7 @@ from datasets.utils.file_utils import (is_relative_path,


from modelscope.msdatasets.config import MS_DATASETS_CACHE from modelscope.msdatasets.config import MS_DATASETS_CACHE
from modelscope.msdatasets.utils.ms_api import MsApi from modelscope.msdatasets.utils.ms_api import MsApi
from modelscope.utils.constant import Hubs
from modelscope.utils.constant import DownloadMode, Hubs
from modelscope.utils.logger import get_logger from modelscope.utils.logger import get_logger


logger = get_logger() logger = get_logger()
@@ -34,6 +34,10 @@ class MsDataset:


def __init__(self, hf_ds: Dataset, target: Optional[str] = None): def __init__(self, hf_ds: Dataset, target: Optional[str] = None):
self._hf_ds = hf_ds self._hf_ds = hf_ds
if target is not None and target not in self._hf_ds.features:
raise TypeError(
f'"target" must be a column of the dataset({list(self._hf_ds.features.keys())}, but got {target}'
)
self.target = target self.target = target


def __iter__(self): def __iter__(self):
@@ -48,17 +52,23 @@ class MsDataset:


@classmethod @classmethod
def from_hf_dataset(cls, def from_hf_dataset(cls,
hf_ds: Dataset,
hf_ds: Union[Dataset, DatasetDict],
target: str = None) -> Union[dict, 'MsDataset']: target: str = None) -> Union[dict, 'MsDataset']:
if isinstance(hf_ds, Dataset): if isinstance(hf_ds, Dataset):
return cls(hf_ds, target) return cls(hf_ds, target)
if len(hf_ds.keys()) == 1:
return cls(next(iter(hf_ds.values())), target)
return {k: cls(v, target) for k, v in hf_ds.items()}
elif isinstance(hf_ds, DatasetDict):
if len(hf_ds.keys()) == 1:
return cls(next(iter(hf_ds.values())), target)
return {k: cls(v, target) for k, v in hf_ds.items()}
else:
raise TypeError(
f'"hf_ds" must be a Dataset or DatasetDict, but got {type(hf_ds)}'
)


@staticmethod @staticmethod
def load( def load(
dataset_name: Union[str, list], dataset_name: Union[str, list],
namespace: Optional[str] = None,
target: Optional[str] = None, target: Optional[str] = None,
version: Optional[str] = None, version: Optional[str] = None,
hub: Optional[Hubs] = Hubs.modelscope, hub: Optional[Hubs] = Hubs.modelscope,
@@ -67,23 +77,32 @@ class MsDataset:
data_dir: Optional[str] = None, data_dir: Optional[str] = None,
data_files: Optional[Union[str, Sequence[str], data_files: Optional[Union[str, Sequence[str],
Mapping[str, Union[str, Mapping[str, Union[str,
Sequence[str]]]]] = None
Sequence[str]]]]] = None,
download_mode: Optional[DownloadMode] = DownloadMode.
REUSE_DATASET_IF_EXISTS
) -> Union[dict, 'MsDataset']: ) -> Union[dict, 'MsDataset']:
"""Load a MsDataset from the ModelScope Hub, Hugging Face Hub, urls, or a local dataset. """Load a MsDataset from the ModelScope Hub, Hugging Face Hub, urls, or a local dataset.
Args: Args:


dataset_name (str): Path or name of the dataset. dataset_name (str): Path or name of the dataset.
namespace(str, optional): Namespace of the dataset. It should not be None, if you load a remote dataset
from Hubs.modelscope,
target (str, optional): Name of the column to output. target (str, optional): Name of the column to output.
version (str, optional): Version of the dataset script to load: version (str, optional): Version of the dataset script to load:
subset_name (str, optional): Defining the subset_name of the dataset. subset_name (str, optional): Defining the subset_name of the dataset.
data_dir (str, optional): Defining the data_dir of the dataset configuration. I data_dir (str, optional): Defining the data_dir of the dataset configuration. I
data_files (str or Sequence or Mapping, optional): Path(s) to source data file(s). data_files (str or Sequence or Mapping, optional): Path(s) to source data file(s).
split (str, optional): Which split of the data to load. split (str, optional): Which split of the data to load.
hub (Hubs, optional): When loading from a remote hub, where it is from
hub (Hubs or str, optional): When loading from a remote hub, where it is from. default Hubs.modelscope
download_mode (DownloadMode or str, optional): How to treat existing datasets. default
DownloadMode.REUSE_DATASET_IF_EXISTS


Returns: Returns:
MsDataset (obj:`MsDataset`): MsDataset object for a certain dataset. MsDataset (obj:`MsDataset`): MsDataset object for a certain dataset.
""" """
download_mode = DownloadMode(download_mode
or DownloadMode.REUSE_DATASET_IF_EXISTS)
hub = Hubs(hub or Hubs.modelscope)
if hub == Hubs.huggingface: if hub == Hubs.huggingface:
dataset = hf_load_dataset( dataset = hf_load_dataset(
dataset_name, dataset_name,
@@ -91,21 +110,25 @@ class MsDataset:
revision=version, revision=version,
split=split, split=split,
data_dir=data_dir, data_dir=data_dir,
data_files=data_files)
data_files=data_files,
download_mode=download_mode.value)
return MsDataset.from_hf_dataset(dataset, target=target) return MsDataset.from_hf_dataset(dataset, target=target)
else:
elif hub == Hubs.modelscope:
return MsDataset._load_ms_dataset( return MsDataset._load_ms_dataset(
dataset_name, dataset_name,
namespace=namespace,
target=target, target=target,
subset_name=subset_name, subset_name=subset_name,
version=version, version=version,
split=split, split=split,
data_dir=data_dir, data_dir=data_dir,
data_files=data_files)
data_files=data_files,
download_mode=download_mode)


@staticmethod @staticmethod
def _load_ms_dataset( def _load_ms_dataset(
dataset_name: Union[str, list], dataset_name: Union[str, list],
namespace: Optional[str] = None,
target: Optional[str] = None, target: Optional[str] = None,
version: Optional[str] = None, version: Optional[str] = None,
subset_name: Optional[str] = None, subset_name: Optional[str] = None,
@@ -113,17 +136,19 @@ class MsDataset:
data_dir: Optional[str] = None, data_dir: Optional[str] = None,
data_files: Optional[Union[str, Sequence[str], data_files: Optional[Union[str, Sequence[str],
Mapping[str, Union[str, Mapping[str, Union[str,
Sequence[str]]]]] = None
Sequence[str]]]]] = None,
download_mode: Optional[DownloadMode] = None
) -> Union[dict, 'MsDataset']: ) -> Union[dict, 'MsDataset']:
if isinstance(dataset_name, str): if isinstance(dataset_name, str):
use_hf = False use_hf = False
if dataset_name in _PACKAGED_DATASETS_MODULES or os.path.isdir(dataset_name) or \ if dataset_name in _PACKAGED_DATASETS_MODULES or os.path.isdir(dataset_name) or \
(os.path.isfile(dataset_name) and dataset_name.endswith('.py')): (os.path.isfile(dataset_name) and dataset_name.endswith('.py')):
use_hf = True use_hf = True
elif is_relative_path(dataset_name):
elif is_relative_path(dataset_name) and dataset_name.count(
'/') == 0:
ms_api = MsApi() ms_api = MsApi()
dataset_scripts = ms_api.fetch_dataset_scripts( dataset_scripts = ms_api.fetch_dataset_scripts(
dataset_name, version)
dataset_name, namespace, download_mode, version)
if 'py' in dataset_scripts: # dataset copied from hf datasets if 'py' in dataset_scripts: # dataset copied from hf datasets
dataset_name = dataset_scripts['py'][0] dataset_name = dataset_scripts['py'][0]
use_hf = True use_hf = True
@@ -140,7 +165,8 @@ class MsDataset:
split=split, split=split,
data_dir=data_dir, data_dir=data_dir,
data_files=data_files, data_files=data_files,
cache_dir=MS_DATASETS_CACHE)
cache_dir=MS_DATASETS_CACHE,
download_mode=download_mode.value)
else: else:
# TODO load from ms datahub # TODO load from ms datahub
raise NotImplementedError( raise NotImplementedError(


+ 33
- 15
modelscope/msdatasets/utils/ms_api.py View File

@@ -1,11 +1,14 @@
import os import os
import shutil
from collections import defaultdict from collections import defaultdict
from typing import Optional from typing import Optional


import requests import requests


from modelscope.hub.errors import NotExistError, datahub_raise_on_error
from modelscope.msdatasets.config import (DOWNLOADED_DATASETS_PATH, from modelscope.msdatasets.config import (DOWNLOADED_DATASETS_PATH,
MS_HUB_ENDPOINT) MS_HUB_ENDPOINT)
from modelscope.utils.constant import DownloadMode
from modelscope.utils.logger import get_logger from modelscope.utils.logger import get_logger


logger = get_logger() logger = get_logger()
@@ -27,23 +30,38 @@ class MsApi:


def fetch_dataset_scripts(self, def fetch_dataset_scripts(self,
dataset_name: str, dataset_name: str,
version: Optional[str] = 'master',
force_download=False):
datahub_url = f'{self.endpoint}/api/v1/datasets?Query={dataset_name}'
r = requests.get(datahub_url)
r.raise_for_status()
dataset_list = r.json()['Data']
if len(dataset_list) == 0:
return None
dataset_id = dataset_list[0]['Id']
namespace: str,
download_mode: Optional[DownloadMode],
version: Optional[str] = 'master'):
if namespace is None:
raise ValueError(
f'Dataset from Hubs.modelscope should have a valid "namespace", but get {namespace}'
)
version = version or 'master' version = version or 'master'
datahub_url = f'{self.endpoint}/api/v1/datasets/{dataset_id}/repo/tree?Revision={version}'
r = requests.get(datahub_url)
r.raise_for_status()
file_list = r.json()['Data']['Files']
cache_dir = os.path.join(DOWNLOADED_DATASETS_PATH, dataset_name, cache_dir = os.path.join(DOWNLOADED_DATASETS_PATH, dataset_name,
version)
namespace, version)
download_mode = DownloadMode(download_mode
or DownloadMode.REUSE_DATASET_IF_EXISTS)
if download_mode == DownloadMode.FORCE_REDOWNLOAD and os.path.exists(
cache_dir):
shutil.rmtree(cache_dir)
os.makedirs(cache_dir, exist_ok=True) os.makedirs(cache_dir, exist_ok=True)
datahub_url = f'{self.endpoint}/api/v1/datasets/{namespace}/{dataset_name}'
r = requests.get(datahub_url)
resp = r.json()
datahub_raise_on_error(datahub_url, resp)
dataset_id = resp['Data']['Id']
datahub_url = f'{self.endpoint}/api/v1/datasets/{dataset_id}/repo/tree?Revision={version}'
r = requests.get(datahub_url)
resp = r.json()
datahub_raise_on_error(datahub_url, resp)
file_list = resp['Data']
if file_list is None:
raise NotExistError(
f'The modelscope dataset [dataset_name = {dataset_name}, namespace = {namespace}, '
f'version = {version}] dose not exist')

file_list = file_list['Files']
local_paths = defaultdict(list) local_paths = defaultdict(list)
for file_info in file_list: for file_info in file_list:
file_path = file_info['Path'] file_path = file_info['Path']
@@ -54,7 +72,7 @@ class MsApi:
r.raise_for_status() r.raise_for_status()
content = r.json()['Data']['Content'] content = r.json()['Data']['Content']
local_path = os.path.join(cache_dir, file_path) local_path = os.path.join(cache_dir, file_path)
if os.path.exists(local_path) and not force_download:
if os.path.exists(local_path):
logger.warning( logger.warning(
f"Reusing dataset {dataset_name}'s python file ({local_path})" f"Reusing dataset {dataset_name}'s python file ({local_path})"
) )


+ 9
- 1
modelscope/utils/constant.py View File

@@ -1,4 +1,5 @@
# Copyright (c) Alibaba, Inc. and its affiliates. # Copyright (c) Alibaba, Inc. and its affiliates.
import enum




class Fields(object): class Fields(object):
@@ -69,13 +70,20 @@ class InputFields(object):
audio = 'audio' audio = 'audio'




class Hubs(object):
class Hubs(enum.Enum):
""" Source from which an entity (such as a Dataset or Model) is stored """ Source from which an entity (such as a Dataset or Model) is stored
""" """
modelscope = 'modelscope' modelscope = 'modelscope'
huggingface = 'huggingface' huggingface = 'huggingface'




class DownloadMode(enum.Enum):
""" How to treat existing datasets
"""
REUSE_DATASET_IF_EXISTS = 'reuse_dataset_if_exists'
FORCE_REDOWNLOAD = 'force_redownload'


class ModelFile(object): class ModelFile(object):
CONFIGURATION = 'configuration.json' CONFIGURATION = 'configuration.json'
README = 'README.md' README = 'README.md'


+ 14
- 10
tests/msdatasets/test_ms_dataset.py View File

@@ -32,11 +32,12 @@ class ImgPreprocessor(Preprocessor):


class MsDatasetTest(unittest.TestCase): class MsDatasetTest(unittest.TestCase):


@unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
@unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
def test_ds_basic(self): def test_ds_basic(self):
ms_ds_full = MsDataset.load('squad')
ms_ds_full = MsDataset.load('squad', namespace='damotest')
ms_ds_full_hf = hfdata.load_dataset('squad') ms_ds_full_hf = hfdata.load_dataset('squad')
ms_ds_train = MsDataset.load('squad', split='train')
ms_ds_train = MsDataset.load(
'squad', namespace='damotest', split='train')
ms_ds_train_hf = hfdata.load_dataset('squad', split='train') ms_ds_train_hf = hfdata.load_dataset('squad', split='train')
ms_image_train = MsDataset.from_hf_dataset( ms_image_train = MsDataset.from_hf_dataset(
hfdata.load_dataset('beans', split='train')) hfdata.load_dataset('beans', split='train'))
@@ -48,7 +49,7 @@ class MsDatasetTest(unittest.TestCase):
print(next(iter(ms_ds_train))) print(next(iter(ms_ds_train)))
print(next(iter(ms_image_train))) print(next(iter(ms_image_train)))


@unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
@unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
@require_torch @require_torch
def test_to_torch_dataset_text(self): def test_to_torch_dataset_text(self):
model_id = 'damo/bert-base-sst2' model_id = 'damo/bert-base-sst2'
@@ -57,13 +58,14 @@ class MsDatasetTest(unittest.TestCase):
nlp_model.model_dir, nlp_model.model_dir,
first_sequence='context', first_sequence='context',
second_sequence=None) second_sequence=None)
ms_ds_train = MsDataset.load('squad', split='train')
ms_ds_train = MsDataset.load(
'squad', namespace='damotest', split='train')
pt_dataset = ms_ds_train.to_torch_dataset(preprocessors=preprocessor) pt_dataset = ms_ds_train.to_torch_dataset(preprocessors=preprocessor)
import torch import torch
dataloader = torch.utils.data.DataLoader(pt_dataset, batch_size=5) dataloader = torch.utils.data.DataLoader(pt_dataset, batch_size=5)
print(next(iter(dataloader))) print(next(iter(dataloader)))


@unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
@unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
@require_tf @require_tf
def test_to_tf_dataset_text(self): def test_to_tf_dataset_text(self):
import tensorflow as tf import tensorflow as tf
@@ -74,7 +76,8 @@ class MsDatasetTest(unittest.TestCase):
nlp_model.model_dir, nlp_model.model_dir,
first_sequence='context', first_sequence='context',
second_sequence=None) second_sequence=None)
ms_ds_train = MsDataset.load('squad', split='train')
ms_ds_train = MsDataset.load(
'squad', namespace='damotest', split='train')
tf_dataset = ms_ds_train.to_tf_dataset( tf_dataset = ms_ds_train.to_tf_dataset(
batch_size=5, batch_size=5,
shuffle=True, shuffle=True,
@@ -85,8 +88,8 @@ class MsDatasetTest(unittest.TestCase):
@unittest.skipUnless(test_level() >= 2, 'skip test in current test level') @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
@require_torch @require_torch
def test_to_torch_dataset_img(self): def test_to_torch_dataset_img(self):
ms_image_train = MsDataset.from_hf_dataset(
hfdata.load_dataset('beans', split='train'))
ms_image_train = MsDataset.load(
'beans', namespace='damotest', split='train')
pt_dataset = ms_image_train.to_torch_dataset( pt_dataset = ms_image_train.to_torch_dataset(
preprocessors=ImgPreprocessor( preprocessors=ImgPreprocessor(
image_path='image_file_path', label='labels')) image_path='image_file_path', label='labels'))
@@ -99,7 +102,8 @@ class MsDatasetTest(unittest.TestCase):
def test_to_tf_dataset_img(self): def test_to_tf_dataset_img(self):
import tensorflow as tf import tensorflow as tf
tf.compat.v1.enable_eager_execution() tf.compat.v1.enable_eager_execution()
ms_image_train = MsDataset.load('beans', split='train')
ms_image_train = MsDataset.load(
'beans', namespace='damotest', split='train')
tf_dataset = ms_image_train.to_tf_dataset( tf_dataset = ms_image_train.to_tf_dataset(
batch_size=5, batch_size=5,
shuffle=True, shuffle=True,


+ 2
- 1
tests/pipelines/test_image_matting.py View File

@@ -62,7 +62,8 @@ class ImageMattingTest(unittest.TestCase):


@unittest.skipUnless(test_level() >= 2, 'skip test in current test level') @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
def test_run_with_modelscope_dataset(self): def test_run_with_modelscope_dataset(self):
dataset = MsDataset.load('beans', split='train', target='image')
dataset = MsDataset.load(
'beans', namespace='damotest', split='train', target='image')
img_matting = pipeline(Tasks.image_matting, model=self.model_id) img_matting = pipeline(Tasks.image_matting, model=self.model_id)
result = img_matting(dataset) result = img_matting(dataset)
for i in range(10): for i in range(10):


+ 6
- 2
tests/pipelines/test_text_classification.py View File

@@ -87,12 +87,16 @@ class SequenceClassificationTest(unittest.TestCase):
result = text_classification(dataset) result = text_classification(dataset)
self.printDataset(result) self.printDataset(result)


@unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
@unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
def test_run_with_modelscope_dataset(self): def test_run_with_modelscope_dataset(self):
text_classification = pipeline(task=Tasks.text_classification) text_classification = pipeline(task=Tasks.text_classification)
# loaded from modelscope dataset # loaded from modelscope dataset
dataset = MsDataset.load( dataset = MsDataset.load(
'squad', split='train', target='context', hub=Hubs.modelscope)
'squad',
namespace='damotest',
split='train',
target='context',
hub=Hubs.modelscope)
result = text_classification(dataset) result = text_classification(dataset)
self.printDataset(result) self.printDataset(result)




Loading…
Cancel
Save