Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/9165402master
@@ -1,7 +1,7 @@ | |||
modelscope.pydatasets package | |||
modelscope.msdatasets package | |||
============================= | |||
.. automodule:: modelscope.pydatasets | |||
.. automodule:: modelscope.msdatasets | |||
:members: | |||
:undoc-members: | |||
:show-inheritance: | |||
@@ -9,10 +9,10 @@ modelscope.pydatasets package | |||
Submodules | |||
---------- | |||
modelscope.pydatasets.py\_dataset module | |||
modelscope.msdatasets.ms\_dataset module | |||
---------------------------------------- | |||
.. automodule:: modelscope.pydatasets.py_dataset | |||
.. automodule:: modelscope.msdatasets.ms_dataset | |||
:members: | |||
:undoc-members: | |||
:show-inheritance: |
@@ -16,7 +16,7 @@ Subpackages | |||
modelscope.models | |||
modelscope.pipelines | |||
modelscope.preprocessors | |||
modelscope.pydatasets | |||
modelscope.msdatasets | |||
modelscope.trainers | |||
modelscope.utils | |||
@@ -3,7 +3,7 @@ | |||
## python环境配置 | |||
首先,参考[文档](https://docs.anaconda.com/anaconda/install/) 安装配置Anaconda环境 | |||
安装完成后,执行如下命令为maas library创建对应的python环境。 | |||
安装完成后,执行如下命令为modelscope library创建对应的python环境。 | |||
```shell | |||
conda create -n modelscope python=3.6 | |||
conda activate modelscope | |||
@@ -105,15 +105,15 @@ import cv2 | |||
import os.path as osp | |||
from modelscope.pipelines import pipeline | |||
from modelscope.utils.constant import Tasks | |||
from modelscope.pydatasets import PyDataset | |||
from modelscope.msdatasets import MsDataset | |||
# 使用图像url构建PyDataset,此处也可通过 input_location = '/dir/to/images' 来使用本地文件夹 | |||
# 使用图像url构建MsDataset,此处也可通过 input_location = '/dir/to/images' 来使用本地文件夹 | |||
input_location = [ | |||
'http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/data/test/maas/image_matting/test.png' | |||
] | |||
dataset = PyDataset.load(input_location, target='image') | |||
dataset = MsDataset.load(input_location, target='image') | |||
img_matting = pipeline(Tasks.image_matting, model='damo/image-matting-person') | |||
# 输入为PyDataset时,输出的结果为迭代器 | |||
# 输入为MsDataset时,输出的结果为迭代器 | |||
result = img_matting(dataset) | |||
cv2.imwrite('result.png', next(result)['output_png']) | |||
print(f'Output written to {osp.abspath("result.png")}') | |||
@@ -187,7 +187,7 @@ def get_file_download_url(model_id: str, file_path: str, revision: str): | |||
""" | |||
Format file download url according to `model_id`, `revision` and `file_path`. | |||
e.g., Given `model_id=john/bert`, `revision=master`, `file_path=README.md`, | |||
the resulted download url is: https://maas.co/api/v1/models/john/bert/repo?Revision=master&FilePath=README.md | |||
the resulted download url is: https://modelscope.co/api/v1/models/john/bert/repo?Revision=master&FilePath=README.md | |||
""" | |||
download_url_template = '{endpoint}/api/v1/models/{model_id}/repo?Revision={revision}&FilePath={file_path}' | |||
return download_url_template.format( | |||
@@ -0,0 +1 @@ | |||
from .ms_dataset import MsDataset |
@@ -10,8 +10,8 @@ from datasets.packaged_modules import _PACKAGED_DATASETS_MODULES | |||
from datasets.utils.file_utils import (is_relative_path, | |||
relative_to_absolute_path) | |||
from modelscope.pydatasets.config import MS_DATASETS_CACHE | |||
from modelscope.pydatasets.utils.ms_api import MsApi | |||
from modelscope.msdatasets.config import MS_DATASETS_CACHE | |||
from modelscope.msdatasets.utils.ms_api import MsApi | |||
from modelscope.utils.constant import Hubs | |||
from modelscope.utils.logger import get_logger | |||
@@ -28,9 +28,9 @@ def format_list(para) -> List: | |||
return para | |||
class PyDataset: | |||
class MsDataset: | |||
_hf_ds = None # holds the underlying HuggingFace Dataset | |||
"""A PyDataset backed by hugging face Dataset.""" | |||
"""A MsDataset backed by hugging face Dataset.""" | |||
def __init__(self, hf_ds: Dataset, target: Optional[str] = None): | |||
self._hf_ds = hf_ds | |||
@@ -49,7 +49,7 @@ class PyDataset: | |||
@classmethod | |||
def from_hf_dataset(cls, | |||
hf_ds: Dataset, | |||
target: str = None) -> Union[dict, 'PyDataset']: | |||
target: str = None) -> Union[dict, 'MsDataset']: | |||
if isinstance(hf_ds, Dataset): | |||
return cls(hf_ds, target) | |||
if len(hf_ds.keys()) == 1: | |||
@@ -68,8 +68,8 @@ class PyDataset: | |||
data_files: Optional[Union[str, Sequence[str], | |||
Mapping[str, Union[str, | |||
Sequence[str]]]]] = None | |||
) -> Union[dict, 'PyDataset']: | |||
"""Load a PyDataset from the ModelScope Hub, Hugging Face Hub, urls, or a local dataset. | |||
) -> Union[dict, 'MsDataset']: | |||
"""Load a MsDataset from the ModelScope Hub, Hugging Face Hub, urls, or a local dataset. | |||
Args: | |||
dataset_name (str): Path or name of the dataset. | |||
@@ -82,7 +82,7 @@ class PyDataset: | |||
hub (Hubs, optional): When loading from a remote hub, where it is from | |||
Returns: | |||
PyDataset (obj:`PyDataset`): PyDataset object for a certain dataset. | |||
MsDataset (obj:`MsDataset`): MsDataset object for a certain dataset. | |||
""" | |||
if hub == Hubs.huggingface: | |||
dataset = hf_load_dataset( | |||
@@ -92,9 +92,9 @@ class PyDataset: | |||
split=split, | |||
data_dir=data_dir, | |||
data_files=data_files) | |||
return PyDataset.from_hf_dataset(dataset, target=target) | |||
return MsDataset.from_hf_dataset(dataset, target=target) | |||
else: | |||
return PyDataset._load_ms_dataset( | |||
return MsDataset._load_ms_dataset( | |||
dataset_name, | |||
target=target, | |||
subset_name=subset_name, | |||
@@ -114,7 +114,7 @@ class PyDataset: | |||
data_files: Optional[Union[str, Sequence[str], | |||
Mapping[str, Union[str, | |||
Sequence[str]]]]] = None | |||
) -> Union[dict, 'PyDataset']: | |||
) -> Union[dict, 'MsDataset']: | |||
if isinstance(dataset_name, str): | |||
use_hf = False | |||
if dataset_name in _PACKAGED_DATASETS_MODULES or os.path.isdir(dataset_name) or \ | |||
@@ -153,7 +153,7 @@ class PyDataset: | |||
else: | |||
raise TypeError('path must be a str or a list, but got' | |||
f' {type(dataset_name)}') | |||
return PyDataset.from_hf_dataset(dataset, target=target) | |||
return MsDataset.from_hf_dataset(dataset, target=target) | |||
def to_torch_dataset_with_processors( | |||
self, |
@@ -4,7 +4,7 @@ from typing import Optional | |||
import requests | |||
from modelscope.pydatasets.config import (DOWNLOADED_DATASETS_PATH, | |||
from modelscope.msdatasets.config import (DOWNLOADED_DATASETS_PATH, | |||
MS_HUB_ENDPOINT) | |||
from modelscope.utils.logger import get_logger | |||
@@ -6,15 +6,15 @@ from typing import Any, Dict, Generator, List, Union | |||
from modelscope.hub.snapshot_download import snapshot_download | |||
from modelscope.models.base import Model | |||
from modelscope.msdatasets import MsDataset | |||
from modelscope.preprocessors import Preprocessor | |||
from modelscope.pydatasets import PyDataset | |||
from modelscope.utils.config import Config | |||
from modelscope.utils.logger import get_logger | |||
from .outputs import TASK_OUTPUTS | |||
from .util import is_model, is_official_hub_path | |||
Tensor = Union['torch.Tensor', 'tf.Tensor'] | |||
Input = Union[str, tuple, PyDataset, 'PIL.Image.Image', 'numpy.ndarray'] | |||
Input = Union[str, tuple, MsDataset, 'PIL.Image.Image', 'numpy.ndarray'] | |||
InputModel = Union[str, Model] | |||
output_keys = [ | |||
@@ -85,7 +85,7 @@ class Pipeline(ABC): | |||
for ele in input: | |||
output.append(self._process_single(ele, *args, **post_kwargs)) | |||
elif isinstance(input, PyDataset): | |||
elif isinstance(input, MsDataset): | |||
return self._process_iterator(input, *args, **post_kwargs) | |||
else: | |||
@@ -1 +0,0 @@ | |||
from .py_dataset import PyDataset |
@@ -3,10 +3,9 @@ import unittest | |||
import datasets as hfdata | |||
from modelscope.models import Model | |||
from modelscope.msdatasets import MsDataset | |||
from modelscope.preprocessors import SequenceClassificationPreprocessor | |||
from modelscope.preprocessors.base import Preprocessor | |||
from modelscope.pydatasets import PyDataset | |||
from modelscope.utils.constant import Hubs | |||
from modelscope.utils.test_utils import require_tf, require_torch, test_level | |||
@@ -31,15 +30,15 @@ class ImgPreprocessor(Preprocessor): | |||
} | |||
class PyDatasetTest(unittest.TestCase): | |||
class MsDatasetTest(unittest.TestCase): | |||
@unittest.skipUnless(test_level() >= 2, 'skip test in current test level') | |||
def test_ds_basic(self): | |||
ms_ds_full = PyDataset.load('squad') | |||
ms_ds_full = MsDataset.load('squad') | |||
ms_ds_full_hf = hfdata.load_dataset('squad') | |||
ms_ds_train = PyDataset.load('squad', split='train') | |||
ms_ds_train = MsDataset.load('squad', split='train') | |||
ms_ds_train_hf = hfdata.load_dataset('squad', split='train') | |||
ms_image_train = PyDataset.from_hf_dataset( | |||
ms_image_train = MsDataset.from_hf_dataset( | |||
hfdata.load_dataset('beans', split='train')) | |||
self.assertEqual(ms_ds_full['train'][0], ms_ds_full_hf['train'][0]) | |||
self.assertEqual(ms_ds_full['validation'][0], | |||
@@ -58,7 +57,7 @@ class PyDatasetTest(unittest.TestCase): | |||
nlp_model.model_dir, | |||
first_sequence='context', | |||
second_sequence=None) | |||
ms_ds_train = PyDataset.load('squad', split='train') | |||
ms_ds_train = MsDataset.load('squad', split='train') | |||
pt_dataset = ms_ds_train.to_torch_dataset(preprocessors=preprocessor) | |||
import torch | |||
dataloader = torch.utils.data.DataLoader(pt_dataset, batch_size=5) | |||
@@ -75,7 +74,7 @@ class PyDatasetTest(unittest.TestCase): | |||
nlp_model.model_dir, | |||
first_sequence='context', | |||
second_sequence=None) | |||
ms_ds_train = PyDataset.load('squad', split='train') | |||
ms_ds_train = MsDataset.load('squad', split='train') | |||
tf_dataset = ms_ds_train.to_tf_dataset( | |||
batch_size=5, | |||
shuffle=True, | |||
@@ -86,7 +85,7 @@ class PyDatasetTest(unittest.TestCase): | |||
@unittest.skipUnless(test_level() >= 2, 'skip test in current test level') | |||
@require_torch | |||
def test_to_torch_dataset_img(self): | |||
ms_image_train = PyDataset.from_hf_dataset( | |||
ms_image_train = MsDataset.from_hf_dataset( | |||
hfdata.load_dataset('beans', split='train')) | |||
pt_dataset = ms_image_train.to_torch_dataset( | |||
preprocessors=ImgPreprocessor( | |||
@@ -100,7 +99,7 @@ class PyDatasetTest(unittest.TestCase): | |||
def test_to_tf_dataset_img(self): | |||
import tensorflow as tf | |||
tf.compat.v1.enable_eager_execution() | |||
ms_image_train = PyDataset.load('beans', split='train') | |||
ms_image_train = MsDataset.load('beans', split='train') | |||
tf_dataset = ms_image_train.to_tf_dataset( | |||
batch_size=5, | |||
shuffle=True, |
@@ -8,8 +8,8 @@ import unittest | |||
import cv2 | |||
from modelscope.fileio import File | |||
from modelscope.msdatasets import MsDataset | |||
from modelscope.pipelines import pipeline | |||
from modelscope.pydatasets import PyDataset | |||
from modelscope.utils.constant import ModelFile, Tasks | |||
from modelscope.utils.test_utils import test_level | |||
@@ -7,8 +7,8 @@ import unittest | |||
import cv2 | |||
from modelscope.fileio import File | |||
from modelscope.msdatasets import MsDataset | |||
from modelscope.pipelines import pipeline | |||
from modelscope.pydatasets import PyDataset | |||
from modelscope.utils.constant import ModelFile, Tasks | |||
from modelscope.utils.test_utils import test_level | |||
@@ -37,7 +37,7 @@ class ImageMattingTest(unittest.TestCase): | |||
# alternatively: | |||
# input_location = '/dir/to/images' | |||
dataset = PyDataset.load(input_location, target='image') | |||
dataset = MsDataset.load(input_location, target='image') | |||
img_matting = pipeline(Tasks.image_matting, model=self.model_id) | |||
# note that for dataset output, the inference-output is a Generator that can be iterated. | |||
result = img_matting(dataset) | |||
@@ -62,7 +62,7 @@ class ImageMattingTest(unittest.TestCase): | |||
@unittest.skipUnless(test_level() >= 2, 'skip test in current test level') | |||
def test_run_with_modelscope_dataset(self): | |||
dataset = PyDataset.load('beans', split='train', target='image') | |||
dataset = MsDataset.load('beans', split='train', target='image') | |||
img_matting = pipeline(Tasks.image_matting, model=self.model_id) | |||
result = img_matting(dataset) | |||
for i in range(10): | |||
@@ -3,9 +3,9 @@ import shutil | |||
import unittest | |||
from modelscope.models import Model | |||
from modelscope.msdatasets import MsDataset | |||
from modelscope.pipelines import SequenceClassificationPipeline, pipeline | |||
from modelscope.preprocessors import SequenceClassificationPreprocessor | |||
from modelscope.pydatasets import PyDataset | |||
from modelscope.utils.constant import Hubs, Tasks | |||
from modelscope.utils.test_utils import test_level | |||
@@ -28,7 +28,7 @@ class SequenceClassificationTest(unittest.TestCase): | |||
print(data) | |||
def printDataset(self, dataset: PyDataset): | |||
def printDataset(self, dataset: MsDataset): | |||
for i, r in enumerate(dataset): | |||
if i > 10: | |||
break | |||
@@ -50,7 +50,7 @@ class SequenceClassificationTest(unittest.TestCase): | |||
text_classification = pipeline( | |||
task=Tasks.text_classification, model=self.model_id) | |||
result = text_classification( | |||
PyDataset.load( | |||
MsDataset.load( | |||
'glue', | |||
subset_name='sst2', | |||
split='train', | |||
@@ -62,7 +62,7 @@ class SequenceClassificationTest(unittest.TestCase): | |||
def test_run_with_default_model(self): | |||
text_classification = pipeline(task=Tasks.text_classification) | |||
result = text_classification( | |||
PyDataset.load( | |||
MsDataset.load( | |||
'glue', | |||
subset_name='sst2', | |||
split='train', | |||
@@ -78,7 +78,7 @@ class SequenceClassificationTest(unittest.TestCase): | |||
text_classification = pipeline( | |||
Tasks.text_classification, model=model, preprocessor=preprocessor) | |||
# loaded from huggingface dataset | |||
dataset = PyDataset.load( | |||
dataset = MsDataset.load( | |||
'glue', | |||
subset_name='sst2', | |||
split='train', | |||
@@ -91,7 +91,7 @@ class SequenceClassificationTest(unittest.TestCase): | |||
def test_run_with_modelscope_dataset(self): | |||
text_classification = pipeline(task=Tasks.text_classification) | |||
# loaded from modelscope dataset | |||
dataset = PyDataset.load( | |||
dataset = MsDataset.load( | |||
'squad', split='train', target='context', hub=Hubs.modelscope) | |||
result = text_classification(dataset) | |||
self.printDataset(result) | |||