Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/9165402master
@@ -1,7 +1,7 @@ | |||||
modelscope.pydatasets package | |||||
modelscope.msdatasets package | |||||
============================= | ============================= | ||||
.. automodule:: modelscope.pydatasets | |||||
.. automodule:: modelscope.msdatasets | |||||
:members: | :members: | ||||
:undoc-members: | :undoc-members: | ||||
:show-inheritance: | :show-inheritance: | ||||
@@ -9,10 +9,10 @@ modelscope.pydatasets package | |||||
Submodules | Submodules | ||||
---------- | ---------- | ||||
modelscope.pydatasets.py\_dataset module | |||||
modelscope.msdatasets.ms\_dataset module | |||||
---------------------------------------- | ---------------------------------------- | ||||
.. automodule:: modelscope.pydatasets.py_dataset | |||||
.. automodule:: modelscope.msdatasets.ms_dataset | |||||
:members: | :members: | ||||
:undoc-members: | :undoc-members: | ||||
:show-inheritance: | :show-inheritance: |
@@ -16,7 +16,7 @@ Subpackages | |||||
modelscope.models | modelscope.models | ||||
modelscope.pipelines | modelscope.pipelines | ||||
modelscope.preprocessors | modelscope.preprocessors | ||||
modelscope.pydatasets | |||||
modelscope.msdatasets | |||||
modelscope.trainers | modelscope.trainers | ||||
modelscope.utils | modelscope.utils | ||||
@@ -3,7 +3,7 @@ | |||||
## python环境配置 | ## python环境配置 | ||||
首先,参考[文档](https://docs.anaconda.com/anaconda/install/) 安装配置Anaconda环境 | 首先,参考[文档](https://docs.anaconda.com/anaconda/install/) 安装配置Anaconda环境 | ||||
安装完成后,执行如下命令为maas library创建对应的python环境。 | |||||
安装完成后,执行如下命令为modelscope library创建对应的python环境。 | |||||
```shell | ```shell | ||||
conda create -n modelscope python=3.6 | conda create -n modelscope python=3.6 | ||||
conda activate modelscope | conda activate modelscope | ||||
@@ -105,15 +105,15 @@ import cv2 | |||||
import os.path as osp | import os.path as osp | ||||
from modelscope.pipelines import pipeline | from modelscope.pipelines import pipeline | ||||
from modelscope.utils.constant import Tasks | from modelscope.utils.constant import Tasks | ||||
from modelscope.pydatasets import PyDataset | |||||
from modelscope.msdatasets import MsDataset | |||||
# 使用图像url构建PyDataset,此处也可通过 input_location = '/dir/to/images' 来使用本地文件夹 | |||||
# 使用图像url构建MsDataset,此处也可通过 input_location = '/dir/to/images' 来使用本地文件夹 | |||||
input_location = [ | input_location = [ | ||||
'http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/data/test/maas/image_matting/test.png' | 'http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/data/test/maas/image_matting/test.png' | ||||
] | ] | ||||
dataset = PyDataset.load(input_location, target='image') | |||||
dataset = MsDataset.load(input_location, target='image') | |||||
img_matting = pipeline(Tasks.image_matting, model='damo/image-matting-person') | img_matting = pipeline(Tasks.image_matting, model='damo/image-matting-person') | ||||
# 输入为PyDataset时,输出的结果为迭代器 | |||||
# 输入为MsDataset时,输出的结果为迭代器 | |||||
result = img_matting(dataset) | result = img_matting(dataset) | ||||
cv2.imwrite('result.png', next(result)['output_png']) | cv2.imwrite('result.png', next(result)['output_png']) | ||||
print(f'Output written to {osp.abspath("result.png")}') | print(f'Output written to {osp.abspath("result.png")}') | ||||
@@ -187,7 +187,7 @@ def get_file_download_url(model_id: str, file_path: str, revision: str): | |||||
""" | """ | ||||
Format file download url according to `model_id`, `revision` and `file_path`. | Format file download url according to `model_id`, `revision` and `file_path`. | ||||
e.g., Given `model_id=john/bert`, `revision=master`, `file_path=README.md`, | e.g., Given `model_id=john/bert`, `revision=master`, `file_path=README.md`, | ||||
the resulted download url is: https://maas.co/api/v1/models/john/bert/repo?Revision=master&FilePath=README.md | |||||
the resulted download url is: https://modelscope.co/api/v1/models/john/bert/repo?Revision=master&FilePath=README.md | |||||
""" | """ | ||||
download_url_template = '{endpoint}/api/v1/models/{model_id}/repo?Revision={revision}&FilePath={file_path}' | download_url_template = '{endpoint}/api/v1/models/{model_id}/repo?Revision={revision}&FilePath={file_path}' | ||||
return download_url_template.format( | return download_url_template.format( | ||||
@@ -0,0 +1 @@ | |||||
from .ms_dataset import MsDataset |
@@ -10,8 +10,8 @@ from datasets.packaged_modules import _PACKAGED_DATASETS_MODULES | |||||
from datasets.utils.file_utils import (is_relative_path, | from datasets.utils.file_utils import (is_relative_path, | ||||
relative_to_absolute_path) | relative_to_absolute_path) | ||||
from modelscope.pydatasets.config import MS_DATASETS_CACHE | |||||
from modelscope.pydatasets.utils.ms_api import MsApi | |||||
from modelscope.msdatasets.config import MS_DATASETS_CACHE | |||||
from modelscope.msdatasets.utils.ms_api import MsApi | |||||
from modelscope.utils.constant import Hubs | from modelscope.utils.constant import Hubs | ||||
from modelscope.utils.logger import get_logger | from modelscope.utils.logger import get_logger | ||||
@@ -28,9 +28,9 @@ def format_list(para) -> List: | |||||
return para | return para | ||||
class PyDataset: | |||||
class MsDataset: | |||||
_hf_ds = None # holds the underlying HuggingFace Dataset | _hf_ds = None # holds the underlying HuggingFace Dataset | ||||
"""A PyDataset backed by hugging face Dataset.""" | |||||
"""A MsDataset backed by hugging face Dataset.""" | |||||
def __init__(self, hf_ds: Dataset, target: Optional[str] = None): | def __init__(self, hf_ds: Dataset, target: Optional[str] = None): | ||||
self._hf_ds = hf_ds | self._hf_ds = hf_ds | ||||
@@ -49,7 +49,7 @@ class PyDataset: | |||||
@classmethod | @classmethod | ||||
def from_hf_dataset(cls, | def from_hf_dataset(cls, | ||||
hf_ds: Dataset, | hf_ds: Dataset, | ||||
target: str = None) -> Union[dict, 'PyDataset']: | |||||
target: str = None) -> Union[dict, 'MsDataset']: | |||||
if isinstance(hf_ds, Dataset): | if isinstance(hf_ds, Dataset): | ||||
return cls(hf_ds, target) | return cls(hf_ds, target) | ||||
if len(hf_ds.keys()) == 1: | if len(hf_ds.keys()) == 1: | ||||
@@ -68,8 +68,8 @@ class PyDataset: | |||||
data_files: Optional[Union[str, Sequence[str], | data_files: Optional[Union[str, Sequence[str], | ||||
Mapping[str, Union[str, | Mapping[str, Union[str, | ||||
Sequence[str]]]]] = None | Sequence[str]]]]] = None | ||||
) -> Union[dict, 'PyDataset']: | |||||
"""Load a PyDataset from the ModelScope Hub, Hugging Face Hub, urls, or a local dataset. | |||||
) -> Union[dict, 'MsDataset']: | |||||
"""Load a MsDataset from the ModelScope Hub, Hugging Face Hub, urls, or a local dataset. | |||||
Args: | Args: | ||||
dataset_name (str): Path or name of the dataset. | dataset_name (str): Path or name of the dataset. | ||||
@@ -82,7 +82,7 @@ class PyDataset: | |||||
hub (Hubs, optional): When loading from a remote hub, where it is from | hub (Hubs, optional): When loading from a remote hub, where it is from | ||||
Returns: | Returns: | ||||
PyDataset (obj:`PyDataset`): PyDataset object for a certain dataset. | |||||
MsDataset (obj:`MsDataset`): MsDataset object for a certain dataset. | |||||
""" | """ | ||||
if hub == Hubs.huggingface: | if hub == Hubs.huggingface: | ||||
dataset = hf_load_dataset( | dataset = hf_load_dataset( | ||||
@@ -92,9 +92,9 @@ class PyDataset: | |||||
split=split, | split=split, | ||||
data_dir=data_dir, | data_dir=data_dir, | ||||
data_files=data_files) | data_files=data_files) | ||||
return PyDataset.from_hf_dataset(dataset, target=target) | |||||
return MsDataset.from_hf_dataset(dataset, target=target) | |||||
else: | else: | ||||
return PyDataset._load_ms_dataset( | |||||
return MsDataset._load_ms_dataset( | |||||
dataset_name, | dataset_name, | ||||
target=target, | target=target, | ||||
subset_name=subset_name, | subset_name=subset_name, | ||||
@@ -114,7 +114,7 @@ class PyDataset: | |||||
data_files: Optional[Union[str, Sequence[str], | data_files: Optional[Union[str, Sequence[str], | ||||
Mapping[str, Union[str, | Mapping[str, Union[str, | ||||
Sequence[str]]]]] = None | Sequence[str]]]]] = None | ||||
) -> Union[dict, 'PyDataset']: | |||||
) -> Union[dict, 'MsDataset']: | |||||
if isinstance(dataset_name, str): | if isinstance(dataset_name, str): | ||||
use_hf = False | use_hf = False | ||||
if dataset_name in _PACKAGED_DATASETS_MODULES or os.path.isdir(dataset_name) or \ | if dataset_name in _PACKAGED_DATASETS_MODULES or os.path.isdir(dataset_name) or \ | ||||
@@ -153,7 +153,7 @@ class PyDataset: | |||||
else: | else: | ||||
raise TypeError('path must be a str or a list, but got' | raise TypeError('path must be a str or a list, but got' | ||||
f' {type(dataset_name)}') | f' {type(dataset_name)}') | ||||
return PyDataset.from_hf_dataset(dataset, target=target) | |||||
return MsDataset.from_hf_dataset(dataset, target=target) | |||||
def to_torch_dataset_with_processors( | def to_torch_dataset_with_processors( | ||||
self, | self, |
@@ -4,7 +4,7 @@ from typing import Optional | |||||
import requests | import requests | ||||
from modelscope.pydatasets.config import (DOWNLOADED_DATASETS_PATH, | |||||
from modelscope.msdatasets.config import (DOWNLOADED_DATASETS_PATH, | |||||
MS_HUB_ENDPOINT) | MS_HUB_ENDPOINT) | ||||
from modelscope.utils.logger import get_logger | from modelscope.utils.logger import get_logger | ||||
@@ -6,15 +6,15 @@ from typing import Any, Dict, Generator, List, Union | |||||
from modelscope.hub.snapshot_download import snapshot_download | from modelscope.hub.snapshot_download import snapshot_download | ||||
from modelscope.models.base import Model | from modelscope.models.base import Model | ||||
from modelscope.msdatasets import MsDataset | |||||
from modelscope.preprocessors import Preprocessor | from modelscope.preprocessors import Preprocessor | ||||
from modelscope.pydatasets import PyDataset | |||||
from modelscope.utils.config import Config | from modelscope.utils.config import Config | ||||
from modelscope.utils.logger import get_logger | from modelscope.utils.logger import get_logger | ||||
from .outputs import TASK_OUTPUTS | from .outputs import TASK_OUTPUTS | ||||
from .util import is_model, is_official_hub_path | from .util import is_model, is_official_hub_path | ||||
Tensor = Union['torch.Tensor', 'tf.Tensor'] | Tensor = Union['torch.Tensor', 'tf.Tensor'] | ||||
Input = Union[str, tuple, PyDataset, 'PIL.Image.Image', 'numpy.ndarray'] | |||||
Input = Union[str, tuple, MsDataset, 'PIL.Image.Image', 'numpy.ndarray'] | |||||
InputModel = Union[str, Model] | InputModel = Union[str, Model] | ||||
output_keys = [ | output_keys = [ | ||||
@@ -85,7 +85,7 @@ class Pipeline(ABC): | |||||
for ele in input: | for ele in input: | ||||
output.append(self._process_single(ele, *args, **post_kwargs)) | output.append(self._process_single(ele, *args, **post_kwargs)) | ||||
elif isinstance(input, PyDataset): | |||||
elif isinstance(input, MsDataset): | |||||
return self._process_iterator(input, *args, **post_kwargs) | return self._process_iterator(input, *args, **post_kwargs) | ||||
else: | else: | ||||
@@ -1 +0,0 @@ | |||||
from .py_dataset import PyDataset |
@@ -3,10 +3,9 @@ import unittest | |||||
import datasets as hfdata | import datasets as hfdata | ||||
from modelscope.models import Model | from modelscope.models import Model | ||||
from modelscope.msdatasets import MsDataset | |||||
from modelscope.preprocessors import SequenceClassificationPreprocessor | from modelscope.preprocessors import SequenceClassificationPreprocessor | ||||
from modelscope.preprocessors.base import Preprocessor | from modelscope.preprocessors.base import Preprocessor | ||||
from modelscope.pydatasets import PyDataset | |||||
from modelscope.utils.constant import Hubs | |||||
from modelscope.utils.test_utils import require_tf, require_torch, test_level | from modelscope.utils.test_utils import require_tf, require_torch, test_level | ||||
@@ -31,15 +30,15 @@ class ImgPreprocessor(Preprocessor): | |||||
} | } | ||||
class PyDatasetTest(unittest.TestCase): | |||||
class MsDatasetTest(unittest.TestCase): | |||||
@unittest.skipUnless(test_level() >= 2, 'skip test in current test level') | @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') | ||||
def test_ds_basic(self): | def test_ds_basic(self): | ||||
ms_ds_full = PyDataset.load('squad') | |||||
ms_ds_full = MsDataset.load('squad') | |||||
ms_ds_full_hf = hfdata.load_dataset('squad') | ms_ds_full_hf = hfdata.load_dataset('squad') | ||||
ms_ds_train = PyDataset.load('squad', split='train') | |||||
ms_ds_train = MsDataset.load('squad', split='train') | |||||
ms_ds_train_hf = hfdata.load_dataset('squad', split='train') | ms_ds_train_hf = hfdata.load_dataset('squad', split='train') | ||||
ms_image_train = PyDataset.from_hf_dataset( | |||||
ms_image_train = MsDataset.from_hf_dataset( | |||||
hfdata.load_dataset('beans', split='train')) | hfdata.load_dataset('beans', split='train')) | ||||
self.assertEqual(ms_ds_full['train'][0], ms_ds_full_hf['train'][0]) | self.assertEqual(ms_ds_full['train'][0], ms_ds_full_hf['train'][0]) | ||||
self.assertEqual(ms_ds_full['validation'][0], | self.assertEqual(ms_ds_full['validation'][0], | ||||
@@ -58,7 +57,7 @@ class PyDatasetTest(unittest.TestCase): | |||||
nlp_model.model_dir, | nlp_model.model_dir, | ||||
first_sequence='context', | first_sequence='context', | ||||
second_sequence=None) | second_sequence=None) | ||||
ms_ds_train = PyDataset.load('squad', split='train') | |||||
ms_ds_train = MsDataset.load('squad', split='train') | |||||
pt_dataset = ms_ds_train.to_torch_dataset(preprocessors=preprocessor) | pt_dataset = ms_ds_train.to_torch_dataset(preprocessors=preprocessor) | ||||
import torch | import torch | ||||
dataloader = torch.utils.data.DataLoader(pt_dataset, batch_size=5) | dataloader = torch.utils.data.DataLoader(pt_dataset, batch_size=5) | ||||
@@ -75,7 +74,7 @@ class PyDatasetTest(unittest.TestCase): | |||||
nlp_model.model_dir, | nlp_model.model_dir, | ||||
first_sequence='context', | first_sequence='context', | ||||
second_sequence=None) | second_sequence=None) | ||||
ms_ds_train = PyDataset.load('squad', split='train') | |||||
ms_ds_train = MsDataset.load('squad', split='train') | |||||
tf_dataset = ms_ds_train.to_tf_dataset( | tf_dataset = ms_ds_train.to_tf_dataset( | ||||
batch_size=5, | batch_size=5, | ||||
shuffle=True, | shuffle=True, | ||||
@@ -86,7 +85,7 @@ class PyDatasetTest(unittest.TestCase): | |||||
@unittest.skipUnless(test_level() >= 2, 'skip test in current test level') | @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') | ||||
@require_torch | @require_torch | ||||
def test_to_torch_dataset_img(self): | def test_to_torch_dataset_img(self): | ||||
ms_image_train = PyDataset.from_hf_dataset( | |||||
ms_image_train = MsDataset.from_hf_dataset( | |||||
hfdata.load_dataset('beans', split='train')) | hfdata.load_dataset('beans', split='train')) | ||||
pt_dataset = ms_image_train.to_torch_dataset( | pt_dataset = ms_image_train.to_torch_dataset( | ||||
preprocessors=ImgPreprocessor( | preprocessors=ImgPreprocessor( | ||||
@@ -100,7 +99,7 @@ class PyDatasetTest(unittest.TestCase): | |||||
def test_to_tf_dataset_img(self): | def test_to_tf_dataset_img(self): | ||||
import tensorflow as tf | import tensorflow as tf | ||||
tf.compat.v1.enable_eager_execution() | tf.compat.v1.enable_eager_execution() | ||||
ms_image_train = PyDataset.load('beans', split='train') | |||||
ms_image_train = MsDataset.load('beans', split='train') | |||||
tf_dataset = ms_image_train.to_tf_dataset( | tf_dataset = ms_image_train.to_tf_dataset( | ||||
batch_size=5, | batch_size=5, | ||||
shuffle=True, | shuffle=True, |
@@ -8,8 +8,8 @@ import unittest | |||||
import cv2 | import cv2 | ||||
from modelscope.fileio import File | from modelscope.fileio import File | ||||
from modelscope.msdatasets import MsDataset | |||||
from modelscope.pipelines import pipeline | from modelscope.pipelines import pipeline | ||||
from modelscope.pydatasets import PyDataset | |||||
from modelscope.utils.constant import ModelFile, Tasks | from modelscope.utils.constant import ModelFile, Tasks | ||||
from modelscope.utils.test_utils import test_level | from modelscope.utils.test_utils import test_level | ||||
@@ -7,8 +7,8 @@ import unittest | |||||
import cv2 | import cv2 | ||||
from modelscope.fileio import File | from modelscope.fileio import File | ||||
from modelscope.msdatasets import MsDataset | |||||
from modelscope.pipelines import pipeline | from modelscope.pipelines import pipeline | ||||
from modelscope.pydatasets import PyDataset | |||||
from modelscope.utils.constant import ModelFile, Tasks | from modelscope.utils.constant import ModelFile, Tasks | ||||
from modelscope.utils.test_utils import test_level | from modelscope.utils.test_utils import test_level | ||||
@@ -37,7 +37,7 @@ class ImageMattingTest(unittest.TestCase): | |||||
# alternatively: | # alternatively: | ||||
# input_location = '/dir/to/images' | # input_location = '/dir/to/images' | ||||
dataset = PyDataset.load(input_location, target='image') | |||||
dataset = MsDataset.load(input_location, target='image') | |||||
img_matting = pipeline(Tasks.image_matting, model=self.model_id) | img_matting = pipeline(Tasks.image_matting, model=self.model_id) | ||||
# note that for dataset output, the inference-output is a Generator that can be iterated. | # note that for dataset output, the inference-output is a Generator that can be iterated. | ||||
result = img_matting(dataset) | result = img_matting(dataset) | ||||
@@ -62,7 +62,7 @@ class ImageMattingTest(unittest.TestCase): | |||||
@unittest.skipUnless(test_level() >= 2, 'skip test in current test level') | @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') | ||||
def test_run_with_modelscope_dataset(self): | def test_run_with_modelscope_dataset(self): | ||||
dataset = PyDataset.load('beans', split='train', target='image') | |||||
dataset = MsDataset.load('beans', split='train', target='image') | |||||
img_matting = pipeline(Tasks.image_matting, model=self.model_id) | img_matting = pipeline(Tasks.image_matting, model=self.model_id) | ||||
result = img_matting(dataset) | result = img_matting(dataset) | ||||
for i in range(10): | for i in range(10): | ||||
@@ -3,9 +3,9 @@ import shutil | |||||
import unittest | import unittest | ||||
from modelscope.models import Model | from modelscope.models import Model | ||||
from modelscope.msdatasets import MsDataset | |||||
from modelscope.pipelines import SequenceClassificationPipeline, pipeline | from modelscope.pipelines import SequenceClassificationPipeline, pipeline | ||||
from modelscope.preprocessors import SequenceClassificationPreprocessor | from modelscope.preprocessors import SequenceClassificationPreprocessor | ||||
from modelscope.pydatasets import PyDataset | |||||
from modelscope.utils.constant import Hubs, Tasks | from modelscope.utils.constant import Hubs, Tasks | ||||
from modelscope.utils.test_utils import test_level | from modelscope.utils.test_utils import test_level | ||||
@@ -28,7 +28,7 @@ class SequenceClassificationTest(unittest.TestCase): | |||||
print(data) | print(data) | ||||
def printDataset(self, dataset: PyDataset): | |||||
def printDataset(self, dataset: MsDataset): | |||||
for i, r in enumerate(dataset): | for i, r in enumerate(dataset): | ||||
if i > 10: | if i > 10: | ||||
break | break | ||||
@@ -50,7 +50,7 @@ class SequenceClassificationTest(unittest.TestCase): | |||||
text_classification = pipeline( | text_classification = pipeline( | ||||
task=Tasks.text_classification, model=self.model_id) | task=Tasks.text_classification, model=self.model_id) | ||||
result = text_classification( | result = text_classification( | ||||
PyDataset.load( | |||||
MsDataset.load( | |||||
'glue', | 'glue', | ||||
subset_name='sst2', | subset_name='sst2', | ||||
split='train', | split='train', | ||||
@@ -62,7 +62,7 @@ class SequenceClassificationTest(unittest.TestCase): | |||||
def test_run_with_default_model(self): | def test_run_with_default_model(self): | ||||
text_classification = pipeline(task=Tasks.text_classification) | text_classification = pipeline(task=Tasks.text_classification) | ||||
result = text_classification( | result = text_classification( | ||||
PyDataset.load( | |||||
MsDataset.load( | |||||
'glue', | 'glue', | ||||
subset_name='sst2', | subset_name='sst2', | ||||
split='train', | split='train', | ||||
@@ -78,7 +78,7 @@ class SequenceClassificationTest(unittest.TestCase): | |||||
text_classification = pipeline( | text_classification = pipeline( | ||||
Tasks.text_classification, model=model, preprocessor=preprocessor) | Tasks.text_classification, model=model, preprocessor=preprocessor) | ||||
# loaded from huggingface dataset | # loaded from huggingface dataset | ||||
dataset = PyDataset.load( | |||||
dataset = MsDataset.load( | |||||
'glue', | 'glue', | ||||
subset_name='sst2', | subset_name='sst2', | ||||
split='train', | split='train', | ||||
@@ -91,7 +91,7 @@ class SequenceClassificationTest(unittest.TestCase): | |||||
def test_run_with_modelscope_dataset(self): | def test_run_with_modelscope_dataset(self): | ||||
text_classification = pipeline(task=Tasks.text_classification) | text_classification = pipeline(task=Tasks.text_classification) | ||||
# loaded from modelscope dataset | # loaded from modelscope dataset | ||||
dataset = PyDataset.load( | |||||
dataset = MsDataset.load( | |||||
'squad', split='train', target='context', hub=Hubs.modelscope) | 'squad', split='train', target='context', hub=Hubs.modelscope) | ||||
result = text_classification(dataset) | result = text_classification(dataset) | ||||
self.printDataset(result) | self.printDataset(result) | ||||