| @@ -1,7 +1,7 @@ | |||
| modelscope.pydatasets package | |||
| modelscope.datasets package | |||
| ============================= | |||
| .. automodule:: modelscope.pydatasets | |||
| .. automodule:: modelscope.datasets | |||
| :members: | |||
| :undoc-members: | |||
| :show-inheritance: | |||
| @@ -9,10 +9,10 @@ modelscope.pydatasets package | |||
| Submodules | |||
| ---------- | |||
| modelscope.pydatasets.py\_dataset module | |||
| modelscope.datasets.py\_dataset module | |||
| ---------------------------------------- | |||
| .. automodule:: modelscope.pydatasets.py_dataset | |||
| .. automodule:: modelscope.datasets.ms_dataset | |||
| :members: | |||
| :undoc-members: | |||
| :show-inheritance: | |||
| @@ -16,7 +16,7 @@ Subpackages | |||
| modelscope.models | |||
| modelscope.pipelines | |||
| modelscope.preprocessors | |||
| modelscope.pydatasets | |||
| modelscope.datasets | |||
| modelscope.trainers | |||
| modelscope.utils | |||
| @@ -3,7 +3,7 @@ | |||
| ## python环境配置 | |||
| 首先,参考[文档](https://docs.anaconda.com/anaconda/install/) 安装配置Anaconda环境 | |||
| 安装完成后,执行如下命令为maas library创建对应的python环境。 | |||
| 安装完成后,执行如下命令为modelscope library创建对应的python环境。 | |||
| ```shell | |||
| conda create -n modelscope python=3.6 | |||
| conda activate modelscope | |||
| @@ -105,15 +105,15 @@ import cv2 | |||
| import os.path as osp | |||
| from modelscope.pipelines import pipeline | |||
| from modelscope.utils.constant import Tasks | |||
| from modelscope.pydatasets import PyDataset | |||
| from modelscope.datasets import MsDataset | |||
| # 使用图像url构建PyDataset,此处也可通过 input_location = '/dir/to/images' 来使用本地文件夹 | |||
| # 使用图像url构建MsDataset,此处也可通过 input_location = '/dir/to/images' 来使用本地文件夹 | |||
| input_location = [ | |||
| 'http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/data/test/maas/image_matting/test.png' | |||
| ] | |||
| dataset = PyDataset.load(input_location, target='image') | |||
| dataset = MsDataset.load(input_location, target='image') | |||
| img_matting = pipeline(Tasks.image_matting, model='damo/image-matting-person') | |||
| # 输入为PyDataset时,输出的结果为迭代器 | |||
| # 输入为MsDataset时,输出的结果为迭代器 | |||
| result = img_matting(dataset) | |||
| cv2.imwrite('result.png', next(result)['output_png']) | |||
| print(f'Output written to {osp.abspath("result.png")}') | |||
| @@ -0,0 +1 @@ | |||
| from .ms_dataset import MsDataset | |||
| @@ -10,8 +10,8 @@ from datasets.packaged_modules import _PACKAGED_DATASETS_MODULES | |||
| from datasets.utils.file_utils import (is_relative_path, | |||
| relative_to_absolute_path) | |||
| from modelscope.pydatasets.config import MS_DATASETS_CACHE | |||
| from modelscope.pydatasets.utils.ms_api import MsApi | |||
| from modelscope.datasets.config import MS_DATASETS_CACHE | |||
| from modelscope.datasets.utils.ms_api import MsApi | |||
| from modelscope.utils.constant import Hubs | |||
| from modelscope.utils.logger import get_logger | |||
| @@ -28,9 +28,9 @@ def format_list(para) -> List: | |||
| return para | |||
| class PyDataset: | |||
| class MsDataset: | |||
| _hf_ds = None # holds the underlying HuggingFace Dataset | |||
| """A PyDataset backed by hugging face Dataset.""" | |||
| """A MsDataset backed by hugging face Dataset.""" | |||
| def __init__(self, hf_ds: Dataset, target: Optional[str] = None): | |||
| self._hf_ds = hf_ds | |||
| @@ -49,7 +49,7 @@ class PyDataset: | |||
| @classmethod | |||
| def from_hf_dataset(cls, | |||
| hf_ds: Dataset, | |||
| target: str = None) -> Union[dict, 'PyDataset']: | |||
| target: str = None) -> Union[dict, 'MsDataset']: | |||
| if isinstance(hf_ds, Dataset): | |||
| return cls(hf_ds, target) | |||
| if len(hf_ds.keys()) == 1: | |||
| @@ -68,8 +68,8 @@ class PyDataset: | |||
| data_files: Optional[Union[str, Sequence[str], | |||
| Mapping[str, Union[str, | |||
| Sequence[str]]]]] = None | |||
| ) -> Union[dict, 'PyDataset']: | |||
| """Load a PyDataset from the ModelScope Hub, Hugging Face Hub, urls, or a local dataset. | |||
| ) -> Union[dict, 'MsDataset']: | |||
| """Load a MsDataset from the ModelScope Hub, Hugging Face Hub, urls, or a local dataset. | |||
| Args: | |||
| dataset_name (str): Path or name of the dataset. | |||
| @@ -82,7 +82,7 @@ class PyDataset: | |||
| hub (Hubs, optional): When loading from a remote hub, where it is from | |||
| Returns: | |||
| PyDataset (obj:`PyDataset`): PyDataset object for a certain dataset. | |||
| MsDataset (obj:`MsDataset`): MsDataset object for a certain dataset. | |||
| """ | |||
| if hub == Hubs.huggingface: | |||
| dataset = hf_load_dataset( | |||
| @@ -92,9 +92,9 @@ class PyDataset: | |||
| split=split, | |||
| data_dir=data_dir, | |||
| data_files=data_files) | |||
| return PyDataset.from_hf_dataset(dataset, target=target) | |||
| return MsDataset.from_hf_dataset(dataset, target=target) | |||
| else: | |||
| return PyDataset._load_ms_dataset( | |||
| return MsDataset._load_ms_dataset( | |||
| dataset_name, | |||
| target=target, | |||
| subset_name=subset_name, | |||
| @@ -114,7 +114,7 @@ class PyDataset: | |||
| data_files: Optional[Union[str, Sequence[str], | |||
| Mapping[str, Union[str, | |||
| Sequence[str]]]]] = None | |||
| ) -> Union[dict, 'PyDataset']: | |||
| ) -> Union[dict, 'MsDataset']: | |||
| if isinstance(dataset_name, str): | |||
| use_hf = False | |||
| if dataset_name in _PACKAGED_DATASETS_MODULES or os.path.isdir(dataset_name) or \ | |||
| @@ -153,7 +153,7 @@ class PyDataset: | |||
| else: | |||
| raise TypeError('path must be a str or a list, but got' | |||
| f' {type(dataset_name)}') | |||
| return PyDataset.from_hf_dataset(dataset, target=target) | |||
| return MsDataset.from_hf_dataset(dataset, target=target) | |||
| def to_torch_dataset_with_processors( | |||
| self, | |||
| @@ -4,8 +4,8 @@ from typing import Optional | |||
| import requests | |||
| from modelscope.pydatasets.config import (DOWNLOADED_DATASETS_PATH, | |||
| MS_HUB_ENDPOINT) | |||
| from modelscope.datasets.config import (DOWNLOADED_DATASETS_PATH, | |||
| MS_HUB_ENDPOINT) | |||
| from modelscope.utils.logger import get_logger | |||
| logger = get_logger() | |||
| @@ -187,7 +187,7 @@ def get_file_download_url(model_id: str, file_path: str, revision: str): | |||
| """ | |||
| Format file download url according to `model_id`, `revision` and `file_path`. | |||
| e.g., Given `model_id=john/bert`, `revision=master`, `file_path=README.md`, | |||
| the resulted download url is: https://maas.co/api/v1/models/john/bert/repo?Revision=master&FilePath=README.md | |||
| the resulted download url is: https://modelscope.co/api/v1/models/john/bert/repo?Revision=master&FilePath=README.md | |||
| """ | |||
| download_url_template = '{endpoint}/api/v1/models/{model_id}/repo?Revision={revision}&FilePath={file_path}' | |||
| return download_url_template.format( | |||
| @@ -4,17 +4,17 @@ import os.path as osp | |||
| from abc import ABC, abstractmethod | |||
| from typing import Any, Dict, Generator, List, Union | |||
| from modelscope.datasets import MsDataset | |||
| from modelscope.hub.snapshot_download import snapshot_download | |||
| from modelscope.models.base import Model | |||
| from modelscope.preprocessors import Preprocessor | |||
| from modelscope.pydatasets import PyDataset | |||
| from modelscope.utils.config import Config | |||
| from modelscope.utils.logger import get_logger | |||
| from .outputs import TASK_OUTPUTS | |||
| from .util import is_model, is_official_hub_path | |||
| Tensor = Union['torch.Tensor', 'tf.Tensor'] | |||
| Input = Union[str, tuple, PyDataset, 'PIL.Image.Image', 'numpy.ndarray'] | |||
| Input = Union[str, tuple, MsDataset, 'PIL.Image.Image', 'numpy.ndarray'] | |||
| InputModel = Union[str, Model] | |||
| output_keys = [ | |||
| @@ -85,7 +85,7 @@ class Pipeline(ABC): | |||
| for ele in input: | |||
| output.append(self._process_single(ele, *args, **post_kwargs)) | |||
| elif isinstance(input, PyDataset): | |||
| elif isinstance(input, MsDataset): | |||
| return self._process_iterator(input, *args, **post_kwargs) | |||
| else: | |||
| @@ -1 +0,0 @@ | |||
| from .py_dataset import PyDataset | |||
| @@ -2,11 +2,10 @@ import unittest | |||
| import datasets as hfdata | |||
| from modelscope.datasets import MsDataset | |||
| from modelscope.models import Model | |||
| from modelscope.preprocessors import SequenceClassificationPreprocessor | |||
| from modelscope.preprocessors.base import Preprocessor | |||
| from modelscope.pydatasets import PyDataset | |||
| from modelscope.utils.constant import Hubs | |||
| from modelscope.utils.test_utils import require_tf, require_torch, test_level | |||
| @@ -31,15 +30,15 @@ class ImgPreprocessor(Preprocessor): | |||
| } | |||
| class PyDatasetTest(unittest.TestCase): | |||
| class MsDatasetTest(unittest.TestCase): | |||
| @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') | |||
| def test_ds_basic(self): | |||
| ms_ds_full = PyDataset.load('squad') | |||
| ms_ds_full = MsDataset.load('squad') | |||
| ms_ds_full_hf = hfdata.load_dataset('squad') | |||
| ms_ds_train = PyDataset.load('squad', split='train') | |||
| ms_ds_train = MsDataset.load('squad', split='train') | |||
| ms_ds_train_hf = hfdata.load_dataset('squad', split='train') | |||
| ms_image_train = PyDataset.from_hf_dataset( | |||
| ms_image_train = MsDataset.from_hf_dataset( | |||
| hfdata.load_dataset('beans', split='train')) | |||
| self.assertEqual(ms_ds_full['train'][0], ms_ds_full_hf['train'][0]) | |||
| self.assertEqual(ms_ds_full['validation'][0], | |||
| @@ -58,7 +57,7 @@ class PyDatasetTest(unittest.TestCase): | |||
| nlp_model.model_dir, | |||
| first_sequence='context', | |||
| second_sequence=None) | |||
| ms_ds_train = PyDataset.load('squad', split='train') | |||
| ms_ds_train = MsDataset.load('squad', split='train') | |||
| pt_dataset = ms_ds_train.to_torch_dataset(preprocessors=preprocessor) | |||
| import torch | |||
| dataloader = torch.utils.data.DataLoader(pt_dataset, batch_size=5) | |||
| @@ -75,7 +74,7 @@ class PyDatasetTest(unittest.TestCase): | |||
| nlp_model.model_dir, | |||
| first_sequence='context', | |||
| second_sequence=None) | |||
| ms_ds_train = PyDataset.load('squad', split='train') | |||
| ms_ds_train = MsDataset.load('squad', split='train') | |||
| tf_dataset = ms_ds_train.to_tf_dataset( | |||
| batch_size=5, | |||
| shuffle=True, | |||
| @@ -86,7 +85,7 @@ class PyDatasetTest(unittest.TestCase): | |||
| @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') | |||
| @require_torch | |||
| def test_to_torch_dataset_img(self): | |||
| ms_image_train = PyDataset.from_hf_dataset( | |||
| ms_image_train = MsDataset.from_hf_dataset( | |||
| hfdata.load_dataset('beans', split='train')) | |||
| pt_dataset = ms_image_train.to_torch_dataset( | |||
| preprocessors=ImgPreprocessor( | |||
| @@ -100,7 +99,7 @@ class PyDatasetTest(unittest.TestCase): | |||
| def test_to_tf_dataset_img(self): | |||
| import tensorflow as tf | |||
| tf.compat.v1.enable_eager_execution() | |||
| ms_image_train = PyDataset.load('beans', split='train') | |||
| ms_image_train = MsDataset.load('beans', split='train') | |||
| tf_dataset = ms_image_train.to_tf_dataset( | |||
| batch_size=5, | |||
| shuffle=True, | |||
| @@ -7,9 +7,9 @@ import unittest | |||
| import cv2 | |||
| from modelscope.datasets import MsDataset | |||
| from modelscope.fileio import File | |||
| from modelscope.pipelines import pipeline | |||
| from modelscope.pydatasets import PyDataset | |||
| from modelscope.utils.constant import ModelFile, Tasks | |||
| from modelscope.utils.test_utils import test_level | |||
| @@ -6,9 +6,9 @@ import unittest | |||
| import cv2 | |||
| from modelscope.datasets import MsDataset | |||
| from modelscope.fileio import File | |||
| from modelscope.pipelines import pipeline | |||
| from modelscope.pydatasets import PyDataset | |||
| from modelscope.utils.constant import ModelFile, Tasks | |||
| from modelscope.utils.test_utils import test_level | |||
| @@ -37,7 +37,7 @@ class ImageMattingTest(unittest.TestCase): | |||
| # alternatively: | |||
| # input_location = '/dir/to/images' | |||
| dataset = PyDataset.load(input_location, target='image') | |||
| dataset = MsDataset.load(input_location, target='image') | |||
| img_matting = pipeline(Tasks.image_matting, model=self.model_id) | |||
| # note that for dataset output, the inference-output is a Generator that can be iterated. | |||
| result = img_matting(dataset) | |||
| @@ -62,7 +62,7 @@ class ImageMattingTest(unittest.TestCase): | |||
| @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') | |||
| def test_run_with_modelscope_dataset(self): | |||
| dataset = PyDataset.load('beans', split='train', target='image') | |||
| dataset = MsDataset.load('beans', split='train', target='image') | |||
| img_matting = pipeline(Tasks.image_matting, model=self.model_id) | |||
| result = img_matting(dataset) | |||
| for i in range(10): | |||
| @@ -2,10 +2,10 @@ | |||
| import shutil | |||
| import unittest | |||
| from modelscope.datasets import MsDataset | |||
| from modelscope.models import Model | |||
| from modelscope.pipelines import SequenceClassificationPipeline, pipeline | |||
| from modelscope.preprocessors import SequenceClassificationPreprocessor | |||
| from modelscope.pydatasets import PyDataset | |||
| from modelscope.utils.constant import Hubs, Tasks | |||
| from modelscope.utils.test_utils import test_level | |||
| @@ -28,7 +28,7 @@ class SequenceClassificationTest(unittest.TestCase): | |||
| print(data) | |||
| def printDataset(self, dataset: PyDataset): | |||
| def printDataset(self, dataset: MsDataset): | |||
| for i, r in enumerate(dataset): | |||
| if i > 10: | |||
| break | |||
| @@ -50,7 +50,7 @@ class SequenceClassificationTest(unittest.TestCase): | |||
| text_classification = pipeline( | |||
| task=Tasks.text_classification, model=self.model_id) | |||
| result = text_classification( | |||
| PyDataset.load( | |||
| MsDataset.load( | |||
| 'glue', | |||
| subset_name='sst2', | |||
| split='train', | |||
| @@ -62,7 +62,7 @@ class SequenceClassificationTest(unittest.TestCase): | |||
| def test_run_with_default_model(self): | |||
| text_classification = pipeline(task=Tasks.text_classification) | |||
| result = text_classification( | |||
| PyDataset.load( | |||
| MsDataset.load( | |||
| 'glue', | |||
| subset_name='sst2', | |||
| split='train', | |||
| @@ -78,7 +78,7 @@ class SequenceClassificationTest(unittest.TestCase): | |||
| text_classification = pipeline( | |||
| Tasks.text_classification, model=model, preprocessor=preprocessor) | |||
| # loaded from huggingface dataset | |||
| dataset = PyDataset.load( | |||
| dataset = MsDataset.load( | |||
| 'glue', | |||
| subset_name='sst2', | |||
| split='train', | |||
| @@ -91,7 +91,7 @@ class SequenceClassificationTest(unittest.TestCase): | |||
| def test_run_with_modelscope_dataset(self): | |||
| text_classification = pipeline(task=Tasks.text_classification) | |||
| # loaded from modelscope dataset | |||
| dataset = PyDataset.load( | |||
| dataset = MsDataset.load( | |||
| 'squad', split='train', target='context', hub=Hubs.modelscope) | |||
| result = text_classification(dataset) | |||
| self.printDataset(result) | |||