diff --git a/docs/source/api/modelscope.pydatasets.rst b/docs/source/api/modelscope.pydatasets.rst index 33f2fab5..2508a91f 100644 --- a/docs/source/api/modelscope.pydatasets.rst +++ b/docs/source/api/modelscope.pydatasets.rst @@ -1,7 +1,7 @@ -modelscope.datasets package +modelscope.pydatasets package ============================= -.. automodule:: modelscope.datasets +.. automodule:: modelscope.pydatasets :members: :undoc-members: :show-inheritance: @@ -9,10 +9,10 @@ modelscope.datasets package Submodules ---------- -modelscope.datasets.py\_dataset module +modelscope.pydatasets.py\_dataset module ---------------------------------------- -.. automodule:: modelscope.datasets.ms_dataset +.. automodule:: modelscope.pydatasets.py_dataset :members: :undoc-members: :show-inheritance: diff --git a/docs/source/api/modelscope.rst b/docs/source/api/modelscope.rst index f1389717..efab568b 100644 --- a/docs/source/api/modelscope.rst +++ b/docs/source/api/modelscope.rst @@ -16,7 +16,7 @@ Subpackages modelscope.models modelscope.pipelines modelscope.preprocessors - modelscope.datasets + modelscope.pydatasets modelscope.trainers modelscope.utils diff --git a/docs/source/quick_start.md b/docs/source/quick_start.md index 91509fa4..7148f27f 100644 --- a/docs/source/quick_start.md +++ b/docs/source/quick_start.md @@ -3,7 +3,7 @@ ## python环境配置 首先,参考[文档](https://docs.anaconda.com/anaconda/install/) 安装配置Anaconda环境 -安装完成后,执行如下命令为modelscope library创建对应的python环境。 +安装完成后,执行如下命令为maas library创建对应的python环境。 ```shell conda create -n modelscope python=3.6 conda activate modelscope @@ -105,15 +105,15 @@ import cv2 import os.path as osp from modelscope.pipelines import pipeline from modelscope.utils.constant import Tasks -from modelscope.datasets import MsDataset +from modelscope.pydatasets import PyDataset -# 使用图像url构建MsDataset,此处也可通过 input_location = '/dir/to/images' 来使用本地文件夹 +# 使用图像url构建PyDataset,此处也可通过 input_location = '/dir/to/images' 来使用本地文件夹 input_location = [ 'http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/data/test/maas/image_matting/test.png' ] -dataset = MsDataset.load(input_location, target='image') +dataset = PyDataset.load(input_location, target='image') img_matting = pipeline(Tasks.image_matting, model='damo/image-matting-person') -# 输入为MsDataset时,输出的结果为迭代器 +# 输入为PyDataset时,输出的结果为迭代器 result = img_matting(dataset) cv2.imwrite('result.png', next(result)['output_png']) print(f'Output written to {osp.abspath("result.png")}') diff --git a/modelscope/datasets/__init__.py b/modelscope/datasets/__init__.py deleted file mode 100644 index 8e0647bb..00000000 --- a/modelscope/datasets/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from .ms_dataset import MsDataset diff --git a/modelscope/hub/file_download.py b/modelscope/hub/file_download.py index b92bf89c..e5c64f1c 100644 --- a/modelscope/hub/file_download.py +++ b/modelscope/hub/file_download.py @@ -187,7 +187,7 @@ def get_file_download_url(model_id: str, file_path: str, revision: str): """ Format file download url according to `model_id`, `revision` and `file_path`. e.g., Given `model_id=john/bert`, `revision=master`, `file_path=README.md`, - the resulted download url is: https://modelscope.co/api/v1/models/john/bert/repo?Revision=master&FilePath=README.md + the resulted download url is: https://maas.co/api/v1/models/john/bert/repo?Revision=master&FilePath=README.md """ download_url_template = '{endpoint}/api/v1/models/{model_id}/repo?Revision={revision}&FilePath={file_path}' return download_url_template.format( diff --git a/modelscope/pipelines/base.py b/modelscope/pipelines/base.py index cf4ce8fd..7e32f543 100644 --- a/modelscope/pipelines/base.py +++ b/modelscope/pipelines/base.py @@ -4,17 +4,17 @@ import os.path as osp from abc import ABC, abstractmethod from typing import Any, Dict, Generator, List, Union -from modelscope.datasets import MsDataset from modelscope.hub.snapshot_download import snapshot_download from modelscope.models.base import Model from modelscope.preprocessors import Preprocessor +from modelscope.pydatasets import PyDataset from modelscope.utils.config import Config from modelscope.utils.logger import get_logger from .outputs import TASK_OUTPUTS from .util import is_model, is_official_hub_path Tensor = Union['torch.Tensor', 'tf.Tensor'] -Input = Union[str, tuple, MsDataset, 'PIL.Image.Image', 'numpy.ndarray'] +Input = Union[str, tuple, PyDataset, 'PIL.Image.Image', 'numpy.ndarray'] InputModel = Union[str, Model] output_keys = [ @@ -85,7 +85,7 @@ class Pipeline(ABC): for ele in input: output.append(self._process_single(ele, *args, **post_kwargs)) - elif isinstance(input, MsDataset): + elif isinstance(input, PyDataset): return self._process_iterator(input, *args, **post_kwargs) else: diff --git a/modelscope/pydatasets/__init__.py b/modelscope/pydatasets/__init__.py new file mode 100644 index 00000000..a1ed1d93 --- /dev/null +++ b/modelscope/pydatasets/__init__.py @@ -0,0 +1 @@ +from .py_dataset import PyDataset diff --git a/modelscope/datasets/config.py b/modelscope/pydatasets/config.py similarity index 100% rename from modelscope/datasets/config.py rename to modelscope/pydatasets/config.py diff --git a/modelscope/datasets/ms_dataset.py b/modelscope/pydatasets/py_dataset.py similarity index 96% rename from modelscope/datasets/ms_dataset.py rename to modelscope/pydatasets/py_dataset.py index 80ffc77a..49137253 100644 --- a/modelscope/datasets/ms_dataset.py +++ b/modelscope/pydatasets/py_dataset.py @@ -10,8 +10,8 @@ from datasets.packaged_modules import _PACKAGED_DATASETS_MODULES from datasets.utils.file_utils import (is_relative_path, relative_to_absolute_path) -from modelscope.datasets.config import MS_DATASETS_CACHE -from modelscope.datasets.utils.ms_api import MsApi +from modelscope.pydatasets.config import MS_DATASETS_CACHE +from modelscope.pydatasets.utils.ms_api import MsApi from modelscope.utils.constant import Hubs from modelscope.utils.logger import get_logger @@ -28,9 +28,9 @@ def format_list(para) -> List: return para -class MsDataset: +class PyDataset: _hf_ds = None # holds the underlying HuggingFace Dataset - """A MsDataset backed by hugging face Dataset.""" + """A PyDataset backed by hugging face Dataset.""" def __init__(self, hf_ds: Dataset, target: Optional[str] = None): self._hf_ds = hf_ds @@ -49,7 +49,7 @@ class MsDataset: @classmethod def from_hf_dataset(cls, hf_ds: Dataset, - target: str = None) -> Union[dict, 'MsDataset']: + target: str = None) -> Union[dict, 'PyDataset']: if isinstance(hf_ds, Dataset): return cls(hf_ds, target) if len(hf_ds.keys()) == 1: @@ -68,8 +68,8 @@ class MsDataset: data_files: Optional[Union[str, Sequence[str], Mapping[str, Union[str, Sequence[str]]]]] = None - ) -> Union[dict, 'MsDataset']: - """Load a MsDataset from the ModelScope Hub, Hugging Face Hub, urls, or a local dataset. + ) -> Union[dict, 'PyDataset']: + """Load a PyDataset from the ModelScope Hub, Hugging Face Hub, urls, or a local dataset. Args: dataset_name (str): Path or name of the dataset. @@ -82,7 +82,7 @@ class MsDataset: hub (Hubs, optional): When loading from a remote hub, where it is from Returns: - MsDataset (obj:`MsDataset`): MsDataset object for a certain dataset. + PyDataset (obj:`PyDataset`): PyDataset object for a certain dataset. """ if hub == Hubs.huggingface: dataset = hf_load_dataset( @@ -92,9 +92,9 @@ class MsDataset: split=split, data_dir=data_dir, data_files=data_files) - return MsDataset.from_hf_dataset(dataset, target=target) + return PyDataset.from_hf_dataset(dataset, target=target) else: - return MsDataset._load_ms_dataset( + return PyDataset._load_ms_dataset( dataset_name, target=target, subset_name=subset_name, @@ -114,7 +114,7 @@ class MsDataset: data_files: Optional[Union[str, Sequence[str], Mapping[str, Union[str, Sequence[str]]]]] = None - ) -> Union[dict, 'MsDataset']: + ) -> Union[dict, 'PyDataset']: if isinstance(dataset_name, str): use_hf = False if dataset_name in _PACKAGED_DATASETS_MODULES or os.path.isdir(dataset_name) or \ @@ -153,7 +153,7 @@ class MsDataset: else: raise TypeError('path must be a str or a list, but got' f' {type(dataset_name)}') - return MsDataset.from_hf_dataset(dataset, target=target) + return PyDataset.from_hf_dataset(dataset, target=target) def to_torch_dataset_with_processors( self, diff --git a/modelscope/datasets/utils/__init__.py b/modelscope/pydatasets/utils/__init__.py similarity index 100% rename from modelscope/datasets/utils/__init__.py rename to modelscope/pydatasets/utils/__init__.py diff --git a/modelscope/datasets/utils/ms_api.py b/modelscope/pydatasets/utils/ms_api.py similarity index 95% rename from modelscope/datasets/utils/ms_api.py rename to modelscope/pydatasets/utils/ms_api.py index a478766f..04052cc4 100644 --- a/modelscope/datasets/utils/ms_api.py +++ b/modelscope/pydatasets/utils/ms_api.py @@ -4,8 +4,8 @@ from typing import Optional import requests -from modelscope.datasets.config import (DOWNLOADED_DATASETS_PATH, - MS_HUB_ENDPOINT) +from modelscope.pydatasets.config import (DOWNLOADED_DATASETS_PATH, + MS_HUB_ENDPOINT) from modelscope.utils.logger import get_logger logger = get_logger() diff --git a/tests/pipelines/test_action_recognition.py b/tests/pipelines/test_action_recognition.py index 7bb3bb90..b524ca18 100644 --- a/tests/pipelines/test_action_recognition.py +++ b/tests/pipelines/test_action_recognition.py @@ -7,9 +7,9 @@ import unittest import cv2 -from modelscope.datasets import MsDataset from modelscope.fileio import File from modelscope.pipelines import pipeline +from modelscope.pydatasets import PyDataset from modelscope.utils.constant import ModelFile, Tasks from modelscope.utils.test_utils import test_level diff --git a/tests/pipelines/test_image_matting.py b/tests/pipelines/test_image_matting.py index 13576d44..1b547e14 100644 --- a/tests/pipelines/test_image_matting.py +++ b/tests/pipelines/test_image_matting.py @@ -6,9 +6,9 @@ import unittest import cv2 -from modelscope.datasets import MsDataset from modelscope.fileio import File from modelscope.pipelines import pipeline +from modelscope.pydatasets import PyDataset from modelscope.utils.constant import ModelFile, Tasks from modelscope.utils.test_utils import test_level @@ -37,7 +37,7 @@ class ImageMattingTest(unittest.TestCase): # alternatively: # input_location = '/dir/to/images' - dataset = MsDataset.load(input_location, target='image') + dataset = PyDataset.load(input_location, target='image') img_matting = pipeline(Tasks.image_matting, model=self.model_id) # note that for dataset output, the inference-output is a Generator that can be iterated. result = img_matting(dataset) @@ -62,7 +62,7 @@ class ImageMattingTest(unittest.TestCase): @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') def test_run_with_modelscope_dataset(self): - dataset = MsDataset.load('beans', split='train', target='image') + dataset = PyDataset.load('beans', split='train', target='image') img_matting = pipeline(Tasks.image_matting, model=self.model_id) result = img_matting(dataset) for i in range(10): diff --git a/tests/pipelines/test_text_classification.py b/tests/pipelines/test_text_classification.py index bf6de28e..9e5f15b9 100644 --- a/tests/pipelines/test_text_classification.py +++ b/tests/pipelines/test_text_classification.py @@ -2,10 +2,10 @@ import shutil import unittest -from modelscope.datasets import MsDataset from modelscope.models import Model from modelscope.pipelines import SequenceClassificationPipeline, pipeline from modelscope.preprocessors import SequenceClassificationPreprocessor +from modelscope.pydatasets import PyDataset from modelscope.utils.constant import Hubs, Tasks from modelscope.utils.test_utils import test_level @@ -28,7 +28,7 @@ class SequenceClassificationTest(unittest.TestCase): print(data) - def printDataset(self, dataset: MsDataset): + def printDataset(self, dataset: PyDataset): for i, r in enumerate(dataset): if i > 10: break @@ -50,7 +50,7 @@ class SequenceClassificationTest(unittest.TestCase): text_classification = pipeline( task=Tasks.text_classification, model=self.model_id) result = text_classification( - MsDataset.load( + PyDataset.load( 'glue', subset_name='sst2', split='train', @@ -62,7 +62,7 @@ class SequenceClassificationTest(unittest.TestCase): def test_run_with_default_model(self): text_classification = pipeline(task=Tasks.text_classification) result = text_classification( - MsDataset.load( + PyDataset.load( 'glue', subset_name='sst2', split='train', @@ -78,7 +78,7 @@ class SequenceClassificationTest(unittest.TestCase): text_classification = pipeline( Tasks.text_classification, model=model, preprocessor=preprocessor) # loaded from huggingface dataset - dataset = MsDataset.load( + dataset = PyDataset.load( 'glue', subset_name='sst2', split='train', @@ -91,7 +91,7 @@ class SequenceClassificationTest(unittest.TestCase): def test_run_with_modelscope_dataset(self): text_classification = pipeline(task=Tasks.text_classification) # loaded from modelscope dataset - dataset = MsDataset.load( + dataset = PyDataset.load( 'squad', split='train', target='context', hub=Hubs.modelscope) result = text_classification(dataset) self.printDataset(result) diff --git a/tests/datasets/__init__.py b/tests/pydatasets/__init__.py similarity index 100% rename from tests/datasets/__init__.py rename to tests/pydatasets/__init__.py diff --git a/tests/datasets/test_ms_dataset.py b/tests/pydatasets/test_py_dataset.py similarity index 88% rename from tests/datasets/test_ms_dataset.py rename to tests/pydatasets/test_py_dataset.py index d08258ac..e84f240a 100644 --- a/tests/datasets/test_ms_dataset.py +++ b/tests/pydatasets/test_py_dataset.py @@ -2,10 +2,11 @@ import unittest import datasets as hfdata -from modelscope.datasets import MsDataset from modelscope.models import Model from modelscope.preprocessors import SequenceClassificationPreprocessor from modelscope.preprocessors.base import Preprocessor +from modelscope.pydatasets import PyDataset +from modelscope.utils.constant import Hubs from modelscope.utils.test_utils import require_tf, require_torch, test_level @@ -30,15 +31,15 @@ class ImgPreprocessor(Preprocessor): } -class MsDatasetTest(unittest.TestCase): +class PyDatasetTest(unittest.TestCase): @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') def test_ds_basic(self): - ms_ds_full = MsDataset.load('squad') + ms_ds_full = PyDataset.load('squad') ms_ds_full_hf = hfdata.load_dataset('squad') - ms_ds_train = MsDataset.load('squad', split='train') + ms_ds_train = PyDataset.load('squad', split='train') ms_ds_train_hf = hfdata.load_dataset('squad', split='train') - ms_image_train = MsDataset.from_hf_dataset( + ms_image_train = PyDataset.from_hf_dataset( hfdata.load_dataset('beans', split='train')) self.assertEqual(ms_ds_full['train'][0], ms_ds_full_hf['train'][0]) self.assertEqual(ms_ds_full['validation'][0], @@ -57,7 +58,7 @@ class MsDatasetTest(unittest.TestCase): nlp_model.model_dir, first_sequence='context', second_sequence=None) - ms_ds_train = MsDataset.load('squad', split='train') + ms_ds_train = PyDataset.load('squad', split='train') pt_dataset = ms_ds_train.to_torch_dataset(preprocessors=preprocessor) import torch dataloader = torch.utils.data.DataLoader(pt_dataset, batch_size=5) @@ -74,7 +75,7 @@ class MsDatasetTest(unittest.TestCase): nlp_model.model_dir, first_sequence='context', second_sequence=None) - ms_ds_train = MsDataset.load('squad', split='train') + ms_ds_train = PyDataset.load('squad', split='train') tf_dataset = ms_ds_train.to_tf_dataset( batch_size=5, shuffle=True, @@ -85,7 +86,7 @@ class MsDatasetTest(unittest.TestCase): @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') @require_torch def test_to_torch_dataset_img(self): - ms_image_train = MsDataset.from_hf_dataset( + ms_image_train = PyDataset.from_hf_dataset( hfdata.load_dataset('beans', split='train')) pt_dataset = ms_image_train.to_torch_dataset( preprocessors=ImgPreprocessor( @@ -99,7 +100,7 @@ class MsDatasetTest(unittest.TestCase): def test_to_tf_dataset_img(self): import tensorflow as tf tf.compat.v1.enable_eager_execution() - ms_image_train = MsDataset.load('beans', split='train') + ms_image_train = PyDataset.load('beans', split='train') tf_dataset = ms_image_train.to_tf_dataset( batch_size=5, shuffle=True,