yingda.chen 3 years ago
parent
commit
6702b29e21
16 changed files with 47 additions and 48 deletions
  1. +4
    -4
      docs/source/api/modelscope.pydatasets.rst
  2. +1
    -1
      docs/source/api/modelscope.rst
  3. +5
    -5
      docs/source/quick_start.md
  4. +1
    -1
      modelscope/hub/file_download.py
  5. +1
    -0
      modelscope/msdatasets/__init__.py
  6. +0
    -0
      modelscope/msdatasets/config.py
  7. +12
    -12
      modelscope/msdatasets/ms_dataset.py
  8. +0
    -0
      modelscope/msdatasets/utils/__init__.py
  9. +1
    -1
      modelscope/msdatasets/utils/ms_api.py
  10. +3
    -3
      modelscope/pipelines/base.py
  11. +0
    -1
      modelscope/pydatasets/__init__.py
  12. +0
    -0
      tests/msdatasets/__init__.py
  13. +9
    -10
      tests/msdatasets/test_ms_dataset.py
  14. +1
    -1
      tests/pipelines/test_action_recognition.py
  15. +3
    -3
      tests/pipelines/test_image_matting.py
  16. +6
    -6
      tests/pipelines/test_text_classification.py

+ 4
- 4
docs/source/api/modelscope.pydatasets.rst View File

@@ -1,7 +1,7 @@
modelscope.pydatasets package
modelscope.msdatasets package
============================= =============================


.. automodule:: modelscope.pydatasets
.. automodule:: modelscope.msdatasets
:members: :members:
:undoc-members: :undoc-members:
:show-inheritance: :show-inheritance:
@@ -9,10 +9,10 @@ modelscope.pydatasets package
Submodules Submodules
---------- ----------


modelscope.pydatasets.py\_dataset module
modelscope.msdatasets.ms\_dataset module
---------------------------------------- ----------------------------------------


.. automodule:: modelscope.pydatasets.py_dataset
.. automodule:: modelscope.msdatasets.ms_dataset
:members: :members:
:undoc-members: :undoc-members:
:show-inheritance: :show-inheritance:

+ 1
- 1
docs/source/api/modelscope.rst View File

@@ -16,7 +16,7 @@ Subpackages
modelscope.models modelscope.models
modelscope.pipelines modelscope.pipelines
modelscope.preprocessors modelscope.preprocessors
modelscope.pydatasets
modelscope.msdatasets
modelscope.trainers modelscope.trainers
modelscope.utils modelscope.utils




+ 5
- 5
docs/source/quick_start.md View File

@@ -3,7 +3,7 @@
## python环境配置 ## python环境配置
首先,参考[文档](https://docs.anaconda.com/anaconda/install/) 安装配置Anaconda环境 首先,参考[文档](https://docs.anaconda.com/anaconda/install/) 安装配置Anaconda环境


安装完成后,执行如下命令为maas library创建对应的python环境。
安装完成后,执行如下命令为modelscope library创建对应的python环境。
```shell ```shell
conda create -n modelscope python=3.6 conda create -n modelscope python=3.6
conda activate modelscope conda activate modelscope
@@ -105,15 +105,15 @@ import cv2
import os.path as osp import os.path as osp
from modelscope.pipelines import pipeline from modelscope.pipelines import pipeline
from modelscope.utils.constant import Tasks from modelscope.utils.constant import Tasks
from modelscope.pydatasets import PyDataset
from modelscope.msdatasets import MsDataset


# 使用图像url构建PyDataset,此处也可通过 input_location = '/dir/to/images' 来使用本地文件夹
# 使用图像url构建MsDataset,此处也可通过 input_location = '/dir/to/images' 来使用本地文件夹
input_location = [ input_location = [
'http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/data/test/maas/image_matting/test.png' 'http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/data/test/maas/image_matting/test.png'
] ]
dataset = PyDataset.load(input_location, target='image')
dataset = MsDataset.load(input_location, target='image')
img_matting = pipeline(Tasks.image_matting, model='damo/image-matting-person') img_matting = pipeline(Tasks.image_matting, model='damo/image-matting-person')
# 输入为PyDataset时,输出的结果为迭代器
# 输入为MsDataset时,输出的结果为迭代器
result = img_matting(dataset) result = img_matting(dataset)
cv2.imwrite('result.png', next(result)['output_png']) cv2.imwrite('result.png', next(result)['output_png'])
print(f'Output written to {osp.abspath("result.png")}') print(f'Output written to {osp.abspath("result.png")}')


+ 1
- 1
modelscope/hub/file_download.py View File

@@ -187,7 +187,7 @@ def get_file_download_url(model_id: str, file_path: str, revision: str):
""" """
Format file download url according to `model_id`, `revision` and `file_path`. Format file download url according to `model_id`, `revision` and `file_path`.
e.g., Given `model_id=john/bert`, `revision=master`, `file_path=README.md`, e.g., Given `model_id=john/bert`, `revision=master`, `file_path=README.md`,
the resulted download url is: https://maas.co/api/v1/models/john/bert/repo?Revision=master&FilePath=README.md
the resulted download url is: https://modelscope.co/api/v1/models/john/bert/repo?Revision=master&FilePath=README.md
""" """
download_url_template = '{endpoint}/api/v1/models/{model_id}/repo?Revision={revision}&FilePath={file_path}' download_url_template = '{endpoint}/api/v1/models/{model_id}/repo?Revision={revision}&FilePath={file_path}'
return download_url_template.format( return download_url_template.format(


+ 1
- 0
modelscope/msdatasets/__init__.py View File

@@ -0,0 +1 @@
from .ms_dataset import MsDataset

modelscope/pydatasets/config.py → modelscope/msdatasets/config.py View File


modelscope/pydatasets/py_dataset.py → modelscope/msdatasets/ms_dataset.py View File

@@ -10,8 +10,8 @@ from datasets.packaged_modules import _PACKAGED_DATASETS_MODULES
from datasets.utils.file_utils import (is_relative_path, from datasets.utils.file_utils import (is_relative_path,
relative_to_absolute_path) relative_to_absolute_path)


from modelscope.pydatasets.config import MS_DATASETS_CACHE
from modelscope.pydatasets.utils.ms_api import MsApi
from modelscope.msdatasets.config import MS_DATASETS_CACHE
from modelscope.msdatasets.utils.ms_api import MsApi
from modelscope.utils.constant import Hubs from modelscope.utils.constant import Hubs
from modelscope.utils.logger import get_logger from modelscope.utils.logger import get_logger


@@ -28,9 +28,9 @@ def format_list(para) -> List:
return para return para




class PyDataset:
class MsDataset:
_hf_ds = None # holds the underlying HuggingFace Dataset _hf_ds = None # holds the underlying HuggingFace Dataset
"""A PyDataset backed by hugging face Dataset."""
"""A MsDataset backed by hugging face Dataset."""


def __init__(self, hf_ds: Dataset, target: Optional[str] = None): def __init__(self, hf_ds: Dataset, target: Optional[str] = None):
self._hf_ds = hf_ds self._hf_ds = hf_ds
@@ -49,7 +49,7 @@ class PyDataset:
@classmethod @classmethod
def from_hf_dataset(cls, def from_hf_dataset(cls,
hf_ds: Dataset, hf_ds: Dataset,
target: str = None) -> Union[dict, 'PyDataset']:
target: str = None) -> Union[dict, 'MsDataset']:
if isinstance(hf_ds, Dataset): if isinstance(hf_ds, Dataset):
return cls(hf_ds, target) return cls(hf_ds, target)
if len(hf_ds.keys()) == 1: if len(hf_ds.keys()) == 1:
@@ -68,8 +68,8 @@ class PyDataset:
data_files: Optional[Union[str, Sequence[str], data_files: Optional[Union[str, Sequence[str],
Mapping[str, Union[str, Mapping[str, Union[str,
Sequence[str]]]]] = None Sequence[str]]]]] = None
) -> Union[dict, 'PyDataset']:
"""Load a PyDataset from the ModelScope Hub, Hugging Face Hub, urls, or a local dataset.
) -> Union[dict, 'MsDataset']:
"""Load a MsDataset from the ModelScope Hub, Hugging Face Hub, urls, or a local dataset.
Args: Args:


dataset_name (str): Path or name of the dataset. dataset_name (str): Path or name of the dataset.
@@ -82,7 +82,7 @@ class PyDataset:
hub (Hubs, optional): When loading from a remote hub, where it is from hub (Hubs, optional): When loading from a remote hub, where it is from


Returns: Returns:
PyDataset (obj:`PyDataset`): PyDataset object for a certain dataset.
MsDataset (obj:`MsDataset`): MsDataset object for a certain dataset.
""" """
if hub == Hubs.huggingface: if hub == Hubs.huggingface:
dataset = hf_load_dataset( dataset = hf_load_dataset(
@@ -92,9 +92,9 @@ class PyDataset:
split=split, split=split,
data_dir=data_dir, data_dir=data_dir,
data_files=data_files) data_files=data_files)
return PyDataset.from_hf_dataset(dataset, target=target)
return MsDataset.from_hf_dataset(dataset, target=target)
else: else:
return PyDataset._load_ms_dataset(
return MsDataset._load_ms_dataset(
dataset_name, dataset_name,
target=target, target=target,
subset_name=subset_name, subset_name=subset_name,
@@ -114,7 +114,7 @@ class PyDataset:
data_files: Optional[Union[str, Sequence[str], data_files: Optional[Union[str, Sequence[str],
Mapping[str, Union[str, Mapping[str, Union[str,
Sequence[str]]]]] = None Sequence[str]]]]] = None
) -> Union[dict, 'PyDataset']:
) -> Union[dict, 'MsDataset']:
if isinstance(dataset_name, str): if isinstance(dataset_name, str):
use_hf = False use_hf = False
if dataset_name in _PACKAGED_DATASETS_MODULES or os.path.isdir(dataset_name) or \ if dataset_name in _PACKAGED_DATASETS_MODULES or os.path.isdir(dataset_name) or \
@@ -153,7 +153,7 @@ class PyDataset:
else: else:
raise TypeError('path must be a str or a list, but got' raise TypeError('path must be a str or a list, but got'
f' {type(dataset_name)}') f' {type(dataset_name)}')
return PyDataset.from_hf_dataset(dataset, target=target)
return MsDataset.from_hf_dataset(dataset, target=target)


def to_torch_dataset_with_processors( def to_torch_dataset_with_processors(
self, self,

modelscope/pydatasets/utils/__init__.py → modelscope/msdatasets/utils/__init__.py View File


modelscope/pydatasets/utils/ms_api.py → modelscope/msdatasets/utils/ms_api.py View File

@@ -4,7 +4,7 @@ from typing import Optional


import requests import requests


from modelscope.pydatasets.config import (DOWNLOADED_DATASETS_PATH,
from modelscope.msdatasets.config import (DOWNLOADED_DATASETS_PATH,
MS_HUB_ENDPOINT) MS_HUB_ENDPOINT)
from modelscope.utils.logger import get_logger from modelscope.utils.logger import get_logger



+ 3
- 3
modelscope/pipelines/base.py View File

@@ -6,15 +6,15 @@ from typing import Any, Dict, Generator, List, Union


from modelscope.hub.snapshot_download import snapshot_download from modelscope.hub.snapshot_download import snapshot_download
from modelscope.models.base import Model from modelscope.models.base import Model
from modelscope.msdatasets import MsDataset
from modelscope.preprocessors import Preprocessor from modelscope.preprocessors import Preprocessor
from modelscope.pydatasets import PyDataset
from modelscope.utils.config import Config from modelscope.utils.config import Config
from modelscope.utils.logger import get_logger from modelscope.utils.logger import get_logger
from .outputs import TASK_OUTPUTS from .outputs import TASK_OUTPUTS
from .util import is_model, is_official_hub_path from .util import is_model, is_official_hub_path


Tensor = Union['torch.Tensor', 'tf.Tensor'] Tensor = Union['torch.Tensor', 'tf.Tensor']
Input = Union[str, tuple, PyDataset, 'PIL.Image.Image', 'numpy.ndarray']
Input = Union[str, tuple, MsDataset, 'PIL.Image.Image', 'numpy.ndarray']
InputModel = Union[str, Model] InputModel = Union[str, Model]


output_keys = [ output_keys = [
@@ -85,7 +85,7 @@ class Pipeline(ABC):
for ele in input: for ele in input:
output.append(self._process_single(ele, *args, **post_kwargs)) output.append(self._process_single(ele, *args, **post_kwargs))


elif isinstance(input, PyDataset):
elif isinstance(input, MsDataset):
return self._process_iterator(input, *args, **post_kwargs) return self._process_iterator(input, *args, **post_kwargs)


else: else:


+ 0
- 1
modelscope/pydatasets/__init__.py View File

@@ -1 +0,0 @@
from .py_dataset import PyDataset

tests/pydatasets/__init__.py → tests/msdatasets/__init__.py View File


tests/pydatasets/test_py_dataset.py → tests/msdatasets/test_ms_dataset.py View File

@@ -3,10 +3,9 @@ import unittest
import datasets as hfdata import datasets as hfdata


from modelscope.models import Model from modelscope.models import Model
from modelscope.msdatasets import MsDataset
from modelscope.preprocessors import SequenceClassificationPreprocessor from modelscope.preprocessors import SequenceClassificationPreprocessor
from modelscope.preprocessors.base import Preprocessor from modelscope.preprocessors.base import Preprocessor
from modelscope.pydatasets import PyDataset
from modelscope.utils.constant import Hubs
from modelscope.utils.test_utils import require_tf, require_torch, test_level from modelscope.utils.test_utils import require_tf, require_torch, test_level




@@ -31,15 +30,15 @@ class ImgPreprocessor(Preprocessor):
} }




class PyDatasetTest(unittest.TestCase):
class MsDatasetTest(unittest.TestCase):


@unittest.skipUnless(test_level() >= 2, 'skip test in current test level') @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
def test_ds_basic(self): def test_ds_basic(self):
ms_ds_full = PyDataset.load('squad')
ms_ds_full = MsDataset.load('squad')
ms_ds_full_hf = hfdata.load_dataset('squad') ms_ds_full_hf = hfdata.load_dataset('squad')
ms_ds_train = PyDataset.load('squad', split='train')
ms_ds_train = MsDataset.load('squad', split='train')
ms_ds_train_hf = hfdata.load_dataset('squad', split='train') ms_ds_train_hf = hfdata.load_dataset('squad', split='train')
ms_image_train = PyDataset.from_hf_dataset(
ms_image_train = MsDataset.from_hf_dataset(
hfdata.load_dataset('beans', split='train')) hfdata.load_dataset('beans', split='train'))
self.assertEqual(ms_ds_full['train'][0], ms_ds_full_hf['train'][0]) self.assertEqual(ms_ds_full['train'][0], ms_ds_full_hf['train'][0])
self.assertEqual(ms_ds_full['validation'][0], self.assertEqual(ms_ds_full['validation'][0],
@@ -58,7 +57,7 @@ class PyDatasetTest(unittest.TestCase):
nlp_model.model_dir, nlp_model.model_dir,
first_sequence='context', first_sequence='context',
second_sequence=None) second_sequence=None)
ms_ds_train = PyDataset.load('squad', split='train')
ms_ds_train = MsDataset.load('squad', split='train')
pt_dataset = ms_ds_train.to_torch_dataset(preprocessors=preprocessor) pt_dataset = ms_ds_train.to_torch_dataset(preprocessors=preprocessor)
import torch import torch
dataloader = torch.utils.data.DataLoader(pt_dataset, batch_size=5) dataloader = torch.utils.data.DataLoader(pt_dataset, batch_size=5)
@@ -75,7 +74,7 @@ class PyDatasetTest(unittest.TestCase):
nlp_model.model_dir, nlp_model.model_dir,
first_sequence='context', first_sequence='context',
second_sequence=None) second_sequence=None)
ms_ds_train = PyDataset.load('squad', split='train')
ms_ds_train = MsDataset.load('squad', split='train')
tf_dataset = ms_ds_train.to_tf_dataset( tf_dataset = ms_ds_train.to_tf_dataset(
batch_size=5, batch_size=5,
shuffle=True, shuffle=True,
@@ -86,7 +85,7 @@ class PyDatasetTest(unittest.TestCase):
@unittest.skipUnless(test_level() >= 2, 'skip test in current test level') @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
@require_torch @require_torch
def test_to_torch_dataset_img(self): def test_to_torch_dataset_img(self):
ms_image_train = PyDataset.from_hf_dataset(
ms_image_train = MsDataset.from_hf_dataset(
hfdata.load_dataset('beans', split='train')) hfdata.load_dataset('beans', split='train'))
pt_dataset = ms_image_train.to_torch_dataset( pt_dataset = ms_image_train.to_torch_dataset(
preprocessors=ImgPreprocessor( preprocessors=ImgPreprocessor(
@@ -100,7 +99,7 @@ class PyDatasetTest(unittest.TestCase):
def test_to_tf_dataset_img(self): def test_to_tf_dataset_img(self):
import tensorflow as tf import tensorflow as tf
tf.compat.v1.enable_eager_execution() tf.compat.v1.enable_eager_execution()
ms_image_train = PyDataset.load('beans', split='train')
ms_image_train = MsDataset.load('beans', split='train')
tf_dataset = ms_image_train.to_tf_dataset( tf_dataset = ms_image_train.to_tf_dataset(
batch_size=5, batch_size=5,
shuffle=True, shuffle=True,

+ 1
- 1
tests/pipelines/test_action_recognition.py View File

@@ -8,8 +8,8 @@ import unittest
import cv2 import cv2


from modelscope.fileio import File from modelscope.fileio import File
from modelscope.msdatasets import MsDataset
from modelscope.pipelines import pipeline from modelscope.pipelines import pipeline
from modelscope.pydatasets import PyDataset
from modelscope.utils.constant import ModelFile, Tasks from modelscope.utils.constant import ModelFile, Tasks
from modelscope.utils.test_utils import test_level from modelscope.utils.test_utils import test_level




+ 3
- 3
tests/pipelines/test_image_matting.py View File

@@ -7,8 +7,8 @@ import unittest
import cv2 import cv2


from modelscope.fileio import File from modelscope.fileio import File
from modelscope.msdatasets import MsDataset
from modelscope.pipelines import pipeline from modelscope.pipelines import pipeline
from modelscope.pydatasets import PyDataset
from modelscope.utils.constant import ModelFile, Tasks from modelscope.utils.constant import ModelFile, Tasks
from modelscope.utils.test_utils import test_level from modelscope.utils.test_utils import test_level


@@ -37,7 +37,7 @@ class ImageMattingTest(unittest.TestCase):
# alternatively: # alternatively:
# input_location = '/dir/to/images' # input_location = '/dir/to/images'


dataset = PyDataset.load(input_location, target='image')
dataset = MsDataset.load(input_location, target='image')
img_matting = pipeline(Tasks.image_matting, model=self.model_id) img_matting = pipeline(Tasks.image_matting, model=self.model_id)
# note that for dataset output, the inference-output is a Generator that can be iterated. # note that for dataset output, the inference-output is a Generator that can be iterated.
result = img_matting(dataset) result = img_matting(dataset)
@@ -62,7 +62,7 @@ class ImageMattingTest(unittest.TestCase):


@unittest.skipUnless(test_level() >= 2, 'skip test in current test level') @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
def test_run_with_modelscope_dataset(self): def test_run_with_modelscope_dataset(self):
dataset = PyDataset.load('beans', split='train', target='image')
dataset = MsDataset.load('beans', split='train', target='image')
img_matting = pipeline(Tasks.image_matting, model=self.model_id) img_matting = pipeline(Tasks.image_matting, model=self.model_id)
result = img_matting(dataset) result = img_matting(dataset)
for i in range(10): for i in range(10):


+ 6
- 6
tests/pipelines/test_text_classification.py View File

@@ -3,9 +3,9 @@ import shutil
import unittest import unittest


from modelscope.models import Model from modelscope.models import Model
from modelscope.msdatasets import MsDataset
from modelscope.pipelines import SequenceClassificationPipeline, pipeline from modelscope.pipelines import SequenceClassificationPipeline, pipeline
from modelscope.preprocessors import SequenceClassificationPreprocessor from modelscope.preprocessors import SequenceClassificationPreprocessor
from modelscope.pydatasets import PyDataset
from modelscope.utils.constant import Hubs, Tasks from modelscope.utils.constant import Hubs, Tasks
from modelscope.utils.test_utils import test_level from modelscope.utils.test_utils import test_level


@@ -28,7 +28,7 @@ class SequenceClassificationTest(unittest.TestCase):


print(data) print(data)


def printDataset(self, dataset: PyDataset):
def printDataset(self, dataset: MsDataset):
for i, r in enumerate(dataset): for i, r in enumerate(dataset):
if i > 10: if i > 10:
break break
@@ -50,7 +50,7 @@ class SequenceClassificationTest(unittest.TestCase):
text_classification = pipeline( text_classification = pipeline(
task=Tasks.text_classification, model=self.model_id) task=Tasks.text_classification, model=self.model_id)
result = text_classification( result = text_classification(
PyDataset.load(
MsDataset.load(
'glue', 'glue',
subset_name='sst2', subset_name='sst2',
split='train', split='train',
@@ -62,7 +62,7 @@ class SequenceClassificationTest(unittest.TestCase):
def test_run_with_default_model(self): def test_run_with_default_model(self):
text_classification = pipeline(task=Tasks.text_classification) text_classification = pipeline(task=Tasks.text_classification)
result = text_classification( result = text_classification(
PyDataset.load(
MsDataset.load(
'glue', 'glue',
subset_name='sst2', subset_name='sst2',
split='train', split='train',
@@ -78,7 +78,7 @@ class SequenceClassificationTest(unittest.TestCase):
text_classification = pipeline( text_classification = pipeline(
Tasks.text_classification, model=model, preprocessor=preprocessor) Tasks.text_classification, model=model, preprocessor=preprocessor)
# loaded from huggingface dataset # loaded from huggingface dataset
dataset = PyDataset.load(
dataset = MsDataset.load(
'glue', 'glue',
subset_name='sst2', subset_name='sst2',
split='train', split='train',
@@ -91,7 +91,7 @@ class SequenceClassificationTest(unittest.TestCase):
def test_run_with_modelscope_dataset(self): def test_run_with_modelscope_dataset(self):
text_classification = pipeline(task=Tasks.text_classification) text_classification = pipeline(task=Tasks.text_classification)
# loaded from modelscope dataset # loaded from modelscope dataset
dataset = PyDataset.load(
dataset = MsDataset.load(
'squad', split='train', target='context', hub=Hubs.modelscope) 'squad', split='train', target='context', hub=Hubs.modelscope)
result = text_classification(dataset) result = text_classification(dataset)
self.printDataset(result) self.printDataset(result)


Loading…
Cancel
Save