Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/9005038master
@@ -1,9 +1,9 @@ | |||
import logging | |||
from typing import (Any, Callable, Dict, List, Mapping, Optional, Sequence, | |||
Union) | |||
from datasets import Dataset, load_dataset | |||
from modelscope.utils.constant import Hubs | |||
from modelscope.utils.logger import get_logger | |||
logger = get_logger() | |||
@@ -41,17 +41,17 @@ class PyDataset: | |||
return dataset | |||
@staticmethod | |||
def load( | |||
path: Union[str, list], | |||
target: Optional[str] = None, | |||
version: Optional[str] = None, | |||
name: Optional[str] = None, | |||
split: Optional[str] = None, | |||
data_dir: Optional[str] = None, | |||
data_files: Optional[Union[str, Sequence[str], | |||
Mapping[str, Union[str, | |||
Sequence[str]]]]] = None | |||
) -> 'PyDataset': | |||
def load(path: Union[str, list], | |||
target: Optional[str] = None, | |||
version: Optional[str] = None, | |||
name: Optional[str] = None, | |||
split: Optional[str] = None, | |||
data_dir: Optional[str] = None, | |||
data_files: Optional[Union[str, Sequence[str], | |||
Mapping[str, | |||
Union[str, | |||
Sequence[str]]]]] = None, | |||
hub: Optional[Hubs] = None) -> 'PyDataset': | |||
"""Load a PyDataset from the ModelScope Hub, Hugging Face Hub, urls, or a local dataset. | |||
Args: | |||
@@ -62,10 +62,15 @@ class PyDataset: | |||
data_dir (str, optional): Defining the data_dir of the dataset configuration. I | |||
data_files (str or Sequence or Mapping, optional): Path(s) to source data file(s). | |||
split (str, optional): Which split of the data to load. | |||
hub (Hubs, optional): When loading from a remote hub, where it is from | |||
Returns: | |||
PyDataset (obj:`PyDataset`): PyDataset object for a certain dataset. | |||
""" | |||
if Hubs.modelscope == hub: | |||
# TODO: parse data meta information from modelscope hub | |||
# and possibly download data files to local (and update path) | |||
print('getting data from modelscope hub') | |||
if isinstance(path, str): | |||
dataset = load_dataset( | |||
path, | |||
@@ -57,13 +57,20 @@ class Tasks(object): | |||
class InputFields(object): | |||
""" Names for input data fileds in the input data for pipelines | |||
""" Names for input data fields in the input data for pipelines | |||
""" | |||
img = 'img' | |||
text = 'text' | |||
audio = 'audio' | |||
class Hubs(object): | |||
""" Source from which an entity (such as a Dataset or Model) is stored | |||
""" | |||
modelscope = 'modelscope' | |||
huggingface = 'huggingface' | |||
# configuration filename | |||
# in order to avoid conflict with huggingface | |||
# config file we use maas_config instead | |||
@@ -10,7 +10,7 @@ from modelscope.models.nlp import BertForSequenceClassification | |||
from modelscope.pipelines import SequenceClassificationPipeline, pipeline | |||
from modelscope.preprocessors import SequenceClassificationPreprocessor | |||
from modelscope.pydatasets import PyDataset | |||
from modelscope.utils.constant import Tasks | |||
from modelscope.utils.constant import Hubs, Tasks | |||
from modelscope.utils.hub import get_model_cache_dir | |||
@@ -81,13 +81,15 @@ class SequenceClassificationTest(unittest.TestCase): | |||
text_classification = pipeline( | |||
task=Tasks.text_classification, model=self.model_id) | |||
result = text_classification( | |||
PyDataset.load('glue', name='sst2', target='sentence')) | |||
PyDataset.load( | |||
'glue', name='sst2', target='sentence', hub=Hubs.huggingface)) | |||
self.printDataset(result) | |||
def test_run_with_default_model(self): | |||
text_classification = pipeline(task=Tasks.text_classification) | |||
result = text_classification( | |||
PyDataset.load('glue', name='sst2', target='sentence')) | |||
PyDataset.load( | |||
'glue', name='sst2', target='sentence', hub=Hubs.huggingface)) | |||
self.printDataset(result) | |||
def test_run_with_dataset(self): | |||
@@ -97,9 +99,9 @@ class SequenceClassificationTest(unittest.TestCase): | |||
text_classification = pipeline( | |||
Tasks.text_classification, model=model, preprocessor=preprocessor) | |||
# loaded from huggingface dataset | |||
# TODO: add load_from parameter (an enum) LOAD_FROM.hugging_face | |||
# TODO: rename parameter as dataset_name and subset_name | |||
dataset = PyDataset.load('glue', name='sst2', target='sentence') | |||
dataset = PyDataset.load( | |||
'glue', name='sst2', target='sentence', hub=Hubs.huggingface) | |||
result = text_classification(dataset) | |||
self.printDataset(result) | |||