@@ -128,7 +128,7 @@ class TorchModelExporter(Exporter): | |||
args_list = list(args) | |||
else: | |||
args_list = [args] | |||
if isinstance(args_list[-1], dict): | |||
if isinstance(args_list[-1], Mapping): | |||
args_dict = args_list[-1] | |||
args_list = args_list[:-1] | |||
n_nonkeyword = len(args_list) | |||
@@ -284,9 +284,8 @@ class TorchModelExporter(Exporter): | |||
'Model property dummy_inputs must be set.') | |||
dummy_inputs = collate_fn(dummy_inputs, device) | |||
if isinstance(dummy_inputs, Mapping): | |||
dummy_inputs = self._decide_input_format(model, dummy_inputs) | |||
dummy_inputs_filter = [] | |||
for _input in dummy_inputs: | |||
for _input in self._decide_input_format(model, dummy_inputs): | |||
if _input is not None: | |||
dummy_inputs_filter.append(_input) | |||
else: | |||
@@ -23,7 +23,8 @@ from modelscope.hub.constants import (API_RESPONSE_FIELD_DATA, | |||
API_RESPONSE_FIELD_MESSAGE, | |||
API_RESPONSE_FIELD_USERNAME, | |||
DEFAULT_CREDENTIALS_PATH, | |||
MODELSCOPE_ENVIRONMENT, ONE_YEAR_SECONDS, | |||
MODELSCOPE_ENVIRONMENT, | |||
MODELSCOPE_USERNAME, ONE_YEAR_SECONDS, | |||
Licenses, ModelVisibility) | |||
from modelscope.hub.errors import (InvalidParameter, NotExistError, | |||
NotLoginException, NoValidRevisionError, | |||
@@ -38,8 +39,8 @@ from modelscope.utils.constant import (DEFAULT_DATASET_REVISION, | |||
DEFAULT_MODEL_REVISION, | |||
DEFAULT_REPOSITORY_REVISION, | |||
MASTER_MODEL_BRANCH, DatasetFormations, | |||
DatasetMetaFormats, DownloadMode, | |||
ModelFile) | |||
DatasetMetaFormats, DownloadChannel, | |||
DownloadMode, ModelFile) | |||
from modelscope.utils.logger import get_logger | |||
from .utils.utils import (get_endpoint, get_release_datetime, | |||
model_id_to_group_owner_name) | |||
@@ -645,6 +646,25 @@ class HubApi: | |||
def check_local_cookies(self, use_cookies) -> CookieJar: | |||
return self._check_cookie(use_cookies=use_cookies) | |||
def dataset_download_uv(self, dataset_name: str, namespace: str): | |||
if not dataset_name or not namespace: | |||
raise ValueError('dataset_name or namespace cannot be empty!') | |||
# get channel and user_name | |||
channel = DownloadChannel.LOCAL.value | |||
user_name = '' | |||
if MODELSCOPE_ENVIRONMENT in os.environ: | |||
channel = os.environ[MODELSCOPE_ENVIRONMENT] | |||
if MODELSCOPE_USERNAME in os.environ: | |||
user_name = os.environ[MODELSCOPE_USERNAME] | |||
url = f'{self.endpoint}/api/v1/datasets/{namespace}/{dataset_name}/download/uv/{channel}?user={user_name}' | |||
cookies = ModelScopeConfig.get_cookies() | |||
r = requests.post(url, cookies=cookies, headers=self.headers) | |||
resp = r.json() | |||
raise_on_error(resp) | |||
return resp['Message'] | |||
class ModelScopeConfig: | |||
path_credential = expanduser(DEFAULT_CREDENTIALS_PATH) | |||
@@ -760,14 +780,18 @@ class ModelScopeConfig: | |||
env = 'custom' | |||
if MODELSCOPE_ENVIRONMENT in os.environ: | |||
env = os.environ[MODELSCOPE_ENVIRONMENT] | |||
user_name = 'unknown' | |||
if MODELSCOPE_USERNAME in os.environ: | |||
user_name = os.environ[MODELSCOPE_USERNAME] | |||
ua = 'modelscope/%s; python/%s; session_id/%s; platform/%s; processor/%s; env/%s' % ( | |||
ua = 'modelscope/%s; python/%s; session_id/%s; platform/%s; processor/%s; env/%s; user/%s' % ( | |||
__version__, | |||
platform.python_version(), | |||
ModelScopeConfig.get_user_session_id(), | |||
platform.platform(), | |||
platform.processor(), | |||
env, | |||
user_name, | |||
) | |||
if isinstance(user_agent, dict): | |||
ua = '; '.join(f'{k}/{v}' for k, v in user_agent.items()) | |||
@@ -18,6 +18,7 @@ API_RESPONSE_FIELD_EMAIL = 'Email' | |||
API_RESPONSE_FIELD_MESSAGE = 'Message' | |||
MODELSCOPE_ENVIRONMENT = 'MODELSCOPE_ENVIRONMENT' | |||
MODELSCOPE_SDK_DEBUG = 'MODELSCOPE_SDK_DEBUG' | |||
MODELSCOPE_USERNAME = 'MODELSCOPE_USERNAME' | |||
ONE_YEAR_SECONDS = 24 * 365 * 60 * 60 | |||
@@ -349,11 +349,13 @@ class CLIP(nn.Module): | |||
text_num_hidden_layers: int, | |||
text_type_vocab_size: int, | |||
tokenizer: FullTokenizer, | |||
# vision_head_width, added this param for ViT-H | |||
vision_head_width: int = 64, | |||
): | |||
super().__init__() | |||
if isinstance(vision_layers, (tuple, list)): | |||
vision_heads = vision_width * 32 // 64 | |||
vision_heads = vision_width * 32 // vision_head_width | |||
self.visual = ModifiedResNet( | |||
layers=vision_layers, | |||
output_dim=embed_dim, | |||
@@ -361,7 +363,7 @@ class CLIP(nn.Module): | |||
input_resolution=image_resolution, | |||
width=vision_width) | |||
else: | |||
vision_heads = vision_width // 64 | |||
vision_heads = vision_width // vision_head_width | |||
self.visual = VisualTransformer( | |||
input_resolution=image_resolution, | |||
patch_size=vision_patch_size, | |||
@@ -0,0 +1,3 @@ | |||
# The Uni-fold implementation is also open-sourced by the authors under Apache-2.0 license, | |||
# and is publicly available at https://github.com/dptech-corp/Uni-Fold. | |||
"""Unifold Modules.""" |
@@ -274,6 +274,8 @@ class MsDataset: | |||
try: | |||
api.on_dataset_download( | |||
dataset_name=download_dataset, namespace=namespace) | |||
api.dataset_download_uv( | |||
dataset_name=download_dataset, namespace=namespace) | |||
except Exception as e: | |||
logger.error(e) | |||
@@ -491,17 +491,8 @@ TASK_OUTPUTS = { | |||
# word segmentation result for single sample | |||
# { | |||
# "output": "今天 天气 不错 , 适合 出去 游玩" | |||
# "labels": [ | |||
# {'word': '今天', 'label': 'PROPN'}, | |||
# {'word': '天气', 'label': 'PROPN'}, | |||
# {'word': '不错', 'label': 'VERB'}, | |||
# {'word': ',', 'label': 'NUM'}, | |||
# {'word': '适合', 'label': 'NOUN'}, | |||
# {'word': '出去', 'label': 'PART'}, | |||
# {'word': '游玩', 'label': 'ADV'}, | |||
# ] | |||
# } | |||
Tasks.word_segmentation: [OutputKeys.OUTPUT, OutputKeys.LABELS], | |||
Tasks.word_segmentation: [OutputKeys.OUTPUT], | |||
# TODO @wenmeng.zwm support list of result check | |||
# named entity recognition result for single sample | |||
@@ -93,9 +93,8 @@ DEFAULT_MODEL_FOR_PIPELINE = { | |||
'damo/cv_resnet50_live-category'), | |||
Tasks.video_category: (Pipelines.video_category, | |||
'damo/cv_resnet50_video-category'), | |||
Tasks.multi_modal_embedding: | |||
(Pipelines.multi_modal_embedding, | |||
'damo/multi-modal_clip-vit-large-patch14_zh'), | |||
Tasks.multi_modal_embedding: (Pipelines.multi_modal_embedding, | |||
'damo/multi-modal_clip-vit-base-patch16_zh'), | |||
Tasks.generative_multi_modal_embedding: | |||
(Pipelines.generative_multi_modal_embedding, | |||
'damo/multi-modal_gemm-vit-large-patch14_generative-multi-modal-embedding' | |||
@@ -109,13 +109,13 @@ class TokenClassificationPipeline(Pipeline): | |||
chunk['span'] = text[chunk['start']:chunk['end']] | |||
chunks.append(chunk) | |||
# for cws output | |||
# for cws outputs | |||
if len(chunks) > 0 and chunks[0]['type'] == 'cws': | |||
spans = [ | |||
chunk['span'] for chunk in chunks if chunk['span'].strip() | |||
] | |||
seg_result = ' '.join(spans) | |||
outputs = {OutputKeys.OUTPUT: seg_result, OutputKeys.LABELS: []} | |||
outputs = {OutputKeys.OUTPUT: seg_result} | |||
# for ner outputs | |||
else: | |||
@@ -115,15 +115,15 @@ class WordSegmentationPipeline(Pipeline): | |||
chunk['span'] = text[chunk['start']:chunk['end']] | |||
chunks.append(chunk) | |||
# for cws output | |||
# for cws outputs | |||
if len(chunks) > 0 and chunks[0]['type'] == 'cws': | |||
spans = [ | |||
chunk['span'] for chunk in chunks if chunk['span'].strip() | |||
] | |||
seg_result = ' '.join(spans) | |||
outputs = {OutputKeys.OUTPUT: seg_result, OutputKeys.LABELS: []} | |||
outputs = {OutputKeys.OUTPUT: seg_result} | |||
# for ner outpus | |||
# for ner output | |||
else: | |||
outputs = {OutputKeys.OUTPUT: chunks} | |||
return outputs |
@@ -96,7 +96,6 @@ class OfaPreprocessor(Preprocessor): | |||
data = input | |||
else: | |||
data = self._build_dict(input) | |||
data = self._ofa_input_compatibility_conversion(data) | |||
sample = self.preprocess(data) | |||
str_data = dict() | |||
for k, v in data.items(): | |||
@@ -34,6 +34,7 @@ class NLPBasePreprocessor(Preprocessor, ABC): | |||
label=None, | |||
label2id=None, | |||
mode=ModeKeys.INFERENCE, | |||
use_fast=None, | |||
**kwargs): | |||
"""The NLP preprocessor base class. | |||
@@ -45,14 +46,18 @@ class NLPBasePreprocessor(Preprocessor, ABC): | |||
label2id: An optional label2id mapping, the class will try to call utils.parse_label_mapping | |||
if this mapping is not supplied. | |||
mode: Run this preprocessor in either 'train'/'eval'/'inference' mode | |||
use_fast: use the fast version of tokenizer | |||
""" | |||
self.model_dir = model_dir | |||
self.first_sequence = first_sequence | |||
self.second_sequence = second_sequence | |||
self.label = label | |||
self.use_fast = kwargs.pop('use_fast', None) | |||
if self.use_fast is None and os.path.isfile( | |||
self.use_fast = use_fast | |||
if self.use_fast is None and model_dir is None: | |||
self.use_fast = False | |||
elif self.use_fast is None and os.path.isfile( | |||
os.path.join(model_dir, 'tokenizer_config.json')): | |||
with open(os.path.join(model_dir, 'tokenizer_config.json'), | |||
'r') as f: | |||
@@ -61,8 +66,8 @@ class NLPBasePreprocessor(Preprocessor, ABC): | |||
self.use_fast = False if self.use_fast is None else self.use_fast | |||
self.label2id = label2id | |||
if self.label2id is None: | |||
self.label2id = parse_label_mapping(self.model_dir) | |||
if self.label2id is None and model_dir is not None: | |||
self.label2id = parse_label_mapping(model_dir) | |||
super().__init__(mode, **kwargs) | |||
@property | |||
@@ -106,6 +111,7 @@ class NLPTokenizerPreprocessorBase(NLPBasePreprocessor): | |||
label: str = 'label', | |||
label2id: dict = None, | |||
mode: str = ModeKeys.INFERENCE, | |||
use_fast: bool = None, | |||
**kwargs): | |||
"""The NLP tokenizer preprocessor base class. | |||
@@ -122,11 +128,12 @@ class NLPTokenizerPreprocessorBase(NLPBasePreprocessor): | |||
- config.json label2id/id2label | |||
- label_mapping.json | |||
mode: Run this preprocessor in either 'train'/'eval'/'inference' mode, the behavior may be different. | |||
use_fast: use the fast version of tokenizer | |||
kwargs: These kwargs will be directly fed into the tokenizer. | |||
""" | |||
super().__init__(model_dir, first_sequence, second_sequence, label, | |||
label2id, mode) | |||
label2id, mode, use_fast, **kwargs) | |||
self.model_dir = model_dir | |||
self.tokenize_kwargs = kwargs | |||
self.tokenizer = self.build_tokenizer(model_dir) | |||
@@ -2,6 +2,7 @@ | |||
from typing import Any, Dict, Tuple, Union | |||
import numpy as np | |||
import torch | |||
from modelscope.metainfo import Preprocessors | |||
@@ -20,9 +21,7 @@ class WordSegmentationBlankSetToLabelPreprocessor(NLPBasePreprocessor): | |||
""" | |||
def __init__(self, **kwargs): | |||
super().__init__(**kwargs) | |||
self.first_sequence: str = kwargs.pop('first_sequence', | |||
'first_sequence') | |||
self.first_sequence: str = kwargs.pop('first_sequence', 'tokens') | |||
self.label = kwargs.pop('label', OutputKeys.LABELS) | |||
def __call__(self, data: str) -> Union[Dict[str, Any], Tuple]: | |||
@@ -80,10 +79,9 @@ class TokenClassificationPreprocessor(NLPTokenizerPreprocessorBase): | |||
'is_split_into_words', False) | |||
if 'label2id' in kwargs: | |||
kwargs.pop('label2id') | |||
self.tokenize_kwargs = kwargs | |||
@type_assert(object, str) | |||
def __call__(self, data: str) -> Dict[str, Any]: | |||
@type_assert(object, (str, dict)) | |||
def __call__(self, data: Union[dict, str]) -> Dict[str, Any]: | |||
"""process the raw input data | |||
Args: | |||
@@ -99,18 +97,24 @@ class TokenClassificationPreprocessor(NLPTokenizerPreprocessorBase): | |||
text = None | |||
labels_list = None | |||
if isinstance(data, str): | |||
# for inference inputs without label | |||
text = data | |||
self.tokenize_kwargs['add_special_tokens'] = False | |||
elif isinstance(data, dict): | |||
# for finetune inputs with label | |||
text = data.get(self.first_sequence) | |||
labels_list = data.get(self.label) | |||
if isinstance(text, list): | |||
self.tokenize_kwargs['is_split_into_words'] = True | |||
input_ids = [] | |||
label_mask = [] | |||
offset_mapping = [] | |||
if self.is_split_into_words: | |||
for offset, token in enumerate(list(data)): | |||
subtoken_ids = self.tokenizer.encode( | |||
token, add_special_tokens=False) | |||
token_type_ids = [] | |||
if self.is_split_into_words and self._mode == ModeKeys.INFERENCE: | |||
for offset, token in enumerate(list(text)): | |||
subtoken_ids = self.tokenizer.encode(token, | |||
**self.tokenize_kwargs) | |||
if len(subtoken_ids) == 0: | |||
subtoken_ids = [self.tokenizer.unk_token_id] | |||
input_ids.extend(subtoken_ids) | |||
@@ -119,10 +123,9 @@ class TokenClassificationPreprocessor(NLPTokenizerPreprocessorBase): | |||
else: | |||
if self.tokenizer.is_fast: | |||
encodings = self.tokenizer( | |||
text, | |||
add_special_tokens=False, | |||
return_offsets_mapping=True, | |||
**self.tokenize_kwargs) | |||
text, return_offsets_mapping=True, **self.tokenize_kwargs) | |||
attention_mask = encodings['attention_mask'] | |||
token_type_ids = encodings['token_type_ids'] | |||
input_ids = encodings['input_ids'] | |||
word_ids = encodings.word_ids() | |||
for i in range(len(word_ids)): | |||
@@ -137,75 +140,85 @@ class TokenClassificationPreprocessor(NLPTokenizerPreprocessorBase): | |||
label_mask.append(1) | |||
offset_mapping.append(encodings['offset_mapping'][i]) | |||
else: | |||
encodings = self.tokenizer( | |||
text, add_special_tokens=False, **self.tokenize_kwargs) | |||
encodings = self.tokenizer(text, **self.tokenize_kwargs) | |||
input_ids = encodings['input_ids'] | |||
label_mask, offset_mapping = self.get_label_mask_and_offset_mapping( | |||
text) | |||
if len(input_ids) >= self.sequence_length - 2: | |||
input_ids = input_ids[:self.sequence_length - 2] | |||
label_mask = label_mask[:self.sequence_length - 2] | |||
input_ids = [self.tokenizer.cls_token_id | |||
] + input_ids + [self.tokenizer.sep_token_id] | |||
label_mask = [0] + label_mask + [0] | |||
attention_mask = [1] * len(input_ids) | |||
offset_mapping = offset_mapping[:sum(label_mask)] | |||
if self._mode == ModeKeys.INFERENCE: | |||
if len(input_ids) >= self.sequence_length - 2: | |||
input_ids = input_ids[:self.sequence_length - 2] | |||
label_mask = label_mask[:self.sequence_length - 2] | |||
input_ids = [self.tokenizer.cls_token_id | |||
] + input_ids + [self.tokenizer.sep_token_id] | |||
label_mask = [0] + label_mask + [0] | |||
attention_mask = [1] * len(input_ids) | |||
offset_mapping = offset_mapping[:sum(label_mask)] | |||
if not self.is_transformer_based_model: | |||
input_ids = input_ids[1:-1] | |||
attention_mask = attention_mask[1:-1] | |||
label_mask = label_mask[1:-1] | |||
if not self.is_transformer_based_model: | |||
input_ids = input_ids[1:-1] | |||
attention_mask = attention_mask[1:-1] | |||
label_mask = label_mask[1:-1] | |||
if self._mode == ModeKeys.INFERENCE: | |||
input_ids = torch.tensor(input_ids).unsqueeze(0) | |||
attention_mask = torch.tensor(attention_mask).unsqueeze(0) | |||
label_mask = torch.tensor( | |||
label_mask, dtype=torch.bool).unsqueeze(0) | |||
# the token classification | |||
output = { | |||
'text': text, | |||
'input_ids': input_ids, | |||
'attention_mask': attention_mask, | |||
'label_mask': label_mask, | |||
'offset_mapping': offset_mapping | |||
} | |||
# align the labels with tokenized text | |||
if labels_list is not None: | |||
assert self.label2id is not None | |||
# Map that sends B-Xxx label to its I-Xxx counterpart | |||
b_to_i_label = [] | |||
label_enumerate_values = [ | |||
k for k, v in sorted( | |||
self.label2id.items(), key=lambda item: item[1]) | |||
] | |||
for idx, label in enumerate(label_enumerate_values): | |||
if label.startswith('B-') and label.replace( | |||
'B-', 'I-') in label_enumerate_values: | |||
b_to_i_label.append( | |||
label_enumerate_values.index( | |||
label.replace('B-', 'I-'))) | |||
else: | |||
b_to_i_label.append(idx) | |||
# the token classification | |||
output = { | |||
'text': text, | |||
'input_ids': input_ids, | |||
'attention_mask': attention_mask, | |||
'label_mask': label_mask, | |||
'offset_mapping': offset_mapping | |||
} | |||
else: | |||
output = { | |||
'input_ids': input_ids, | |||
'token_type_ids': token_type_ids, | |||
'attention_mask': attention_mask, | |||
'label_mask': label_mask, | |||
} | |||
label_row = [self.label2id[lb] for lb in labels_list] | |||
previous_word_idx = None | |||
label_ids = [] | |||
for word_idx in word_ids: | |||
if word_idx is None: | |||
label_ids.append(-100) | |||
elif word_idx != previous_word_idx: | |||
label_ids.append(label_row[word_idx]) | |||
else: | |||
if self.label_all_tokens: | |||
label_ids.append(b_to_i_label[label_row[word_idx]]) | |||
# align the labels with tokenized text | |||
if labels_list is not None: | |||
assert self.label2id is not None | |||
# Map that sends B-Xxx label to its I-Xxx counterpart | |||
b_to_i_label = [] | |||
label_enumerate_values = [ | |||
k for k, v in sorted( | |||
self.label2id.items(), key=lambda item: item[1]) | |||
] | |||
for idx, label in enumerate(label_enumerate_values): | |||
if label.startswith('B-') and label.replace( | |||
'B-', 'I-') in label_enumerate_values: | |||
b_to_i_label.append( | |||
label_enumerate_values.index( | |||
label.replace('B-', 'I-'))) | |||
else: | |||
b_to_i_label.append(idx) | |||
label_row = [self.label2id[lb] for lb in labels_list] | |||
previous_word_idx = None | |||
label_ids = [] | |||
for word_idx in word_ids: | |||
if word_idx is None: | |||
label_ids.append(-100) | |||
previous_word_idx = word_idx | |||
labels = label_ids | |||
output['labels'] = labels | |||
elif word_idx != previous_word_idx: | |||
label_ids.append(label_row[word_idx]) | |||
else: | |||
if self.label_all_tokens: | |||
label_ids.append(b_to_i_label[label_row[word_idx]]) | |||
else: | |||
label_ids.append(-100) | |||
previous_word_idx = word_idx | |||
labels = label_ids | |||
output['labels'] = labels | |||
output = { | |||
k: np.array(v) if isinstance(v, list) else v | |||
for k, v in output.items() | |||
} | |||
return output | |||
def get_tokenizer_class(self): | |||
@@ -2,12 +2,12 @@ | |||
from typing import Any, Dict | |||
import torch | |||
from PIL import Image | |||
import unicodedata2 | |||
from torchvision import transforms | |||
from torchvision.transforms import InterpolationMode | |||
from torchvision.transforms import functional as F | |||
from zhconv import convert | |||
from modelscope.preprocessors.image import load_image | |||
from modelscope.utils.constant import ModeKeys | |||
from .base import OfaBasePreprocessor | |||
@@ -98,8 +98,7 @@ class OfaOcrRecognitionPreprocessor(OfaBasePreprocessor): | |||
def _build_train_sample(self, data: Dict[str, Any]) -> Dict[str, Any]: | |||
sample = self._build_infer_sample(data) | |||
target = data[self.column_map['text']] | |||
target = target.translate(self.transtab).strip() | |||
target = sample['label'] | |||
target_token_list = target.strip().split() | |||
target = ' '.join(target_token_list[:self.max_tgt_length]) | |||
sample['target'] = self.tokenize_text(target, add_bos=False) | |||
@@ -119,5 +118,7 @@ class OfaOcrRecognitionPreprocessor(OfaBasePreprocessor): | |||
'patch_mask': torch.tensor([True]) | |||
} | |||
if 'text' in self.column_map and self.column_map['text'] in data: | |||
sample['label'] = data[self.column_map['text']] | |||
target = data[self.column_map['text']] | |||
target = unicodedata2.normalize('NFKC', convert(target, 'zh-hans')) | |||
sample['label'] = target | |||
return sample |
@@ -18,7 +18,7 @@ class TextGenerationTrainer(NlpEpochBasedTrainer): | |||
return tokenizer.decode(tokens.tolist(), skip_special_tokens=True) | |||
def evaluation_step(self, data): | |||
model = self.model | |||
model = self.model.module if self._dist else self.model | |||
model.eval() | |||
with torch.no_grad(): | |||
@@ -586,14 +586,16 @@ class NlpEpochBasedTrainer(EpochBasedTrainer): | |||
preprocessor_mode=ModeKeys.TRAIN, | |||
**model_args, | |||
**self.train_keys, | |||
mode=ModeKeys.TRAIN) | |||
mode=ModeKeys.TRAIN, | |||
use_fast=True) | |||
eval_preprocessor = Preprocessor.from_pretrained( | |||
self.model_dir, | |||
cfg_dict=self.cfg, | |||
preprocessor_mode=ModeKeys.EVAL, | |||
**model_args, | |||
**self.eval_keys, | |||
mode=ModeKeys.EVAL) | |||
mode=ModeKeys.EVAL, | |||
use_fast=True) | |||
return train_preprocessor, eval_preprocessor | |||
@@ -876,7 +876,7 @@ class EpochBasedTrainer(BaseTrainer): | |||
Subclass and override to inject custom behavior. | |||
""" | |||
model = self.model | |||
model = self.model.module if self._dist else self.model | |||
model.eval() | |||
if is_parallel(model): | |||
@@ -238,6 +238,14 @@ class DownloadMode(enum.Enum): | |||
FORCE_REDOWNLOAD = 'force_redownload' | |||
class DownloadChannel(enum.Enum): | |||
""" Channels of datasets downloading for uv/pv counting. | |||
""" | |||
LOCAL = 'local' | |||
DSW = 'dsw' | |||
EAIS = 'eais' | |||
class UploadMode(enum.Enum): | |||
""" How to upload object to remote. | |||
""" | |||
@@ -1,6 +1,7 @@ | |||
addict | |||
attrs | |||
datasets | |||
# version beyond 2.5.2 introduces compatbility issue and is being resolved | |||
datasets<=2.5.2 | |||
easydict | |||
einops | |||
filelock>=3.3.0 | |||
@@ -11,3 +11,5 @@ timm | |||
tokenizers | |||
torchvision | |||
transformers>=4.12.0 | |||
unicodedata2 | |||
zhconv |
@@ -1,4 +1,6 @@ | |||
biopython | |||
iopath | |||
ipdb | |||
lmdb | |||
ml_collections | |||
scipy | |||
@@ -8,7 +8,8 @@ import zipfile | |||
from modelscope.msdatasets import MsDataset | |||
from modelscope.msdatasets.utils.dataset_utils import list_dataset_objects | |||
from modelscope.utils import logger as logging | |||
from modelscope.utils.constant import DEFAULT_DATASET_REVISION, ModelFile | |||
from modelscope.utils.constant import (DEFAULT_DATASET_REVISION, DownloadMode, | |||
ModelFile) | |||
from modelscope.utils.test_utils import test_level | |||
logger = logging.get_logger(__name__) | |||
@@ -104,7 +105,10 @@ class DatasetUploadTest(unittest.TestCase): | |||
@unittest.skipUnless(test_level() >= 1, 'skip test in current test level') | |||
def test_ds_download_dir(self): | |||
test_ds = MsDataset.load(self.dataset_name, self.namespace) | |||
test_ds = MsDataset.load( | |||
self.dataset_name, | |||
namespace=self.namespace, | |||
download_mode=DownloadMode.FORCE_REDOWNLOAD) | |||
assert test_ds.config_kwargs['split_config'].values() | |||
@unittest.skipUnless(test_level() >= 1, 'skip test in current test level') | |||
@@ -21,9 +21,10 @@ class TestModelOutput(unittest.TestCase): | |||
self.assertEqual(outputs['logits'], torch.Tensor([1])) | |||
self.assertEqual(outputs[0], torch.Tensor([1])) | |||
self.assertEqual(outputs.logits, torch.Tensor([1])) | |||
outputs.loss = torch.Tensor([2]) | |||
logits, loss = outputs | |||
self.assertEqual(logits, torch.Tensor([1])) | |||
self.assertTrue(loss is None) | |||
self.assertTrue(loss is not None) | |||
if __name__ == '__main__': | |||
@@ -19,9 +19,11 @@ class NamedEntityRecognitionTest(unittest.TestCase, DemoCompatibilityCheck): | |||
self.task = Tasks.named_entity_recognition | |||
self.model_id = 'damo/nlp_raner_named-entity-recognition_chinese-base-news' | |||
english_model_id = 'damo/nlp_raner_named-entity-recognition_english-large-ecom' | |||
tcrf_model_id = 'damo/nlp_raner_named-entity-recognition_chinese-base-news' | |||
lcrf_model_id = 'damo/nlp_lstm_named-entity-recognition_chinese-news' | |||
sentence = '这与温岭市新河镇的一个神秘的传说有关。' | |||
sentence_en = 'pizza shovel' | |||
@unittest.skipUnless(test_level() >= 2, 'skip test in current test level') | |||
def test_run_tcrf_by_direct_model_download(self): | |||
@@ -89,6 +91,12 @@ class NamedEntityRecognitionTest(unittest.TestCase, DemoCompatibilityCheck): | |||
task=Tasks.named_entity_recognition, model=self.lcrf_model_id) | |||
print(pipeline_ins(input=self.sentence)) | |||
@unittest.skipUnless(test_level() >= 2, 'skip test in current test level') | |||
def test_run_english_with_model_name(self): | |||
pipeline_ins = pipeline( | |||
task=Tasks.named_entity_recognition, model=self.english_model_id) | |||
print(pipeline_ins(input='pizza shovel')) | |||
@unittest.skipUnless(test_level() >= 2, 'skip test in current test level') | |||
def test_run_with_default_model(self): | |||
pipeline_ins = pipeline(task=Tasks.named_entity_recognition) | |||
@@ -19,7 +19,7 @@ class UnifoldProteinStructureTest(unittest.TestCase, DemoCompatibilityCheck): | |||
self.protein_multimer = 'GAMGLPEEPSSPQESTLKALSLYEAHLSSYIMYLQTFLVKTKQKVNNKNYPEFTLFDTSKLKKDQTLKSIKT' + \ | |||
'NIAALKNHIDKIKPIAMQIYKKYSKNIP' | |||
@unittest.skipUnless(test_level() >= 2, 'skip test in current test level') | |||
@unittest.skipUnless(test_level() >= 0, 'skip test in current test level') | |||
def test_run_by_direct_model_download(self): | |||
model_dir = snapshot_download(self.model_id) | |||
mono_pipeline_ins = pipeline(task=self.task, model=model_dir) | |||
@@ -87,7 +87,7 @@ class TestFinetuneTokenClassification(unittest.TestCase): | |||
cfg['dataset'] = { | |||
'train': { | |||
'labels': label_enumerate_values, | |||
'first_sequence': 'first_sequence', | |||
'first_sequence': 'tokens', | |||
'label': 'labels', | |||
} | |||
} | |||
@@ -85,7 +85,7 @@ class TestOfaTrainer(unittest.TestCase): | |||
'ocr_fudanvi_zh', | |||
subset_name='scene', | |||
namespace='modelscope', | |||
split='train[:200]', | |||
split='train[800:900]', | |||
download_mode=DownloadMode.REUSE_DATASET_IF_EXISTS), | |||
eval_dataset=MsDataset.load( | |||
'ocr_fudanvi_zh', | |||