diff --git a/.dev_scripts/dockerci.sh b/.dev_scripts/dockerci.sh index c502175b..07ea947a 100644 --- a/.dev_scripts/dockerci.sh +++ b/.dev_scripts/dockerci.sh @@ -37,6 +37,7 @@ do -e TEST_ACCESS_TOKEN_CITEST=$TEST_ACCESS_TOKEN_CITEST \ -e TEST_ACCESS_TOKEN_SDKDEV=$TEST_ACCESS_TOKEN_SDKDEV \ -e TEST_LEVEL=$TEST_LEVEL \ + -e MODELSCOPE_ENVIRONMENT='ci' \ -e TEST_UPLOAD_MS_TOKEN=$TEST_UPLOAD_MS_TOKEN \ -e MODEL_TAG_URL=$MODEL_TAG_URL \ --workdir=$CODE_DIR_IN_CONTAINER \ @@ -59,6 +60,7 @@ do -e TEST_ACCESS_TOKEN_CITEST=$TEST_ACCESS_TOKEN_CITEST \ -e TEST_ACCESS_TOKEN_SDKDEV=$TEST_ACCESS_TOKEN_SDKDEV \ -e TEST_LEVEL=$TEST_LEVEL \ + -e MODELSCOPE_ENVIRONMENT='ci' \ -e TEST_UPLOAD_MS_TOKEN=$TEST_UPLOAD_MS_TOKEN \ -e MODEL_TAG_URL=$MODEL_TAG_URL \ --workdir=$CODE_DIR_IN_CONTAINER \ diff --git a/README.md b/README.md index 1da48ef2..3d90c7ef 100644 --- a/README.md +++ b/README.md @@ -1,10 +1,10 @@ # Introduction -[ModelScope]( https://www.modelscope.cn) is a “Model-as-a-Service” (MaaS) platform that seeks to bringing together most advanced machine learning models from the AI community, and to streamlining the process of leveraging and applying AI models . The core ModelScope library enables developers to perform model inference, training and evaluation, through rich layers of API designs that facilitate a unified experience across state-of-the-art models from different AI domains. +[ModelScope]( https://www.modelscope.cn) is a “Model-as-a-Service” (MaaS) platform that seeks to bring together most advanced machine learning models from the AI community, and to streamline the process of leveraging AI models in real applications. The core ModelScope library enables developers to perform inference, training and evaluation, through rich layers of API designs that facilitate a unified experience across state-of-the-art models from different AI domains. -The Python library offers the layered-APIs necessary for model contributors to integrate models from CV, NLP, Speech, Multi-Modality, as well as Scientific-computation, into the ModelScope ecosystem. Implementations for all these different models are encapsulated within the library in a way that allows easy and unified access. With such integration, model inference, finetuning, and evaluations can be done within only a few lines of codes. In the meantime, flexibilities are provided so that different components in the model applications can be customized as well, where necessary. +The Python library offers the layered-APIs necessary for model contributors to integrate models from CV, NLP, Speech, Multi-Modality, as well as Scientific-computation, into the ModelScope ecosystem. Implementations for all these different models are encapsulated within the library in a way that allows easy and unified access. With such integration, model inference, finetuning, and evaluations can be done with only a few lines of codes. In the meantime, flexibilities are provided so that different components in the model applications can be customized as well, where necessary. -Apart from harboring implementations of various models, ModelScope library also enables the necessary interactions with the backend services of ModelScope, particularly with the Model-Hub and Dataset-Hub. Such interactions facilitate various entity (models and datasets) management to be performed seamlessly under-the-hood, such as entity lookup, version control, and cache management. +Apart from harboring implementations of various models, ModelScope library also enables the necessary interactions with ModelScope backend services, particularly with the Model-Hub and Dataset-Hub. Such interactions facilitate management of various entities (models and datasets) to be performed seamlessly under-the-hood, including entity lookup, version control, cache management, and many others. # Installation diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py index 8c9964b8..c7c3e729 100644 --- a/modelscope/metainfo.py +++ b/modelscope/metainfo.py @@ -299,6 +299,7 @@ class Trainers(object): # multi-modal trainers clip_multi_modal_embedding = 'clip-multi-modal-embedding' ofa = 'ofa' + mplug = 'mplug' # cv trainers image_instance_segmentation = 'image-instance-segmentation' @@ -402,6 +403,7 @@ class Metrics(object): # accuracy accuracy = 'accuracy' + multi_average_precision = 'mAP' audio_noise_metric = 'audio-noise-metric' # text gen diff --git a/modelscope/metrics/accuracy_metric.py b/modelscope/metrics/accuracy_metric.py index 953ece4c..fe040177 100644 --- a/modelscope/metrics/accuracy_metric.py +++ b/modelscope/metrics/accuracy_metric.py @@ -6,6 +6,7 @@ import numpy as np from modelscope.metainfo import Metrics from modelscope.outputs import OutputKeys +from modelscope.utils.chinese_utils import remove_space_between_chinese_chars from modelscope.utils.registry import default_group from .base import Metric from .builder import METRICS, MetricKeys @@ -26,10 +27,10 @@ class AccuracyMetric(Metric): def add(self, outputs: Dict, inputs: Dict): label_name = OutputKeys.LABEL if OutputKeys.LABEL in inputs else OutputKeys.LABELS ground_truths = inputs[label_name] - eval_results = outputs[label_name] + eval_results = None for key in [ OutputKeys.CAPTION, OutputKeys.TEXT, OutputKeys.BOXES, - OutputKeys.LABELS, OutputKeys.SCORES + OutputKeys.LABEL, OutputKeys.LABELS, OutputKeys.SCORES ]: if key in outputs and outputs[key] is not None: eval_results = outputs[key] @@ -39,7 +40,7 @@ class AccuracyMetric(Metric): self.labels.append(truth) for result in eval_results: if isinstance(truth, str): - self.preds.append(result.strip().replace(' ', '')) + self.preds.append(remove_space_between_chinese_chars(result)) else: self.preds.append(result) diff --git a/modelscope/metrics/builder.py b/modelscope/metrics/builder.py index b9e402c5..03d4c324 100644 --- a/modelscope/metrics/builder.py +++ b/modelscope/metrics/builder.py @@ -24,6 +24,7 @@ class MetricKeys(object): ROUGE_1 = 'rouge-1' ROUGE_L = 'rouge-l' NED = 'ned' # ocr metric + mAP = 'mAP' BatchAcc = 'inbatch_t2i_recall_at_1' @@ -40,8 +41,8 @@ task_default_metrics = { Tasks.image_portrait_enhancement: [Metrics.image_portrait_enhancement_metric], Tasks.video_summarization: [Metrics.video_summarization_metric], - Tasks.image_captioning: [Metrics.text_gen_metric], - Tasks.visual_question_answering: [Metrics.text_gen_metric], + Tasks.image_captioning: [Metrics.accuracy], + Tasks.visual_question_answering: [Metrics.accuracy], Tasks.movie_scene_segmentation: [Metrics.movie_scene_segmentation_metric], Tasks.image_inpainting: [Metrics.image_inpainting_metric], Tasks.referring_video_object_segmentation: diff --git a/modelscope/metrics/map_metric.py b/modelscope/metrics/map_metric.py new file mode 100644 index 00000000..aac76f22 --- /dev/null +++ b/modelscope/metrics/map_metric.py @@ -0,0 +1,67 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. + +from typing import Dict + +import numpy as np + +from modelscope.metainfo import Metrics +from modelscope.outputs import OutputKeys +from modelscope.utils.registry import default_group +from .base import Metric +from .builder import METRICS, MetricKeys + + +@METRICS.register_module( + group_key=default_group, module_name=Metrics.multi_average_precision) +class AveragePrecisionMetric(Metric): + """The metric computation class for multi avarage precision classes. + + This metric class calculates multi avarage precision for the whole input batches. + """ + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.preds = [] + self.labels = [] + self.thresh = kwargs.get('threshold', 0.5) + + def add(self, outputs: Dict, inputs: Dict): + label_name = OutputKeys.LABEL if OutputKeys.LABEL in inputs else OutputKeys.LABELS + ground_truths = inputs[label_name] + eval_results = outputs[label_name] + for key in [ + OutputKeys.CAPTION, OutputKeys.TEXT, OutputKeys.BOXES, + OutputKeys.LABELS, OutputKeys.SCORES + ]: + if key in outputs and outputs[key] is not None: + eval_results = outputs[key] + break + assert type(ground_truths) == type(eval_results) + for truth in ground_truths: + self.labels.append(truth) + for result in eval_results: + if isinstance(truth, str): + self.preds.append(result.strip().replace(' ', '')) + else: + self.preds.append(result) + + def evaluate(self): + assert len(self.preds) == len(self.labels) + scores = self._calculate_ap_score(self.preds, self.labels, self.thresh) + return {MetricKeys.mAP: scores.mean().item()} + + def _calculate_ap_score(self, preds, labels, thresh=0.5): + hyps = np.array(preds) + refs = np.array(labels) + a = np.where(hyps[:, :2] < refs[:, :2], refs[:, :2], hyps[:, :2]) + b = np.where(hyps[:, 2:] < refs[:, 2:], hyps[:, 2:], refs[:, 2:]) + interacts = np.concatenate([a, b], axis=1) + area_predictions = (hyps[:, 2] - hyps[:, 0]) * ( + hyps[:, 3] - hyps[:, 1]) + area_targets = (refs[:, 2] - refs[:, 0]) * (refs[:, 3] - refs[:, 1]) + interacts_w = interacts[:, 2] - interacts[:, 0] + interacts_h = interacts[:, 3] - interacts[:, 1] + area_interacts = interacts_w * interacts_h + ious = area_interacts / ( + area_predictions + area_targets - area_interacts + 1e-6) + return (ious >= thresh) & (interacts_w > 0) & (interacts_h > 0) diff --git a/modelscope/metrics/text_generation_metric.py b/modelscope/metrics/text_generation_metric.py index c2d9c6a8..08df5235 100644 --- a/modelscope/metrics/text_generation_metric.py +++ b/modelscope/metrics/text_generation_metric.py @@ -8,6 +8,7 @@ from rouge import Rouge from modelscope.metainfo import Metrics from modelscope.metrics.base import Metric from modelscope.metrics.builder import METRICS, MetricKeys +from modelscope.utils.chinese_utils import rebuild_chinese_str from modelscope.utils.registry import default_group @@ -24,25 +25,13 @@ class TextGenerationMetric(Metric): self.tgts: List[str] = [] self.rouge = Rouge() - @staticmethod - def is_chinese_char(char: str): - # the length of char must be 1 - return '\u4e00' <= char <= '\u9fa5' - - # add space for each chinese char - def rebuild_str(self, string: str): - return ' '.join(''.join([ - f' {char} ' if self.is_chinese_char(char) else char - for char in string - ]).split()) - def add(self, outputs: Dict[str, List[str]], inputs: Dict[str, List[str]]): ground_truths = inputs['tgts'] eval_results = outputs['preds'] for truth in ground_truths: - self.tgts.append(self.rebuild_str(truth)) + self.tgts.append(rebuild_chinese_str(truth)) for result in eval_results: - self.preds.append(self.rebuild_str(result)) + self.preds.append(rebuild_chinese_str(result)) def _check(self, pred: str, tgt: str) -> bool: diff --git a/modelscope/models/audio/kws/farfield/model.py b/modelscope/models/audio/kws/farfield/model.py index d63d1e2a..af1c0a27 100644 --- a/modelscope/models/audio/kws/farfield/model.py +++ b/modelscope/models/audio/kws/farfield/model.py @@ -1,6 +1,7 @@ # Copyright (c) Alibaba, Inc. and its affiliates. import os +import tempfile from typing import Dict, Optional from modelscope.metainfo import Models @@ -36,12 +37,15 @@ class FSMNSeleNetV2Decorator(TorchModel): else: sc_config_file = os.path.join(model_dir, self.SC_CONFIG) model_txt_file = os.path.join(model_dir, self.MODEL_TXT) + self.tmp_dir = tempfile.TemporaryDirectory() + new_config_file = os.path.join(self.tmp_dir.name, self.SC_CONFIG) + self._sc = None if os.path.exists(model_txt_file): conf_dict = dict(mode=56542, kws_model=model_txt_file) - update_conf(sc_config_file, sc_config_file, conf_dict) + update_conf(sc_config_file, new_config_file, conf_dict) import py_sound_connect - self._sc = py_sound_connect.SoundConnect(sc_config_file) + self._sc = py_sound_connect.SoundConnect(new_config_file) self.size_in = self._sc.bytesPerBlockIn() self.size_out = self._sc.bytesPerBlockOut() else: @@ -49,6 +53,9 @@ class FSMNSeleNetV2Decorator(TorchModel): f'Invalid model directory! Failed to load model file: {model_txt_file}.' ) + def __del__(self): + self.tmp_dir.cleanup() + def forward(self, input: Dict[str, Tensor]) -> Dict[str, Tensor]: return self.model.forward(input) diff --git a/modelscope/models/multi_modal/mplug_for_all_tasks.py b/modelscope/models/multi_modal/mplug_for_all_tasks.py index 64a7dd7b..7de8d291 100644 --- a/modelscope/models/multi_modal/mplug_for_all_tasks.py +++ b/modelscope/models/multi_modal/mplug_for_all_tasks.py @@ -45,10 +45,6 @@ class MPlugForAllTasks(TorchModel): } """ - replace_tokens_bert = (('[unused0]', ''), ('[PAD]', ''), - ('[unused1]', ''), (r' +', ' '), ('[SEP]', ''), - ('[unused2]', ''), ('[CLS]', ''), ('[UNK]', '')) - # get task from config file task = Config.from_file( osp.join(self.model_dir, ModelFile.CONFIGURATION)).task @@ -60,10 +56,7 @@ class MPlugForAllTasks(TorchModel): return {OutputKeys.SCORES: output[0].tolist()} topk_ids, _ = output pred_string: List[str] = \ - self.tokenizer.decode(topk_ids[0][0]) - for _old, _new in replace_tokens_bert: - pred_string = pred_string.replace(_old, _new) - pred_string = pred_string.strip() + self.tokenizer.decode(topk_ids[0][0], skip_special_tokens=True) output_key = OutputKeys.CAPTION \ if task == Tasks.image_captioning else OutputKeys.TEXT return {output_key: pred_string} @@ -87,19 +80,4 @@ class MPlugForAllTasks(TorchModel): # evaluate topk_ids, _ = output - preds: List[str] = [ - self.tokenizer.decode(batch[0]) for batch in topk_ids - ] - for i in range(len(preds)): - for _old, _new in replace_tokens_bert: - preds[i] = preds[i].replace(_old, _new) - preds[i] = preds[i].strip() - tgts: List[str] = [ - self.tokenizer.decode(batch) - for batch in input['answer_input_ids'].cpu().numpy().tolist() - ] - for i in range(len(tgts)): - for _old, _new in replace_tokens_bert: - tgts[i] = tgts[i].replace(_old, _new) - preds[i] = preds[i].strip() - return {'preds': preds, 'tgts': tgts} + return {'sequences': [list_tensor[0] for list_tensor in topk_ids]} diff --git a/modelscope/models/nlp/task_models/text_generation.py b/modelscope/models/nlp/task_models/text_generation.py index cd8e20cf..b886f124 100644 --- a/modelscope/models/nlp/task_models/text_generation.py +++ b/modelscope/models/nlp/task_models/text_generation.py @@ -2,7 +2,7 @@ from typing import Any, Dict import numpy as np -from transformers.modeling_utils import PreTrainedModel +from transformers.modeling_utils import GenerationMixin from modelscope.metainfo import TaskModels from modelscope.models.builder import MODELS @@ -17,7 +17,8 @@ __all__ = ['TaskModelForTextGeneration'] @MODELS.register_module( Tasks.text_generation, module_name=TaskModels.text_generation) -class TaskModelForTextGeneration(SingleBackboneTaskModelBase, PreTrainedModel): +class TaskModelForTextGeneration(SingleBackboneTaskModelBase, GenerationMixin): + main_input_name = 'input_ids' def __init__(self, model_dir: str, *args, **kwargs): """initialize the text generation model from the `model_dir` path. diff --git a/modelscope/pipelines/base.py b/modelscope/pipelines/base.py index 68010012..7a8bfd14 100644 --- a/modelscope/pipelines/base.py +++ b/modelscope/pipelines/base.py @@ -366,6 +366,7 @@ class DistributedPipeline(Pipeline): master_port=master_port, **self.cfg.model, **kwargs), ranks) + self.models = [] def __del__(self): if hasattr(self, 'model_pool') and self.model_pool is not None: diff --git a/modelscope/pipelines/cv/body_3d_keypoints_pipeline.py b/modelscope/pipelines/cv/body_3d_keypoints_pipeline.py index 8522ceff..d113fb3c 100644 --- a/modelscope/pipelines/cv/body_3d_keypoints_pipeline.py +++ b/modelscope/pipelines/cv/body_3d_keypoints_pipeline.py @@ -132,8 +132,8 @@ class Body3DKeypointsPipeline(Pipeline): device='gpu' if torch.cuda.is_available() else 'cpu') def preprocess(self, input: Input) -> Dict[str, Any]: - video_url = input - video_frames = self.read_video_frames(video_url) + self.video_url = input + video_frames = self.read_video_frames(self.video_url) if 0 == len(video_frames): res = {'success': False, 'msg': 'get video frame failed.'} return res @@ -198,7 +198,7 @@ class Body3DKeypointsPipeline(Pipeline): } if not input['success']: - pass + res[OutputKeys.OUTPUT_VIDEO] = self.video_url else: poses = input[KeypointsTypes.POSES_CAMERA] pred_3d_pose = poses.data.cpu().numpy()[ diff --git a/modelscope/pipelines/nlp/text_classification_pipeline.py b/modelscope/pipelines/nlp/text_classification_pipeline.py index 9e00ad7f..771660a5 100644 --- a/modelscope/pipelines/nlp/text_classification_pipeline.py +++ b/modelscope/pipelines/nlp/text_classification_pipeline.py @@ -3,14 +3,13 @@ from typing import Any, Dict, Union import numpy as np -from modelscope.metainfo import Pipelines +from modelscope.metainfo import Pipelines, Preprocessors from modelscope.models.base import Model -from modelscope.models.multi_modal import OfaForAllTasks from modelscope.outputs import OutputKeys from modelscope.pipelines.base import Pipeline from modelscope.pipelines.builder import PIPELINES -from modelscope.preprocessors import OfaPreprocessor, Preprocessor -from modelscope.utils.constant import Tasks +from modelscope.preprocessors import Preprocessor +from modelscope.utils.constant import Fields, Tasks @PIPELINES.register_module( @@ -58,8 +57,11 @@ class TextClassificationPipeline(Pipeline): str) else model if preprocessor is None: - if isinstance(model, OfaForAllTasks): - preprocessor = OfaPreprocessor(model_dir=model.model_dir) + if model.__class__.__name__ == 'OfaForAllTasks': + preprocessor = Preprocessor.from_pretrained( + model_name_or_path=model.model_dir, + type=Preprocessors.ofa_tasks_preprocessor, + field=Fields.multi_modal) else: first_sequence = kwargs.pop('first_sequence', 'first_sequence') second_sequence = kwargs.pop('second_sequence', None) @@ -76,7 +78,7 @@ class TextClassificationPipeline(Pipeline): def forward(self, inputs: Dict[str, Any], **forward_params) -> Dict[str, Any]: - if isinstance(self.model, OfaForAllTasks): + if self.model.__class__.__name__ == 'OfaForAllTasks': return super().forward(inputs, **forward_params) return self.model(**inputs, **forward_params) @@ -95,7 +97,7 @@ class TextClassificationPipeline(Pipeline): labels: The real labels. Label at index 0 is the smallest probability. """ - if isinstance(self.model, OfaForAllTasks): + if self.model.__class__.__name__ == 'OfaForAllTasks': return inputs else: assert self.id2label is not None, 'Cannot convert id to the original label, please pass in the mapping ' \ diff --git a/modelscope/pipelines/nlp/text_generation_pipeline.py b/modelscope/pipelines/nlp/text_generation_pipeline.py index fdde5f25..0490c8e7 100644 --- a/modelscope/pipelines/nlp/text_generation_pipeline.py +++ b/modelscope/pipelines/nlp/text_generation_pipeline.py @@ -10,6 +10,7 @@ from modelscope.outputs import OutputKeys from modelscope.pipelines.base import Pipeline, Tensor from modelscope.pipelines.builder import PIPELINES from modelscope.preprocessors import Preprocessor, build_preprocessor +from modelscope.utils.chinese_utils import remove_space_between_chinese_chars from modelscope.utils.constant import Fields, Tasks from modelscope.utils.hub import read_config @@ -78,28 +79,6 @@ class TextGenerationPipeline(Pipeline): with torch.no_grad(): return self.model.generate(inputs, **forward_params) - def _is_chinese_char(self, word: str): - chinese_punctuations = (',', '。', ';', ':' '!', '?', '《', '》') - return len(word) == 1 \ - and ('\u4e00' <= word <= '\u9fa5' or word in chinese_punctuations) - - def _remove_space_between_chinese_chars(self, decoded: str): - old_word_list = decoded.split(' ') - new_word_list = [] - start = -1 - for i, word in enumerate(old_word_list): - if self._is_chinese_char(word): - if start == -1: - start = i - else: - if start != -1: - new_word_list.append(''.join(old_word_list[start:i])) - start = -1 - new_word_list.append(word) - if start != -1: - new_word_list.append(''.join(old_word_list[start:])) - return ' '.join(new_word_list) - def decode(self, inputs) -> str: tokenizer = self.preprocessor.tokenizer return tokenizer.decode(inputs.tolist(), skip_special_tokens=True) @@ -128,5 +107,5 @@ class TextGenerationPipeline(Pipeline): if isinstance(inputs, list) or len(inputs.shape) > 1: inputs = inputs[0] decoded = getattr(self, self.postprocessor)(inputs) - text = self._remove_space_between_chinese_chars(decoded) + text = remove_space_between_chinese_chars(decoded) return {OutputKeys.TEXT: text} diff --git a/modelscope/preprocessors/base.py b/modelscope/preprocessors/base.py index be62ebb4..38500561 100644 --- a/modelscope/preprocessors/base.py +++ b/modelscope/preprocessors/base.py @@ -205,10 +205,12 @@ class Preprocessor(ABC): if 'task' in kwargs: task = kwargs.pop('task') field_name = Tasks.find_field_by_task(task) + if 'field' in kwargs: + field_name = kwargs.pop('field') sub_key = 'train' if preprocessor_mode == ModeKeys.TRAIN else 'val' - if not hasattr(cfg, 'preprocessor'): - logger.error('No preprocessor field found in cfg.') + if not hasattr(cfg, 'preprocessor') or len(cfg.preprocessor) == 0: + logger.warn('No preprocessor field found in cfg.') preprocessor_cfg = ConfigDict() else: preprocessor_cfg = cfg.preprocessor @@ -217,9 +219,8 @@ class Preprocessor(ABC): if sub_key in preprocessor_cfg: sub_cfg = getattr(preprocessor_cfg, sub_key) else: - logger.error( - f'No {sub_key} key and type key found in ' - f'preprocessor domain of configuration.json file.') + logger.warn(f'No {sub_key} key and type key found in ' + f'preprocessor domain of configuration.json file.') sub_cfg = preprocessor_cfg else: sub_cfg = preprocessor_cfg @@ -235,7 +236,7 @@ class Preprocessor(ABC): preprocessor = build_preprocessor(sub_cfg, field_name) else: - logger.error( + logger.warn( f'Cannot find available config to build preprocessor at mode {preprocessor_mode}, ' f'current config: {sub_cfg}. trying to build by task and model information.' ) @@ -243,13 +244,13 @@ class Preprocessor(ABC): model_type = model_cfg.type if hasattr( model_cfg, 'type') else getattr(model_cfg, 'model_type', None) if task is None or model_type is None: - logger.error( + logger.warn( f'Find task: {task}, model type: {model_type}. ' f'Insufficient information to build preprocessor, skip building preprocessor' ) return None if (model_type, task) not in PREPROCESSOR_MAP: - logger.error( + logger.warn( f'No preprocessor key {(model_type, task)} found in PREPROCESSOR_MAP, ' f'skip building preprocessor.') return None diff --git a/modelscope/preprocessors/nlp/token_classification_preprocessor.py b/modelscope/preprocessors/nlp/token_classification_preprocessor.py index 92b7c46b..a7616736 100644 --- a/modelscope/preprocessors/nlp/token_classification_preprocessor.py +++ b/modelscope/preprocessors/nlp/token_classification_preprocessor.py @@ -73,10 +73,12 @@ class TokenClassificationPreprocessor(NLPTokenizerPreprocessorBase): super().__init__(model_dir, mode=mode, **kwargs) if 'is_split_into_words' in kwargs: - self.is_split_into_words = kwargs.pop('is_split_into_words') + self.tokenize_kwargs['is_split_into_words'] = kwargs.pop( + 'is_split_into_words') else: - self.is_split_into_words = self.tokenizer.init_kwargs.get( - 'is_split_into_words', False) + self.tokenize_kwargs[ + 'is_split_into_words'] = self.tokenizer.init_kwargs.get( + 'is_split_into_words', False) if 'label2id' in kwargs: kwargs.pop('label2id') @@ -99,7 +101,6 @@ class TokenClassificationPreprocessor(NLPTokenizerPreprocessorBase): if isinstance(data, str): # for inference inputs without label text = data - self.tokenize_kwargs['add_special_tokens'] = False elif isinstance(data, dict): # for finetune inputs with label text = data.get(self.first_sequence) @@ -107,11 +108,15 @@ class TokenClassificationPreprocessor(NLPTokenizerPreprocessorBase): if isinstance(text, list): self.tokenize_kwargs['is_split_into_words'] = True + if self._mode == ModeKeys.INFERENCE: + self.tokenize_kwargs['add_special_tokens'] = False + input_ids = [] label_mask = [] offset_mapping = [] token_type_ids = [] - if self.is_split_into_words and self._mode == ModeKeys.INFERENCE: + if self.tokenize_kwargs[ + 'is_split_into_words'] and self._mode == ModeKeys.INFERENCE: for offset, token in enumerate(list(text)): subtoken_ids = self.tokenizer.encode(token, **self.tokenize_kwargs) @@ -125,7 +130,8 @@ class TokenClassificationPreprocessor(NLPTokenizerPreprocessorBase): encodings = self.tokenizer( text, return_offsets_mapping=True, **self.tokenize_kwargs) attention_mask = encodings['attention_mask'] - token_type_ids = encodings['token_type_ids'] + if 'token_type_ids' in encodings: + token_type_ids = encodings['token_type_ids'] input_ids = encodings['input_ids'] word_ids = encodings.word_ids() for i in range(len(word_ids)): diff --git a/modelscope/preprocessors/ofa/image_captioning.py b/modelscope/preprocessors/ofa/image_captioning.py index af623297..5fb83908 100644 --- a/modelscope/preprocessors/ofa/image_captioning.py +++ b/modelscope/preprocessors/ofa/image_captioning.py @@ -43,7 +43,7 @@ class OfaImageCaptioningPreprocessor(OfaBasePreprocessor): def _build_train_sample(self, data: Dict[str, Any]) -> Dict[str, Any]: sample = self._build_infer_sample(data) - target = data[self.column_map['text']] + target = sample['label'] target = target.translate(self.transtab).strip() target_token_list = target.strip().split() target = ' '.join(target_token_list[:self.max_tgt_length]) diff --git a/modelscope/preprocessors/ofa/image_classification.py b/modelscope/preprocessors/ofa/image_classification.py index 49968823..038a9e15 100644 --- a/modelscope/preprocessors/ofa/image_classification.py +++ b/modelscope/preprocessors/ofa/image_classification.py @@ -1,13 +1,20 @@ # Copyright (c) Alibaba, Inc. and its affiliates. +import functools from typing import Any, Dict import torch -from PIL import Image +from PIL import Image, ImageFile +from timm.data import create_transform from torchvision import transforms from modelscope.preprocessors.image import load_image from modelscope.utils.constant import ModeKeys from .base import OfaBasePreprocessor +from .utils.vision_helper import RandomAugment + +ImageFile.LOAD_TRUNCATED_IMAGES = True +ImageFile.MAX_IMAGE_PIXELS = None +Image.MAX_IMAGE_PIXELS = None class OfaImageClassificationPreprocessor(OfaBasePreprocessor): @@ -28,18 +35,77 @@ class OfaImageClassificationPreprocessor(OfaBasePreprocessor): super(OfaImageClassificationPreprocessor, self).__init__(cfg, model_dir, mode, *args, **kwargs) # Initialize transform - self.patch_resize_transform = transforms.Compose([ - lambda image: image.convert('RGB'), - transforms.Resize( - (self.patch_image_size, self.patch_image_size), - interpolation=transforms.InterpolationMode.BICUBIC), - transforms.ToTensor(), - transforms.Normalize(mean=self.mean, std=self.std), - ]) + if self.mode != ModeKeys.TRAIN: + self.patch_resize_transform = transforms.Compose([ + lambda image: image.convert('RGB'), + transforms.Resize( + (self.patch_image_size, self.patch_image_size), + interpolation=transforms.InterpolationMode.BICUBIC), + transforms.ToTensor(), + transforms.Normalize(mean=self.mean, std=self.std), + ]) + else: + self.patch_resize_transform = create_transform( + input_size=self.patch_image_size, + is_training=True, + color_jitter=0.4, + auto_augment='rand-m9-mstd0.5-inc1', + interpolation='bicubic', + re_prob=0.25, + re_mode='pixel', + re_count=1, + mean=self.mean, + std=self.std) + self.patch_resize_transform = transforms.Compose( + functools.reduce(lambda x, y: x + y, [ + [ + lambda image: image.convert('RGB'), + ], + self.patch_resize_transform.transforms[:2], + [self.patch_resize_transform.transforms[2]], + [ + RandomAugment( + 2, + 7, + isPIL=True, + augs=[ + 'Identity', 'AutoContrast', 'Equalize', + 'Brightness', 'Sharpness', 'ShearX', 'ShearY', + 'TranslateX', 'TranslateY', 'Rotate' + ]), + ], + self.patch_resize_transform.transforms[3:], + ])) def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]: - image = data['image'] if isinstance( - data['image'], Image.Image) else load_image(data['image']) + if self.mode == ModeKeys.TRAIN: + return self._build_train_sample(data) + else: + return self._build_infer_sample(data) + + def _build_train_sample(self, data: Dict[str, Any]) -> Dict[str, Any]: + sample = self._build_infer_sample(data) + target = ' {}'.format(sample['label']) + sample['ref_dict'] = {sample['label']: 1.0} + sample['target'] = self.tokenize_text(target, add_bos=False) + sample['prev_output_tokens'] = torch.cat( + [self.bos_item, sample['target'][:-1]]) + + if self.constraint_trie is not None: + constraint_mask = torch.zeros((len(sample['prev_output_tokens']), + len(self.tgt_dict))).bool() + for i in range(len(sample['prev_output_tokens'])): + constraint_prefix_token = sample[ + 'prev_output_tokens'][:i + 1].tolist() + constraint_nodes = self.constraint_trie.get_next_layer( + constraint_prefix_token) + constraint_mask[i][constraint_nodes] = True + sample['constraint_mask'] = constraint_mask + + return sample + + def _build_infer_sample(self, data: Dict[str, Any]) -> Dict[str, Any]: + image = self.get_img_pil(data[self.column_map['image']]) patch_image = self.patch_resize_transform(image) prompt = self.cfg.model.get('prompt', ' what does the image describe?') inputs = self.tokenize_text(prompt) @@ -48,4 +114,6 @@ class OfaImageClassificationPreprocessor(OfaBasePreprocessor): 'patch_image': patch_image, 'patch_mask': torch.tensor([True]) } + if 'text' in self.column_map and self.column_map['text'] in data: + sample['label'] = data[self.column_map['text']] return sample diff --git a/modelscope/preprocessors/ofa/ocr_recognition.py b/modelscope/preprocessors/ofa/ocr_recognition.py index 58e3ea6e..e15be93f 100644 --- a/modelscope/preprocessors/ofa/ocr_recognition.py +++ b/modelscope/preprocessors/ofa/ocr_recognition.py @@ -11,9 +11,6 @@ from zhconv import convert from modelscope.utils.constant import ModeKeys from .base import OfaBasePreprocessor -IMAGENET_DEFAULT_MEAN = (0.485, 0.456, 0.406) -IMAGENET_DEFAULT_STD = (0.229, 0.224, 0.225) - def ocr_resize(img, patch_image_size, is_document=False): img = img.convert('RGB') @@ -112,6 +109,6 @@ class OfaOcrRecognitionPreprocessor(OfaBasePreprocessor): } if 'text' in self.column_map and self.column_map['text'] in data: target = data[self.column_map['text']] - target = unicodedata2.normalize('NFKC', convert(target, 'zh-hans')) - sample['label'] = target + sample['label'] = unicodedata2.normalize( + 'NFKC', convert(target, 'zh-hans')) return sample diff --git a/modelscope/preprocessors/ofa/summarization.py b/modelscope/preprocessors/ofa/summarization.py index cfd3c23d..d33e9d25 100644 --- a/modelscope/preprocessors/ofa/summarization.py +++ b/modelscope/preprocessors/ofa/summarization.py @@ -1,6 +1,8 @@ # Copyright (c) Alibaba, Inc. and its affiliates. from typing import Any, Dict +import torch + from modelscope.utils.constant import ModeKeys from .base import OfaBasePreprocessor @@ -24,9 +26,26 @@ class OfaSummarizationPreprocessor(OfaBasePreprocessor): self).__init__(cfg, model_dir, mode, *args, **kwargs) def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]: + if self.mode == ModeKeys.TRAIN: + return self._build_train_sample(data) + else: + return self._build_infer_sample(data) + + def _build_train_sample(self, data: Dict[str, Any]) -> Dict[str, Any]: + sample = self._build_infer_sample(data) + target_str = sample['label'].lower() + target = super().pre_caption(target_str, max_words=self.max_tgt_length) + target = target.replace('[unk]', 'unk').replace('', 'unk') + sample['target'] = self.tokenize_text(target, add_bos=False) + noise_target_item = self.add_noise_to_tgt( + sample['target'][:-1].clone()) + sample['prev_output_tokens'] = torch.cat( + [self.bos_item, noise_target_item]) + return sample + + def _build_infer_sample(self, data: Dict[str, Any]) -> Dict[str, Any]: source = super().pre_caption( - data['text'], max_words=self.max_src_length) - source = source.strip()[:self.max_src_length] + data[self.column_map['text']], max_words=self.max_src_length) source = source.replace('[unk]', 'unk').replace('', 'unk') prompt = self.cfg.model.get( 'prompt', ' " {} " Summarize the article with a title: ') @@ -42,4 +61,17 @@ class OfaSummarizationPreprocessor(OfaBasePreprocessor): 'source': inputs, 'decoder_prompt': decoder_prompt, } + if 'summary' in self.column_map and self.column_map['summary'] in data: + sample['label'] = data[self.column_map['summary']] return sample + + def add_noise_to_tgt(self, target): + noise_indices = torch.FloatTensor( + target.size(0)).uniform_() < self.cfg.model.get( + 'noise_ratio', 0.0) + target[noise_indices] = torch.randint( + 4, + len(self.src_dict) - self.cfg.model.get('num_codes', 8192) + - self.cfg.model.get('num_bins', 1000), + size=(noise_indices.sum(), )) + return target diff --git a/modelscope/preprocessors/ofa/visual_entailment.py b/modelscope/preprocessors/ofa/visual_entailment.py index 61c3cc6a..fff5bbd3 100644 --- a/modelscope/preprocessors/ofa/visual_entailment.py +++ b/modelscope/preprocessors/ofa/visual_entailment.py @@ -38,18 +38,64 @@ class OfaVisualEntailmentPreprocessor(OfaBasePreprocessor): ]) def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]: - image = data['image'] if isinstance( - data['image'], Image.Image) else load_image(data['image']) + if self.mode == ModeKeys.TRAIN: + return self._build_train_sample(data) + else: + return self._build_infer_sample(data) + + def _build_train_sample(self, data: Dict[str, Any]) -> Dict[str, Any]: + sample = self._build_infer_sample(data) + target = ' {}'.format(sample['label']) + sample['ref_dict'] = {sample['label']: 1.0} + tgt_item = self.tokenize_text(target, add_bos=False, add_eos=False) + + if self.prompt_type == 'none': + prev_output_item = torch.cat([self.bos_item, tgt_item]) + target_item = torch.cat([prev_output_item[1:], self.eos_item]) + elif self.prompt_type == 'src': + prev_output_item = torch.cat([sample['source'], tgt_item]) + target_item = torch.cat([prev_output_item[1:], self.eos_item]) + elif self.prompt_type == 'prev_output': + prev_output_item = torch.cat([sample['source'][:-1], tgt_item]) + target_item = torch.cat([prev_output_item[1:], self.eos_item]) + else: + raise NotImplementedError + + target_item[:-len(tgt_item) - 1] = self.tokenizer.pad_token_id + sample['target'] = target_item + sample['prev_output_tokens'] = prev_output_item + + if self.constraint_trie is not None: + constraint_mask = torch.zeros( + (len(target_item), len(self.tgt_dict))).bool() + start_idx = len(target_item) - len(tgt_item) - 1 + for i in range( + len(target_item) - len(tgt_item) - 1, len(target_item)): + constraint_prefix_token = [ + self.tgt_dict.bos() + ] + target_item[start_idx:i].tolist() + constraint_nodes = self.constraint_trie.get_next_layer( + constraint_prefix_token) + constraint_mask[i][constraint_nodes] = True + sample['constraint_mask'] = constraint_mask + + return sample + + def _build_infer_sample(self, data: Dict[str, Any]) -> Dict[str, Any]: + image = self.get_img_pil(data[self.column_map['image']]) patch_image = self.patch_resize_transform(image) if 'text2' not in data: - hypothesis = self.pre_caption(data['text'], self.max_src_length) + hypothesis = self.pre_caption(data[self.column_map['text']], + self.max_src_length) prompt = self.cfg.model.get('prompt', ' does the image describe " {} "?') text = prompt.format(hypothesis) else: assert 'text' in data, f'text must be in the input {data.keys()}' - caption = self.pre_caption(data['text2'], self.max_src_length) - hypothesis = self.pre_caption(data['text'], self.max_src_length) + caption = self.pre_caption(data[self.column_map['text2']], + self.max_src_length) + hypothesis = self.pre_caption(data[self.column_map['text']], + self.max_src_length) prompt = self.cfg.model.get( 'prompt', ' can image and text1 " {} " imply text2 " {} "?') text = prompt.format(caption, hypothesis) @@ -68,4 +114,7 @@ class OfaVisualEntailmentPreprocessor(OfaBasePreprocessor): 'patch_mask': torch.tensor([True]), 'decoder_prompt': decoder_prompt, } + if 'relation' in self.column_map and self.column_map[ + 'relation'] in data: + sample['label'] = data[self.column_map['relation']] return sample diff --git a/modelscope/preprocessors/ofa/visual_grounding.py b/modelscope/preprocessors/ofa/visual_grounding.py index 8b116463..2da79670 100644 --- a/modelscope/preprocessors/ofa/visual_grounding.py +++ b/modelscope/preprocessors/ofa/visual_grounding.py @@ -1,6 +1,7 @@ # Copyright (c) Alibaba, Inc. and its affiliates. from typing import Any, Dict +import numpy as np import torch from PIL import Image from torchvision import transforms @@ -8,6 +9,7 @@ from torchvision import transforms from modelscope.preprocessors.image import load_image from modelscope.utils.constant import ModeKeys from .base import OfaBasePreprocessor +from .utils import transforms as T class OfaVisualGroundingPreprocessor(OfaBasePreprocessor): @@ -27,24 +29,98 @@ class OfaVisualGroundingPreprocessor(OfaBasePreprocessor): """ super(OfaVisualGroundingPreprocessor, self).__init__(cfg, model_dir, mode, *args, **kwargs) - # Initialize transform - self.patch_resize_transform = transforms.Compose([ - lambda image: image.convert('RGB'), - transforms.Resize( - (self.patch_image_size, self.patch_image_size), - interpolation=transforms.InterpolationMode.BICUBIC), - transforms.ToTensor(), - transforms.Normalize(mean=self.mean, std=self.std), - ]) + + self.num_bins = self.cfg.model.get('num_bins', 1000) + if self.mode == ModeKeys.TRAIN: + # for positioning + self.positioning_transform = T.Compose([ + T.RandomResize([self.patch_image_size], + max_size=self.patch_image_size), + T.ToTensor(), + T.Normalize( + mean=self.mean, + std=self.std, + max_image_size=self.max_image_size) + ]) + else: + # Initialize transform + self.patch_resize_transform = transforms.Compose([ + lambda image: image.convert('RGB'), + transforms.Resize( + (self.patch_image_size, self.patch_image_size), + interpolation=transforms.InterpolationMode.BICUBIC), + transforms.ToTensor(), + transforms.Normalize(mean=self.mean, std=self.std), + ]) def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]: - image = data['image'] if isinstance( - data['image'], Image.Image) else load_image(data['image']) + if self.mode == ModeKeys.TRAIN: + return self._build_train_sample(data) + else: + return self._build_infer_sample(data) + + def _build_train_sample(self, data: Dict[str, Any]) -> Dict[str, Any]: + image = self.get_img_pil(data[self.column_map['image']]) + w, h = image.size + boxes_target = { + 'boxes': [], + 'labels': [], + 'area': [], + 'size': torch.tensor([h, w]) + } + x0, y0, x1, y1 = data[self.column_map['region_coord']].strip().split( + ',') + region = torch.tensor([float(x0), float(y0), float(x1), float(y1)]) + boxes_target['boxes'] = torch.tensor( + [[float(x0), float(y0), float(x1), + float(y1)]]) + boxes_target['labels'] = np.array([0]) + area = [(float(x1) - float(x0)) * (float(y1) - float(y0))] + boxes_target['area'] = torch.tensor(area) + + patch_image, patch_boxes = self.positioning_transform( + image, boxes_target) + resize_h, resize_w = patch_boxes['size'][0], patch_boxes['size'][1] + quant_x0 = ''.format( + int((patch_boxes['boxes'][0][0] * (self.num_bins - 1)).round())) + quant_y0 = ''.format( + int((patch_boxes['boxes'][0][1] * (self.num_bins - 1)).round())) + quant_x1 = ''.format( + int((patch_boxes['boxes'][0][2] * (self.num_bins - 1)).round())) + quant_y1 = ''.format( + int((patch_boxes['boxes'][0][3] * (self.num_bins - 1)).round())) + region_coord = '{} {} {} {}'.format(quant_x0, quant_y0, quant_x1, + quant_y1) + src_caption = self.pre_caption(data[self.column_map['text']], + self.max_src_length) + prompt = self.cfg.model.get( + 'prompt', ' which region does the text " {} " describe?') + text = prompt.format(src_caption) + src_item = self.tokenize_text(text) + target_item = self.tokenize_text( + region_coord, add_bos=False) # !!! use_bpe=False + prev_output_item = torch.cat([self.bos_item, target_item[:-1]]) + + sample = { + 'source': src_item, + 'patch_image': patch_image, + 'patch_mask': torch.tensor([True]), + 'target': target_item, + 'prev_output_tokens': prev_output_item, + 'w_resize_ratio': resize_w / w, + 'h_resize_ratio': resize_h / h, + 'region_coord': region + } + return sample + + def _build_infer_sample(self, data: Dict[str, Any]) -> Dict[str, Any]: + image = self.get_img_pil(data[self.column_map['image']]) w, h = image.size patch_image = self.patch_resize_transform(image) w_resize_ratio = torch.tensor(self.patch_image_size / w) h_resize_ratio = torch.tensor(self.patch_image_size / h) - src_caption = self.pre_caption(data['text'], self.max_src_length) + src_caption = self.pre_caption(data[self.column_map['text']], + self.max_src_length) prompt = self.cfg.model.get( 'prompt', ' which region does the text " {} " describe?') text = prompt.format(src_caption) @@ -56,4 +132,10 @@ class OfaVisualGroundingPreprocessor(OfaBasePreprocessor): 'w_resize_ratio': w_resize_ratio, 'h_resize_ratio': h_resize_ratio, } + + if 'region_coord' in self.column_map and self.column_map[ + 'region_coord'] in data: + x0, y0, x1, y1 = data[ + self.column_map['region_coord']].strip().split(',') + sample['label'] = [float(x0), float(y0), float(x1), float(y1)] return sample diff --git a/modelscope/preprocessors/ofa/visual_question_answering.py b/modelscope/preprocessors/ofa/visual_question_answering.py index 11104e7e..b83cf935 100644 --- a/modelscope/preprocessors/ofa/visual_question_answering.py +++ b/modelscope/preprocessors/ofa/visual_question_answering.py @@ -38,10 +38,52 @@ class OfaVisualQuestionAnsweringPreprocessor(OfaBasePreprocessor): ]) def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]: - image = data['image'] if isinstance( - data['image'], Image.Image) else load_image(data['image']) + if self.mode == ModeKeys.TRAIN: + return self._build_train_sample(data) + else: + return self._build_infer_sample(data) + + def _build_train_sample(self, data: Dict[str, Any]) -> Dict[str, Any]: + sample = self._build_infer_sample(data) + tgt_item = self.tokenize_text( + ' {}'.format(sample['label']), add_bos=False, add_eos=False) + + if self.prompt_type == 'none': + prev_output_item = torch.cat([self.bos_item, tgt_item]) + target_item = torch.cat([prev_output_item[1:], self.eos_item]) + elif self.prompt_type == 'src': + prev_output_item = torch.cat([sample['source'], tgt_item]) + target_item = torch.cat([prev_output_item[1:], self.eos_item]) + elif self.prompt_type == 'prev_output': + prev_output_item = torch.cat([sample['source'][:-1], tgt_item]) + target_item = torch.cat([prev_output_item[1:], self.eos_item]) + else: + raise NotImplementedError + target_item[:-len(tgt_item) - 1] = self.tokenizer.pad_token_id + + sample['prev_output_tokens'] = prev_output_item + sample['target'] = target_item + + if self.constraint_trie is not None: + constraint_mask = torch.zeros( + (len(target_item), len(self.tgt_dict))).bool() + start_idx = len(target_item) - len(tgt_item) - 1 + for i in range( + len(target_item) - len(tgt_item) - 1, len(target_item)): + constraint_prefix_token = [ + self.tgt_dict.bos() + ] + target_item[start_idx:i].tolist() + constraint_nodes = self.constraint_trie.get_next_layer( + constraint_prefix_token) + constraint_mask[i][constraint_nodes] = True + sample['constraint_mask'] = constraint_mask + + return sample + + def _build_infer_sample(self, data: Dict[str, Any]) -> Dict[str, Any]: + image = self.get_img_pil(data[self.column_map['image']]) patch_image = self.patch_resize_transform(image) - text = ' {}'.format(data['text']) + text = ' {}'.format(data[self.column_map['text']]) inputs = self.tokenize_text(text) if self.prompt_type == 'none': decoder_prompt = self.bos_item @@ -57,4 +99,6 @@ class OfaVisualQuestionAnsweringPreprocessor(OfaBasePreprocessor): 'patch_mask': torch.tensor([True]), 'decoder_prompt': decoder_prompt, } + if 'answer' in self.column_map and self.column_map['answer'] in data: + sample['label'] = data[self.column_map['answer']] return sample diff --git a/modelscope/trainers/audio/kws_farfield_trainer.py b/modelscope/trainers/audio/kws_farfield_trainer.py index a720ced5..85c1a496 100644 --- a/modelscope/trainers/audio/kws_farfield_trainer.py +++ b/modelscope/trainers/audio/kws_farfield_trainer.py @@ -69,11 +69,14 @@ class KWSFarfieldTrainer(BaseTrainer): super().__init__(cfg_file, arg_parse_fn) - self.model = self.build_model() - self.work_dir = work_dir # the number of model output dimension # should update config outside the trainer, if user need more wake word + num_syn = kwargs.get('num_syn', None) + if num_syn: + self.cfg.model.num_syn = num_syn self._num_classes = self.cfg.model.num_syn + self.model = self.build_model() + self.work_dir = work_dir if kwargs.get('launcher', None) is not None: init_dist(kwargs['launcher']) diff --git a/modelscope/trainers/multi_modal/__init__.py b/modelscope/trainers/multi_modal/__init__.py index 448f23a3..6840b573 100644 --- a/modelscope/trainers/multi_modal/__init__.py +++ b/modelscope/trainers/multi_modal/__init__.py @@ -6,11 +6,15 @@ from modelscope.utils.import_utils import LazyImportModule if TYPE_CHECKING: from .clip import CLIPTrainer from .team import TEAMImgClsTrainer + from .ofa import OFATrainer + from .mplug import MPlugTrainer else: _import_structure = { 'clip': ['CLIPTrainer'], - 'team': ['TEAMImgClsTrainer'] + 'team': ['TEAMImgClsTrainer'], + 'ofa': ['OFATrainer'], + 'mplug': ['MPlugTrainer'], } import sys diff --git a/modelscope/trainers/multi_modal/mplug/__init__.py b/modelscope/trainers/multi_modal/mplug/__init__.py new file mode 100644 index 00000000..caf7e3f0 --- /dev/null +++ b/modelscope/trainers/multi_modal/mplug/__init__.py @@ -0,0 +1,3 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. + +from .mplug_trainer import MPlugTrainer diff --git a/modelscope/trainers/multi_modal/mplug/mplug_trainer.py b/modelscope/trainers/multi_modal/mplug/mplug_trainer.py new file mode 100644 index 00000000..def66220 --- /dev/null +++ b/modelscope/trainers/multi_modal/mplug/mplug_trainer.py @@ -0,0 +1,40 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. + +from collections.abc import Mapping + +import torch + +from modelscope.metainfo import Trainers +from modelscope.outputs import OutputKeys +from modelscope.trainers import NlpEpochBasedTrainer +from modelscope.trainers.builder import TRAINERS +from modelscope.utils.file_utils import func_receive_dict_inputs + + +@TRAINERS.register_module(module_name=Trainers.mplug) +class MPlugTrainer(NlpEpochBasedTrainer): + + def _decode(self, tokens): + tokenizer = self.eval_preprocessor.tokenizer + return tokenizer.decode(tokens, skip_special_tokens=True) + + def evaluation_step(self, data): + model = self.model.module if self._dist else self.model + model.eval() + + with torch.no_grad(): + if isinstance( + data, + Mapping) and not func_receive_dict_inputs(model.forward): + result = model.forward(**data) + else: + result = model.forward(data) + + result[OutputKeys.TEXT] = [ + self._decode(seq) for seq in result['sequences'] + ] + data[OutputKeys.LABELS] = [ + self._decode(seq) for seq in data['answer_input_ids'] + ] + + return result diff --git a/modelscope/trainers/multi_modal/ofa/ofa_trainer.py b/modelscope/trainers/multi_modal/ofa/ofa_trainer.py index f8028c6c..71494768 100644 --- a/modelscope/trainers/multi_modal/ofa/ofa_trainer.py +++ b/modelscope/trainers/multi_modal/ofa/ofa_trainer.py @@ -34,6 +34,7 @@ class OFATrainer(EpochBasedTrainer): self, model: Optional[Union[TorchModel, nn.Module, str]] = None, cfg_file: Optional[str] = None, + cfg_modify_fn: Optional[Callable] = None, arg_parse_fn: Optional[Callable] = None, data_collator: Optional[Union[Callable, Dict[str, Callable]]] = None, @@ -49,7 +50,8 @@ class OFATrainer(EpochBasedTrainer): **kwargs): model = Model.from_pretrained(model, revision=model_revision) model_dir = model.model_dir - cfg = Config.from_file(cfg_file) + self.cfg_modify_fn = cfg_modify_fn + cfg = self.rebuild_config(Config.from_file(cfg_file)) if 'work_dir' not in kwargs or len(kwargs['work_dir']) == 0: work_dir = cfg.train.work_dir else: @@ -57,10 +59,12 @@ class OFATrainer(EpochBasedTrainer): tokenizer_files = { 'zh': [ 'tokenizer.json', 'tokenizer_config.json', 'vocab.txt', - 'config.json' + 'config.json', 'ans2label.json' + ], + 'en': [ + 'tokenizer.json', 'vocab.json', 'merges.txt', 'config.json', + 'ans2label.json' ], - 'en': - ['tokenizer.json', 'vocab.json', 'merges.txt', 'config.json'], } for filename in tokenizer_files[cfg.model.get('language', 'en')]: finetune_file = os.path.join(work_dir, filename) @@ -127,6 +131,11 @@ class OFATrainer(EpochBasedTrainer): **kwargs, ) + def rebuild_config(self, cfg: Config): + if self.cfg_modify_fn is not None: + cfg = self.cfg_modify_fn(cfg) + return cfg + def train_step(self, model, inputs): model.train() loss, sample_size, logging_output = self.criterion(model, inputs) diff --git a/modelscope/utils/chinese_utils.py b/modelscope/utils/chinese_utils.py new file mode 100644 index 00000000..e5fe7aa8 --- /dev/null +++ b/modelscope/utils/chinese_utils.py @@ -0,0 +1,35 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. + + +def is_chinese_char(word: str): + chinese_punctuations = { + ',', '。', ';', ':' + '!', '?', '《', '》', '‘', '’', '“', '”', '(', ')', '【', '】' + } + return len(word) == 1 \ + and ('\u4e00' <= word <= '\u9fa5' or word in chinese_punctuations) + + +def remove_space_between_chinese_chars(decoded_str: str): + old_word_list = decoded_str.split(' ') + new_word_list = [] + start = -1 + for i, word in enumerate(old_word_list): + if is_chinese_char(word): + if start == -1: + start = i + else: + if start != -1: + new_word_list.append(''.join(old_word_list[start:i])) + start = -1 + new_word_list.append(word) + if start != -1: + new_word_list.append(''.join(old_word_list[start:])) + return ' '.join(new_word_list).strip() + + +# add space for each chinese char +def rebuild_chinese_str(string: str): + return ' '.join(''.join([ + f' {char} ' if is_chinese_char(char) else char for char in string + ]).split()) diff --git a/modelscope/utils/regress_test_utils.py b/modelscope/utils/regress_test_utils.py index be983c6c..58b5b1a3 100644 --- a/modelscope/utils/regress_test_utils.py +++ b/modelscope/utils/regress_test_utils.py @@ -5,6 +5,7 @@ import hashlib import os import pickle import random +import re import shutil import tempfile from collections import OrderedDict @@ -759,3 +760,20 @@ def compare_cfg_and_optimizers(baseline_json, state2, **kwargs) and match return match + + +class IgnoreKeyFn: + + def __init__(self, keys): + if isinstance(keys, str): + keys = [keys] + self.keys = keys if isinstance(keys, list) else [] + + def __call__(self, v1output, v2output, key, type): + if key == 'encoder.encoder.layer.0.intermediate.intermediate_act_fn': + print() + for _key in self.keys: + pattern = re.compile(_key) + if key is not None and pattern.fullmatch(key): + return True + return None diff --git a/tests/pipelines/test_fill_mask.py b/tests/pipelines/test_fill_mask.py index 35202b88..64833026 100644 --- a/tests/pipelines/test_fill_mask.py +++ b/tests/pipelines/test_fill_mask.py @@ -11,7 +11,7 @@ from modelscope.pipelines.nlp import FillMaskPipeline from modelscope.preprocessors import NLPPreprocessor from modelscope.utils.constant import Tasks from modelscope.utils.demo_utils import DemoCompatibilityCheck -from modelscope.utils.regress_test_utils import MsRegressTool +from modelscope.utils.regress_test_utils import IgnoreKeyFn, MsRegressTool from modelscope.utils.test_utils import test_level @@ -109,7 +109,9 @@ class FillMaskTest(unittest.TestCase, DemoCompatibilityCheck): pipeline_ins = pipeline( task=Tasks.fill_mask, model=model, preprocessor=preprocessor) with self.regress_tool.monitor_module_single_forward( - pipeline_ins.model, f'fill_mask_sbert_{language}'): + pipeline_ins.model, + f'fill_mask_sbert_{language}', + compare_fn=IgnoreKeyFn('.*intermediate_act_fn')): print( f'\nori_text: {self.ori_texts[language]}\ninput: {self.test_inputs[language]}\npipeline: ' f'{pipeline_ins(self.test_inputs[language])}\n') @@ -124,7 +126,9 @@ class FillMaskTest(unittest.TestCase, DemoCompatibilityCheck): ori_text = self.ori_texts[language] test_input = self.test_inputs[language].replace('[MASK]', '') with self.regress_tool.monitor_module_single_forward( - pipeline_ins.model, f'fill_mask_veco_{language}'): + pipeline_ins.model, + f'fill_mask_veco_{language}', + compare_fn=IgnoreKeyFn('.*intermediate_act_fn')): print( f'\nori_text: {ori_text}\ninput: {test_input}\npipeline: ' f'{pipeline_ins(test_input)}\n') diff --git a/tests/pipelines/test_multilingual_named_entity_recognition.py b/tests/pipelines/test_multilingual_named_entity_recognition.py index 6f72c83c..cb2b32d6 100644 --- a/tests/pipelines/test_multilingual_named_entity_recognition.py +++ b/tests/pipelines/test_multilingual_named_entity_recognition.py @@ -27,6 +27,9 @@ class MultilingualNamedEntityRecognitionTest(unittest.TestCase, viet_tcrf_model_id = 'damo/nlp_xlmr_named-entity-recognition_viet-ecommerce-title' viet_sentence = 'Nón vành dễ thương cho bé gái' + multilingual_model_id = 'damo/nlp_raner_named-entity-recognition_multilingual-large-generic' + ml_stc = 'সমস্ত বেতন নিলামের সাধারণ ব্যবহারিক উদাহরণ বিভিন্ন পেনি নিলাম / বিডিং ফি নিলাম ওয়েবসাইটে পাওয়া যাবে।' + @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') def test_run_tcrf_by_direct_model_download_thai(self): cache_path = snapshot_download(self.thai_tcrf_model_id) @@ -60,6 +63,13 @@ class MultilingualNamedEntityRecognitionTest(unittest.TestCase, task=Tasks.named_entity_recognition, model=self.thai_tcrf_model_id) print(pipeline_ins(input=self.thai_sentence)) + @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') + def test_run_tcrf_with_model_name_multilingual(self): + pipeline_ins = pipeline( + task=Tasks.named_entity_recognition, + model=self.multilingual_model_id) + print(pipeline_ins(input=self.ml_stc)) + @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') def test_run_tcrf_by_direct_model_download_viet(self): cache_path = snapshot_download(self.viet_tcrf_model_id) diff --git a/tests/pipelines/test_named_entity_recognition.py b/tests/pipelines/test_named_entity_recognition.py index aef4aaed..0df44f5b 100644 --- a/tests/pipelines/test_named_entity_recognition.py +++ b/tests/pipelines/test_named_entity_recognition.py @@ -20,10 +20,12 @@ class NamedEntityRecognitionTest(unittest.TestCase, DemoCompatibilityCheck): self.model_id = 'damo/nlp_raner_named-entity-recognition_chinese-base-news' english_model_id = 'damo/nlp_raner_named-entity-recognition_english-large-ecom' + chinese_model_id = 'damo/nlp_raner_named-entity-recognition_chinese-large-generic' tcrf_model_id = 'damo/nlp_raner_named-entity-recognition_chinese-base-news' lcrf_model_id = 'damo/nlp_lstm_named-entity-recognition_chinese-news' sentence = '这与温岭市新河镇的一个神秘的传说有关。' sentence_en = 'pizza shovel' + sentence_zh = '他 继 续 与 貝 塞 斯 達 遊 戲 工 作 室 在 接 下 来 辐 射 4 游 戏 。' @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') def test_run_tcrf_by_direct_model_download(self): @@ -91,11 +93,17 @@ class NamedEntityRecognitionTest(unittest.TestCase, DemoCompatibilityCheck): task=Tasks.named_entity_recognition, model=self.lcrf_model_id) print(pipeline_ins(input=self.sentence)) + @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') + def test_run_lcrf_with_chinese_model_name(self): + pipeline_ins = pipeline( + task=Tasks.named_entity_recognition, model=self.chinese_model_id) + print(pipeline_ins(input=self.sentence_zh)) + @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') def test_run_english_with_model_name(self): pipeline_ins = pipeline( task=Tasks.named_entity_recognition, model=self.english_model_id) - print(pipeline_ins(input='pizza shovel')) + print(pipeline_ins(input=self.sentence_en)) @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') def test_run_with_default_model(self): diff --git a/tests/pipelines/test_nli.py b/tests/pipelines/test_nli.py index 5f2dcb25..9e9fefea 100644 --- a/tests/pipelines/test_nli.py +++ b/tests/pipelines/test_nli.py @@ -3,13 +3,12 @@ import unittest from modelscope.hub.snapshot_download import snapshot_download from modelscope.models import Model -from modelscope.models.nlp import SbertForSequenceClassification from modelscope.pipelines import pipeline from modelscope.pipelines.nlp import TextClassificationPipeline from modelscope.preprocessors import SequenceClassificationPreprocessor from modelscope.utils.constant import Tasks from modelscope.utils.demo_utils import DemoCompatibilityCheck -from modelscope.utils.regress_test_utils import MsRegressTool +from modelscope.utils.regress_test_utils import IgnoreKeyFn, MsRegressTool from modelscope.utils.test_utils import test_level @@ -48,7 +47,9 @@ class NLITest(unittest.TestCase, DemoCompatibilityCheck): def test_run_with_model_name(self): pipeline_ins = pipeline(task=Tasks.nli, model=self.model_id) with self.regress_tool.monitor_module_single_forward( - pipeline_ins.model, 'sbert_nli'): + pipeline_ins.model, + 'sbert_nli', + compare_fn=IgnoreKeyFn('.*intermediate_act_fn')): print(pipeline_ins(input=(self.sentence1, self.sentence2))) @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') diff --git a/tests/pipelines/test_sentence_similarity.py b/tests/pipelines/test_sentence_similarity.py index 76db0a8f..904caea3 100644 --- a/tests/pipelines/test_sentence_similarity.py +++ b/tests/pipelines/test_sentence_similarity.py @@ -9,7 +9,7 @@ from modelscope.pipelines.nlp import TextClassificationPipeline from modelscope.preprocessors import SequenceClassificationPreprocessor from modelscope.utils.constant import Tasks from modelscope.utils.demo_utils import DemoCompatibilityCheck -from modelscope.utils.regress_test_utils import MsRegressTool +from modelscope.utils.regress_test_utils import IgnoreKeyFn, MsRegressTool from modelscope.utils.test_utils import test_level @@ -54,7 +54,9 @@ class SentenceSimilarityTest(unittest.TestCase, DemoCompatibilityCheck): pipeline_ins = pipeline( task=Tasks.sentence_similarity, model=self.model_id) with self.regress_tool.monitor_module_single_forward( - pipeline_ins.model, 'sbert_sen_sim'): + pipeline_ins.model, + 'sbert_sen_sim', + compare_fn=IgnoreKeyFn('.*intermediate_act_fn')): print(pipeline_ins(input=(self.sentence1, self.sentence2))) @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') diff --git a/tests/pipelines/test_word_segmentation.py b/tests/pipelines/test_word_segmentation.py index cd01b98f..6969c0e6 100644 --- a/tests/pipelines/test_word_segmentation.py +++ b/tests/pipelines/test_word_segmentation.py @@ -9,7 +9,7 @@ from modelscope.pipelines.nlp import WordSegmentationPipeline from modelscope.preprocessors import TokenClassificationPreprocessor from modelscope.utils.constant import Tasks from modelscope.utils.demo_utils import DemoCompatibilityCheck -from modelscope.utils.regress_test_utils import MsRegressTool +from modelscope.utils.regress_test_utils import IgnoreKeyFn, MsRegressTool from modelscope.utils.test_utils import test_level @@ -48,10 +48,14 @@ class WordSegmentationTest(unittest.TestCase, DemoCompatibilityCheck): pipeline_ins = pipeline( task=Tasks.word_segmentation, model=self.model_id) with self.regress_tool.monitor_module_single_forward( - pipeline_ins.model, 'sbert_ws_zh'): + pipeline_ins.model, + 'sbert_ws_zh', + compare_fn=IgnoreKeyFn('.*intermediate_act_fn')): print(pipeline_ins(input=self.sentence)) with self.regress_tool.monitor_module_single_forward( - pipeline_ins.model, 'sbert_ws_en'): + pipeline_ins.model, + 'sbert_ws_en', + compare_fn=IgnoreKeyFn('.*intermediate_act_fn')): print(pipeline_ins(input=self.sentence_eng)) @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') diff --git a/tests/pipelines/test_zero_shot_classification.py b/tests/pipelines/test_zero_shot_classification.py index 6a98132a..00789707 100644 --- a/tests/pipelines/test_zero_shot_classification.py +++ b/tests/pipelines/test_zero_shot_classification.py @@ -9,7 +9,7 @@ from modelscope.pipelines.nlp import ZeroShotClassificationPipeline from modelscope.preprocessors import ZeroShotClassificationPreprocessor from modelscope.utils.constant import Tasks from modelscope.utils.demo_utils import DemoCompatibilityCheck -from modelscope.utils.regress_test_utils import MsRegressTool +from modelscope.utils.regress_test_utils import IgnoreKeyFn, MsRegressTool from modelscope.utils.test_utils import test_level @@ -65,7 +65,9 @@ class ZeroShotClassificationTest(unittest.TestCase, DemoCompatibilityCheck): pipeline_ins = pipeline( task=Tasks.zero_shot_classification, model=self.model_id) with self.regress_tool.monitor_module_single_forward( - pipeline_ins.model, 'sbert_zero_shot'): + pipeline_ins.model, + 'sbert_zero_shot', + compare_fn=IgnoreKeyFn('.*intermediate_act_fn')): print( pipeline_ins( input=self.sentence, candidate_labels=self.labels)) diff --git a/tests/trainers/test_finetune_mplug.py b/tests/trainers/test_finetune_mplug.py index 4972a731..46664114 100644 --- a/tests/trainers/test_finetune_mplug.py +++ b/tests/trainers/test_finetune_mplug.py @@ -20,10 +20,7 @@ class TestFinetuneMPlug(unittest.TestCase): self.tmp_dir = tempfile.TemporaryDirectory().name if not os.path.exists(self.tmp_dir): os.makedirs(self.tmp_dir) - from modelscope.utils.constant import DownloadMode - datadict = MsDataset.load( - 'coco_captions_small_slice', - download_mode=DownloadMode.FORCE_REDOWNLOAD) + datadict = MsDataset.load('coco_captions_small_slice') self.train_dataset = MsDataset( datadict['train'].remap_columns({ 'image:FILE': 'image', @@ -40,18 +37,6 @@ class TestFinetuneMPlug(unittest.TestCase): shutil.rmtree(self.tmp_dir) super().tearDown() - def _cfg_modify_fn(self, cfg): - cfg.train.hooks = [{ - 'type': 'CheckpointHook', - 'interval': self.max_epochs - }, { - 'type': 'TextLoggerHook', - 'interval': 1 - }, { - 'type': 'IterTimerHook' - }] - return cfg - @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') def test_trainer_with_caption(self): kwargs = dict( @@ -59,11 +44,10 @@ class TestFinetuneMPlug(unittest.TestCase): train_dataset=self.train_dataset, eval_dataset=self.test_dataset, max_epochs=self.max_epochs, - work_dir=self.tmp_dir, - cfg_modify_fn=self._cfg_modify_fn) + work_dir=self.tmp_dir) trainer: EpochBasedTrainer = build_trainer( - name=Trainers.nlp_base_trainer, default_args=kwargs) + name=Trainers.mplug, default_args=kwargs) trainer.train() @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') @@ -80,7 +64,7 @@ class TestFinetuneMPlug(unittest.TestCase): work_dir=self.tmp_dir) trainer: EpochBasedTrainer = build_trainer( - name=Trainers.nlp_base_trainer, default_args=kwargs) + name=Trainers.mplug, default_args=kwargs) trainer.train() results_files = os.listdir(self.tmp_dir) self.assertIn(f'{trainer.timestamp}.log.json', results_files) @@ -94,11 +78,10 @@ class TestFinetuneMPlug(unittest.TestCase): train_dataset=self.train_dataset, eval_dataset=self.test_dataset, max_epochs=self.max_epochs, - work_dir=self.tmp_dir, - cfg_modify_fn=self._cfg_modify_fn) + work_dir=self.tmp_dir) trainer: EpochBasedTrainer = build_trainer( - name=Trainers.nlp_base_trainer, default_args=kwargs) + name=Trainers.mplug, default_args=kwargs) trainer.train() @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') @@ -115,7 +98,7 @@ class TestFinetuneMPlug(unittest.TestCase): work_dir=self.tmp_dir) trainer: EpochBasedTrainer = build_trainer( - name=Trainers.nlp_base_trainer, default_args=kwargs) + name=Trainers.mplug, default_args=kwargs) trainer.train() results_files = os.listdir(self.tmp_dir) self.assertIn(f'{trainer.timestamp}.log.json', results_files) @@ -129,11 +112,10 @@ class TestFinetuneMPlug(unittest.TestCase): train_dataset=self.train_dataset, eval_dataset=self.test_dataset, max_epochs=self.max_epochs, - work_dir=self.tmp_dir, - cfg_modify_fn=self._cfg_modify_fn) + work_dir=self.tmp_dir) trainer: EpochBasedTrainer = build_trainer( - name=Trainers.nlp_base_trainer, default_args=kwargs) + name=Trainers.mplug, default_args=kwargs) trainer.train() @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') @@ -150,7 +132,7 @@ class TestFinetuneMPlug(unittest.TestCase): work_dir=self.tmp_dir) trainer: EpochBasedTrainer = build_trainer( - name=Trainers.nlp_base_trainer, default_args=kwargs) + name=Trainers.mplug, default_args=kwargs) trainer.train() results_files = os.listdir(self.tmp_dir) self.assertIn(f'{trainer.timestamp}.log.json', results_files) diff --git a/tests/trainers/test_ofa_trainer.py b/tests/trainers/test_ofa_trainer.py index 85c21881..0516e569 100644 --- a/tests/trainers/test_ofa_trainer.py +++ b/tests/trainers/test_ofa_trainer.py @@ -9,6 +9,7 @@ from modelscope.metainfo import Trainers from modelscope.msdatasets import MsDataset from modelscope.trainers import build_trainer from modelscope.utils.constant import DownloadMode, ModelFile +from modelscope.utils.hub import read_config from modelscope.utils.test_utils import test_level @@ -78,6 +79,7 @@ class TestOfaTrainer(unittest.TestCase): json.dump(self.finetune_cfg, writer) pretrained_model = 'damo/ofa_ocr-recognition_scene_base_zh' + args = dict( model=pretrained_model, work_dir=WORKSPACE, diff --git a/tests/utils/test_ast.py b/tests/utils/test_ast.py index c0624679..0243053e 100644 --- a/tests/utils/test_ast.py +++ b/tests/utils/test_ast.py @@ -41,7 +41,7 @@ class AstScaningTest(unittest.TestCase): self.assertIsInstance(from_imports, dict) self.assertIsInstance(decorators, list) self.assertListEqual(list(set(imports.keys()) - set(['torch'])), []) - self.assertEqual(len(from_imports.keys()), 9) + self.assertEqual(len(from_imports.keys()), 10) self.assertTrue(from_imports['modelscope.metainfo'] is not None) self.assertEqual(from_imports['modelscope.metainfo'], ['Pipelines']) self.assertEqual(decorators,