From 32c2adb650bc5080ebd09972afc024fc8865eb00 Mon Sep 17 00:00:00 2001 From: "yichang.zyc" Date: Thu, 4 Aug 2022 17:24:44 +0800 Subject: [PATCH] =?UTF-8?q?[to=20#42322933]=20fix:=20vqa=20demo=E9=80=82?= =?UTF-8?q?=E9=85=8D=E6=97=A0preprocessor=20=E5=92=8C=20=E7=9B=B8=E5=85=B3?= =?UTF-8?q?=E9=97=AE=E9=A2=98=20cr:https://code.alibaba-inc.com/Ali-MaaS/M?= =?UTF-8?q?aaS-lib/codereview/9638497=20=20=20=20=20=20=20=20=20Link:=20ht?= =?UTF-8?q?tps://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/9638497?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- modelscope/metainfo.py | 5 ++-- modelscope/models/multi_modal/__init__.py | 1 + .../models/multi_modal/ofa_for_all_tasks.py | 8 +++++++ .../cv/image_classification_pipeline.py | 3 ++- .../multi_modal/image_captioning_pipeline.py | 3 ++- .../multi_modal/visual_entailment_pipeline.py | 3 ++- .../multi_modal/visual_grounding_pipeline.py | 3 ++- .../visual_question_answering_pipeline.py | 13 +++++++---- .../pipelines/nlp/summarization_pipeline.py | 3 ++- .../nlp/text_classification_pipeline.py | 3 ++- modelscope/preprocessors/multi_modal.py | 4 +--- tests/pipelines/test_ofa_tasks.py | 23 ++++++------------- 12 files changed, 40 insertions(+), 32 deletions(-) diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py index 9d9b255a..17102da0 100644 --- a/modelscope/metainfo.py +++ b/modelscope/metainfo.py @@ -201,9 +201,8 @@ class Preprocessors(object): wav_to_lists = 'wav-to-lists' wav_to_scp = 'wav-to-scp' - # multi-modal - ofa_image_caption = 'ofa-image-caption' - ofa_text_to_image_synthesis = 'ofa-text-to-image-synthesis' + # multi-modal preprocessor + ofa_tasks_preprocessor = 'ofa-tasks-preprocessor' mplug_visual_question_answering = 'mplug-visual-question-answering' diff --git a/modelscope/models/multi_modal/__init__.py b/modelscope/models/multi_modal/__init__.py index 0f9c9e85..9a0636ee 100644 --- a/modelscope/models/multi_modal/__init__.py +++ b/modelscope/models/multi_modal/__init__.py @@ -11,6 +11,7 @@ if TYPE_CHECKING: from .mmr import VideoCLIPForMultiModalEmbedding from .mplug_for_visual_question_answering import \ MPlugForVisualQuestionAnswering + from .ofa_for_all_tasks import OfaForAllTasks else: _import_structure = { diff --git a/modelscope/models/multi_modal/ofa_for_all_tasks.py b/modelscope/models/multi_modal/ofa_for_all_tasks.py index 363d552d..860b68d3 100644 --- a/modelscope/models/multi_modal/ofa_for_all_tasks.py +++ b/modelscope/models/multi_modal/ofa_for_all_tasks.py @@ -1,5 +1,6 @@ # Copyright (c) Alibaba, Inc. and its affiliates. import math +import string from os import path as osp from typing import Any, Dict @@ -58,6 +59,9 @@ class OfaForAllTasks(TorchModel): self.max_image_size = self.cfg.model.get('max_image_size', 512) self.val_batch_size = self.cfg.model.get('valid_batch_size', self.batch_size) + self.transtab = str.maketrans( + {key: None + for key in string.punctuation}) self.gen_type = self.cfg.model.get('gen_type', 'generation') assert self.gen_type in ['generation', 'traverse'], \ 'model.gen_type must be in ["generation", "traverse"]' @@ -116,6 +120,10 @@ class OfaForAllTasks(TorchModel): def postprocess(self, input: Dict[str, Tensor], **kwargs) -> Dict[str, Tensor]: + if self.cfg.task == Tasks.image_captioning: + caption = input[OutputKeys.CAPTION] + caption = caption.translate(self.transtab).strip() + input[OutputKeys.CAPTION] = caption return input def _text_gen_inference(self, input): diff --git a/modelscope/pipelines/cv/image_classification_pipeline.py b/modelscope/pipelines/cv/image_classification_pipeline.py index b15ef025..439ea6d3 100644 --- a/modelscope/pipelines/cv/image_classification_pipeline.py +++ b/modelscope/pipelines/cv/image_classification_pipeline.py @@ -7,6 +7,7 @@ import PIL import torch from modelscope.metainfo import Pipelines +from modelscope.models.multi_modal import OfaForAllTasks from modelscope.outputs import OutputKeys from modelscope.pipelines.base import Input, Model, Pipeline from modelscope.pipelines.builder import PIPELINES @@ -35,7 +36,7 @@ class ImageClassificationPipeline(Pipeline): else: raise NotImplementedError pipe_model.model.eval() - if preprocessor is None and pipe_model: + if preprocessor is None and isinstance(pipe_model, OfaForAllTasks): preprocessor = OfaPreprocessor(model_dir=pipe_model.model_dir) super().__init__(model=pipe_model, preprocessor=preprocessor, **kwargs) diff --git a/modelscope/pipelines/multi_modal/image_captioning_pipeline.py b/modelscope/pipelines/multi_modal/image_captioning_pipeline.py index 4d491ceb..2028e7dc 100644 --- a/modelscope/pipelines/multi_modal/image_captioning_pipeline.py +++ b/modelscope/pipelines/multi_modal/image_captioning_pipeline.py @@ -2,6 +2,7 @@ from typing import Any, Dict, Optional, Union from modelscope.metainfo import Pipelines +from modelscope.models.multi_modal import OfaForAllTasks from modelscope.pipelines.base import Model, Pipeline from modelscope.pipelines.builder import PIPELINES from modelscope.preprocessors import OfaPreprocessor, Preprocessor @@ -34,7 +35,7 @@ class ImageCaptioningPipeline(Pipeline): else: raise NotImplementedError pipe_model.model.eval() - if preprocessor is None and pipe_model: + if preprocessor is None and isinstance(pipe_model, OfaForAllTasks): preprocessor = OfaPreprocessor(model_dir=pipe_model.model_dir) super().__init__(model=pipe_model, preprocessor=preprocessor, **kwargs) diff --git a/modelscope/pipelines/multi_modal/visual_entailment_pipeline.py b/modelscope/pipelines/multi_modal/visual_entailment_pipeline.py index e1bd3929..2a7bd1d0 100644 --- a/modelscope/pipelines/multi_modal/visual_entailment_pipeline.py +++ b/modelscope/pipelines/multi_modal/visual_entailment_pipeline.py @@ -2,6 +2,7 @@ from typing import Any, Dict, Union from modelscope.metainfo import Pipelines +from modelscope.models.multi_modal import OfaForAllTasks from modelscope.pipelines.base import Model, Pipeline from modelscope.pipelines.builder import PIPELINES from modelscope.preprocessors import OfaPreprocessor, Preprocessor @@ -34,7 +35,7 @@ class VisualEntailmentPipeline(Pipeline): else: raise NotImplementedError pipe_model.model.eval() - if preprocessor is None and pipe_model: + if preprocessor is None and isinstance(pipe_model, OfaForAllTasks): preprocessor = OfaPreprocessor(model_dir=pipe_model.model_dir) super().__init__(model=pipe_model, preprocessor=preprocessor, **kwargs) diff --git a/modelscope/pipelines/multi_modal/visual_grounding_pipeline.py b/modelscope/pipelines/multi_modal/visual_grounding_pipeline.py index a603d4fd..651109d9 100644 --- a/modelscope/pipelines/multi_modal/visual_grounding_pipeline.py +++ b/modelscope/pipelines/multi_modal/visual_grounding_pipeline.py @@ -2,6 +2,7 @@ from typing import Any, Dict, Union from modelscope.metainfo import Pipelines +from modelscope.models.multi_modal import OfaForAllTasks from modelscope.pipelines.base import Model, Pipeline from modelscope.pipelines.builder import PIPELINES from modelscope.preprocessors import OfaPreprocessor, Preprocessor @@ -34,7 +35,7 @@ class VisualGroundingPipeline(Pipeline): else: raise NotImplementedError pipe_model.model.eval() - if preprocessor is None and pipe_model: + if preprocessor is None and isinstance(pipe_model, OfaForAllTasks): preprocessor = OfaPreprocessor(model_dir=pipe_model.model_dir) super().__init__(model=pipe_model, preprocessor=preprocessor, **kwargs) diff --git a/modelscope/pipelines/multi_modal/visual_question_answering_pipeline.py b/modelscope/pipelines/multi_modal/visual_question_answering_pipeline.py index 47727a29..9c694500 100644 --- a/modelscope/pipelines/multi_modal/visual_question_answering_pipeline.py +++ b/modelscope/pipelines/multi_modal/visual_question_answering_pipeline.py @@ -5,11 +5,13 @@ import torch from modelscope.metainfo import Pipelines from modelscope.models import Model -from modelscope.models.multi_modal import MPlugForVisualQuestionAnswering +from modelscope.models.multi_modal import (MPlugForVisualQuestionAnswering, + OfaForAllTasks) from modelscope.outputs import OutputKeys from modelscope.pipelines.base import Pipeline, Tensor from modelscope.pipelines.builder import PIPELINES -from modelscope.preprocessors import MPlugVisualQuestionAnsweringPreprocessor +from modelscope.preprocessors import (MPlugVisualQuestionAnsweringPreprocessor, + OfaPreprocessor) from modelscope.utils.constant import Tasks __all__ = ['VisualQuestionAnsweringPipeline'] @@ -35,8 +37,11 @@ class VisualQuestionAnsweringPipeline(Pipeline): Model) else Model.from_pretrained(model) self.tokenizer = None if preprocessor is None: - preprocessor = MPlugVisualQuestionAnsweringPreprocessor( - model.model_dir) + if isinstance(model, OfaForAllTasks): + preprocessor = OfaPreprocessor(model.model_dir) + elif isinstance(model, MPlugForVisualQuestionAnswering): + preprocessor = MPlugVisualQuestionAnsweringPreprocessor( + model.model_dir) if isinstance(model, MPlugForVisualQuestionAnswering): model.eval() self.tokenizer = model.tokenizer diff --git a/modelscope/pipelines/nlp/summarization_pipeline.py b/modelscope/pipelines/nlp/summarization_pipeline.py index 148acc06..7c163f04 100644 --- a/modelscope/pipelines/nlp/summarization_pipeline.py +++ b/modelscope/pipelines/nlp/summarization_pipeline.py @@ -2,6 +2,7 @@ from typing import Any, Dict, Union from modelscope.metainfo import Pipelines +from modelscope.models.multi_modal import OfaForAllTasks from modelscope.pipelines.base import Model, Pipeline from modelscope.pipelines.builder import PIPELINES from modelscope.preprocessors import OfaPreprocessor, Preprocessor @@ -34,7 +35,7 @@ class SummarizationPipeline(Pipeline): else: raise NotImplementedError pipe_model.model.eval() - if preprocessor is None and pipe_model: + if preprocessor is None and isinstance(pipe_model, OfaForAllTasks): preprocessor = OfaPreprocessor(model_dir=pipe_model.model_dir) super().__init__(model=pipe_model, preprocessor=preprocessor, **kwargs) diff --git a/modelscope/pipelines/nlp/text_classification_pipeline.py b/modelscope/pipelines/nlp/text_classification_pipeline.py index f873d6d7..13d9964d 100644 --- a/modelscope/pipelines/nlp/text_classification_pipeline.py +++ b/modelscope/pipelines/nlp/text_classification_pipeline.py @@ -2,6 +2,7 @@ from typing import Any, Dict, Union from modelscope.metainfo import Pipelines +from modelscope.models.multi_modal import OfaForAllTasks from modelscope.pipelines.base import Model, Pipeline from modelscope.pipelines.builder import PIPELINES from modelscope.preprocessors import OfaPreprocessor, Preprocessor @@ -34,7 +35,7 @@ class TextClassificationPipeline(Pipeline): else: raise NotImplementedError pipe_model.model.eval() - if preprocessor is None and pipe_model: + if preprocessor is None and isinstance(pipe_model, OfaForAllTasks): preprocessor = OfaPreprocessor(model_dir=pipe_model.model_dir) super().__init__(model=pipe_model, preprocessor=preprocessor, **kwargs) diff --git a/modelscope/preprocessors/multi_modal.py b/modelscope/preprocessors/multi_modal.py index f3bba772..65578e6a 100644 --- a/modelscope/preprocessors/multi_modal.py +++ b/modelscope/preprocessors/multi_modal.py @@ -22,9 +22,7 @@ __all__ = [ @PREPROCESSORS.register_module( - Fields.multi_modal, module_name=Preprocessors.ofa_image_caption) -@PREPROCESSORS.register_module( - Fields.multi_modal, module_name=Preprocessors.ofa_text_to_image_synthesis) + Fields.multi_modal, module_name=Preprocessors.ofa_tasks_preprocessor) class OfaPreprocessor(Preprocessor): def __init__(self, model_dir: str, *args, **kwargs): diff --git a/tests/pipelines/test_ofa_tasks.py b/tests/pipelines/test_ofa_tasks.py index a2b23e48..2b890e8b 100644 --- a/tests/pipelines/test_ofa_tasks.py +++ b/tests/pipelines/test_ofa_tasks.py @@ -35,15 +35,15 @@ class OfaTasksTest(unittest.TestCase): task=Tasks.image_captioning, model=model, ) - result = img_captioning( - {'image': 'data/test/images/image_captioning.png'}) + image = 'data/test/images/image_captioning.png' + result = img_captioning({'image': image}) print(result[OutputKeys.CAPTION]) @unittest.skipUnless(test_level() >= 1, 'skip test in current test level') def test_run_with_image_captioning_with_name(self): img_captioning = pipeline( Tasks.image_captioning, - model='damo/ofa_image-caption_coco_distilled_en') + model='damo/ofa_image-caption_coco_large_en') result = img_captioning( {'image': 'data/test/images/image_captioning.png'}) print(result[OutputKeys.CAPTION]) @@ -181,14 +181,9 @@ class OfaTasksTest(unittest.TestCase): @unittest.skipUnless(test_level() >= 1, 'skip test in current test level') def test_run_with_visual_question_answering_with_model(self): - from modelscope.preprocessors.multi_modal import OfaPreprocessor model = Model.from_pretrained( 'damo/ofa_visual-question-answering_pretrain_large_en') - preprocessor = OfaPreprocessor(model_dir=model.model_dir) - ofa_pipe = pipeline( - Tasks.visual_question_answering, - model=model, - preprocessor=preprocessor) + ofa_pipe = pipeline(Tasks.visual_question_answering, model=model) image = 'data/test/images/visual_question_answering.png' text = 'what is grown on the plant?' input = {'image': image, 'text': text} @@ -197,13 +192,8 @@ class OfaTasksTest(unittest.TestCase): @unittest.skipUnless(test_level() >= 1, 'skip test in current test level') def test_run_with_visual_question_answering_with_name(self): - from modelscope.preprocessors.multi_modal import OfaPreprocessor model = 'damo/ofa_visual-question-answering_pretrain_large_en' - preprocessor = OfaPreprocessor(model_dir=model) - ofa_pipe = pipeline( - Tasks.visual_question_answering, - model=model, - preprocessor=preprocessor) + ofa_pipe = pipeline(Tasks.visual_question_answering, model=model) image = 'data/test/images/visual_question_answering.png' text = 'what is grown on the plant?' input = {'image': image, 'text': text} @@ -218,7 +208,8 @@ class OfaTasksTest(unittest.TestCase): task=Tasks.image_captioning, model=model, ) - image = Image.open('data/test/images/image_captioning.png') + image_path = 'data/test/images/image_captioning.png' + image = Image.open(image_path) result = img_captioning(image) print(result[OutputKeys.CAPTION])