From 32c2adb650bc5080ebd09972afc024fc8865eb00 Mon Sep 17 00:00:00 2001
From: "yichang.zyc" <yichang.zyc@alibaba-inc.com>
Date: Thu, 4 Aug 2022 17:24:44 +0800
Subject: [PATCH] =?UTF-8?q?[to=20#42322933]=20fix:=20vqa=20demo=E9=80=82?=
 =?UTF-8?q?=E9=85=8D=E6=97=A0preprocessor=20=E5=92=8C=20=E7=9B=B8=E5=85=B3?=
 =?UTF-8?q?=E9=97=AE=E9=A2=98=20cr:https://code.alibaba-inc.com/Ali-MaaS/M?=
 =?UTF-8?q?aaS-lib/codereview/9638497=20=20=20=20=20=20=20=20=20Link:=20ht?=
 =?UTF-8?q?tps://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/9638497?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 modelscope/metainfo.py                        |  5 ++--
 modelscope/models/multi_modal/__init__.py     |  1 +
 .../models/multi_modal/ofa_for_all_tasks.py   |  8 +++++++
 .../cv/image_classification_pipeline.py       |  3 ++-
 .../multi_modal/image_captioning_pipeline.py  |  3 ++-
 .../multi_modal/visual_entailment_pipeline.py |  3 ++-
 .../multi_modal/visual_grounding_pipeline.py  |  3 ++-
 .../visual_question_answering_pipeline.py     | 13 +++++++----
 .../pipelines/nlp/summarization_pipeline.py   |  3 ++-
 .../nlp/text_classification_pipeline.py       |  3 ++-
 modelscope/preprocessors/multi_modal.py       |  4 +---
 tests/pipelines/test_ofa_tasks.py             | 23 ++++++-------------
 12 files changed, 40 insertions(+), 32 deletions(-)

diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py
index 9d9b255a..17102da0 100644
--- a/modelscope/metainfo.py
+++ b/modelscope/metainfo.py
@@ -201,9 +201,8 @@ class Preprocessors(object):
     wav_to_lists = 'wav-to-lists'
     wav_to_scp = 'wav-to-scp'
 
-    # multi-modal
-    ofa_image_caption = 'ofa-image-caption'
-    ofa_text_to_image_synthesis = 'ofa-text-to-image-synthesis'
+    # multi-modal preprocessor
+    ofa_tasks_preprocessor = 'ofa-tasks-preprocessor'
     mplug_visual_question_answering = 'mplug-visual-question-answering'
 
 
diff --git a/modelscope/models/multi_modal/__init__.py b/modelscope/models/multi_modal/__init__.py
index 0f9c9e85..9a0636ee 100644
--- a/modelscope/models/multi_modal/__init__.py
+++ b/modelscope/models/multi_modal/__init__.py
@@ -11,6 +11,7 @@ if TYPE_CHECKING:
     from .mmr import VideoCLIPForMultiModalEmbedding
     from .mplug_for_visual_question_answering import \
         MPlugForVisualQuestionAnswering
+    from .ofa_for_all_tasks import OfaForAllTasks
 
 else:
     _import_structure = {
diff --git a/modelscope/models/multi_modal/ofa_for_all_tasks.py b/modelscope/models/multi_modal/ofa_for_all_tasks.py
index 363d552d..860b68d3 100644
--- a/modelscope/models/multi_modal/ofa_for_all_tasks.py
+++ b/modelscope/models/multi_modal/ofa_for_all_tasks.py
@@ -1,5 +1,6 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import math
+import string
 from os import path as osp
 from typing import Any, Dict
 
@@ -58,6 +59,9 @@ class OfaForAllTasks(TorchModel):
         self.max_image_size = self.cfg.model.get('max_image_size', 512)
         self.val_batch_size = self.cfg.model.get('valid_batch_size',
                                                  self.batch_size)
+        self.transtab = str.maketrans(
+            {key: None
+             for key in string.punctuation})
         self.gen_type = self.cfg.model.get('gen_type', 'generation')
         assert self.gen_type in ['generation', 'traverse'], \
             'model.gen_type must be in ["generation", "traverse"]'
@@ -116,6 +120,10 @@ class OfaForAllTasks(TorchModel):
 
     def postprocess(self, input: Dict[str, Tensor],
                     **kwargs) -> Dict[str, Tensor]:
+        if self.cfg.task == Tasks.image_captioning:
+            caption = input[OutputKeys.CAPTION]
+            caption = caption.translate(self.transtab).strip()
+            input[OutputKeys.CAPTION] = caption
         return input
 
     def _text_gen_inference(self, input):
diff --git a/modelscope/pipelines/cv/image_classification_pipeline.py b/modelscope/pipelines/cv/image_classification_pipeline.py
index b15ef025..439ea6d3 100644
--- a/modelscope/pipelines/cv/image_classification_pipeline.py
+++ b/modelscope/pipelines/cv/image_classification_pipeline.py
@@ -7,6 +7,7 @@ import PIL
 import torch
 
 from modelscope.metainfo import Pipelines
+from modelscope.models.multi_modal import OfaForAllTasks
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines.base import Input, Model, Pipeline
 from modelscope.pipelines.builder import PIPELINES
@@ -35,7 +36,7 @@ class ImageClassificationPipeline(Pipeline):
         else:
             raise NotImplementedError
         pipe_model.model.eval()
-        if preprocessor is None and pipe_model:
+        if preprocessor is None and isinstance(pipe_model, OfaForAllTasks):
             preprocessor = OfaPreprocessor(model_dir=pipe_model.model_dir)
         super().__init__(model=pipe_model, preprocessor=preprocessor, **kwargs)
 
diff --git a/modelscope/pipelines/multi_modal/image_captioning_pipeline.py b/modelscope/pipelines/multi_modal/image_captioning_pipeline.py
index 4d491ceb..2028e7dc 100644
--- a/modelscope/pipelines/multi_modal/image_captioning_pipeline.py
+++ b/modelscope/pipelines/multi_modal/image_captioning_pipeline.py
@@ -2,6 +2,7 @@
 from typing import Any, Dict, Optional, Union
 
 from modelscope.metainfo import Pipelines
+from modelscope.models.multi_modal import OfaForAllTasks
 from modelscope.pipelines.base import Model, Pipeline
 from modelscope.pipelines.builder import PIPELINES
 from modelscope.preprocessors import OfaPreprocessor, Preprocessor
@@ -34,7 +35,7 @@ class ImageCaptioningPipeline(Pipeline):
         else:
             raise NotImplementedError
         pipe_model.model.eval()
-        if preprocessor is None and pipe_model:
+        if preprocessor is None and isinstance(pipe_model, OfaForAllTasks):
             preprocessor = OfaPreprocessor(model_dir=pipe_model.model_dir)
         super().__init__(model=pipe_model, preprocessor=preprocessor, **kwargs)
 
diff --git a/modelscope/pipelines/multi_modal/visual_entailment_pipeline.py b/modelscope/pipelines/multi_modal/visual_entailment_pipeline.py
index e1bd3929..2a7bd1d0 100644
--- a/modelscope/pipelines/multi_modal/visual_entailment_pipeline.py
+++ b/modelscope/pipelines/multi_modal/visual_entailment_pipeline.py
@@ -2,6 +2,7 @@
 from typing import Any, Dict, Union
 
 from modelscope.metainfo import Pipelines
+from modelscope.models.multi_modal import OfaForAllTasks
 from modelscope.pipelines.base import Model, Pipeline
 from modelscope.pipelines.builder import PIPELINES
 from modelscope.preprocessors import OfaPreprocessor, Preprocessor
@@ -34,7 +35,7 @@ class VisualEntailmentPipeline(Pipeline):
         else:
             raise NotImplementedError
         pipe_model.model.eval()
-        if preprocessor is None and pipe_model:
+        if preprocessor is None and isinstance(pipe_model, OfaForAllTasks):
             preprocessor = OfaPreprocessor(model_dir=pipe_model.model_dir)
         super().__init__(model=pipe_model, preprocessor=preprocessor, **kwargs)
 
diff --git a/modelscope/pipelines/multi_modal/visual_grounding_pipeline.py b/modelscope/pipelines/multi_modal/visual_grounding_pipeline.py
index a603d4fd..651109d9 100644
--- a/modelscope/pipelines/multi_modal/visual_grounding_pipeline.py
+++ b/modelscope/pipelines/multi_modal/visual_grounding_pipeline.py
@@ -2,6 +2,7 @@
 from typing import Any, Dict, Union
 
 from modelscope.metainfo import Pipelines
+from modelscope.models.multi_modal import OfaForAllTasks
 from modelscope.pipelines.base import Model, Pipeline
 from modelscope.pipelines.builder import PIPELINES
 from modelscope.preprocessors import OfaPreprocessor, Preprocessor
@@ -34,7 +35,7 @@ class VisualGroundingPipeline(Pipeline):
         else:
             raise NotImplementedError
         pipe_model.model.eval()
-        if preprocessor is None and pipe_model:
+        if preprocessor is None and isinstance(pipe_model, OfaForAllTasks):
             preprocessor = OfaPreprocessor(model_dir=pipe_model.model_dir)
         super().__init__(model=pipe_model, preprocessor=preprocessor, **kwargs)
 
diff --git a/modelscope/pipelines/multi_modal/visual_question_answering_pipeline.py b/modelscope/pipelines/multi_modal/visual_question_answering_pipeline.py
index 47727a29..9c694500 100644
--- a/modelscope/pipelines/multi_modal/visual_question_answering_pipeline.py
+++ b/modelscope/pipelines/multi_modal/visual_question_answering_pipeline.py
@@ -5,11 +5,13 @@ import torch
 
 from modelscope.metainfo import Pipelines
 from modelscope.models import Model
-from modelscope.models.multi_modal import MPlugForVisualQuestionAnswering
+from modelscope.models.multi_modal import (MPlugForVisualQuestionAnswering,
+                                           OfaForAllTasks)
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines.base import Pipeline, Tensor
 from modelscope.pipelines.builder import PIPELINES
-from modelscope.preprocessors import MPlugVisualQuestionAnsweringPreprocessor
+from modelscope.preprocessors import (MPlugVisualQuestionAnsweringPreprocessor,
+                                      OfaPreprocessor)
 from modelscope.utils.constant import Tasks
 
 __all__ = ['VisualQuestionAnsweringPipeline']
@@ -35,8 +37,11 @@ class VisualQuestionAnsweringPipeline(Pipeline):
                                     Model) else Model.from_pretrained(model)
         self.tokenizer = None
         if preprocessor is None:
-            preprocessor = MPlugVisualQuestionAnsweringPreprocessor(
-                model.model_dir)
+            if isinstance(model, OfaForAllTasks):
+                preprocessor = OfaPreprocessor(model.model_dir)
+            elif isinstance(model, MPlugForVisualQuestionAnswering):
+                preprocessor = MPlugVisualQuestionAnsweringPreprocessor(
+                    model.model_dir)
         if isinstance(model, MPlugForVisualQuestionAnswering):
             model.eval()
             self.tokenizer = model.tokenizer
diff --git a/modelscope/pipelines/nlp/summarization_pipeline.py b/modelscope/pipelines/nlp/summarization_pipeline.py
index 148acc06..7c163f04 100644
--- a/modelscope/pipelines/nlp/summarization_pipeline.py
+++ b/modelscope/pipelines/nlp/summarization_pipeline.py
@@ -2,6 +2,7 @@
 from typing import Any, Dict, Union
 
 from modelscope.metainfo import Pipelines
+from modelscope.models.multi_modal import OfaForAllTasks
 from modelscope.pipelines.base import Model, Pipeline
 from modelscope.pipelines.builder import PIPELINES
 from modelscope.preprocessors import OfaPreprocessor, Preprocessor
@@ -34,7 +35,7 @@ class SummarizationPipeline(Pipeline):
         else:
             raise NotImplementedError
         pipe_model.model.eval()
-        if preprocessor is None and pipe_model:
+        if preprocessor is None and isinstance(pipe_model, OfaForAllTasks):
             preprocessor = OfaPreprocessor(model_dir=pipe_model.model_dir)
         super().__init__(model=pipe_model, preprocessor=preprocessor, **kwargs)
 
diff --git a/modelscope/pipelines/nlp/text_classification_pipeline.py b/modelscope/pipelines/nlp/text_classification_pipeline.py
index f873d6d7..13d9964d 100644
--- a/modelscope/pipelines/nlp/text_classification_pipeline.py
+++ b/modelscope/pipelines/nlp/text_classification_pipeline.py
@@ -2,6 +2,7 @@
 from typing import Any, Dict, Union
 
 from modelscope.metainfo import Pipelines
+from modelscope.models.multi_modal import OfaForAllTasks
 from modelscope.pipelines.base import Model, Pipeline
 from modelscope.pipelines.builder import PIPELINES
 from modelscope.preprocessors import OfaPreprocessor, Preprocessor
@@ -34,7 +35,7 @@ class TextClassificationPipeline(Pipeline):
         else:
             raise NotImplementedError
         pipe_model.model.eval()
-        if preprocessor is None and pipe_model:
+        if preprocessor is None and isinstance(pipe_model, OfaForAllTasks):
             preprocessor = OfaPreprocessor(model_dir=pipe_model.model_dir)
         super().__init__(model=pipe_model, preprocessor=preprocessor, **kwargs)
 
diff --git a/modelscope/preprocessors/multi_modal.py b/modelscope/preprocessors/multi_modal.py
index f3bba772..65578e6a 100644
--- a/modelscope/preprocessors/multi_modal.py
+++ b/modelscope/preprocessors/multi_modal.py
@@ -22,9 +22,7 @@ __all__ = [
 
 
 @PREPROCESSORS.register_module(
-    Fields.multi_modal, module_name=Preprocessors.ofa_image_caption)
-@PREPROCESSORS.register_module(
-    Fields.multi_modal, module_name=Preprocessors.ofa_text_to_image_synthesis)
+    Fields.multi_modal, module_name=Preprocessors.ofa_tasks_preprocessor)
 class OfaPreprocessor(Preprocessor):
 
     def __init__(self, model_dir: str, *args, **kwargs):
diff --git a/tests/pipelines/test_ofa_tasks.py b/tests/pipelines/test_ofa_tasks.py
index a2b23e48..2b890e8b 100644
--- a/tests/pipelines/test_ofa_tasks.py
+++ b/tests/pipelines/test_ofa_tasks.py
@@ -35,15 +35,15 @@ class OfaTasksTest(unittest.TestCase):
             task=Tasks.image_captioning,
             model=model,
         )
-        result = img_captioning(
-            {'image': 'data/test/images/image_captioning.png'})
+        image = 'data/test/images/image_captioning.png'
+        result = img_captioning({'image': image})
         print(result[OutputKeys.CAPTION])
 
     @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
     def test_run_with_image_captioning_with_name(self):
         img_captioning = pipeline(
             Tasks.image_captioning,
-            model='damo/ofa_image-caption_coco_distilled_en')
+            model='damo/ofa_image-caption_coco_large_en')
         result = img_captioning(
             {'image': 'data/test/images/image_captioning.png'})
         print(result[OutputKeys.CAPTION])
@@ -181,14 +181,9 @@ class OfaTasksTest(unittest.TestCase):
 
     @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
     def test_run_with_visual_question_answering_with_model(self):
-        from modelscope.preprocessors.multi_modal import OfaPreprocessor
         model = Model.from_pretrained(
             'damo/ofa_visual-question-answering_pretrain_large_en')
-        preprocessor = OfaPreprocessor(model_dir=model.model_dir)
-        ofa_pipe = pipeline(
-            Tasks.visual_question_answering,
-            model=model,
-            preprocessor=preprocessor)
+        ofa_pipe = pipeline(Tasks.visual_question_answering, model=model)
         image = 'data/test/images/visual_question_answering.png'
         text = 'what is grown on the plant?'
         input = {'image': image, 'text': text}
@@ -197,13 +192,8 @@ class OfaTasksTest(unittest.TestCase):
 
     @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
     def test_run_with_visual_question_answering_with_name(self):
-        from modelscope.preprocessors.multi_modal import OfaPreprocessor
         model = 'damo/ofa_visual-question-answering_pretrain_large_en'
-        preprocessor = OfaPreprocessor(model_dir=model)
-        ofa_pipe = pipeline(
-            Tasks.visual_question_answering,
-            model=model,
-            preprocessor=preprocessor)
+        ofa_pipe = pipeline(Tasks.visual_question_answering, model=model)
         image = 'data/test/images/visual_question_answering.png'
         text = 'what is grown on the plant?'
         input = {'image': image, 'text': text}
@@ -218,7 +208,8 @@ class OfaTasksTest(unittest.TestCase):
             task=Tasks.image_captioning,
             model=model,
         )
-        image = Image.open('data/test/images/image_captioning.png')
+        image_path = 'data/test/images/image_captioning.png'
+        image = Image.open(image_path)
         result = img_captioning(image)
         print(result[OutputKeys.CAPTION])