Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/9638497master
| @@ -201,9 +201,8 @@ class Preprocessors(object): | |||
| wav_to_lists = 'wav-to-lists' | |||
| wav_to_scp = 'wav-to-scp' | |||
| # multi-modal | |||
| ofa_image_caption = 'ofa-image-caption' | |||
| ofa_text_to_image_synthesis = 'ofa-text-to-image-synthesis' | |||
| # multi-modal preprocessor | |||
| ofa_tasks_preprocessor = 'ofa-tasks-preprocessor' | |||
| mplug_visual_question_answering = 'mplug-visual-question-answering' | |||
| @@ -11,6 +11,7 @@ if TYPE_CHECKING: | |||
| from .mmr import VideoCLIPForMultiModalEmbedding | |||
| from .mplug_for_visual_question_answering import \ | |||
| MPlugForVisualQuestionAnswering | |||
| from .ofa_for_all_tasks import OfaForAllTasks | |||
| else: | |||
| _import_structure = { | |||
| @@ -1,5 +1,6 @@ | |||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||
| import math | |||
| import string | |||
| from os import path as osp | |||
| from typing import Any, Dict | |||
| @@ -58,6 +59,9 @@ class OfaForAllTasks(TorchModel): | |||
| self.max_image_size = self.cfg.model.get('max_image_size', 512) | |||
| self.val_batch_size = self.cfg.model.get('valid_batch_size', | |||
| self.batch_size) | |||
| self.transtab = str.maketrans( | |||
| {key: None | |||
| for key in string.punctuation}) | |||
| self.gen_type = self.cfg.model.get('gen_type', 'generation') | |||
| assert self.gen_type in ['generation', 'traverse'], \ | |||
| 'model.gen_type must be in ["generation", "traverse"]' | |||
| @@ -116,6 +120,10 @@ class OfaForAllTasks(TorchModel): | |||
| def postprocess(self, input: Dict[str, Tensor], | |||
| **kwargs) -> Dict[str, Tensor]: | |||
| if self.cfg.task == Tasks.image_captioning: | |||
| caption = input[OutputKeys.CAPTION] | |||
| caption = caption.translate(self.transtab).strip() | |||
| input[OutputKeys.CAPTION] = caption | |||
| return input | |||
| def _text_gen_inference(self, input): | |||
| @@ -7,6 +7,7 @@ import PIL | |||
| import torch | |||
| from modelscope.metainfo import Pipelines | |||
| from modelscope.models.multi_modal import OfaForAllTasks | |||
| from modelscope.outputs import OutputKeys | |||
| from modelscope.pipelines.base import Input, Model, Pipeline | |||
| from modelscope.pipelines.builder import PIPELINES | |||
| @@ -35,7 +36,7 @@ class ImageClassificationPipeline(Pipeline): | |||
| else: | |||
| raise NotImplementedError | |||
| pipe_model.model.eval() | |||
| if preprocessor is None and pipe_model: | |||
| if preprocessor is None and isinstance(pipe_model, OfaForAllTasks): | |||
| preprocessor = OfaPreprocessor(model_dir=pipe_model.model_dir) | |||
| super().__init__(model=pipe_model, preprocessor=preprocessor, **kwargs) | |||
| @@ -2,6 +2,7 @@ | |||
| from typing import Any, Dict, Optional, Union | |||
| from modelscope.metainfo import Pipelines | |||
| from modelscope.models.multi_modal import OfaForAllTasks | |||
| from modelscope.pipelines.base import Model, Pipeline | |||
| from modelscope.pipelines.builder import PIPELINES | |||
| from modelscope.preprocessors import OfaPreprocessor, Preprocessor | |||
| @@ -34,7 +35,7 @@ class ImageCaptioningPipeline(Pipeline): | |||
| else: | |||
| raise NotImplementedError | |||
| pipe_model.model.eval() | |||
| if preprocessor is None and pipe_model: | |||
| if preprocessor is None and isinstance(pipe_model, OfaForAllTasks): | |||
| preprocessor = OfaPreprocessor(model_dir=pipe_model.model_dir) | |||
| super().__init__(model=pipe_model, preprocessor=preprocessor, **kwargs) | |||
| @@ -2,6 +2,7 @@ | |||
| from typing import Any, Dict, Union | |||
| from modelscope.metainfo import Pipelines | |||
| from modelscope.models.multi_modal import OfaForAllTasks | |||
| from modelscope.pipelines.base import Model, Pipeline | |||
| from modelscope.pipelines.builder import PIPELINES | |||
| from modelscope.preprocessors import OfaPreprocessor, Preprocessor | |||
| @@ -34,7 +35,7 @@ class VisualEntailmentPipeline(Pipeline): | |||
| else: | |||
| raise NotImplementedError | |||
| pipe_model.model.eval() | |||
| if preprocessor is None and pipe_model: | |||
| if preprocessor is None and isinstance(pipe_model, OfaForAllTasks): | |||
| preprocessor = OfaPreprocessor(model_dir=pipe_model.model_dir) | |||
| super().__init__(model=pipe_model, preprocessor=preprocessor, **kwargs) | |||
| @@ -2,6 +2,7 @@ | |||
| from typing import Any, Dict, Union | |||
| from modelscope.metainfo import Pipelines | |||
| from modelscope.models.multi_modal import OfaForAllTasks | |||
| from modelscope.pipelines.base import Model, Pipeline | |||
| from modelscope.pipelines.builder import PIPELINES | |||
| from modelscope.preprocessors import OfaPreprocessor, Preprocessor | |||
| @@ -34,7 +35,7 @@ class VisualGroundingPipeline(Pipeline): | |||
| else: | |||
| raise NotImplementedError | |||
| pipe_model.model.eval() | |||
| if preprocessor is None and pipe_model: | |||
| if preprocessor is None and isinstance(pipe_model, OfaForAllTasks): | |||
| preprocessor = OfaPreprocessor(model_dir=pipe_model.model_dir) | |||
| super().__init__(model=pipe_model, preprocessor=preprocessor, **kwargs) | |||
| @@ -5,11 +5,13 @@ import torch | |||
| from modelscope.metainfo import Pipelines | |||
| from modelscope.models import Model | |||
| from modelscope.models.multi_modal import MPlugForVisualQuestionAnswering | |||
| from modelscope.models.multi_modal import (MPlugForVisualQuestionAnswering, | |||
| OfaForAllTasks) | |||
| from modelscope.outputs import OutputKeys | |||
| from modelscope.pipelines.base import Pipeline, Tensor | |||
| from modelscope.pipelines.builder import PIPELINES | |||
| from modelscope.preprocessors import MPlugVisualQuestionAnsweringPreprocessor | |||
| from modelscope.preprocessors import (MPlugVisualQuestionAnsweringPreprocessor, | |||
| OfaPreprocessor) | |||
| from modelscope.utils.constant import Tasks | |||
| __all__ = ['VisualQuestionAnsweringPipeline'] | |||
| @@ -35,8 +37,11 @@ class VisualQuestionAnsweringPipeline(Pipeline): | |||
| Model) else Model.from_pretrained(model) | |||
| self.tokenizer = None | |||
| if preprocessor is None: | |||
| preprocessor = MPlugVisualQuestionAnsweringPreprocessor( | |||
| model.model_dir) | |||
| if isinstance(model, OfaForAllTasks): | |||
| preprocessor = OfaPreprocessor(model.model_dir) | |||
| elif isinstance(model, MPlugForVisualQuestionAnswering): | |||
| preprocessor = MPlugVisualQuestionAnsweringPreprocessor( | |||
| model.model_dir) | |||
| if isinstance(model, MPlugForVisualQuestionAnswering): | |||
| model.eval() | |||
| self.tokenizer = model.tokenizer | |||
| @@ -2,6 +2,7 @@ | |||
| from typing import Any, Dict, Union | |||
| from modelscope.metainfo import Pipelines | |||
| from modelscope.models.multi_modal import OfaForAllTasks | |||
| from modelscope.pipelines.base import Model, Pipeline | |||
| from modelscope.pipelines.builder import PIPELINES | |||
| from modelscope.preprocessors import OfaPreprocessor, Preprocessor | |||
| @@ -34,7 +35,7 @@ class SummarizationPipeline(Pipeline): | |||
| else: | |||
| raise NotImplementedError | |||
| pipe_model.model.eval() | |||
| if preprocessor is None and pipe_model: | |||
| if preprocessor is None and isinstance(pipe_model, OfaForAllTasks): | |||
| preprocessor = OfaPreprocessor(model_dir=pipe_model.model_dir) | |||
| super().__init__(model=pipe_model, preprocessor=preprocessor, **kwargs) | |||
| @@ -2,6 +2,7 @@ | |||
| from typing import Any, Dict, Union | |||
| from modelscope.metainfo import Pipelines | |||
| from modelscope.models.multi_modal import OfaForAllTasks | |||
| from modelscope.pipelines.base import Model, Pipeline | |||
| from modelscope.pipelines.builder import PIPELINES | |||
| from modelscope.preprocessors import OfaPreprocessor, Preprocessor | |||
| @@ -34,7 +35,7 @@ class TextClassificationPipeline(Pipeline): | |||
| else: | |||
| raise NotImplementedError | |||
| pipe_model.model.eval() | |||
| if preprocessor is None and pipe_model: | |||
| if preprocessor is None and isinstance(pipe_model, OfaForAllTasks): | |||
| preprocessor = OfaPreprocessor(model_dir=pipe_model.model_dir) | |||
| super().__init__(model=pipe_model, preprocessor=preprocessor, **kwargs) | |||
| @@ -22,9 +22,7 @@ __all__ = [ | |||
| @PREPROCESSORS.register_module( | |||
| Fields.multi_modal, module_name=Preprocessors.ofa_image_caption) | |||
| @PREPROCESSORS.register_module( | |||
| Fields.multi_modal, module_name=Preprocessors.ofa_text_to_image_synthesis) | |||
| Fields.multi_modal, module_name=Preprocessors.ofa_tasks_preprocessor) | |||
| class OfaPreprocessor(Preprocessor): | |||
| def __init__(self, model_dir: str, *args, **kwargs): | |||
| @@ -35,15 +35,15 @@ class OfaTasksTest(unittest.TestCase): | |||
| task=Tasks.image_captioning, | |||
| model=model, | |||
| ) | |||
| result = img_captioning( | |||
| {'image': 'data/test/images/image_captioning.png'}) | |||
| image = 'data/test/images/image_captioning.png' | |||
| result = img_captioning({'image': image}) | |||
| print(result[OutputKeys.CAPTION]) | |||
| @unittest.skipUnless(test_level() >= 1, 'skip test in current test level') | |||
| def test_run_with_image_captioning_with_name(self): | |||
| img_captioning = pipeline( | |||
| Tasks.image_captioning, | |||
| model='damo/ofa_image-caption_coco_distilled_en') | |||
| model='damo/ofa_image-caption_coco_large_en') | |||
| result = img_captioning( | |||
| {'image': 'data/test/images/image_captioning.png'}) | |||
| print(result[OutputKeys.CAPTION]) | |||
| @@ -181,14 +181,9 @@ class OfaTasksTest(unittest.TestCase): | |||
| @unittest.skipUnless(test_level() >= 1, 'skip test in current test level') | |||
| def test_run_with_visual_question_answering_with_model(self): | |||
| from modelscope.preprocessors.multi_modal import OfaPreprocessor | |||
| model = Model.from_pretrained( | |||
| 'damo/ofa_visual-question-answering_pretrain_large_en') | |||
| preprocessor = OfaPreprocessor(model_dir=model.model_dir) | |||
| ofa_pipe = pipeline( | |||
| Tasks.visual_question_answering, | |||
| model=model, | |||
| preprocessor=preprocessor) | |||
| ofa_pipe = pipeline(Tasks.visual_question_answering, model=model) | |||
| image = 'data/test/images/visual_question_answering.png' | |||
| text = 'what is grown on the plant?' | |||
| input = {'image': image, 'text': text} | |||
| @@ -197,13 +192,8 @@ class OfaTasksTest(unittest.TestCase): | |||
| @unittest.skipUnless(test_level() >= 1, 'skip test in current test level') | |||
| def test_run_with_visual_question_answering_with_name(self): | |||
| from modelscope.preprocessors.multi_modal import OfaPreprocessor | |||
| model = 'damo/ofa_visual-question-answering_pretrain_large_en' | |||
| preprocessor = OfaPreprocessor(model_dir=model) | |||
| ofa_pipe = pipeline( | |||
| Tasks.visual_question_answering, | |||
| model=model, | |||
| preprocessor=preprocessor) | |||
| ofa_pipe = pipeline(Tasks.visual_question_answering, model=model) | |||
| image = 'data/test/images/visual_question_answering.png' | |||
| text = 'what is grown on the plant?' | |||
| input = {'image': image, 'text': text} | |||
| @@ -218,7 +208,8 @@ class OfaTasksTest(unittest.TestCase): | |||
| task=Tasks.image_captioning, | |||
| model=model, | |||
| ) | |||
| image = Image.open('data/test/images/image_captioning.png') | |||
| image_path = 'data/test/images/image_captioning.png' | |||
| image = Image.open(image_path) | |||
| result = img_captioning(image) | |||
| print(result[OutputKeys.CAPTION]) | |||