|
- # Copyright (c) Alibaba, Inc. and its affiliates.
-
- import cv2
- import numpy as np
- from PIL import Image
-
- from modelscope.models.base.base_head import Input
- from modelscope.utils.constant import Tasks
-
-
- class InputKeys(object):
- IMAGE = 'image'
- TEXT = 'text'
- VIDEO = 'video'
-
-
- class InputType(object):
- IMAGE = 'image'
- TEXT = 'text'
- AUDIO = 'audio'
- VIDEO = 'video'
- BOX = 'box'
- DICT = 'dict'
- LIST = 'list'
- INT = 'int'
-
-
- INPUT_TYPE = {
- InputType.IMAGE: (str, np.ndarray, Image.Image),
- InputType.TEXT: str,
- InputType.AUDIO: (str, bytes, np.ndarray),
- InputType.VIDEO: (str, np.ndarray, cv2.VideoCapture),
- InputType.BOX: (list, np.ndarray),
- InputType.DICT: (dict, type(None)),
- InputType.LIST: (list, type(None)),
- InputType.INT: int,
- }
-
-
- def check_input_type(input_type, input):
- expected_type = INPUT_TYPE[input_type]
- assert isinstance(input, expected_type), \
- f'invalid input type for {input_type}, expected {expected_type} but got {type(input)}\n {input}'
-
-
- TASK_INPUTS = {
- # if task input is single var, value is InputType
- # if task input is a tuple, value is tuple of InputType
- # if task input is a dict, value is a dict of InputType, where key
- # equals the one needed in pipeline input dict
- # if task input is a list, value is a set of input format, in which
- # each elements corresponds to one input format as described above.
- # ============ vision tasks ===================
- Tasks.ocr_detection:
- InputType.IMAGE,
- Tasks.ocr_recognition:
- InputType.IMAGE,
- Tasks.face_2d_keypoints:
- InputType.IMAGE,
- Tasks.face_detection:
- InputType.IMAGE,
- Tasks.facial_expression_recognition:
- InputType.IMAGE,
- Tasks.face_recognition:
- InputType.IMAGE,
- Tasks.human_detection:
- InputType.IMAGE,
- Tasks.face_image_generation:
- InputType.INT,
- Tasks.image_classification:
- InputType.IMAGE,
- Tasks.image_object_detection:
- InputType.IMAGE,
- Tasks.image_segmentation:
- InputType.IMAGE,
- Tasks.portrait_matting:
- InputType.IMAGE,
-
- # image editing task result for a single image
- Tasks.skin_retouching:
- InputType.IMAGE,
- Tasks.image_super_resolution:
- InputType.IMAGE,
- Tasks.image_colorization:
- InputType.IMAGE,
- Tasks.image_color_enhancement:
- InputType.IMAGE,
- Tasks.image_denoising:
- InputType.IMAGE,
- Tasks.image_portrait_enhancement:
- InputType.IMAGE,
- Tasks.crowd_counting:
- InputType.IMAGE,
-
- # image generation task result for a single image
- Tasks.image_to_image_generation:
- InputType.IMAGE,
- Tasks.image_to_image_translation:
- InputType.IMAGE,
- Tasks.image_style_transfer: {
- 'content': InputType.IMAGE,
- 'style': InputType.IMAGE,
- },
- Tasks.image_portrait_stylization:
- InputType.IMAGE,
- Tasks.live_category:
- InputType.VIDEO,
- Tasks.action_recognition:
- InputType.VIDEO,
- Tasks.body_2d_keypoints:
- InputType.IMAGE,
- Tasks.body_3d_keypoints:
- InputType.VIDEO,
- Tasks.hand_2d_keypoints:
- InputType.IMAGE,
- Tasks.video_single_object_tracking: (InputType.VIDEO, InputType.BOX),
- Tasks.video_category:
- InputType.VIDEO,
- Tasks.product_retrieval_embedding:
- InputType.IMAGE,
- Tasks.video_embedding:
- InputType.VIDEO,
- Tasks.virtual_try_on: (InputType.IMAGE, InputType.IMAGE, InputType.IMAGE),
- Tasks.text_driven_segmentation: {
- InputKeys.IMAGE: InputType.IMAGE,
- InputKeys.TEXT: InputType.TEXT
- },
- Tasks.shop_segmentation:
- InputType.IMAGE,
- Tasks.movie_scene_segmentation:
- InputType.VIDEO,
-
- # ============ nlp tasks ===================
- Tasks.text_classification: [
- InputType.TEXT,
- (InputType.TEXT, InputType.TEXT),
- {
- 'text': InputType.TEXT,
- 'text2': InputType.TEXT
- },
- ],
- Tasks.sentence_similarity: (InputType.TEXT, InputType.TEXT),
- Tasks.nli: (InputType.TEXT, InputType.TEXT),
- Tasks.sentiment_classification:
- InputType.TEXT,
- Tasks.zero_shot_classification:
- InputType.TEXT,
- Tasks.relation_extraction:
- InputType.TEXT,
- Tasks.translation:
- InputType.TEXT,
- Tasks.word_segmentation: [InputType.TEXT, {
- 'text': InputType.TEXT,
- }],
- Tasks.part_of_speech:
- InputType.TEXT,
- Tasks.named_entity_recognition:
- InputType.TEXT,
- Tasks.text_error_correction:
- InputType.TEXT,
- Tasks.sentence_embedding: {
- 'source_sentence': InputType.LIST,
- 'sentences_to_compare': InputType.LIST,
- },
- Tasks.passage_ranking: (InputType.TEXT, InputType.TEXT),
- Tasks.text_generation:
- InputType.TEXT,
- Tasks.fill_mask:
- InputType.TEXT,
- Tasks.task_oriented_conversation: {
- 'user_input': InputType.TEXT,
- 'history': InputType.DICT,
- },
- Tasks.table_question_answering: {
- 'question': InputType.TEXT,
- 'history_sql': InputType.DICT,
- },
- Tasks.faq_question_answering: {
- 'query_set': InputType.LIST,
- 'support_set': InputType.LIST,
- },
-
- # ============ audio tasks ===================
- Tasks.auto_speech_recognition:
- InputType.AUDIO,
- Tasks.speech_signal_process:
- InputType.AUDIO,
- Tasks.acoustic_echo_cancellation: {
- 'nearend_mic': InputType.AUDIO,
- 'farend_speech': InputType.AUDIO
- },
- Tasks.acoustic_noise_suppression:
- InputType.AUDIO,
- Tasks.text_to_speech:
- InputType.TEXT,
- Tasks.keyword_spotting:
- InputType.AUDIO,
-
- # ============ multi-modal tasks ===================
- Tasks.image_captioning: [InputType.IMAGE, {
- 'image': InputType.IMAGE,
- }],
- Tasks.visual_grounding: {
- 'image': InputType.IMAGE,
- 'text': InputType.TEXT
- },
- Tasks.text_to_image_synthesis: {
- 'text': InputType.TEXT,
- },
- Tasks.multi_modal_embedding: {
- 'img': InputType.IMAGE,
- 'text': InputType.TEXT
- },
- Tasks.generative_multi_modal_embedding: {
- 'image': InputType.IMAGE,
- 'text': InputType.TEXT
- },
- Tasks.multi_modal_similarity: {
- 'img': InputType.IMAGE,
- 'text': InputType.TEXT
- },
- Tasks.visual_question_answering: {
- 'image': InputType.IMAGE,
- 'text': InputType.TEXT
- },
- Tasks.visual_entailment: {
- 'image': InputType.IMAGE,
- 'text': InputType.TEXT,
- 'text2': InputType.TEXT,
- },
- Tasks.action_detection:
- InputType.VIDEO,
- Tasks.image_reid_person:
- InputType.IMAGE,
- Tasks.video_inpainting: {
- 'video_input_path': InputType.TEXT,
- 'video_output_path': InputType.TEXT,
- 'mask_path': InputType.TEXT,
- }
- }
|