| @@ -0,0 +1,3 @@ | |||
| version https://git-lfs.github.com/spec/v1 | |||
| oid sha256:8bdb9627c3a40897e84ee186b2a959f272790571644224e1d2efca443f867e12 | |||
| size 202823 | |||
| @@ -0,0 +1,3 @@ | |||
| version https://git-lfs.github.com/spec/v1 | |||
| oid sha256:8b89734b9c9d89342e58fbe406d3b9bdc8e07447cb170a4ae2743000471fc969 | |||
| size 23069 | |||
| @@ -0,0 +1,3 @@ | |||
| version https://git-lfs.github.com/spec/v1 | |||
| oid sha256:d53e9fbdd129b234dcbec9b9fe6a15a0e05820e802a873f95955574267bbd2ff | |||
| size 121141 | |||
| @@ -69,6 +69,7 @@ class Pipelines(object): | |||
| action_recognition = 'TAdaConv_action-recognition' | |||
| animal_recognation = 'resnet101-animal_recog' | |||
| cmdssl_video_embedding = 'cmdssl-r2p1d_video_embedding' | |||
| image_classification = 'image-classification' | |||
| face_detection = 'resnet-face-detection-scrfd10gkps' | |||
| live_category = 'live-category' | |||
| general_image_classification = 'vit-base_image-classification_ImageNet-labels' | |||
| @@ -92,6 +93,7 @@ class Pipelines(object): | |||
| text_generation = 'text-generation' | |||
| sentiment_analysis = 'sentiment-analysis' | |||
| sentiment_classification = 'sentiment-classification' | |||
| text_classification = 'text-classification' | |||
| fill_mask = 'fill-mask' | |||
| csanmt_translation = 'csanmt-translation' | |||
| nli = 'nli' | |||
| @@ -113,6 +115,8 @@ class Pipelines(object): | |||
| multi_modal_embedding = 'multi-modal-embedding' | |||
| generative_multi_modal_embedding = 'generative-multi-modal-embedding' | |||
| visual_question_answering = 'visual-question-answering' | |||
| visual_grounding = 'visual-grounding' | |||
| visual_entailment = 'visual-entailment' | |||
| text_to_image_synthesis = 'text-to-image-synthesis' | |||
| video_multi_modal_embedding = 'video-multi-modal-embedding' | |||
| @@ -11,7 +11,6 @@ if TYPE_CHECKING: | |||
| from .mmr import VideoCLIPForMultiModalEmbedding | |||
| from .mplug_for_visual_question_answering import \ | |||
| MPlugForVisualQuestionAnswering | |||
| from .ofa_for_image_captioning_model import OfaForImageCaptioning | |||
| else: | |||
| _import_structure = { | |||
| @@ -21,7 +20,7 @@ else: | |||
| 'mmr': ['VideoCLIPForMultiModalEmbedding'], | |||
| 'mplug_for_visual_question_answering': | |||
| ['MPlugForVisualQuestionAnswering'], | |||
| 'ofa_for_image_captioning_model': ['OfaForImageCaptioning'] | |||
| 'ofa_for_all_tasks': ['OfaForAllTasks'] | |||
| } | |||
| import sys | |||
| @@ -1,86 +0,0 @@ | |||
| import os.path as osp | |||
| from typing import Any, Dict | |||
| import torch.cuda | |||
| from PIL import Image | |||
| from modelscope.metainfo import Models | |||
| from modelscope.models.base import Model | |||
| from modelscope.models.builder import MODELS | |||
| from modelscope.utils.constant import ModelFile, Tasks | |||
| __all__ = ['OfaForImageCaptioning'] | |||
| @MODELS.register_module(Tasks.image_captioning, module_name=Models.ofa) | |||
| class OfaForImageCaptioning(Model): | |||
| def __init__(self, model_dir, *args, **kwargs): | |||
| super().__init__(model_dir=model_dir, *args, **kwargs) | |||
| ckpt_name = ModelFile.TORCH_MODEL_FILE | |||
| local_model = osp.join(model_dir, ckpt_name) | |||
| bpe_dir = model_dir | |||
| # turn on cuda if GPU is available | |||
| from fairseq import checkpoint_utils, tasks, utils | |||
| from ofa.tasks.mm_tasks import CaptionTask | |||
| from ofa.utils.eval_utils import eval_caption | |||
| self.eval_caption = eval_caption | |||
| tasks.register_task('caption', CaptionTask) | |||
| if torch.cuda.is_available(): | |||
| self._device = torch.device('cuda') | |||
| else: | |||
| self._device = torch.device('cpu') | |||
| self.use_fp16 = kwargs[ | |||
| 'use_fp16'] if 'use_fp16' in kwargs and torch.cuda.is_available()\ | |||
| else False | |||
| overrides = { | |||
| 'bpe_dir': bpe_dir, | |||
| 'eval_cider': False, | |||
| 'beam': 5, | |||
| 'max_len_b': 16, | |||
| 'no_repeat_ngram_size': 3, | |||
| 'seed': 7 | |||
| } | |||
| models, cfg, task = checkpoint_utils.load_model_ensemble_and_task( | |||
| utils.split_paths(local_model), arg_overrides=overrides) | |||
| # Move models to GPU | |||
| for model in models: | |||
| model.eval() | |||
| model.to(self._device) | |||
| if self.use_fp16: | |||
| model.half() | |||
| model.prepare_for_inference_(cfg) | |||
| self.models = models | |||
| # Initialize generator | |||
| self.generator = task.build_generator(models, cfg.generation) | |||
| # Initialize transform | |||
| from torchvision import transforms | |||
| mean = [0.5, 0.5, 0.5] | |||
| std = [0.5, 0.5, 0.5] | |||
| self.patch_resize_transform = transforms.Compose([ | |||
| lambda image: image.convert('RGB'), | |||
| transforms.Resize( | |||
| (cfg.task.patch_image_size, cfg.task.patch_image_size), | |||
| interpolation=Image.BICUBIC), | |||
| transforms.ToTensor(), | |||
| transforms.Normalize(mean=mean, std=std), | |||
| ]) | |||
| self.task = task | |||
| def forward(self, input: Dict[str, Any]) -> Dict[str, Any]: | |||
| import fairseq.utils | |||
| if torch.cuda.is_available(): | |||
| input = fairseq.utils.move_to_cuda(input, device=self._device) | |||
| results, _ = self.eval_caption(self.task, self.generator, self.models, | |||
| input) | |||
| from modelscope.outputs import OutputKeys | |||
| return { | |||
| 'image_id': results[0]['image_id'], | |||
| OutputKeys.CAPTION: results[0][OutputKeys.CAPTION] | |||
| } | |||
| def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]: | |||
| # What should we do here ? | |||
| return inputs | |||
| @@ -194,13 +194,6 @@ class SequenceGenerator(nn.Module): | |||
| bos_token: Optional[int] = None, | |||
| ): | |||
| model = EnsembleModel(models) | |||
| # incremental_states = torch.jit.annotate( | |||
| # List[Dict[str, Dict[str, Optional[Tensor]]]], | |||
| # [ | |||
| # torch.jit.annotate(Dict[str, Dict[str, Optional[Tensor]]], {}) | |||
| # for i in range(model.models_size) | |||
| # ], | |||
| # ) | |||
| incremental_states = torch.jit.annotate( | |||
| List[Tuple[Tuple[torch.Tensor]]], | |||
| [ | |||
| @@ -208,8 +201,6 @@ class SequenceGenerator(nn.Module): | |||
| for i in range(model.models_size) | |||
| ], | |||
| ) | |||
| # print("incremental_states",incremental_states) | |||
| # print("incremental_states[0]",incremental_states[0]) | |||
| net_input = sample['net_input'] | |||
| if 'src_tokens' in net_input: | |||
| @@ -281,7 +272,6 @@ class SequenceGenerator(nn.Module): | |||
| tokens = (torch.zeros(bsz * beam_size, | |||
| max_len + 2).to(src_tokens).long().fill_( | |||
| self.pad)) # +2 for eos and pad | |||
| # tokens[:, 0] = self.eos if bos_token is None else bos_token | |||
| tokens[:, 0] = self.bos | |||
| attn: Optional[Tensor] = None | |||
| @@ -335,7 +325,7 @@ class SequenceGenerator(nn.Module): | |||
| corr.unsqueeze(-1) * beam_size) | |||
| original_batch_idxs = original_batch_idxs[batch_idxs] | |||
| model.reorder_incremental_state(incremental_states, | |||
| reorder_state) # todo | |||
| reorder_state) | |||
| encoder_outs = model.reorder_encoder_out( | |||
| encoder_outs, reorder_state) | |||
| @@ -479,7 +469,6 @@ class SequenceGenerator(nn.Module): | |||
| batch_mask = torch.ones( | |||
| bsz, dtype=torch.bool, device=cand_indices.device) | |||
| batch_mask[finalized_sents] = False | |||
| # TODO replace `nonzero(as_tuple=False)` after TorchScript supports it | |||
| batch_idxs = torch.arange( | |||
| bsz, device=cand_indices.device).masked_select(batch_mask) | |||
| @@ -833,7 +822,7 @@ class EnsembleModel(nn.Module): | |||
| # decode each model | |||
| if self.has_incremental_states(): | |||
| decoder_out = model.decoder.forward( # todo 模型输入不同 | |||
| decoder_out = model.decoder.forward( | |||
| input_ids=tokens, | |||
| attention_mask=attention_mask, | |||
| encoder_hidden_states=encoder_hidden_states, | |||
| @@ -846,7 +835,7 @@ class EnsembleModel(nn.Module): | |||
| else: | |||
| if hasattr(model, 'decoder'): | |||
| # decoder_out = model.decoder.forward(tokens, code_masks=code_mask, encoder_out=encoder_out) | |||
| decoder_out = model.decoder.forward( # todo 模型输入不同 | |||
| decoder_out = model.decoder.forward( | |||
| input_ids=tokens, | |||
| attention_mask=attention_mask, | |||
| encoder_hidden_states=encoder_hidden_states, | |||
| @@ -855,32 +844,9 @@ class EnsembleModel(nn.Module): | |||
| src_pos_embed=src_pos_embed) | |||
| else: | |||
| decoder_out = model.forward(tokens) | |||
| # print('#### decoder_out ####', decoder_out) | |||
| # print('#### decoder_out ####', decoder_out.keys()) | |||
| # for k,v in decoder_out.items(): | |||
| # print(k) | |||
| # if isinstance(v, Tensor): | |||
| # print(v.shape) | |||
| # elif k == "past_key_values": | |||
| # print(len(v)) | |||
| # print([v[0][i].shape for i in range(len(v[0]))]) | |||
| # else: | |||
| # print(len(v)) | |||
| # print([v[i].shape for i in range(len(v))]) | |||
| attn: Optional[Tensor] = None | |||
| decoder_len = len(decoder_out) | |||
| # if decoder_len > 1 and decoder_out[1] is not None: | |||
| # if isinstance(decoder_out[1], Tensor): | |||
| # attn = decoder_out[1] | |||
| # else: | |||
| # attn_holder = decoder_out[1]["attn"] | |||
| # if isinstance(attn_holder, Tensor): | |||
| # attn = attn_holder | |||
| # elif attn_holder is not None: | |||
| # attn = attn_holder[0] | |||
| # if attn is not None: | |||
| # attn = attn[:, -1, :] | |||
| if 'cross_attentions' in decoder_out: | |||
| attn = decoder_out['cross_attentions'][-1].transpose(1, 0) | |||
| @@ -888,11 +854,6 @@ class EnsembleModel(nn.Module): | |||
| if attn is not None: | |||
| attn = attn[:, -1, :] | |||
| # decoder_out_tuple = ( | |||
| # decoder_out[0][:, -1:, :].div_(temperature), | |||
| # None if decoder_len <= 1 else decoder_out[1], | |||
| # ) | |||
| decoder_out_tuple = ( | |||
| decoder_out[0][:, -1:, :].div_(temperature), | |||
| None if decoder_len <= 1 else attn, | |||
| @@ -993,5 +954,5 @@ class EnsembleModel(nn.Module): | |||
| if not self.has_incremental_states(): | |||
| return | |||
| for i, model in enumerate(self.models): | |||
| model.decoder.reorder_incremental_state_scripting( # todo | |||
| model.decoder.reorder_incremental_state_scripting( | |||
| incremental_states[i], new_order) | |||
| @@ -0,0 +1,13 @@ | |||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||
| from modelscope.outputs import OutputKeys | |||
| from modelscope.utils.constant import Tasks | |||
| OFA_TASK_KEY_MAPPING = { | |||
| Tasks.image_captioning: OutputKeys.CAPTION, | |||
| Tasks.summarization: OutputKeys.TEXT, | |||
| Tasks.visual_question_answering: OutputKeys.TEXT, | |||
| Tasks.visual_grounding: OutputKeys.BOXES, | |||
| Tasks.text_classification: (OutputKeys.SCORES, OutputKeys.LABELS), | |||
| Tasks.image_classification: OutputKeys.LABELS, | |||
| Tasks.visual_entailment: (OutputKeys.SCORES, OutputKeys.LABELS), | |||
| } | |||
| @@ -0,0 +1,19 @@ | |||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||
| from typing import Optional | |||
| import torch | |||
| def expand_mask(mask: torch.Tensor, | |||
| dtype: torch.dtype, | |||
| tgt_len: Optional[int] = None): | |||
| r""" | |||
| Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`. | |||
| """ | |||
| bsz, src_len = mask.size() | |||
| tgt_len = tgt_len if tgt_len is not None else src_len | |||
| expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, | |||
| src_len).to(dtype) | |||
| return expanded_mask.masked_fill(expanded_mask.bool(), | |||
| torch.finfo(dtype).min) | |||
| @@ -0,0 +1,259 @@ | |||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||
| import math | |||
| from os import path as osp | |||
| from typing import Any, Dict | |||
| import json | |||
| import torch.cuda | |||
| import torch.nn.functional as F | |||
| from modelscope.metainfo import Models | |||
| from modelscope.models.base import Model, Tensor | |||
| from modelscope.models.builder import MODELS | |||
| from modelscope.outputs import OutputKeys | |||
| from modelscope.preprocessors.ofa.utils.collate import collate_tokens | |||
| from modelscope.utils.config import Config | |||
| from modelscope.utils.constant import ModelFile | |||
| from modelscope.utils.trie import Trie | |||
| from .ofa import OFAModel, OFATokenizer | |||
| from .ofa.generate import sequence_generator as sg | |||
| from .ofa.generate.utils import move_to_device | |||
| from .ofa.utils.constant import OFA_TASK_KEY_MAPPING, Tasks | |||
| from .ofa.utils.utils import expand_mask | |||
| __all__ = ['OfaForAllTasks'] | |||
| @MODELS.register_module(Tasks.image_captioning, module_name=Models.ofa) | |||
| @MODELS.register_module(Tasks.visual_grounding, module_name=Models.ofa) | |||
| @MODELS.register_module( | |||
| Tasks.visual_question_answering, module_name=Models.ofa) | |||
| @MODELS.register_module(Tasks.visual_entailment, module_name=Models.ofa) | |||
| @MODELS.register_module(Tasks.image_classification, module_name=Models.ofa) | |||
| @MODELS.register_module(Tasks.summarization, module_name=Models.ofa) | |||
| @MODELS.register_module(Tasks.text_classification, module_name=Models.ofa) | |||
| class OfaForAllTasks(Model): | |||
| def __init__(self, model_dir, *args, **kwargs): | |||
| super().__init__(model_dir=model_dir, *args, **kwargs) | |||
| model = OFAModel.from_pretrained(model_dir) | |||
| self.cfg = Config.from_file( | |||
| osp.join(model_dir, ModelFile.CONFIGURATION)) | |||
| self.model = model.module if hasattr(model, 'module') else model | |||
| self.tokenizer = OFATokenizer.from_pretrained(model_dir) | |||
| self.tokenizer.add_tokens(['<code_{}>'.format(i) for i in range(8192)]) | |||
| self.tokenizer.add_tokens(['<bin_{}>'.format(i) for i in range(1000)]) | |||
| self.cfg.update({'num_bins': 1000, 'num_codes': 8192}) | |||
| self.batch_size = self.cfg.model.get('batch_size', 1) | |||
| self.val_batch_size = self.cfg.model.get('valid_batch_size', | |||
| self.batch_size) | |||
| self.gen_type = self.cfg.model.get('gen_type', 'generation') | |||
| assert self.gen_type in ['generation', 'traverse'], \ | |||
| 'model.gen_type must be in ["generation", "traverse"]' | |||
| self._device = torch.device('cuda') if torch.cuda.is_available() \ | |||
| else torch.device('cpu') | |||
| self.eos_item = torch.LongTensor([self.tokenizer.eos_token_id | |||
| ]).to(self._device) | |||
| self.index2ans = {} | |||
| self.ans2label_dict = {} | |||
| self.load_ans2label() | |||
| # Initialize generator | |||
| sg_args = { | |||
| 'tokenizer': self.tokenizer, | |||
| 'beam_size': 5, | |||
| 'max_len_b': 16, | |||
| 'min_len': 1, | |||
| 'no_repeat_ngram_size': 3, | |||
| 'constraint_range': None | |||
| } | |||
| if hasattr(self.cfg.model, 'beam_search'): | |||
| sg_args.update(self.cfg.model.beam_search) | |||
| if len(self.ans2label_dict) > 0: | |||
| self.constraint_trie = Trie(self.tokenizer.eos_token_id) | |||
| self.val_ans_l = [] | |||
| self.val_masks_l = [] | |||
| self.build_trie() | |||
| sg_args['constraint_trie'] = self.constraint_trie | |||
| self.model.to(self._device) | |||
| self.generator = sg.SequenceGenerator(**sg_args) | |||
| inference_d = { | |||
| 'generation': self._text_gen_inference, | |||
| 'traverse': self._traverse_inference, | |||
| } | |||
| self.task_inference_mapping = { | |||
| Tasks.image_captioning: self._text_gen_inference, | |||
| Tasks.summarization: self._text_gen_inference, | |||
| Tasks.visual_grounding: self._visual_grounding_inference, | |||
| Tasks.visual_entailment: inference_d[self.gen_type], | |||
| Tasks.visual_question_answering: inference_d[self.gen_type], | |||
| Tasks.text_classification: inference_d[self.gen_type], | |||
| Tasks.image_classification: inference_d[self.gen_type], | |||
| } | |||
| def forward(self, input: Dict[str, Any]) -> Dict[str, Any]: | |||
| ret = self.task_inference_mapping[self.cfg.task](input) | |||
| ret['samples'] = input['samples'] | |||
| for key in [ | |||
| OutputKeys.CAPTION, OutputKeys.TEXT, OutputKeys.BOXES, | |||
| OutputKeys.LABELS, OutputKeys.SCORES | |||
| ]: | |||
| if key in ret and len(ret[key]) == 1: | |||
| ret[key] = ret[key][0] | |||
| if key not in ret: | |||
| ret[key] = None | |||
| return ret | |||
| def postprocess(self, input: Dict[str, Tensor], | |||
| **kwargs) -> Dict[str, Tensor]: | |||
| return input | |||
| def _text_gen_inference(self, input): | |||
| input = move_to_device(input, self._device) | |||
| gen_output = self.generator.generate([self.model], input) | |||
| gen = [gen_output[i][0]['tokens'] for i in range(len(gen_output))] | |||
| result = self.tokenizer.batch_decode(gen, skip_special_tokens=True) | |||
| # text generation tasks have no score | |||
| ret = {OFA_TASK_KEY_MAPPING[self.cfg.task]: result} | |||
| if self.cfg.task.endswith('classification'): | |||
| ret[OutputKeys.SCORES] = [1.0] * len(result) | |||
| return ret | |||
| def _visual_grounding_inference(self, input): | |||
| input = move_to_device(input, self._device) | |||
| gen_output = self.generator.generate([self.model], input) | |||
| tokens = [gen_output[i][0]['tokens'] for i in range(len(gen_output))] | |||
| region_coord_l = list() | |||
| for i in range(len(tokens)): | |||
| region_coord_l.append(tokens[i][:-1] | |||
| - len(self.tokenizer.get_vocab().items()) | |||
| + self.cfg.num_bins) | |||
| region_tensor = torch.stack(region_coord_l, dim=0) | |||
| region_tensor = region_tensor / ( | |||
| self.cfg.num_bins - 1) * self.cfg.model.get('max_image_size', 512) | |||
| region_tensor[:, ::2] /= input['w_resize_ratios'] | |||
| region_tensor[:, 1::2] /= input['h_resize_ratios'] | |||
| return { | |||
| OutputKeys.BOXES: move_to_device(region_tensor, | |||
| torch.device('cpu')), | |||
| OutputKeys.SCORES: [1.0] * region_tensor.shape[0] | |||
| } | |||
| def _traverse_inference(self, input): | |||
| input = move_to_device(input, self._device) | |||
| encoder_input = dict() | |||
| for key in input['net_input'].keys(): | |||
| encoder_input[key] = input['net_input'][key] | |||
| encoder_out = self.model.encoder(**encoder_input) | |||
| valid_result = [] | |||
| for val_ans, val_masks in zip(self.val_ans_l, self.val_masks_l): | |||
| valid_size = len(val_ans) | |||
| valid_tgt_items = [ | |||
| torch.cat([ | |||
| torch.tensor(decoder_prompt[1:]), valid_answer, | |||
| self.eos_item | |||
| ]) for decoder_prompt in input['decoder_prompts'] | |||
| for valid_answer in val_ans | |||
| ] | |||
| valid_prev_items = [ | |||
| torch.cat([torch.tensor(decoder_prompt), valid_answer]) | |||
| for decoder_prompt in input['decoder_prompts'] | |||
| for valid_answer in val_ans | |||
| ] | |||
| valid_constraint_mask_items = [ | |||
| torch.cat([ | |||
| torch.zeros( | |||
| len(decoder_prompt) - 1, | |||
| valid_constraint_mask.size(1)).bool().to(self._device), | |||
| valid_constraint_mask], dim=0) # yapf: disable | |||
| for decoder_prompt in input['decoder_prompts'] # yapf: disable | |||
| for valid_constraint_mask in val_masks] # yapf: disable | |||
| valid_tgt = collate_tokens( | |||
| valid_tgt_items, | |||
| pad_idx=self.tokenizer.pad_token_id).to(self._device) | |||
| valid_prev_output = collate_tokens( | |||
| valid_prev_items, | |||
| pad_idx=self.tokenizer.pad_token_id).to(self._device) | |||
| val_masks = collate_tokens( | |||
| valid_constraint_mask_items, | |||
| pad_idx=self.tokenizer.pad_token_id).to(self._device) | |||
| new_encoder_out = { | |||
| 'last_hidden_state': | |||
| encoder_out['last_hidden_state'].repeat_interleave( | |||
| valid_size, dim=0), | |||
| 'padding_mask': | |||
| encoder_out['padding_mask'].repeat_interleave( | |||
| valid_size, dim=0), | |||
| 'position_embedding': | |||
| encoder_out['position_embedding'].repeat_interleave( | |||
| valid_size, dim=0) | |||
| } | |||
| encoder_attention_mask = expand_mask( | |||
| new_encoder_out['padding_mask'], | |||
| new_encoder_out['last_hidden_state'].dtype, | |||
| valid_prev_output.shape[-1]) | |||
| decoder_out = self.model.decoder( | |||
| valid_prev_output, | |||
| encoder_hidden_states=new_encoder_out['last_hidden_state'], | |||
| encoder_attention_mask=encoder_attention_mask, | |||
| src_pos_embed=new_encoder_out['position_embedding']) | |||
| decoder_out[0].masked_fill_(~val_masks, -math.inf) | |||
| lprobs = self.model.get_normalized_probs( | |||
| decoder_out, log_probs=True) | |||
| scores = lprobs.gather( | |||
| dim=-1, index=valid_tgt.unsqueeze(-1)).squeeze(-1) | |||
| scores = scores.masked_fill( | |||
| valid_tgt.eq(self.tokenizer.pad_token_id), 0) | |||
| scores = scores.masked_fill((~val_masks).all(2), 0) | |||
| scores = scores.sum(1) | |||
| scores = scores.view(-1, valid_size) | |||
| valid_result.append(scores) | |||
| valid_result = torch.cat(valid_result, dim=-1) | |||
| predicts = valid_result.argmax(1).tolist() | |||
| probs = F.softmax(valid_result, dim=-1) | |||
| hyps = [self.index2ans[predict_index] for predict_index in predicts] | |||
| scores = [ | |||
| float(prob[idx].cpu().detach().numpy()) | |||
| for prob, idx in zip(probs, predicts) | |||
| ] | |||
| return {OutputKeys.LABELS: hyps, OutputKeys.SCORES: scores} | |||
| def build_trie(self): | |||
| answer_item_list = [] | |||
| for i, answer in enumerate(self.ans2label_dict.keys()): | |||
| answer_item = self.tokenizer( | |||
| ' ' + answer, return_tensors='pt', | |||
| add_special_tokens=False).input_ids.squeeze(0) | |||
| answer_item_list.append(answer_item) | |||
| self.index2ans[i] = answer | |||
| self.constraint_trie.insert([self.tokenizer.bos_token_id] | |||
| + answer_item.tolist() | |||
| + [self.tokenizer.eos_token_id]) | |||
| constraint_mask_list = [] | |||
| for answer_item in answer_item_list: | |||
| constraint_mask = torch.zeros( | |||
| (len(answer_item) + 1, | |||
| len(self.tokenizer.get_vocab()))).bool() | |||
| for i in range(len(answer_item) + 1): | |||
| constraint_prefix_token = [self.tokenizer.bos_token_id | |||
| ] + answer_item[:i].tolist() | |||
| constraint_nodes = self.constraint_trie.get_next_layer( | |||
| constraint_prefix_token) | |||
| constraint_mask[i][constraint_nodes] = True | |||
| constraint_mask_list.append(constraint_mask) | |||
| for i in range(0, len(answer_item_list), self.val_batch_size): | |||
| self.val_ans_l += [answer_item_list[i:i + self.val_batch_size]] | |||
| self.val_masks_l += [ | |||
| constraint_mask_list[i:i + self.val_batch_size] | |||
| ] | |||
| self.val_ans_l = move_to_device(self.val_ans_l, self._device) | |||
| self.val_masks_l = move_to_device(self.val_masks_l, self._device) | |||
| def load_ans2label(self): | |||
| if self.cfg.model.get('answer2label', None): | |||
| filename = osp.join(self.model_dir, self.cfg.model.answer2label) | |||
| self.ans2label_dict = json.load(open(filename)) | |||
| @@ -1,53 +0,0 @@ | |||
| from typing import Any, Dict | |||
| import torch.cuda | |||
| from modelscope.metainfo import Models | |||
| from modelscope.models.base import Model | |||
| from modelscope.models.builder import MODELS | |||
| from modelscope.outputs import OutputKeys | |||
| from modelscope.utils.constant import Tasks | |||
| from .ofa import OFAModel, OFATokenizer | |||
| from .ofa.generate import sequence_generator as sg | |||
| from .ofa.generate.utils import move_to_device | |||
| __all__ = ['OfaForImageCaptioning'] | |||
| @MODELS.register_module(Tasks.image_captioning, module_name=Models.ofa) | |||
| class OfaForImageCaptioning(Model): | |||
| def __init__(self, model_dir, *args, **kwargs): | |||
| super().__init__(model_dir=model_dir, *args, **kwargs) | |||
| model = OFAModel.from_pretrained(model_dir) | |||
| self.model = model.module if hasattr(model, 'module') else model | |||
| self.tokenizer = OFATokenizer.from_pretrained(model_dir) | |||
| self.tokenizer.add_tokens(['<code_{}>'.format(i) for i in range(8192)]) | |||
| self.tokenizer.add_tokens(['<bin_{}>'.format(i) for i in range(1000)]) | |||
| self._device = torch.device('cuda') if torch.cuda.is_available() \ | |||
| else torch.device('cpu') | |||
| self.model.to(self._device) | |||
| # Initialize generator | |||
| sg_args = { | |||
| 'tokenizer': self.tokenizer, | |||
| 'beam_size': 5, | |||
| 'max_len_b': 16, | |||
| 'min_len': 1, | |||
| 'no_repeat_ngram_size': 3, | |||
| 'constraint_range': None | |||
| } | |||
| if hasattr(kwargs, 'beam_search'): | |||
| sg_args.update(kwargs['beam_search']) | |||
| self.generator = sg.SequenceGenerator(**sg_args) | |||
| def forward(self, input: Dict[str, Any]) -> Dict[str, Any]: | |||
| input = move_to_device(input, self._device) | |||
| gen_output = self.generator.generate([self.model], input) | |||
| gen = [gen_output[i][0]['tokens'] for i in range(len(gen_output))] | |||
| result = self.tokenizer.batch_decode(gen, skip_special_tokens=True) | |||
| return {'image_id': '42', OutputKeys.CAPTION: result[0]} | |||
| def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]: | |||
| # What should we do here ? | |||
| return inputs | |||
| @@ -24,6 +24,7 @@ if TYPE_CHECKING: | |||
| from .ocr_detection_pipeline import OCRDetectionPipeline | |||
| from .video_category_pipeline import VideoCategoryPipeline | |||
| from .virtual_tryon_pipeline import VirtualTryonPipeline | |||
| from .image_classification_pipeline import ImageClassificationPipeline | |||
| else: | |||
| _import_structure = { | |||
| 'action_recognition_pipeline': ['ActionRecognitionPipeline'], | |||
| @@ -33,7 +34,7 @@ else: | |||
| 'face_image_generation_pipeline': ['FaceImageGenerationPipeline'], | |||
| 'face_recognition_pipeline': ['FaceRecognitionPipeline'], | |||
| 'image_classification_pipeline': | |||
| ['GeneralImageClassificationPipeline'], | |||
| ['GeneralImageClassificationPipeline', 'ImageClassificationPipeline'], | |||
| 'image_cartoon_pipeline': ['ImageCartoonPipeline'], | |||
| 'image_denoise_pipeline': ['ImageDenoisePipeline'], | |||
| 'image_color_enhance_pipeline': ['ImageColorEnhancePipeline'], | |||
| @@ -1,4 +1,5 @@ | |||
| from typing import Any, Dict | |||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||
| from typing import Any, Dict, Union | |||
| import cv2 | |||
| import numpy as np | |||
| @@ -7,16 +8,41 @@ import torch | |||
| from modelscope.metainfo import Pipelines | |||
| from modelscope.outputs import OutputKeys | |||
| from modelscope.pipelines.base import Input | |||
| from modelscope.preprocessors import load_image | |||
| from modelscope.pipelines.base import Input, Model, Pipeline | |||
| from modelscope.pipelines.builder import PIPELINES | |||
| from modelscope.preprocessors import OfaPreprocessor, Preprocessor, load_image | |||
| from modelscope.utils.constant import Tasks | |||
| from modelscope.utils.logger import get_logger | |||
| from ..base import Pipeline | |||
| from ..builder import PIPELINES | |||
| logger = get_logger() | |||
| @PIPELINES.register_module( | |||
| Tasks.image_classification, module_name=Pipelines.image_classification) | |||
| class ImageClassificationPipeline(Pipeline): | |||
| def __init__(self, | |||
| model: Union[Model, str], | |||
| preprocessor: [Preprocessor] = None, | |||
| **kwargs): | |||
| super().__init__(model=model) | |||
| assert isinstance(model, str) or isinstance(model, Model), \ | |||
| 'model must be a single str or OfaForAllTasks' | |||
| if isinstance(model, str): | |||
| pipe_model = Model.from_pretrained(model) | |||
| elif isinstance(model, Model): | |||
| pipe_model = model | |||
| else: | |||
| raise NotImplementedError | |||
| pipe_model.model.eval() | |||
| if preprocessor is None and pipe_model: | |||
| preprocessor = OfaPreprocessor(model_dir=pipe_model.model_dir) | |||
| super().__init__(model=pipe_model, preprocessor=preprocessor, **kwargs) | |||
| def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]: | |||
| return inputs | |||
| @PIPELINES.register_module( | |||
| Tasks.image_classification_imagenet, | |||
| module_name=Pipelines.general_image_classification) | |||
| @@ -27,7 +53,7 @@ class GeneralImageClassificationPipeline(Pipeline): | |||
| def __init__(self, model: str, **kwargs): | |||
| """ | |||
| use `model` and `preprocessor` to create a kws pipeline for prediction | |||
| use `model` and `preprocessor` to create a image classification pipeline for prediction | |||
| Args: | |||
| model: model id on modelscope hub. | |||
| """ | |||
| @@ -5,7 +5,9 @@ from modelscope.utils.import_utils import LazyImportModule | |||
| if TYPE_CHECKING: | |||
| from .generative_multi_modal_embedding_pipeline import GEMMMultiModalEmbeddingPipeline | |||
| from .image_captioning_pipeline import ImageCaptionPipeline | |||
| from .image_captioning_pipeline import ImageCaptioningPipeline | |||
| from .visual_entailment_pipeline import VisualEntailmentPipeline | |||
| from .visual_grounding_pipeline import VisualGroundingPipeline | |||
| from .multi_modal_embedding_pipeline import MultiModalEmbeddingPipeline | |||
| from .text_to_image_synthesis_pipeline import TextToImageSynthesisPipeline | |||
| from .video_multi_modal_embedding_pipeline import \ | |||
| @@ -14,7 +16,9 @@ if TYPE_CHECKING: | |||
| else: | |||
| _import_structure = { | |||
| 'image_captioning_pipeline': ['ImageCaptionPipeline'], | |||
| 'image_captioning_pipeline': ['ImageCaptioningPipeline'], | |||
| 'visual_entailment_pipeline': ['VisualEntailmentPipeline'], | |||
| 'visual_grounding_pipeline': ['VisualGroundingPipeline'], | |||
| 'multi_modal_embedding_pipeline': ['MultiModalEmbeddingPipeline'], | |||
| 'text_to_image_synthesis_pipeline': ['TextToImageSynthesisPipeline'], | |||
| 'visual_question_answering_pipeline': | |||
| @@ -1,9 +1,10 @@ | |||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||
| from typing import Any, Dict, Optional, Union | |||
| from modelscope.metainfo import Pipelines | |||
| from modelscope.pipelines.base import Model, Pipeline | |||
| from modelscope.pipelines.builder import PIPELINES | |||
| from modelscope.preprocessors import OfaImageCaptionPreprocessor, Preprocessor | |||
| from modelscope.preprocessors import OfaPreprocessor, Preprocessor | |||
| from modelscope.utils.constant import Tasks | |||
| from modelscope.utils.logger import get_logger | |||
| @@ -12,28 +13,29 @@ logger = get_logger() | |||
| @PIPELINES.register_module( | |||
| Tasks.image_captioning, module_name=Pipelines.image_captioning) | |||
| class ImageCaptionPipeline(Pipeline): | |||
| class ImageCaptioningPipeline(Pipeline): | |||
| def __init__(self, | |||
| model: Union[Model, str], | |||
| preprocessor: Optional[Preprocessor] = None, | |||
| **kwargs): | |||
| """ | |||
| use `model` and `preprocessor` to create a kws pipeline for prediction | |||
| use `model` and `preprocessor` to create a image captioning pipeline for prediction | |||
| Args: | |||
| model: model id on modelscope hub. | |||
| """ | |||
| super().__init__(model=model) | |||
| assert isinstance(model, str) or isinstance(model, Model), \ | |||
| 'model must be a single str or OfaForImageCaptioning' | |||
| 'model must be a single str or OfaForAllTasks' | |||
| if isinstance(model, str): | |||
| pipe_model = Model.from_pretrained(model) | |||
| elif isinstance(model, Model): | |||
| pipe_model = model | |||
| else: | |||
| raise NotImplementedError | |||
| pipe_model.model.eval() | |||
| if preprocessor is None and pipe_model: | |||
| preprocessor = OfaImageCaptionPreprocessor(model_dir=model) | |||
| preprocessor = OfaPreprocessor(model_dir=pipe_model.model_dir) | |||
| super().__init__(model=pipe_model, preprocessor=preprocessor, **kwargs) | |||
| def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]: | |||
| @@ -0,0 +1,42 @@ | |||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||
| from typing import Any, Dict, Union | |||
| from modelscope.metainfo import Pipelines | |||
| from modelscope.pipelines.base import Model, Pipeline | |||
| from modelscope.pipelines.builder import PIPELINES | |||
| from modelscope.preprocessors import OfaPreprocessor, Preprocessor | |||
| from modelscope.utils.constant import Tasks | |||
| from modelscope.utils.logger import get_logger | |||
| logger = get_logger() | |||
| @PIPELINES.register_module( | |||
| Tasks.visual_entailment, module_name=Pipelines.visual_entailment) | |||
| class VisualEntailmentPipeline(Pipeline): | |||
| def __init__(self, | |||
| model: Union[Model, str], | |||
| preprocessor: [Preprocessor] = None, | |||
| **kwargs): | |||
| """ | |||
| use `model` and `preprocessor` to create a visual entailment pipeline for prediction | |||
| Args: | |||
| model: model id on modelscope hub. | |||
| """ | |||
| super().__init__(model=model) | |||
| assert isinstance(model, str) or isinstance(model, Model), \ | |||
| 'model must be a single str or OfaForAllTasks' | |||
| if isinstance(model, str): | |||
| pipe_model = Model.from_pretrained(model) | |||
| elif isinstance(model, Model): | |||
| pipe_model = model | |||
| else: | |||
| raise NotImplementedError | |||
| pipe_model.model.eval() | |||
| if preprocessor is None and pipe_model: | |||
| preprocessor = OfaPreprocessor(model_dir=pipe_model.model_dir) | |||
| super().__init__(model=pipe_model, preprocessor=preprocessor, **kwargs) | |||
| def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]: | |||
| return inputs | |||
| @@ -0,0 +1,42 @@ | |||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||
| from typing import Any, Dict, Union | |||
| from modelscope.metainfo import Pipelines | |||
| from modelscope.pipelines.base import Model, Pipeline | |||
| from modelscope.pipelines.builder import PIPELINES | |||
| from modelscope.preprocessors import OfaPreprocessor, Preprocessor | |||
| from modelscope.utils.constant import Tasks | |||
| from modelscope.utils.logger import get_logger | |||
| logger = get_logger() | |||
| @PIPELINES.register_module( | |||
| Tasks.visual_grounding, module_name=Pipelines.visual_grounding) | |||
| class VisualGroundingPipeline(Pipeline): | |||
| def __init__(self, | |||
| model: Union[Model, str], | |||
| preprocessor: [Preprocessor] = None, | |||
| **kwargs): | |||
| """ | |||
| use `model` and `preprocessor` to create a visual grounding pipeline for prediction | |||
| Args: | |||
| model: model id on modelscope hub. | |||
| """ | |||
| super().__init__(model=model) | |||
| assert isinstance(model, str) or isinstance(model, Model), \ | |||
| 'model must be a single str or OfaForAllTasks' | |||
| if isinstance(model, str): | |||
| pipe_model = Model.from_pretrained(model) | |||
| elif isinstance(model, Model): | |||
| pipe_model = model | |||
| else: | |||
| raise NotImplementedError | |||
| pipe_model.model.eval() | |||
| if preprocessor is None and pipe_model: | |||
| preprocessor = OfaPreprocessor(model_dir=pipe_model.model_dir) | |||
| super().__init__(model=pipe_model, preprocessor=preprocessor, **kwargs) | |||
| def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]: | |||
| return inputs | |||
| @@ -1,3 +1,4 @@ | |||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||
| from typing import Any, Dict, Optional, Union | |||
| import torch | |||
| @@ -30,15 +31,18 @@ class VisualQuestionAnsweringPipeline(Pipeline): | |||
| model (MPlugForVisualQuestionAnswering): a model instance | |||
| preprocessor (MPlugVisualQuestionAnsweringPreprocessor): a preprocessor instance | |||
| """ | |||
| model = model if isinstance( | |||
| model, | |||
| MPlugForVisualQuestionAnswering) else Model.from_pretrained(model) | |||
| model = model if isinstance(model, | |||
| Model) else Model.from_pretrained(model) | |||
| self.tokenizer = None | |||
| if preprocessor is None: | |||
| preprocessor = MPlugVisualQuestionAnsweringPreprocessor( | |||
| model.model_dir) | |||
| model.eval() | |||
| if isinstance(model, MPlugForVisualQuestionAnswering): | |||
| model.eval() | |||
| self.tokenizer = model.tokenizer | |||
| else: | |||
| model.model.eval() | |||
| super().__init__(model=model, preprocessor=preprocessor, **kwargs) | |||
| self.tokenizer = model.tokenizer | |||
| def forward(self, inputs: Dict[str, Any], | |||
| **forward_params) -> Dict[str, Any]: | |||
| @@ -55,6 +59,8 @@ class VisualQuestionAnsweringPipeline(Pipeline): | |||
| Returns: | |||
| Dict[str, str]: the prediction results | |||
| """ | |||
| if self.tokenizer is None: | |||
| return inputs | |||
| replace_tokens_bert = (('[unused0]', ''), ('[PAD]', ''), | |||
| ('[unused1]', ''), (r' +', ' '), ('[SEP]', ''), | |||
| ('[unused2]', ''), ('[CLS]', ''), ('[UNK]', '')) | |||
| @@ -17,6 +17,8 @@ if TYPE_CHECKING: | |||
| from .translation_pipeline import TranslationPipeline | |||
| from .word_segmentation_pipeline import WordSegmentationPipeline | |||
| from .zero_shot_classification_pipeline import ZeroShotClassificationPipeline | |||
| from .summarization_pipeline import SummarizationPipeline | |||
| from .text_classification_pipeline import TextClassificationPipeline | |||
| from .text_error_correction_pipeline import TextErrorCorrectionPipeline | |||
| else: | |||
| @@ -38,6 +40,8 @@ else: | |||
| 'named_entity_recognition_pipeline': | |||
| ['NamedEntityRecognitionPipeline'], | |||
| 'translation_pipeline': ['TranslationPipeline'], | |||
| 'summarization_pipeline': ['SummarizationPipeline'], | |||
| 'text_classification_pipeline': ['TextClassificationPipeline'], | |||
| 'text_error_correction_pipeline': ['TextErrorCorrectionPipeline'] | |||
| } | |||
| @@ -0,0 +1,42 @@ | |||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||
| from typing import Any, Dict, Union | |||
| from modelscope.metainfo import Pipelines | |||
| from modelscope.pipelines.base import Model, Pipeline | |||
| from modelscope.pipelines.builder import PIPELINES | |||
| from modelscope.preprocessors import OfaPreprocessor, Preprocessor | |||
| from modelscope.utils.constant import Tasks | |||
| from modelscope.utils.logger import get_logger | |||
| logger = get_logger() | |||
| @PIPELINES.register_module( | |||
| Tasks.summarization, module_name=Pipelines.text_generation) | |||
| class SummarizationPipeline(Pipeline): | |||
| def __init__(self, | |||
| model: Union[Model, str], | |||
| preprocessor: [Preprocessor] = None, | |||
| **kwargs): | |||
| """ | |||
| use `model` and `preprocessor` to create a kws pipeline for prediction | |||
| Args: | |||
| model: model id on modelscope hub. | |||
| """ | |||
| super().__init__(model=model) | |||
| assert isinstance(model, str) or isinstance(model, Model), \ | |||
| 'model must be a single str or OfaForAllTasks' | |||
| if isinstance(model, str): | |||
| pipe_model = Model.from_pretrained(model) | |||
| elif isinstance(model, Model): | |||
| pipe_model = model | |||
| else: | |||
| raise NotImplementedError | |||
| pipe_model.model.eval() | |||
| if preprocessor is None and pipe_model: | |||
| preprocessor = OfaPreprocessor(model_dir=pipe_model.model_dir) | |||
| super().__init__(model=pipe_model, preprocessor=preprocessor, **kwargs) | |||
| def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]: | |||
| return inputs | |||
| @@ -0,0 +1,42 @@ | |||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||
| from typing import Any, Dict, Union | |||
| from modelscope.metainfo import Pipelines | |||
| from modelscope.pipelines.base import Model, Pipeline | |||
| from modelscope.pipelines.builder import PIPELINES | |||
| from modelscope.preprocessors import OfaPreprocessor, Preprocessor | |||
| from modelscope.utils.constant import Tasks | |||
| from modelscope.utils.logger import get_logger | |||
| logger = get_logger() | |||
| @PIPELINES.register_module( | |||
| Tasks.text_classification, module_name=Pipelines.text_classification) | |||
| class TextClassificationPipeline(Pipeline): | |||
| def __init__(self, | |||
| model: Union[Model, str], | |||
| preprocessor: [Preprocessor] = None, | |||
| **kwargs): | |||
| """ | |||
| use `model` and `preprocessor` to create a kws pipeline for prediction | |||
| Args: | |||
| model: model id on modelscope hub. | |||
| """ | |||
| super().__init__(model=model) | |||
| assert isinstance(model, str) or isinstance(model, Model), \ | |||
| 'model must be a single str or OfaForAllTasks' | |||
| if isinstance(model, str): | |||
| pipe_model = Model.from_pretrained(model) | |||
| elif isinstance(model, Model): | |||
| pipe_model = model | |||
| else: | |||
| raise NotImplementedError | |||
| pipe_model.model.eval() | |||
| if preprocessor is None and pipe_model: | |||
| preprocessor = OfaPreprocessor(model_dir=pipe_model.model_dir) | |||
| super().__init__(model=pipe_model, preprocessor=preprocessor, **kwargs) | |||
| def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]: | |||
| return inputs | |||
| @@ -14,7 +14,7 @@ if TYPE_CHECKING: | |||
| ImageInstanceSegmentationPreprocessor, | |||
| ImageDenoisePreprocessor) | |||
| from .kws import WavToLists | |||
| from .multi_modal import (OfaImageCaptionPreprocessor, | |||
| from .multi_modal import (OfaPreprocessor, | |||
| MPlugVisualQuestionAnsweringPreprocessor) | |||
| from .nlp import (Tokenize, SequenceClassificationPreprocessor, | |||
| TextGenerationPreprocessor, | |||
| @@ -41,10 +41,8 @@ else: | |||
| 'ImageInstanceSegmentationPreprocessor', 'ImageDenoisePreprocessor' | |||
| ], | |||
| 'kws': ['WavToLists'], | |||
| 'multi_modal': [ | |||
| 'OfaImageCaptionPreprocessor', | |||
| 'MPlugVisualQuestionAnsweringPreprocessor' | |||
| ], | |||
| 'multi_modal': | |||
| ['OfaPreprocessor', 'MPlugVisualQuestionAnsweringPreprocessor'], | |||
| 'nlp': [ | |||
| 'Tokenize', 'SequenceClassificationPreprocessor', | |||
| 'TextGenerationPreprocessor', 'TokenClassificationPreprocessor', | |||
| @@ -4,26 +4,25 @@ from typing import Any, Dict, Union | |||
| import torch | |||
| from PIL import Image | |||
| from torchvision import transforms | |||
| from modelscope.hub.snapshot_download import snapshot_download | |||
| from modelscope.metainfo import Preprocessors | |||
| from modelscope.models.multi_modal.ofa import OFATokenizer | |||
| from modelscope.utils.constant import Fields | |||
| from modelscope.utils.type_assert import type_assert | |||
| from modelscope.utils.config import Config | |||
| from modelscope.utils.constant import Fields, ModelFile, Tasks | |||
| from .base import Preprocessor | |||
| from .builder import PREPROCESSORS | |||
| from .image import load_image | |||
| from .ofa import * # noqa | |||
| from .ofa.utils.collate import collate_fn | |||
| __all__ = [ | |||
| 'OfaImageCaptionPreprocessor', | |||
| 'OfaPreprocessor', | |||
| 'MPlugVisualQuestionAnsweringPreprocessor', | |||
| ] | |||
| @PREPROCESSORS.register_module( | |||
| Fields.multi_modal, module_name=Preprocessors.ofa_image_caption) | |||
| class OfaImageCaptionPreprocessor(Preprocessor): | |||
| class OfaPreprocessor(Preprocessor): | |||
| def __init__(self, model_dir: str, *args, **kwargs): | |||
| """preprocess the data via the vocab.txt from the `model_dir` path | |||
| @@ -32,41 +31,28 @@ class OfaImageCaptionPreprocessor(Preprocessor): | |||
| model_dir (str): model path | |||
| """ | |||
| super().__init__(*args, **kwargs) | |||
| preprocess_mapping = { | |||
| Tasks.image_captioning: OfaImageCaptioningPreprocessor, | |||
| Tasks.visual_grounding: OfaVisualGroundingPreprocessor, | |||
| Tasks.visual_question_answering: | |||
| OfaVisualQuestionAnsweringPreprocessor, | |||
| Tasks.visual_entailment: OfaVisualEntailmentPreprocessor, | |||
| Tasks.image_classification: OfaImageClassificationPreprocessor, | |||
| Tasks.text_classification: OfaTextClassificationPreprocessor, | |||
| Tasks.summarization: OfaSummarizationPreprocessor | |||
| } | |||
| model_dir = model_dir if osp.exists(model_dir) else snapshot_download( | |||
| model_dir) | |||
| self.tokenizer = OFATokenizer.from_pretrained(model_dir) | |||
| self.tokenizer.add_tokens(['<code_{}>'.format(i) for i in range(8192)]) | |||
| self.tokenizer.add_tokens(['<bin_{}>'.format(i) for i in range(1000)]) | |||
| # Initialize transform | |||
| mean = [0.5, 0.5, 0.5] | |||
| std = [0.5, 0.5, 0.5] | |||
| patch_image_size = 480 | |||
| self.patch_resize_transform = transforms.Compose([ | |||
| lambda image: image.convert('RGB'), | |||
| transforms.Resize((patch_image_size, patch_image_size), | |||
| interpolation=Image.BICUBIC), | |||
| transforms.ToTensor(), | |||
| transforms.Normalize(mean=mean, std=std), | |||
| ]) | |||
| cfg = Config.from_file(osp.join(model_dir, ModelFile.CONFIGURATION)) | |||
| self.preprocess = preprocess_mapping[cfg.task](cfg, model_dir) | |||
| self.tokenizer = self.preprocess.tokenizer | |||
| @type_assert(object, (str, tuple, Image.Image)) | |||
| def __call__(self, data: Union[str, tuple]) -> Dict[str, Any]: | |||
| if isinstance(data, Image.Image): | |||
| patch_image = self.patch_resize_transform(data).unsqueeze(0) | |||
| else: | |||
| patch_image = self.patch_resize_transform( | |||
| load_image(data)).unsqueeze(0) | |||
| text = ' what does the image describe?' | |||
| inputs = self.tokenizer([text], max_length=1024, | |||
| return_tensors='pt')['input_ids'] | |||
| sample = dict() | |||
| sample['net_input'] = { | |||
| 'input_ids': inputs, | |||
| 'patch_images': patch_image, | |||
| 'patch_masks': torch.tensor([True]) | |||
| } | |||
| return sample | |||
| def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]: | |||
| sample = self.preprocess(data) | |||
| sample['sample'] = data | |||
| return collate_fn([sample], | |||
| pad_idx=self.tokenizer.pad_token_id, | |||
| eos_idx=self.tokenizer.eos_token_id) | |||
| @PREPROCESSORS.register_module( | |||
| @@ -0,0 +1,8 @@ | |||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||
| from .image_captioning import OfaImageCaptioningPreprocessor | |||
| from .image_classification import OfaImageClassificationPreprocessor | |||
| from .summarization import OfaSummarizationPreprocessor | |||
| from .text_classification import OfaTextClassificationPreprocessor | |||
| from .visual_entailment import OfaVisualEntailmentPreprocessor | |||
| from .visual_grounding import OfaVisualGroundingPreprocessor | |||
| from .visual_question_answering import OfaVisualQuestionAnsweringPreprocessor | |||
| @@ -0,0 +1,117 @@ | |||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||
| import re | |||
| from os import path as osp | |||
| import json | |||
| import numpy as np | |||
| import torch | |||
| from modelscope.models.multi_modal.ofa import OFATokenizer | |||
| from modelscope.utils.trie import Trie | |||
| from .utils.random_help import set_torch_seed | |||
| class OfaBasePreprocessor: | |||
| def __init__(self, cfg, model_dir): | |||
| """preprocess the data via the vocab.txt from the `model_dir` path | |||
| Args: | |||
| cfg(modelscope.utils.config.ConfigDict) : model config | |||
| model_dir (str): model path | |||
| """ | |||
| self.cfg = cfg | |||
| tokenizer = OFATokenizer.from_pretrained(model_dir) | |||
| tokenizer.add_tokens(['<code_{}>'.format(i) for i in range(8192)]) | |||
| tokenizer.add_tokens(['<bin_{}>'.format(i) for i in range(1000)]) | |||
| self.tokenizer = tokenizer | |||
| self.bos_item = torch.LongTensor([tokenizer.bos_token_id]) | |||
| self.pad_item = torch.LongTensor([tokenizer.pad_token_id]) | |||
| self.eos_item = torch.LongTensor([tokenizer.eos_token_id]) | |||
| self.tgt_dict = self.src_dict = { | |||
| value: key | |||
| for key, value in tokenizer.get_vocab().items() | |||
| } | |||
| self.max_src_length = cfg.model.get('max_src_length', 256) | |||
| self.max_image_size = cfg.model.get('max_image_size', 512) | |||
| self.language = self.cfg.model.get('language', 'en') | |||
| self.prompt_type = self.cfg.model.get('prompt_type', 'none') | |||
| seed = self.cfg.model.get('seed', 7) | |||
| np.random.seed(seed) | |||
| set_torch_seed(seed) | |||
| imagenet_default_mean_and_std = self.cfg.model.get( | |||
| 'imagenet_default_mean_and_std', False) | |||
| if imagenet_default_mean_and_std: | |||
| self.mean = [0.485, 0.456, 0.406] | |||
| self.std = [0.229, 0.224, 0.225] | |||
| else: | |||
| self.mean = [0.5, 0.5, 0.5] | |||
| self.std = [0.5, 0.5, 0.5] | |||
| self.patch_image_size = self.cfg.model.get('patch_image_size', 480) | |||
| self.constraint_trie = None | |||
| self.index2ans = {} | |||
| if self.cfg.model.get('answer2label', False): | |||
| ans2label_file = osp.join(model_dir, self.cfg.model.answer2label) | |||
| ans2label_dict = json.load(open(ans2label_file, 'r')) | |||
| self.constraint_trie = Trie(tokenizer.eos_token_id) | |||
| for i, answer in enumerate(ans2label_dict.keys()): | |||
| answer_item = tokenizer( | |||
| ' ' + answer, | |||
| return_tensors='pt', | |||
| add_special_tokens=False).input_ids.squeeze(0) | |||
| self.constraint_trie.insert([tokenizer.bos_token_id] | |||
| + answer_item.tolist() | |||
| + [tokenizer.eos_token_id]) | |||
| def get_inputs(self, text, add_bos=True, add_eos=True): | |||
| inputs = self.tokenizer( | |||
| text, | |||
| max_length=self.max_src_length, | |||
| add_special_tokens=False, | |||
| return_tensors='pt')['input_ids'].squeeze(0) | |||
| if add_bos: | |||
| inputs = torch.cat([self.bos_item, inputs]) | |||
| if add_eos: | |||
| inputs = torch.cat([inputs, self.eos_item]) | |||
| return inputs | |||
| @staticmethod | |||
| def pre_caption(caption, max_words=None): | |||
| caption = caption.lower().lstrip(',.!?*#:;~').replace('-', ' ')\ | |||
| .replace('/', ' ').replace('<person>', 'person') | |||
| caption = re.sub( | |||
| r'\s{2,}', | |||
| ' ', | |||
| caption, | |||
| ) | |||
| caption = caption.rstrip('\n') | |||
| caption = caption.strip(' ') | |||
| # truncate caption | |||
| caption_words = caption.split(' ') | |||
| if max_words is not None and len(caption_words) > max_words: | |||
| caption = ' '.join(caption_words[:max_words]) | |||
| return caption | |||
| @staticmethod | |||
| def pre_question(question, max_ques_words): | |||
| question = question.lower().lstrip(',.!?*#:;~').replace('-', | |||
| ' ').replace( | |||
| '/', ' ') | |||
| question = re.sub( | |||
| r'\s{2,}', | |||
| ' ', | |||
| question, | |||
| ) | |||
| question = question.rstrip('\n') | |||
| question = question.strip(' ') | |||
| # truncate question | |||
| question_words = question.split(' ') | |||
| if len(question_words) > max_ques_words: | |||
| question = ' '.join(question_words[:max_ques_words]) | |||
| return question | |||
| @@ -0,0 +1,42 @@ | |||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||
| from typing import Any, Dict, Union | |||
| import torch | |||
| from PIL import Image | |||
| from torchvision import transforms | |||
| from modelscope.preprocessors.image import load_image | |||
| from .base import OfaBasePreprocessor | |||
| class OfaImageCaptioningPreprocessor(OfaBasePreprocessor): | |||
| def __init__(self, cfg, model_dir): | |||
| """preprocess the data via the vocab.txt from the `model_dir` path | |||
| Args: | |||
| cfg(modelscope.utils.config.ConfigDict) : model config | |||
| model_dir (str): model path | |||
| """ | |||
| super(OfaImageCaptioningPreprocessor, self).__init__(cfg, model_dir) | |||
| # Initialize transform | |||
| self.patch_resize_transform = transforms.Compose([ | |||
| lambda image: image.convert('RGB'), | |||
| transforms.Resize((self.patch_image_size, self.patch_image_size), | |||
| interpolation=Image.BICUBIC), | |||
| transforms.ToTensor(), | |||
| transforms.Normalize(mean=self.mean, std=self.std), | |||
| ]) | |||
| def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]: | |||
| image = data['image'] if isinstance( | |||
| data['image'], Image.Image) else load_image(data['image']) | |||
| patch_image = self.patch_resize_transform(image) | |||
| prompt = self.cfg.model.get('prompt', ' what does the image describe?') | |||
| inputs = self.get_inputs(prompt) | |||
| sample = { | |||
| 'source': inputs, | |||
| 'patch_image': patch_image, | |||
| 'patch_mask': torch.tensor([True]) | |||
| } | |||
| return sample | |||
| @@ -0,0 +1,43 @@ | |||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||
| from typing import Any, Dict | |||
| import torch | |||
| from PIL import Image | |||
| from torchvision import transforms | |||
| from modelscope.preprocessors.image import load_image | |||
| from .base import OfaBasePreprocessor | |||
| class OfaImageClassificationPreprocessor(OfaBasePreprocessor): | |||
| def __init__(self, cfg, model_dir): | |||
| """preprocess the data via the vocab.txt from the `model_dir` path | |||
| Args: | |||
| cfg(modelscope.utils.config.ConfigDict) : model config | |||
| model_dir (str): model path | |||
| """ | |||
| super(OfaImageClassificationPreprocessor, | |||
| self).__init__(cfg, model_dir) | |||
| # Initialize transform | |||
| self.patch_resize_transform = transforms.Compose([ | |||
| lambda image: image.convert('RGB'), | |||
| transforms.Resize((self.patch_image_size, self.patch_image_size), | |||
| interpolation=Image.BICUBIC), | |||
| transforms.ToTensor(), | |||
| transforms.Normalize(mean=self.mean, std=self.std), | |||
| ]) | |||
| def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]: | |||
| image = data['image'] if isinstance( | |||
| data['image'], Image.Image) else load_image(data['image']) | |||
| patch_image = self.patch_resize_transform(image) | |||
| prompt = self.cfg.model.get('prompt', ' what does the image describe?') | |||
| inputs = self.get_inputs(prompt) | |||
| sample = { | |||
| 'source': inputs, | |||
| 'patch_image': patch_image, | |||
| 'patch_mask': torch.tensor([True]) | |||
| } | |||
| return sample | |||
| @@ -0,0 +1,37 @@ | |||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||
| from typing import Any, Dict | |||
| from .base import OfaBasePreprocessor | |||
| class OfaSummarizationPreprocessor(OfaBasePreprocessor): | |||
| def __init__(self, cfg, model_dir): | |||
| """preprocess the data via the vocab.txt from the `model_dir` path | |||
| Args: | |||
| cfg(modelscope.utils.config.ConfigDict) : model config | |||
| model_dir (str): model path | |||
| """ | |||
| super(OfaSummarizationPreprocessor, self).__init__(cfg, model_dir) | |||
| def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]: | |||
| source = super().pre_caption( | |||
| data['text'], max_words=self.max_src_length) | |||
| source = source.strip()[:self.max_src_length] | |||
| source = source.replace('[unk]', 'unk').replace('<unk>', 'unk') | |||
| prompt = self.cfg.model.get( | |||
| 'prompt', ' " {} " Summarize the article with a title: ') | |||
| text = prompt.format(source) | |||
| inputs = self.get_inputs(text) | |||
| if self.prompt_type == 'none': | |||
| decoder_prompt = self.bos_item | |||
| elif self.prompt_type == 'prev_output': | |||
| decoder_prompt = inputs[:-1] | |||
| else: | |||
| raise NotImplementedError | |||
| sample = { | |||
| 'source': inputs, | |||
| 'decoder_prompt': decoder_prompt, | |||
| } | |||
| return sample | |||
| @@ -0,0 +1,38 @@ | |||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||
| from typing import Any, Dict | |||
| from .base import OfaBasePreprocessor | |||
| class OfaTextClassificationPreprocessor(OfaBasePreprocessor): | |||
| def __init__(self, cfg, model_dir): | |||
| """preprocess the data via the vocab.txt from the `model_dir` path | |||
| Args: | |||
| cfg(modelscope.utils.config.ConfigDict) : model config | |||
| model_dir (str): model path | |||
| """ | |||
| super(OfaTextClassificationPreprocessor, self).__init__(cfg, model_dir) | |||
| def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]: | |||
| text1 = ' '.join( | |||
| data['text'].lower().strip().split()[:self.max_src_length]) | |||
| text2 = ' '.join( | |||
| data['text2'].lower().strip().split()[:self.max_src_length]) | |||
| prompt = ' can text1 " {} " imply text2 " {} "?' | |||
| text = prompt.format(text1, text2) | |||
| inputs = self.get_inputs(text) | |||
| if self.prompt_type == 'none': | |||
| decoder_prompt = self.bos_item | |||
| elif self.prompt_type == 'src': | |||
| decoder_prompt = inputs | |||
| elif self.prompt_type == 'prev_output': | |||
| decoder_prompt = inputs[:-1] | |||
| else: | |||
| raise NotImplementedError | |||
| sample = { | |||
| 'source': inputs, | |||
| 'decoder_prompt': decoder_prompt, | |||
| } | |||
| return sample | |||
| @@ -0,0 +1,109 @@ | |||
| import numpy as np | |||
| import torch | |||
| def collate_fn(samples, pad_idx, eos_idx): | |||
| if len(samples) == 0: | |||
| return {} | |||
| def merge(key): | |||
| return collate_tokens([s[key] for s in samples], | |||
| pad_idx, | |||
| eos_idx=eos_idx) | |||
| src_tokens = merge('source') | |||
| batch = { | |||
| 'nsentences': len(samples), | |||
| 'net_input': { | |||
| 'input_ids': src_tokens, | |||
| }, | |||
| } | |||
| if samples[0].get('id', None) is not None: | |||
| batch['id'] = np.array([s.get['id'] for s in samples]) | |||
| if samples[0].get('target', None) is not None: | |||
| batch['target'] = merge('target') | |||
| tgt_lengths = torch.LongTensor( | |||
| [s['target'].ne(pad_idx).long().sum() for s in samples]) | |||
| ntokens = tgt_lengths.sum().item() | |||
| batch['ntokens'] = ntokens | |||
| if samples[0].get('prev_output_tokens', None) is not None: | |||
| batch['net_input']['decoder_input_ids'] = merge('prev_output_tokens') | |||
| if samples[0].get('patch_image', None) is not None: | |||
| batch['net_input']['patch_images'] = torch.stack( | |||
| [sample['patch_image'] for sample in samples], dim=0) | |||
| if samples[0].get('patch_mask', None) is not None: | |||
| batch['net_input']['patch_masks'] = torch.cat( | |||
| [sample['patch_mask'] for sample in samples]) | |||
| # image generation | |||
| if samples[0].get('code_mask', None) is not None: | |||
| batch['net_input']['code_masks'] = torch.cat( | |||
| [sample['code_mask'] for sample in samples]) | |||
| if samples[0].get('code_image', None) is not None: | |||
| batch['code_images'] = torch.cat( | |||
| [sample['code_image'] for sample in samples]) | |||
| # For classification tasks (i.e., VQA, SNLI-VE, GLUE) | |||
| if samples[0].get('conf', None) is not None: | |||
| batch['conf'] = torch.cat([s['conf'] for s in samples], dim=0) | |||
| if samples[0].get('ref_dict', None) is not None: | |||
| batch['ref_dict'] = np.array([s['ref_dict'] for s in samples]) | |||
| if samples[0].get('constraint_mask', None) is not None: | |||
| batch['constraint_masks'] = merge('constraint_mask') | |||
| if samples[0].get('decoder_prompt', None) is not None: | |||
| batch['decoder_prompts'] = np.array( | |||
| [s['decoder_prompt'].tolist() for s in samples]) | |||
| # For detection and visual grounding | |||
| if samples[0].get('w_resize_ratio', None) is not None: | |||
| batch['w_resize_ratios'] = torch.stack( | |||
| [s['w_resize_ratio'] for s in samples], dim=0) | |||
| if samples[0].get('h_resize_ratio', None) is not None: | |||
| batch['h_resize_ratios'] = torch.stack( | |||
| [s['h_resize_ratio'] for s in samples], dim=0) | |||
| if samples[0].get('region_coord', None) is not None: | |||
| batch['region_coords'] = torch.stack( | |||
| [s['region_coord'] for s in samples], dim=0) | |||
| if samples[0].get('sample', None) is not None: | |||
| batch['samples'] = [s['sample'] for s in samples] | |||
| return batch | |||
| def collate_tokens( | |||
| values, | |||
| pad_idx, | |||
| eos_idx=None, | |||
| left_pad=False, | |||
| move_eos_to_beginning=False, | |||
| pad_to_length=None, | |||
| pad_to_multiple=1, | |||
| pad_to_bsz=None, | |||
| ): | |||
| """Convert a list of 1d tensors into a padded 2d tensor.""" | |||
| size = max(v.size(0) for v in values) | |||
| size = size if pad_to_length is None else max(size, pad_to_length) | |||
| if pad_to_multiple != 1 and size % pad_to_multiple != 0: | |||
| size = int(((size - 0.1) // pad_to_multiple + 1) * pad_to_multiple) | |||
| def copy_tensor(src, dst): | |||
| assert dst.numel() == src.numel() | |||
| if move_eos_to_beginning: | |||
| if eos_idx is None: | |||
| # if no eos_idx is specified, then use the last token in src | |||
| dst[0] = src[-1] | |||
| else: | |||
| dst[0] = eos_idx | |||
| dst[1:] = src[:-1] | |||
| else: | |||
| dst.copy_(src) | |||
| if values[0].dim() == 1: | |||
| res = values[0].new(len(values), size).fill_(pad_idx) | |||
| elif values[0].dim() == 2: | |||
| assert move_eos_to_beginning is False | |||
| res = values[0].new(len(values), size, | |||
| values[0].size(1)).fill_(pad_idx) | |||
| else: | |||
| raise NotImplementedError | |||
| for i, v in enumerate(values): | |||
| copy_tensor(v, res[i][size - len(v):] if left_pad else res[i][:len(v)]) | |||
| return res | |||
| @@ -0,0 +1,42 @@ | |||
| import torch | |||
| try: | |||
| import torch_xla.core.xla_model as xm | |||
| except ImportError: | |||
| xm = None | |||
| def get_rng_state(): | |||
| state = {'torch_rng_state': torch.get_rng_state()} | |||
| if xm is not None: | |||
| state['xla_rng_state'] = xm.get_rng_state() | |||
| if torch.cuda.is_available(): | |||
| state['cuda_rng_state'] = torch.cuda.get_rng_state() | |||
| return state | |||
| def set_rng_state(state): | |||
| torch.set_rng_state(state['torch_rng_state']) | |||
| if xm is not None: | |||
| xm.set_rng_state(state['xla_rng_state']) | |||
| if torch.cuda.is_available(): | |||
| torch.cuda.set_rng_state(state['cuda_rng_state']) | |||
| class set_torch_seed(object): | |||
| def __init__(self, seed): | |||
| assert isinstance(seed, int) | |||
| self.rng_state = get_rng_state() | |||
| torch.manual_seed(seed) | |||
| if xm is not None: | |||
| xm.set_rng_state(seed) | |||
| if torch.cuda.is_available(): | |||
| torch.cuda.manual_seed(seed) | |||
| def __enter__(self): | |||
| return self | |||
| def __exit__(self, *exc): | |||
| set_rng_state(self.rng_state) | |||
| @@ -0,0 +1,557 @@ | |||
| # Copyright 2022 The OFA-Sys Team. | |||
| # All rights reserved. | |||
| # This source code is licensed under the Apache 2.0 license | |||
| # found in the LICENSE file in the root directory. | |||
| import random | |||
| import numpy as np | |||
| import torch | |||
| import torchvision.transforms as T | |||
| import torchvision.transforms.functional as F | |||
| from PIL import Image | |||
| def crop(image, target, region, delete=True): | |||
| cropped_image = F.crop(image, *region) | |||
| target = target.copy() | |||
| i, j, h, w = region | |||
| # should we do something wrt the original size? | |||
| target['size'] = torch.tensor([h, w]) | |||
| fields = ['labels', 'area'] | |||
| if 'boxes' in target: | |||
| boxes = target['boxes'] | |||
| max_size = torch.as_tensor([w, h], dtype=torch.float32) | |||
| cropped_boxes = boxes - torch.as_tensor([j, i, j, i]) | |||
| cropped_boxes = torch.min(cropped_boxes.reshape(-1, 2, 2), max_size) | |||
| cropped_boxes = cropped_boxes.clamp(min=0) | |||
| area = (cropped_boxes[:, 1, :] - cropped_boxes[:, 0, :]).prod(dim=1) | |||
| target['boxes'] = cropped_boxes.reshape(-1, 4) | |||
| target['area'] = area | |||
| fields.append('boxes') | |||
| if 'polygons' in target: | |||
| polygons = target['polygons'] | |||
| num_polygons = polygons.shape[0] | |||
| max_size = torch.as_tensor([w, h], dtype=torch.float32) | |||
| start_coord = torch.cat([ | |||
| torch.tensor([j, i], dtype=torch.float32) | |||
| for _ in range(polygons.shape[1] // 2)], dim=0) # yapf: disable# | |||
| cropped_boxes = polygons - start_coord | |||
| cropped_boxes = torch.min( | |||
| cropped_boxes.reshape(num_polygons, -1, 2), max_size) | |||
| cropped_boxes = cropped_boxes.clamp(min=0) | |||
| target['polygons'] = cropped_boxes.reshape(num_polygons, -1) | |||
| fields.append('polygons') | |||
| if 'masks' in target: | |||
| # FIXME should we update the area here if there are no boxes? | |||
| target['masks'] = target['masks'][:, i:i + h, j:j + w] | |||
| fields.append('masks') | |||
| # remove elements for which the boxes or masks that have zero area | |||
| if delete and ('boxes' in target or 'masks' in target): | |||
| # favor boxes selection when defining which elements to keep | |||
| # this is compatible with previous implementation | |||
| if 'boxes' in target: | |||
| cropped_boxes = target['boxes'].reshape(-1, 2, 2) | |||
| keep = torch.all( | |||
| cropped_boxes[:, 1, :] > cropped_boxes[:, 0, :], dim=1) | |||
| else: | |||
| keep = target['masks'].flatten(1).any(1) | |||
| for field in fields: | |||
| target[field] = target[field][keep.tolist()] | |||
| return cropped_image, target | |||
| def hflip(image, target): | |||
| flipped_image = F.hflip(image) | |||
| w, h = image.size | |||
| target = target.copy() | |||
| if 'boxes' in target: | |||
| boxes = target['boxes'] | |||
| boxes = boxes[:, [2, 1, 0, 3]] * torch.as_tensor( | |||
| [-1, 1, -1, 1]) + torch.as_tensor([w, 0, w, 0]) | |||
| target['boxes'] = boxes | |||
| if 'polygons' in target: | |||
| polygons = target['polygons'] | |||
| num_polygons = polygons.shape[0] | |||
| polygons = polygons.reshape(num_polygons, -1, 2) * torch.as_tensor( | |||
| [-1, 1]) + torch.as_tensor([w, 0]) | |||
| target['polygons'] = polygons | |||
| if 'masks' in target: | |||
| target['masks'] = target['masks'].flip(-1) | |||
| return flipped_image, target | |||
| def resize(image, target, size, max_size=None): | |||
| # size can be min_size (scalar) or (w, h) tuple | |||
| def get_size_with_aspect_ratio(image_size, size, max_size=None): | |||
| w, h = image_size | |||
| if (w <= h and w == size) or (h <= w and h == size): | |||
| if max_size is not None: | |||
| max_size = int(max_size) | |||
| h = min(h, max_size) | |||
| w = min(w, max_size) | |||
| return (h, w) | |||
| if w < h: | |||
| ow = size | |||
| oh = int(size * h / w) | |||
| else: | |||
| oh = size | |||
| ow = int(size * w / h) | |||
| if max_size is not None: | |||
| max_size = int(max_size) | |||
| oh = min(oh, max_size) | |||
| ow = min(ow, max_size) | |||
| return (oh, ow) | |||
| def get_size(image_size, size, max_size=None): | |||
| if isinstance(size, (list, tuple)): | |||
| return size[::-1] | |||
| else: | |||
| return get_size_with_aspect_ratio(image_size, size, max_size) | |||
| size = get_size(image.size, size, max_size) | |||
| rescaled_image = F.resize(image, size, interpolation=Image.BICUBIC) | |||
| if target is None: | |||
| return rescaled_image | |||
| ratios = tuple( | |||
| float(s) / float(s_orig) | |||
| for s, s_orig in zip(rescaled_image.size, image.size)) | |||
| ratio_width, ratio_height = ratios | |||
| target = target.copy() | |||
| if 'boxes' in target: | |||
| boxes = target['boxes'] | |||
| scaled_boxes = boxes * torch.as_tensor( | |||
| [ratio_width, ratio_height, ratio_width, ratio_height]) | |||
| target['boxes'] = scaled_boxes | |||
| if 'polygons' in target: | |||
| polygons = target['polygons'] | |||
| scaled_ratio = torch.cat([ | |||
| torch.tensor([ratio_width, ratio_height]) | |||
| for _ in range(polygons.shape[1] // 2)], dim=0) # yapf: disable | |||
| scaled_polygons = polygons * scaled_ratio | |||
| target['polygons'] = scaled_polygons | |||
| if 'area' in target: | |||
| area = target['area'] | |||
| scaled_area = area * (ratio_width * ratio_height) | |||
| target['area'] = scaled_area | |||
| h, w = size | |||
| target['size'] = torch.tensor([h, w]) | |||
| if 'masks' in target: | |||
| assert False | |||
| return rescaled_image, target | |||
| class CenterCrop(object): | |||
| def __init__(self, size): | |||
| self.size = size | |||
| def __call__(self, img, target): | |||
| image_width, image_height = img.size | |||
| crop_height, crop_width = self.size | |||
| crop_top = int(round((image_height - crop_height) / 2.)) | |||
| crop_left = int(round((image_width - crop_width) / 2.)) | |||
| return crop(img, target, | |||
| (crop_top, crop_left, crop_height, crop_width)) | |||
| class ObjectCenterCrop(object): | |||
| def __init__(self, size): | |||
| self.size = size | |||
| def __call__(self, img, target): | |||
| image_width, image_height = img.size | |||
| crop_height, crop_width = self.size | |||
| x0 = float(target['boxes'][0][0]) | |||
| y0 = float(target['boxes'][0][1]) | |||
| x1 = float(target['boxes'][0][2]) | |||
| y1 = float(target['boxes'][0][3]) | |||
| center_x = (x0 + x1) / 2 | |||
| center_y = (y0 + y1) / 2 | |||
| crop_left = max( | |||
| center_x - crop_width / 2 | |||
| + min(image_width - center_x - crop_width / 2, 0), 0) | |||
| crop_top = max( | |||
| center_y - crop_height / 2 | |||
| + min(image_height - center_y - crop_height / 2, 0), 0) | |||
| return crop( | |||
| img, | |||
| target, (crop_top, crop_left, crop_height, crop_width), | |||
| delete=False) | |||
| class RandomHorizontalFlip(object): | |||
| def __init__(self, p=0.5): | |||
| self.p = p | |||
| def __call__(self, img, target): | |||
| if random.random() < self.p: | |||
| return hflip(img, target) | |||
| return img, target | |||
| class RandomResize(object): | |||
| def __init__(self, sizes, max_size=None, equal=False): | |||
| assert isinstance(sizes, (list, tuple)) | |||
| self.sizes = sizes | |||
| self.max_size = max_size | |||
| self.equal = equal | |||
| def __call__(self, img, target=None): | |||
| size = random.choice(self.sizes) | |||
| if self.equal: | |||
| return resize(img, target, size, size) | |||
| else: | |||
| return resize(img, target, size, self.max_size) | |||
| class ToTensor(object): | |||
| def __call__(self, img, target): | |||
| return F.to_tensor(img), target | |||
| class Normalize(object): | |||
| def __init__(self, mean, std, max_image_size=512): | |||
| self.mean = mean | |||
| self.std = std | |||
| self.max_image_size = max_image_size | |||
| def __call__(self, image, target=None): | |||
| image = F.normalize(image, mean=self.mean, std=self.std) | |||
| if target is None: | |||
| return image, None | |||
| target = target.copy() | |||
| # h, w = image.shape[-2:] | |||
| h, w = target['size'][0], target['size'][1] | |||
| if 'boxes' in target: | |||
| boxes = target['boxes'] | |||
| boxes = boxes / self.max_image_size | |||
| target['boxes'] = boxes | |||
| if 'polygons' in target: | |||
| polygons = target['polygons'] | |||
| scale = torch.cat([ | |||
| torch.tensor([w, h], dtype=torch.float32) | |||
| for _ in range(polygons.shape[1] // 2)], dim=0) # yapf: disable | |||
| polygons = polygons / scale | |||
| target['polygons'] = polygons | |||
| return image, target | |||
| class Compose(object): | |||
| def __init__(self, transforms): | |||
| self.transforms = transforms | |||
| def __call__(self, image, target): | |||
| for t in self.transforms: | |||
| image, target = t(image, target) | |||
| return image, target | |||
| def __repr__(self): | |||
| format_string = self.__class__.__name__ + '(' | |||
| for t in self.transforms: | |||
| format_string += '\n' | |||
| format_string += ' {0}'.format(t) | |||
| format_string += '\n)' | |||
| return format_string | |||
| class LargeScaleJitter(object): | |||
| """ | |||
| implementation of large scale jitter from copy_paste | |||
| """ | |||
| def __init__(self, output_size=512, aug_scale_min=0.3, aug_scale_max=2.0): | |||
| self.desired_size = torch.tensor([output_size]) | |||
| self.aug_scale_min = aug_scale_min | |||
| self.aug_scale_max = aug_scale_max | |||
| def rescale_target(self, scaled_size, image_size, target): | |||
| # compute rescaled targets | |||
| image_scale = scaled_size / image_size | |||
| ratio_height, ratio_width = image_scale | |||
| target = target.copy() | |||
| target['size'] = scaled_size | |||
| if 'boxes' in target: | |||
| boxes = target['boxes'] | |||
| scaled_boxes = boxes * torch.as_tensor( | |||
| [ratio_width, ratio_height, ratio_width, ratio_height]) | |||
| target['boxes'] = scaled_boxes | |||
| if 'area' in target: | |||
| area = target['area'] | |||
| scaled_area = area * (ratio_width * ratio_height) | |||
| target['area'] = scaled_area | |||
| if 'masks' in target: | |||
| assert False | |||
| masks = target['masks'] | |||
| # masks = interpolate( | |||
| # masks[:, None].float(), scaled_size, mode="nearest")[:, 0] > 0.5 | |||
| target['masks'] = masks | |||
| return target | |||
| def crop_target(self, region, target): | |||
| i, j, h, w = region | |||
| fields = ['labels', 'area'] | |||
| target = target.copy() | |||
| target['size'] = torch.tensor([h, w]) | |||
| if 'boxes' in target: | |||
| boxes = target['boxes'] | |||
| max_size = torch.as_tensor([w, h], dtype=torch.float32) | |||
| cropped_boxes = boxes - torch.as_tensor([j, i, j, i]) | |||
| cropped_boxes = torch.min( | |||
| cropped_boxes.reshape(-1, 2, 2), max_size) | |||
| cropped_boxes = cropped_boxes.clamp(min=0) | |||
| area = (cropped_boxes[:, 1, :] | |||
| - cropped_boxes[:, 0, :]).prod(dim=1) | |||
| target['boxes'] = cropped_boxes.reshape(-1, 4) | |||
| target['area'] = area | |||
| fields.append('boxes') | |||
| if 'masks' in target: | |||
| # FIXME should we update the area here if there are no boxes? | |||
| target['masks'] = target['masks'][:, i:i + h, j:j + w] | |||
| fields.append('masks') | |||
| # remove elements for which the boxes or masks that have zero area | |||
| if 'boxes' in target or 'masks' in target: | |||
| # favor boxes selection when defining which elements to keep | |||
| # this is compatible with previous implementation | |||
| if 'boxes' in target: | |||
| cropped_boxes = target['boxes'].reshape(-1, 2, 2) | |||
| keep = torch.all( | |||
| cropped_boxes[:, 1, :] > cropped_boxes[:, 0, :], dim=1) | |||
| else: | |||
| keep = target['masks'].flatten(1).any(1) | |||
| for field in fields: | |||
| target[field] = target[field][keep.tolist()] | |||
| return target | |||
| def pad_target(self, padding, target): | |||
| target = target.copy() | |||
| if 'masks' in target: | |||
| target['masks'] = torch.nn.functional.pad( | |||
| target['masks'], (0, padding[1], 0, padding[0])) | |||
| return target | |||
| def __call__(self, image, target=None): | |||
| image_size = image.size | |||
| image_size = torch.tensor(image_size[::-1]) | |||
| random_scale = torch.rand(1) * ( | |||
| self.aug_scale_max - self.aug_scale_min) + self.aug_scale_min | |||
| scaled_size = (random_scale * self.desired_size).round() | |||
| scale = torch.maximum(scaled_size / image_size[0], | |||
| scaled_size / image_size[1]) | |||
| scaled_size = (image_size * scale).round().int() | |||
| scaled_image = F.resize( | |||
| image, scaled_size.tolist(), interpolation=Image.BICUBIC) | |||
| if target is not None: | |||
| target = self.rescale_target(scaled_size, image_size, target) | |||
| # randomly crop or pad images | |||
| if random_scale >= 1: | |||
| # Selects non-zero random offset (x, y) if scaled image is larger than desired_size. | |||
| max_offset = scaled_size - self.desired_size | |||
| offset = (max_offset * torch.rand(2)).floor().int() | |||
| region = (offset[0].item(), offset[1].item(), | |||
| self.desired_size[0].item(), self.desired_size[0].item()) | |||
| output_image = F.crop(scaled_image, *region) | |||
| if target is not None: | |||
| target = self.crop_target(region, target) | |||
| else: | |||
| assert False | |||
| padding = self.desired_size - scaled_size | |||
| output_image = F.pad(scaled_image, | |||
| [0, 0, padding[1].item(), padding[0].item()]) | |||
| if target is not None: | |||
| target = self.pad_target(padding, target) | |||
| return output_image, target | |||
| class OriginLargeScaleJitter(object): | |||
| """ | |||
| implementation of large scale jitter from copy_paste | |||
| """ | |||
| def __init__(self, output_size=512, aug_scale_min=0.3, aug_scale_max=2.0): | |||
| self.desired_size = torch.tensor(output_size) | |||
| self.aug_scale_min = aug_scale_min | |||
| self.aug_scale_max = aug_scale_max | |||
| def rescale_target(self, scaled_size, image_size, target): | |||
| # compute rescaled targets | |||
| image_scale = scaled_size / image_size | |||
| ratio_height, ratio_width = image_scale | |||
| target = target.copy() | |||
| target['size'] = scaled_size | |||
| if 'boxes' in target: | |||
| boxes = target['boxes'] | |||
| scaled_boxes = boxes * torch.as_tensor( | |||
| [ratio_width, ratio_height, ratio_width, ratio_height]) | |||
| target['boxes'] = scaled_boxes | |||
| if 'area' in target: | |||
| area = target['area'] | |||
| scaled_area = area * (ratio_width * ratio_height) | |||
| target['area'] = scaled_area | |||
| if 'masks' in target: | |||
| assert False | |||
| masks = target['masks'] | |||
| # masks = interpolate( | |||
| # masks[:, None].float(), scaled_size, mode="nearest")[:, 0] > 0.5 | |||
| target['masks'] = masks | |||
| return target | |||
| def crop_target(self, region, target): | |||
| i, j, h, w = region | |||
| fields = ['labels', 'area'] | |||
| target = target.copy() | |||
| target['size'] = torch.tensor([h, w]) | |||
| if 'boxes' in target: | |||
| boxes = target['boxes'] | |||
| max_size = torch.as_tensor([w, h], dtype=torch.float32) | |||
| cropped_boxes = boxes - torch.as_tensor([j, i, j, i]) | |||
| cropped_boxes = torch.min( | |||
| cropped_boxes.reshape(-1, 2, 2), max_size) | |||
| cropped_boxes = cropped_boxes.clamp(min=0) | |||
| area = (cropped_boxes[:, 1, :] | |||
| - cropped_boxes[:, 0, :]).prod(dim=1) | |||
| target['boxes'] = cropped_boxes.reshape(-1, 4) | |||
| target['area'] = area | |||
| fields.append('boxes') | |||
| if 'masks' in target: | |||
| # FIXME should we update the area here if there are no boxes? | |||
| target['masks'] = target['masks'][:, i:i + h, j:j + w] | |||
| fields.append('masks') | |||
| # remove elements for which the boxes or masks that have zero area | |||
| if 'boxes' in target or 'masks' in target: | |||
| # favor boxes selection when defining which elements to keep | |||
| # this is compatible with previous implementation | |||
| if 'boxes' in target: | |||
| cropped_boxes = target['boxes'].reshape(-1, 2, 2) | |||
| keep = torch.all( | |||
| cropped_boxes[:, 1, :] > cropped_boxes[:, 0, :], dim=1) | |||
| else: | |||
| keep = target['masks'].flatten(1).any(1) | |||
| for field in fields: | |||
| target[field] = target[field][keep.tolist()] | |||
| return target | |||
| def pad_target(self, padding, target): | |||
| target = target.copy() | |||
| if 'masks' in target: | |||
| target['masks'] = torch.nn.functional.pad( | |||
| target['masks'], (0, padding[1], 0, padding[0])) | |||
| return target | |||
| def __call__(self, image, target=None): | |||
| image_size = image.size | |||
| image_size = torch.tensor(image_size[::-1]) | |||
| out_desired_size = (self.desired_size * image_size | |||
| / max(image_size)).round().int() | |||
| random_scale = torch.rand(1) * ( | |||
| self.aug_scale_max - self.aug_scale_min) + self.aug_scale_min | |||
| scaled_size = (random_scale * self.desired_size).round() | |||
| scale = torch.minimum(scaled_size / image_size[0], | |||
| scaled_size / image_size[1]) | |||
| scaled_size = (image_size * scale).round().int() | |||
| scaled_image = F.resize(image, scaled_size.tolist()) | |||
| if target is not None: | |||
| target = self.rescale_target(scaled_size, image_size, target) | |||
| # randomly crop or pad images | |||
| if random_scale > 1: | |||
| # Selects non-zero random offset (x, y) if scaled image is larger than desired_size. | |||
| max_offset = scaled_size - out_desired_size | |||
| offset = (max_offset * torch.rand(2)).floor().int() | |||
| region = (offset[0].item(), offset[1].item(), | |||
| out_desired_size[0].item(), out_desired_size[1].item()) | |||
| output_image = F.crop(scaled_image, *region) | |||
| if target is not None: | |||
| target = self.crop_target(region, target) | |||
| else: | |||
| padding = out_desired_size - scaled_size | |||
| output_image = F.pad(scaled_image, | |||
| [0, 0, padding[1].item(), padding[0].item()]) | |||
| if target is not None: | |||
| target = self.pad_target(padding, target) | |||
| return output_image, target | |||
| class RandomDistortion(object): | |||
| """ | |||
| Distort image w.r.t hue, saturation and exposure. | |||
| """ | |||
| def __init__(self, | |||
| brightness=0, | |||
| contrast=0, | |||
| saturation=0, | |||
| hue=0, | |||
| prob=0.5): | |||
| self.prob = prob | |||
| self.tfm = T.ColorJitter(brightness, contrast, saturation, hue) | |||
| def __call__(self, img, target=None): | |||
| if np.random.random() < self.prob: | |||
| return self.tfm(img), target | |||
| else: | |||
| return img, target | |||
| @@ -0,0 +1,357 @@ | |||
| # Copyright 2022 The OFA-Sys Team. | |||
| # All rights reserved. | |||
| # This source code is licensed under the Apache 2.0 license | |||
| # found in the LICENSE file in the root directory. | |||
| import cv2 | |||
| import numpy as np | |||
| def identity_func(img): | |||
| return img | |||
| def autocontrast_func(img, cutoff=0): | |||
| ''' | |||
| same output as PIL.ImageOps.autocontrast | |||
| ''' | |||
| n_bins = 256 | |||
| def tune_channel(ch): | |||
| n = ch.size | |||
| cut = cutoff * n // 100 | |||
| if cut == 0: | |||
| high, low = ch.max(), ch.min() | |||
| else: | |||
| hist = cv2.calcHist([ch], [0], None, [n_bins], [0, n_bins]) | |||
| low = np.argwhere(np.cumsum(hist) > cut) | |||
| low = 0 if low.shape[0] == 0 else low[0] | |||
| high = np.argwhere(np.cumsum(hist[::-1]) > cut) | |||
| high = n_bins - 1 if high.shape[0] == 0 else n_bins - 1 - high[0] | |||
| if high <= low: | |||
| table = np.arange(n_bins) | |||
| else: | |||
| scale = (n_bins - 1) / (high - low) | |||
| offset = -low * scale | |||
| table = np.arange(n_bins) * scale + offset | |||
| table[table < 0] = 0 | |||
| table[table > n_bins - 1] = n_bins - 1 | |||
| table = table.clip(0, 255).astype(np.uint8) | |||
| return table[ch] | |||
| channels = [tune_channel(ch) for ch in cv2.split(img)] | |||
| out = cv2.merge(channels) | |||
| return out | |||
| def equalize_func(img): | |||
| ''' | |||
| same output as PIL.ImageOps.equalize | |||
| PIL's implementation is different from cv2.equalize | |||
| ''' | |||
| n_bins = 256 | |||
| def tune_channel(ch): | |||
| hist = cv2.calcHist([ch], [0], None, [n_bins], [0, n_bins]) | |||
| non_zero_hist = hist[hist != 0].reshape(-1) | |||
| step = np.sum(non_zero_hist[:-1]) // (n_bins - 1) | |||
| if step == 0: | |||
| return ch | |||
| n = np.empty_like(hist) | |||
| n[0] = step // 2 | |||
| n[1:] = hist[:-1] | |||
| table = (np.cumsum(n) // step).clip(0, 255).astype(np.uint8) | |||
| return table[ch] | |||
| channels = [tune_channel(ch) for ch in cv2.split(img)] | |||
| out = cv2.merge(channels) | |||
| return out | |||
| def rotate_func(img, degree, fill=(0, 0, 0)): | |||
| ''' | |||
| like PIL, rotate by degree, not radians | |||
| ''' | |||
| H, W = img.shape[0], img.shape[1] | |||
| center = W / 2, H / 2 | |||
| M = cv2.getRotationMatrix2D(center, degree, 1) | |||
| out = cv2.warpAffine(img, M, (W, H), borderValue=fill) | |||
| return out | |||
| def solarize_func(img, thresh=128): | |||
| ''' | |||
| same output as PIL.ImageOps.posterize | |||
| ''' | |||
| table = np.array([el if el < thresh else 255 - el for el in range(256)]) | |||
| table = table.clip(0, 255).astype(np.uint8) | |||
| out = table[img] | |||
| return out | |||
| def color_func(img, factor): | |||
| # same output as PIL.ImageEnhance.Color | |||
| M = ( | |||
| np.float32([[0.886, -0.114, -0.114], [-0.587, 0.413, -0.587], | |||
| [-0.299, -0.299, 0.701]]) * factor | |||
| + np.float32([[0.114], [0.587], [0.299]])) | |||
| out = np.matmul(img, M).clip(0, 255).astype(np.uint8) | |||
| return out | |||
| def contrast_func(img, factor): | |||
| """ | |||
| same output as PIL.ImageEnhance.Contrast | |||
| """ | |||
| mean = np.sum(np.mean(img, axis=(0, 1)) * np.array([0.114, 0.587, 0.299])) | |||
| table = np.array([(el - mean) * factor + mean | |||
| for el in range(256)]).clip(0, 255).astype(np.uint8) | |||
| out = table[img] | |||
| return out | |||
| def brightness_func(img, factor): | |||
| ''' | |||
| same output as PIL.ImageEnhance.Contrast | |||
| ''' | |||
| table = (np.arange(256, dtype=np.float32) * factor).clip(0, 255).astype( | |||
| np.uint8) | |||
| out = table[img] | |||
| return out | |||
| def sharpness_func(img, factor): | |||
| ''' | |||
| The differences the this result and PIL are all on the 4 boundaries, the center | |||
| areas are same | |||
| ''' | |||
| kernel = np.ones((3, 3), dtype=np.float32) | |||
| kernel[1][1] = 5 | |||
| kernel /= 13 | |||
| degenerate = cv2.filter2D(img, -1, kernel) | |||
| if factor == 0.0: | |||
| out = degenerate | |||
| elif factor == 1.0: | |||
| out = img | |||
| else: | |||
| out = img.astype(np.float32) | |||
| degenerate = degenerate.astype(np.float32)[1:-1, 1:-1, :] | |||
| out[1:-1, 1:-1, :] = degenerate + factor * ( | |||
| out[1:-1, 1:-1, :] - degenerate) | |||
| out = out.astype(np.uint8) | |||
| return out | |||
| def shear_x_func(img, factor, fill=(0, 0, 0)): | |||
| H, W = img.shape[0], img.shape[1] | |||
| M = np.float32([[1, factor, 0], [0, 1, 0]]) | |||
| out = cv2.warpAffine( | |||
| img, M, (W, H), borderValue=fill, | |||
| flags=cv2.INTER_LINEAR).astype(np.uint8) | |||
| return out | |||
| def translate_x_func(img, offset, fill=(0, 0, 0)): | |||
| ''' | |||
| same output as PIL.Image.transform | |||
| ''' | |||
| H, W = img.shape[0], img.shape[1] | |||
| M = np.float32([[1, 0, -offset], [0, 1, 0]]) | |||
| out = cv2.warpAffine( | |||
| img, M, (W, H), borderValue=fill, | |||
| flags=cv2.INTER_LINEAR).astype(np.uint8) | |||
| return out | |||
| def translate_y_func(img, offset, fill=(0, 0, 0)): | |||
| ''' | |||
| same output as PIL.Image.transform | |||
| ''' | |||
| H, W = img.shape[0], img.shape[1] | |||
| M = np.float32([[1, 0, 0], [0, 1, -offset]]) | |||
| out = cv2.warpAffine( | |||
| img, M, (W, H), borderValue=fill, | |||
| flags=cv2.INTER_LINEAR).astype(np.uint8) | |||
| return out | |||
| def posterize_func(img, bits): | |||
| ''' | |||
| same output as PIL.ImageOps.posterize | |||
| ''' | |||
| out = np.bitwise_and(img, np.uint8(255 << (8 - bits))) | |||
| return out | |||
| def shear_y_func(img, factor, fill=(0, 0, 0)): | |||
| H, W = img.shape[0], img.shape[1] | |||
| M = np.float32([[1, 0, 0], [factor, 1, 0]]) | |||
| out = cv2.warpAffine( | |||
| img, M, (W, H), borderValue=fill, | |||
| flags=cv2.INTER_LINEAR).astype(np.uint8) | |||
| return out | |||
| def cutout_func(img, pad_size, replace=(0, 0, 0)): | |||
| replace = np.array(replace, dtype=np.uint8) | |||
| H, W = img.shape[0], img.shape[1] | |||
| rh, rw = np.random.random(2) | |||
| pad_size = pad_size // 2 | |||
| ch, cw = int(rh * H), int(rw * W) | |||
| x1, x2 = max(ch - pad_size, 0), min(ch + pad_size, H) | |||
| y1, y2 = max(cw - pad_size, 0), min(cw + pad_size, W) | |||
| out = img.copy() | |||
| out[x1:x2, y1:y2, :] = replace | |||
| return out | |||
| # level to args | |||
| def enhance_level_to_args(MAX_LEVEL): | |||
| def level_to_args(level): | |||
| return ((level / MAX_LEVEL) * 1.8 + 0.1, ) | |||
| return level_to_args | |||
| def shear_level_to_args(MAX_LEVEL, replace_value): | |||
| def level_to_args(level): | |||
| level = (level / MAX_LEVEL) * 0.3 | |||
| if np.random.random() > 0.5: | |||
| level = -level | |||
| return level, replace_value | |||
| return level_to_args | |||
| def translate_level_to_args(translate_const, MAX_LEVEL, replace_value): | |||
| def level_to_args(level): | |||
| level = (level / MAX_LEVEL) * float(translate_const) | |||
| if np.random.random() > 0.5: | |||
| level = -level | |||
| return (level, replace_value) | |||
| return level_to_args | |||
| def cutout_level_to_args(cutout_const, MAX_LEVEL, replace_value): | |||
| def level_to_args(level): | |||
| level = int((level / MAX_LEVEL) * cutout_const) | |||
| return (level, replace_value) | |||
| return level_to_args | |||
| def solarize_level_to_args(MAX_LEVEL): | |||
| def level_to_args(level): | |||
| level = int((level / MAX_LEVEL) * 256) | |||
| return (level, ) | |||
| return level_to_args | |||
| def none_level_to_args(level): | |||
| return () | |||
| def posterize_level_to_args(MAX_LEVEL): | |||
| def level_to_args(level): | |||
| level = int((level / MAX_LEVEL) * 4) | |||
| return (level, ) | |||
| return level_to_args | |||
| def rotate_level_to_args(MAX_LEVEL, replace_value): | |||
| def level_to_args(level): | |||
| level = (level / MAX_LEVEL) * 30 | |||
| if np.random.random() < 0.5: | |||
| level = -level | |||
| return (level, replace_value) | |||
| return level_to_args | |||
| func_dict = { | |||
| 'Identity': identity_func, | |||
| 'AutoContrast': autocontrast_func, | |||
| 'Equalize': equalize_func, | |||
| 'Rotate': rotate_func, | |||
| 'Solarize': solarize_func, | |||
| 'Color': color_func, | |||
| 'Contrast': contrast_func, | |||
| 'Brightness': brightness_func, | |||
| 'Sharpness': sharpness_func, | |||
| 'ShearX': shear_x_func, | |||
| 'TranslateX': translate_x_func, | |||
| 'TranslateY': translate_y_func, | |||
| 'Posterize': posterize_func, | |||
| 'ShearY': shear_y_func, | |||
| } | |||
| translate_const = 10 | |||
| MAX_LEVEL = 10 | |||
| replace_value = (128, 128, 128) | |||
| arg_dict = { | |||
| 'Identity': | |||
| none_level_to_args, | |||
| 'AutoContrast': | |||
| none_level_to_args, | |||
| 'Equalize': | |||
| none_level_to_args, | |||
| 'Rotate': | |||
| rotate_level_to_args(MAX_LEVEL, replace_value), | |||
| 'Solarize': | |||
| solarize_level_to_args(MAX_LEVEL), | |||
| 'Color': | |||
| enhance_level_to_args(MAX_LEVEL), | |||
| 'Contrast': | |||
| enhance_level_to_args(MAX_LEVEL), | |||
| 'Brightness': | |||
| enhance_level_to_args(MAX_LEVEL), | |||
| 'Sharpness': | |||
| enhance_level_to_args(MAX_LEVEL), | |||
| 'ShearX': | |||
| shear_level_to_args(MAX_LEVEL, replace_value), | |||
| 'TranslateX': | |||
| translate_level_to_args(translate_const, MAX_LEVEL, replace_value), | |||
| 'TranslateY': | |||
| translate_level_to_args(translate_const, MAX_LEVEL, replace_value), | |||
| 'Posterize': | |||
| posterize_level_to_args(MAX_LEVEL), | |||
| 'ShearY': | |||
| shear_level_to_args(MAX_LEVEL, replace_value), | |||
| } | |||
| class RandomAugment(object): | |||
| def __init__(self, N=2, M=10, isPIL=False, augs=[]): | |||
| self.N = N | |||
| self.M = M | |||
| self.isPIL = isPIL | |||
| if augs: | |||
| self.augs = augs | |||
| else: | |||
| self.augs = list(arg_dict.keys()) | |||
| def get_random_ops(self): | |||
| sampled_ops = np.random.choice(self.augs, self.N) | |||
| return [(op, 0.5, self.M) for op in sampled_ops] | |||
| def __call__(self, img): | |||
| if self.isPIL: | |||
| img = np.array(img) | |||
| ops = self.get_random_ops() | |||
| for name, prob, level in ops: | |||
| if np.random.random() > prob: | |||
| continue | |||
| args = arg_dict[name](level) | |||
| img = func_dict[name](img, *args) | |||
| return img | |||
| @@ -0,0 +1,62 @@ | |||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||
| from typing import Any, Dict | |||
| import torch | |||
| from PIL import Image | |||
| from torchvision import transforms | |||
| from modelscope.preprocessors.image import load_image | |||
| from .base import OfaBasePreprocessor | |||
| class OfaVisualEntailmentPreprocessor(OfaBasePreprocessor): | |||
| def __init__(self, cfg, model_dir): | |||
| """preprocess the data via the vocab.txt from the `model_dir` path | |||
| Args: | |||
| cfg(modelscope.utils.config.ConfigDict) : model config | |||
| model_dir (str): model path | |||
| """ | |||
| super(OfaVisualEntailmentPreprocessor, self).__init__(cfg, model_dir) | |||
| # Initialize transform | |||
| self.patch_resize_transform = transforms.Compose([ | |||
| lambda image: image.convert('RGB'), | |||
| transforms.Resize((self.patch_image_size, self.patch_image_size), | |||
| interpolation=Image.BICUBIC), | |||
| transforms.ToTensor(), | |||
| transforms.Normalize(mean=self.mean, std=self.std), | |||
| ]) | |||
| def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]: | |||
| image = data['image'] if isinstance( | |||
| data['image'], Image.Image) else load_image(data['image']) | |||
| patch_image = self.patch_resize_transform(image) | |||
| if 'text2' not in data: | |||
| hypothesis = self.pre_caption(data['text'], self.max_src_length) | |||
| prompt = self.cfg.model.get('prompt', | |||
| ' does the image describe " {} "?') | |||
| text = prompt.format(hypothesis) | |||
| else: | |||
| assert 'text' in data, f'text must be in the input {data.keys()}' | |||
| caption = self.pre_caption(data['text2'], self.max_src_length) | |||
| hypothesis = self.pre_caption(data['text'], self.max_src_length) | |||
| prompt = self.cfg.model.get( | |||
| 'prompt', ' can image and text1 " {} " imply text2 " {} "?') | |||
| text = prompt.format(caption, hypothesis) | |||
| inputs = self.get_inputs(text) | |||
| if self.prompt_type == 'none': | |||
| decoder_prompt = self.bos_item | |||
| elif self.prompt_type == 'src': | |||
| decoder_prompt = inputs | |||
| elif self.prompt_type == 'prev_output': | |||
| decoder_prompt = inputs[:-1] | |||
| else: | |||
| raise NotImplementedError | |||
| sample = { | |||
| 'source': inputs, | |||
| 'patch_image': patch_image, | |||
| 'patch_mask': torch.tensor([True]), | |||
| 'decoder_prompt': decoder_prompt, | |||
| } | |||
| return sample | |||
| @@ -0,0 +1,50 @@ | |||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||
| from typing import Any, Dict | |||
| import torch | |||
| from PIL import Image | |||
| from torchvision import transforms | |||
| from modelscope.preprocessors.image import load_image | |||
| from .base import OfaBasePreprocessor | |||
| class OfaVisualGroundingPreprocessor(OfaBasePreprocessor): | |||
| def __init__(self, cfg, model_dir): | |||
| """preprocess the data via the vocab.txt from the `model_dir` path | |||
| Args: | |||
| cfg(modelscope.utils.config.ConfigDict) : model config | |||
| model_dir (str): model path | |||
| """ | |||
| super(OfaVisualGroundingPreprocessor, self).__init__(cfg, model_dir) | |||
| # Initialize transform | |||
| self.patch_resize_transform = transforms.Compose([ | |||
| lambda image: image.convert('RGB'), | |||
| transforms.Resize((self.patch_image_size, self.patch_image_size), | |||
| interpolation=Image.BICUBIC), | |||
| transforms.ToTensor(), | |||
| transforms.Normalize(mean=self.mean, std=self.std), | |||
| ]) | |||
| def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]: | |||
| image = data['image'] if isinstance( | |||
| data['image'], Image.Image) else load_image(data['image']) | |||
| w, h = image.size | |||
| patch_image = self.patch_resize_transform(image) | |||
| w_resize_ratio = torch.tensor(self.patch_image_size / w) | |||
| h_resize_ratio = torch.tensor(self.patch_image_size / h) | |||
| src_caption = self.pre_caption(data['text'], self.max_src_length) | |||
| prompt = self.cfg.model.get( | |||
| 'prompt', ' which region does the text " {} " describe?') | |||
| text = prompt.format(src_caption) | |||
| src_item = self.get_inputs(text) | |||
| sample = { | |||
| 'source': src_item, | |||
| 'patch_image': patch_image, | |||
| 'patch_mask': torch.tensor([True]), | |||
| 'w_resize_ratio': w_resize_ratio, | |||
| 'h_resize_ratio': h_resize_ratio, | |||
| } | |||
| return sample | |||
| @@ -0,0 +1,52 @@ | |||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||
| from typing import Any, Dict | |||
| import torch | |||
| from PIL import Image | |||
| from torchvision import transforms | |||
| from modelscope.preprocessors.image import load_image | |||
| from .base import OfaBasePreprocessor | |||
| class OfaVisualQuestionAnsweringPreprocessor(OfaBasePreprocessor): | |||
| def __init__(self, cfg, model_dir): | |||
| """preprocess the data via the vocab.txt from the `model_dir` path | |||
| Args: | |||
| cfg(modelscope.utils.config.ConfigDict) : model config | |||
| model_dir (str): model path | |||
| """ | |||
| super(OfaVisualQuestionAnsweringPreprocessor, | |||
| self).__init__(cfg, model_dir) | |||
| # Initialize transform | |||
| self.patch_resize_transform = transforms.Compose([ | |||
| lambda image: image.convert('RGB'), | |||
| transforms.Resize((self.patch_image_size, self.patch_image_size), | |||
| interpolation=Image.BICUBIC), | |||
| transforms.ToTensor(), | |||
| transforms.Normalize(mean=self.mean, std=self.std), | |||
| ]) | |||
| def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]: | |||
| image = data['image'] if isinstance( | |||
| data['image'], Image.Image) else load_image(data['image']) | |||
| patch_image = self.patch_resize_transform(image) | |||
| text = ' {}'.format(data['text']) | |||
| inputs = self.get_inputs(text) | |||
| if self.prompt_type == 'none': | |||
| decoder_prompt = self.bos_item | |||
| elif self.prompt_type == 'src': | |||
| decoder_prompt = inputs | |||
| elif self.prompt_type == 'prev_output': | |||
| decoder_prompt = inputs[:-1] | |||
| else: | |||
| raise NotImplementedError | |||
| sample = { | |||
| 'source': inputs, | |||
| 'patch_image': patch_image, | |||
| 'patch_mask': torch.tensor([True]), | |||
| 'decoder_prompt': decoder_prompt, | |||
| } | |||
| return sample | |||
| @@ -85,6 +85,7 @@ class MultiModalTasks(object): | |||
| multi_modal_embedding = 'multi-modal-embedding' | |||
| generative_multi_modal_embedding = 'generative-multi-modal-embedding' | |||
| visual_question_answering = 'visual-question-answering' | |||
| visual_entailment = 'visual-entailment' | |||
| video_multi_modal_embedding = 'video-multi-modal-embedding' | |||
| @@ -0,0 +1,29 @@ | |||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||
| from collections import defaultdict | |||
| class TreeNode: | |||
| def __init__(self): | |||
| self.child = defaultdict(TreeNode) | |||
| class Trie: | |||
| def __init__(self, eos): | |||
| self.root = TreeNode() | |||
| self.eos = eos | |||
| def insert(self, word): | |||
| cur = self.root | |||
| for c in word: | |||
| cur = cur.child[c] | |||
| def get_next_layer(self, word): | |||
| cur = self.root | |||
| for c in word: | |||
| cur = cur.child.get(c) | |||
| if cur is None: | |||
| return [self.eos] | |||
| return list(cur.child.keys()) | |||
| @@ -1,23 +0,0 @@ | |||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||
| import unittest | |||
| from modelscope.outputs import OutputKeys | |||
| from modelscope.pipelines import pipeline | |||
| from modelscope.utils.constant import Tasks | |||
| from modelscope.utils.test_utils import test_level | |||
| class ImageCaptionTest(unittest.TestCase): | |||
| @unittest.skipUnless(test_level() >= 1, 'skip test in current test level') | |||
| def test_run(self): | |||
| img_captioning = pipeline( | |||
| Tasks.image_captioning, | |||
| model='damo/ofa_image-caption_coco_distilled_en') | |||
| result = img_captioning('data/test/images/image_captioning.png') | |||
| print(result[OutputKeys.CAPTION]) | |||
| if __name__ == '__main__': | |||
| unittest.main() | |||
| @@ -0,0 +1,179 @@ | |||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||
| import unittest | |||
| from modelscope.models import Model | |||
| from modelscope.outputs import OutputKeys | |||
| from modelscope.pipelines import pipeline | |||
| from modelscope.utils.constant import Tasks | |||
| from modelscope.utils.test_utils import test_level | |||
| class OfaTasksTest(unittest.TestCase): | |||
| @unittest.skipUnless(test_level() >= 1, 'skip test in current test level') | |||
| def test_run_with_image_captioning_with_model(self): | |||
| model = Model.from_pretrained( | |||
| 'damo/ofa_image-caption_coco_distilled_en') | |||
| img_captioning = pipeline( | |||
| task=Tasks.image_captioning, | |||
| model=model, | |||
| ) | |||
| result = img_captioning( | |||
| {'image': 'data/test/images/image_captioning.png'}) | |||
| print(result[OutputKeys.CAPTION]) | |||
| @unittest.skipUnless(test_level() >= 1, 'skip test in current test level') | |||
| def test_run_with_image_captioning_with_name(self): | |||
| img_captioning = pipeline( | |||
| Tasks.image_captioning, | |||
| model='damo/ofa_image-caption_coco_distilled_en') | |||
| result = img_captioning( | |||
| {'image': 'data/test/images/image_captioning.png'}) | |||
| print(result[OutputKeys.CAPTION]) | |||
| @unittest.skipUnless(test_level() >= 1, 'skip test in current test level') | |||
| def test_run_with_image_classification_with_model(self): | |||
| model = Model.from_pretrained( | |||
| 'damo/ofa_image-classification_imagenet_large_en') | |||
| ofa_pipe = pipeline(Tasks.image_classification, model=model) | |||
| image = 'data/test/images/image_classification.png' | |||
| input = {'image': image} | |||
| result = ofa_pipe(input) | |||
| print(result) | |||
| @unittest.skipUnless(test_level() >= 1, 'skip test in current test level') | |||
| def test_run_with_image_classification_with_name(self): | |||
| ofa_pipe = pipeline( | |||
| Tasks.image_classification, | |||
| model='damo/ofa_image-classification_imagenet_large_en') | |||
| image = 'data/test/images/image_classification.png' | |||
| input = {'image': image} | |||
| result = ofa_pipe(input) | |||
| print(result) | |||
| @unittest.skipUnless(test_level() >= 1, 'skip test in current test level') | |||
| def test_run_with_summarization_with_model(self): | |||
| model = Model.from_pretrained( | |||
| 'damo/ofa_summarization_gigaword_large_en') | |||
| ofa_pipe = pipeline(Tasks.summarization, model=model) | |||
| text = 'five-time world champion michelle kwan withdrew' + \ | |||
| 'from the #### us figure skating championships on wednesday ,' + \ | |||
| ' but will petition us skating officials for the chance to ' + \ | |||
| 'compete at the #### turin olympics .' | |||
| input = {'text': text} | |||
| result = ofa_pipe(input) | |||
| print(result) | |||
| @unittest.skipUnless(test_level() >= 1, 'skip test in current test level') | |||
| def test_run_with_summarization_with_name(self): | |||
| ofa_pipe = pipeline( | |||
| Tasks.summarization, | |||
| model='damo/ofa_summarization_gigaword_large_en') | |||
| text = 'five-time world champion michelle kwan withdrew' + \ | |||
| 'from the #### us figure skating championships on wednesday ,' + \ | |||
| ' but will petition us skating officials for the chance to ' +\ | |||
| 'compete at the #### turin olympics .' | |||
| input = {'text': text} | |||
| result = ofa_pipe(input) | |||
| print(result) | |||
| @unittest.skipUnless(test_level() >= 1, 'skip test in current test level') | |||
| def test_run_with_text_classification_with_model(self): | |||
| model = Model.from_pretrained( | |||
| 'damo/ofa_text-classification_mnli_large_en') | |||
| ofa_pipe = pipeline(Tasks.text_classification, model=model) | |||
| text = 'One of our number will carry out your instructions minutely.' | |||
| text2 = 'A member of my team will execute your orders with immense precision.' | |||
| input = {'text': text, 'text2': text2} | |||
| result = ofa_pipe(input) | |||
| print(result) | |||
| @unittest.skipUnless(test_level() >= 1, 'skip test in current test level') | |||
| def test_run_with_text_classification_with_name(self): | |||
| ofa_pipe = pipeline( | |||
| Tasks.text_classification, | |||
| model='damo/ofa_text-classification_mnli_large_en') | |||
| text = 'One of our number will carry out your instructions minutely.' | |||
| text2 = 'A member of my team will execute your orders with immense precision.' | |||
| input = {'text': text, 'text2': text2} | |||
| result = ofa_pipe(input) | |||
| print(result) | |||
| @unittest.skipUnless(test_level() >= 1, 'skip test in current test level') | |||
| def test_run_with_visual_entailment_with_model(self): | |||
| model = Model.from_pretrained( | |||
| 'damo/ofa_visual-entailment_snli-ve_large_en') | |||
| ofa_pipe = pipeline(Tasks.visual_entailment, model=model) | |||
| image = 'data/test/images/dogs.jpg' | |||
| text = 'there are two birds.' | |||
| input = {'image': image, 'text': text} | |||
| result = ofa_pipe(input) | |||
| print(result) | |||
| @unittest.skipUnless(test_level() >= 1, 'skip test in current test level') | |||
| def test_run_with_visual_entailment_with_name(self): | |||
| ofa_pipe = pipeline( | |||
| Tasks.visual_entailment, | |||
| model='damo/ofa_visual-entailment_snli-ve_large_en') | |||
| image = 'data/test/images/dogs.jpg' | |||
| text = 'there are two birds.' | |||
| input = {'image': image, 'text': text} | |||
| result = ofa_pipe(input) | |||
| print(result) | |||
| @unittest.skipUnless(test_level() >= 1, 'skip test in current test level') | |||
| def test_run_with_visual_grounding_with_model(self): | |||
| model = Model.from_pretrained( | |||
| 'damo/ofa_visual-grounding_refcoco_large_en') | |||
| ofa_pipe = pipeline(Tasks.visual_grounding, model=model) | |||
| image = 'data/test/images/visual_grounding.png' | |||
| text = 'a blue turtle-like pokemon with round head' | |||
| input = {'image': image, 'text': text} | |||
| result = ofa_pipe(input) | |||
| print(result) | |||
| @unittest.skipUnless(test_level() >= 1, 'skip test in current test level') | |||
| def test_run_with_visual_grounding_with_name(self): | |||
| ofa_pipe = pipeline( | |||
| Tasks.visual_grounding, | |||
| model='damo/ofa_visual-grounding_refcoco_large_en') | |||
| image = 'data/test/images/visual_grounding.png' | |||
| text = 'a blue turtle-like pokemon with round head' | |||
| input = {'image': image, 'text': text} | |||
| result = ofa_pipe(input) | |||
| print(result) | |||
| @unittest.skipUnless(test_level() >= 1, 'skip test in current test level') | |||
| def test_run_with_visual_question_answering_with_model(self): | |||
| from modelscope.preprocessors.multi_modal import OfaPreprocessor | |||
| model = Model.from_pretrained( | |||
| 'damo/ofa_visual-question-answering_pretrain_large_en') | |||
| preprocessor = OfaPreprocessor(model_dir=model.model_dir) | |||
| ofa_pipe = pipeline( | |||
| Tasks.visual_question_answering, | |||
| model=model, | |||
| preprocessor=preprocessor) | |||
| image = 'data/test/images/visual_question_answering.png' | |||
| text = 'what is grown on the plant?' | |||
| input = {'image': image, 'text': text} | |||
| result = ofa_pipe(input) | |||
| print(result) | |||
| @unittest.skipUnless(test_level() >= 1, 'skip test in current test level') | |||
| def test_run_with_visual_question_answering_with_name(self): | |||
| from modelscope.preprocessors.multi_modal import OfaPreprocessor | |||
| model = 'damo/ofa_visual-question-answering_pretrain_large_en' | |||
| preprocessor = OfaPreprocessor(model_dir=model) | |||
| ofa_pipe = pipeline( | |||
| Tasks.visual_question_answering, | |||
| model=model, | |||
| preprocessor=preprocessor) | |||
| image = 'data/test/images/visual_question_answering.png' | |||
| text = 'what is grown on the plant?' | |||
| input = {'image': image, 'text': text} | |||
| result = ofa_pipe(input) | |||
| print(result) | |||
| if __name__ == '__main__': | |||
| unittest.main() | |||