Browse Source

[to #42322933] add ofa source code and caption distill model code reivew link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/9373663

master
yichang.zyc 3 years ago
parent
commit
b5ff6e9b8c
21 changed files with 6051 additions and 68 deletions
  1. +1
    -1
      modelscope/metainfo.py
  2. +1
    -1
      modelscope/models/multi_modal/__init__.py
  3. +2
    -0
      modelscope/models/multi_modal/ofa/__init__.py
  4. +194
    -0
      modelscope/models/multi_modal/ofa/configuration_ofa.py
  5. +0
    -0
      modelscope/models/multi_modal/ofa/generate/__init__.py
  6. +51
    -0
      modelscope/models/multi_modal/ofa/generate/incremental_decoding_utils.py
  7. +510
    -0
      modelscope/models/multi_modal/ofa/generate/multihead_attention.py
  8. +155
    -0
      modelscope/models/multi_modal/ofa/generate/ngram_repeat_block.py
  9. +848
    -0
      modelscope/models/multi_modal/ofa/generate/search.py
  10. +996
    -0
      modelscope/models/multi_modal/ofa/generate/sequence_generator.py
  11. +512
    -0
      modelscope/models/multi_modal/ofa/generate/token_generation_constraints.py
  12. +124
    -0
      modelscope/models/multi_modal/ofa/generate/utils.py
  13. +2192
    -0
      modelscope/models/multi_modal/ofa/modeling_ofa.py
  14. +283
    -0
      modelscope/models/multi_modal/ofa/resnet.py
  15. +48
    -0
      modelscope/models/multi_modal/ofa/tokenization_ofa.py
  16. +59
    -0
      modelscope/models/multi_modal/ofa/tokenization_ofa_fast.py
  17. +53
    -0
      modelscope/models/multi_modal/ofa_for_image_captioning_model.py
  18. +1
    -1
      modelscope/pipelines/builder.py
  19. +1
    -1
      modelscope/pipelines/multi_modal/image_captioning_pipeline.py
  20. +19
    -63
      modelscope/preprocessors/multi_modal.py
  21. +1
    -1
      tests/pipelines/test_image_captioning.py

+ 1
- 1
modelscope/metainfo.py View File

@@ -73,7 +73,7 @@ class Pipelines(object):
asr_inference = 'asr-inference'

# multi-modal tasks
image_caption = 'image-captioning'
image_captioning = 'image-captioning'
multi_modal_embedding = 'multi-modal-embedding'
visual_question_answering = 'visual-question-answering'
text_to_image_synthesis = 'text-to-image-synthesis'


+ 1
- 1
modelscope/models/multi_modal/__init__.py View File

@@ -1,5 +1,5 @@
from .clip.clip_model import CLIPForMultiModalEmbedding
from .image_captioning_model import OfaForImageCaptioning
from .imagen.imagen_model import ImagenForTextToImageSynthesis
from .mplug_for_visual_question_answering import \
MPlugForVisualQuestionAnswering
from .ofa_for_image_captioning_model import OfaForImageCaptioning

+ 2
- 0
modelscope/models/multi_modal/ofa/__init__.py View File

@@ -0,0 +1,2 @@
from .modeling_ofa import OFADecoder, OFAEncoder, OFAModel, OFAPreTrainedModel
from .tokenization_ofa import OFATokenizer

+ 194
- 0
modelscope/models/multi_modal/ofa/configuration_ofa.py View File

@@ -0,0 +1,194 @@
# Copyright 2022 Alibaba Group and The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" OFA model configuration"""
import warnings

from transformers import PretrainedConfig
from transformers.utils import logging

logger = logging.get_logger(__name__)

OFA_PRETRAINED_CONFIG_ARCHIVE_MAP = {
'ofa-medium': 'https://huggingface.co/ofa-base/resolve/main/config.json',
# OFA models are implemeted to be compatible with both huggingface
# and modelscope frameworks. For all OFA models available on huggingface,
# please refer to https://huggingface.co/models?filter=ofa
}


class OFAConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`~OFAModel`]. It is used to instantiate an OFA
model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
defaults will yield a similar configuration to that of the OFA [ofa-base](https://huggingface.co/ofa-base)
architecture.

Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.


Args:
vocab_size (`int`, *optional*, defaults to 50265):
Vocabulary size of the OFA model. Defines the number of different tokens that can be represented by the
`inputs_ids` passed when calling [`~OFAModel`] or [`~TFOFAModel`].
d_model (`int`, *optional*, defaults to 1024):
Dimension of the layers and the pooler layer.
encoder_layers (`int`, *optional*, defaults to 12):
Number of encoder layers.
decoder_layers (`int`, *optional*, defaults to 12):
Number of decoder layers.
encoder_attention_heads (`int`, *optional*, defaults to 16):
Number of attention heads for each attention layer in the Transformer encoder.
decoder_attention_heads (`int`, *optional*, defaults to 16):
Number of attention heads for each attention layer in the Transformer decoder.
decoder_ffn_dim (`int`, *optional*, defaults to 4096):
Dimension of the "intermediate" (often named feed-forward) layer in decoder.
encoder_ffn_dim (`int`, *optional*, defaults to 4096):
Dimension of the "intermediate" (often named feed-forward) layer in decoder.
activation_function (`str` or `function`, *optional*, defaults to `"gelu"`):
The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
`"relu"`, `"silu"` and `"gelu_new"` are supported.
dropout (`float`, *optional*, defaults to 0.1):
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
attention_dropout (`float`, *optional*, defaults to 0.0):
The dropout ratio for the attention probabilities.
activation_dropout (`float`, *optional*, defaults to 0.0):
The dropout ratio for activations inside the fully connected layer.
classifier_dropout (`float`, *optional*, defaults to 0.0):
The dropout ratio for classifier.
max_position_embeddings (`int`, *optional*, defaults to 1024):
The maximum sequence length that this model might ever be used with. Typically set this to something large
just in case (e.g., 512 or 1024 or 2048).
init_std (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
encoder_layerdrop: (`float`, *optional*, defaults to 0.0):
The LayerDrop probability for the encoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556)
for more details.
decoder_layerdrop: (`float`, *optional*, defaults to 0.0):
The LayerDrop probability for the decoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556)
for more details.
use_cache (`bool`, *optional*, defaults to `True`):
Whether or not the model should return the last key/values attentions (not used by all models).
"""

model_type = 'ofa'
keys_to_ignore_at_inference = ['past_key_values']

attribute_map = {
'num_attention_heads': 'encoder_attention_heads',
'hidden_size': 'd_model'
}

def __init__(self,
vocab_size=59457,
max_position_embeddings=1024,
encoder_layers=4,
encoder_ffn_dim=512 * 4,
encoder_attention_heads=8,
decoder_layers=4,
decoder_ffn_dim=512 * 4,
decoder_attention_heads=8,
encoder_layerdrop=0.0,
decoder_layerdrop=0.0,
use_cache=True,
is_encoder_decoder=True,
activation_function='gelu',
d_model=512,
dropout=0.1,
attention_dropout=0.0,
activation_dropout=0.0,
init_std=0.02,
classifier_dropout=0.0,
scale_embedding=False,
pad_token_id=1,
bos_token_id=0,
decoder_start_token_id=0,
eos_token_id=2,
forced_eos_token_id=2,
encoder_normalize_before=True,
decoder_normalize_before=True,
normformer=True,
encoder_drop_path_rate=0.0,
decoder_drop_path_rate=0.0,
layernorm_embedding=True,
patch_layernorm_embedding=True,
resnet_type='resnet101',
resnet_model_path=None,
resnet_drop_path_rate=0.0,
token_bucket_size=256,
image_bucket_size=42,
add_type_embedding=True,
share_decoder_input_output_embed=True,
attn_scale_factor=2.,
code_layernorm_embedding=True,
code_image_size=128,
entangle_position_embedding=False,
**kwargs):
self.vocab_size = vocab_size
self.max_position_embeddings = max_position_embeddings
self.d_model = d_model
self.encoder_ffn_dim = encoder_ffn_dim
self.encoder_layers = encoder_layers
self.encoder_attention_heads = encoder_attention_heads
self.decoder_ffn_dim = decoder_ffn_dim
self.decoder_layers = decoder_layers
self.decoder_attention_heads = decoder_attention_heads
self.dropout = dropout
self.attention_dropout = attention_dropout
self.activation_dropout = activation_dropout
self.activation_function = activation_function
self.init_std = init_std
self.encoder_layerdrop = encoder_layerdrop
self.decoder_layerdrop = decoder_layerdrop
self.classifier_dropout = classifier_dropout
self.use_cache = use_cache
self.num_hidden_layers = encoder_layers
self.scale_embedding = scale_embedding # scale factor will be sqrt(d_model) if True
self.encoder_normalize_before = encoder_normalize_before
self.decoder_normalize_before = decoder_normalize_before
self.normformer = normformer
self.encoder_drop_path_rate = encoder_drop_path_rate
self.decoder_drop_path_rate = decoder_drop_path_rate
self.layernorm_embedding = layernorm_embedding
self.patch_layernorm_embedding = patch_layernorm_embedding
self.resnet_type = resnet_type
self.resnet_model_path = resnet_model_path
self.resnet_drop_path_rate = resnet_drop_path_rate
self.token_bucket_size = token_bucket_size
self.image_bucket_size = image_bucket_size
self.add_type_embedding = add_type_embedding
self.share_decoder_input_output_embed = share_decoder_input_output_embed
self.attn_scale_factor = attn_scale_factor
self.code_layernorm_embedding = code_layernorm_embedding
self.code_image_size = code_image_size
self.entangle_position_embedding = entangle_position_embedding

super().__init__(
pad_token_id=pad_token_id,
bos_token_id=bos_token_id,
eos_token_id=eos_token_id,
is_encoder_decoder=is_encoder_decoder,
decoder_start_token_id=decoder_start_token_id,
forced_eos_token_id=forced_eos_token_id,
**kwargs,
)

# ensure backward compatibility for BART CNN models
if self.forced_bos_token_id is None and kwargs.get(
'force_bos_token_to_be_generated', False):
self.forced_bos_token_id = self.bos_token_id
warnings.warn(
f'Please make sure the config includes `forced_bos_token_id={self.bos_token_id}` in future versions. '
'The config can simply be saved and uploaded again to be fixed.'
)

+ 0
- 0
modelscope/models/multi_modal/ofa/generate/__init__.py View File


+ 51
- 0
modelscope/models/multi_modal/ofa/generate/incremental_decoding_utils.py View File

@@ -0,0 +1,51 @@
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license which can be found at
# https://github.com/facebookresearch/fairseq/blob/main/LICENSE

import uuid
from typing import Dict, Optional

from torch import Tensor


class FairseqIncrementalState(object):

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.init_incremental_state()

def init_incremental_state(self):
self._incremental_state_id = str(uuid.uuid4())

def _get_full_incremental_state_key(self, key: str) -> str:
return '{}.{}'.format(self._incremental_state_id, key)

def get_incremental_state(
self,
incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]],
key: str,
) -> Optional[Dict[str, Optional[Tensor]]]:
"""Helper for getting incremental state for an nn.Module."""
full_key = self._get_full_incremental_state_key(key)
if incremental_state is None or full_key not in incremental_state:
return None
return incremental_state[full_key]

def set_incremental_state(
self,
incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]],
key: str,
value: Dict[str, Optional[Tensor]],
) -> Optional[Dict[str, Dict[str, Optional[Tensor]]]]:
"""Helper for setting incremental state for an nn.Module."""
if incremental_state is not None:
full_key = self._get_full_incremental_state_key(key)
incremental_state[full_key] = value
return incremental_state


def with_incremental_state(cls):
cls.__bases__ = (FairseqIncrementalState, ) + tuple(
b for b in cls.__bases__ if b != FairseqIncrementalState)
return cls

+ 510
- 0
modelscope/models/multi_modal/ofa/generate/multihead_attention.py View File

@@ -0,0 +1,510 @@
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license which can be found at
# https://github.com/facebookresearch/fairseq/blob/main/LICENSE

import math
from typing import Dict, Optional, Tuple

import torch
import torch.nn.functional as F
from fairseq import utils
from fairseq.incremental_decoding_utils import with_incremental_state
from fairseq.modules.fairseq_dropout import FairseqDropout
from fairseq.modules.quant_noise import quant_noise
from torch import Tensor, nn
from torch.nn import Parameter


@with_incremental_state
class MultiheadAttention(nn.Module):
"""Multi-headed attention.

See "Attention Is All You Need" for more details.
"""

def __init__(
self,
embed_dim,
num_heads,
kdim=None,
vdim=None,
dropout=0.0,
bias=True,
add_bias_kv=False,
add_zero_attn=False,
self_attention=False,
encoder_decoder_attention=False,
q_noise=0.0,
qn_block_size=8,
):
super().__init__()
self.embed_dim = embed_dim
self.kdim = kdim if kdim is not None else embed_dim
self.vdim = vdim if vdim is not None else embed_dim
self.qkv_same_dim = self.kdim == embed_dim and self.vdim == embed_dim

self.num_heads = num_heads
self.dropout_module = FairseqDropout(
dropout, module_name=self.__class__.__name__)

self.head_dim = embed_dim // num_heads
assert (self.head_dim * num_heads == self.embed_dim
), 'embed_dim must be divisible by num_heads'
self.scaling = self.head_dim**-0.5

self.self_attention = self_attention
self.encoder_decoder_attention = encoder_decoder_attention

assert not self.self_attention or self.qkv_same_dim, (
'Self-attention requires query, key and '
'value to be of the same size')

self.k_proj = quant_noise(
nn.Linear(self.kdim, embed_dim, bias=bias), q_noise, qn_block_size)
self.v_proj = quant_noise(
nn.Linear(self.vdim, embed_dim, bias=bias), q_noise, qn_block_size)
self.q_proj = quant_noise(
nn.Linear(embed_dim, embed_dim, bias=bias), q_noise, qn_block_size)

self.out_proj = quant_noise(
nn.Linear(embed_dim, embed_dim, bias=bias), q_noise, qn_block_size)

if add_bias_kv:
self.bias_k = Parameter(torch.Tensor(1, 1, embed_dim))
self.bias_v = Parameter(torch.Tensor(1, 1, embed_dim))
else:
self.bias_k = self.bias_v = None

self.add_zero_attn = add_zero_attn

self.reset_parameters()

self.onnx_trace = False

def prepare_for_onnx_export_(self):
self.onnx_trace = True

def reset_parameters(self):
if self.qkv_same_dim:
# Empirically observed the convergence to be much better with
# the scaled initialization
nn.init.xavier_uniform_(self.k_proj.weight, gain=1 / math.sqrt(2))
nn.init.xavier_uniform_(self.v_proj.weight, gain=1 / math.sqrt(2))
nn.init.xavier_uniform_(self.q_proj.weight, gain=1 / math.sqrt(2))
else:
nn.init.xavier_uniform_(self.k_proj.weight)
nn.init.xavier_uniform_(self.v_proj.weight)
nn.init.xavier_uniform_(self.q_proj.weight)

nn.init.xavier_uniform_(self.out_proj.weight)
if self.out_proj.bias is not None:
nn.init.constant_(self.out_proj.bias, 0.0)
if self.bias_k is not None:
nn.init.xavier_normal_(self.bias_k)
if self.bias_v is not None:
nn.init.xavier_normal_(self.bias_v)

def forward(
self,
query,
key: Optional[Tensor],
value: Optional[Tensor],
key_padding_mask: Optional[Tensor] = None,
incremental_state: Optional[Dict[str, Dict[str,
Optional[Tensor]]]] = None,
need_weights: bool = True,
static_kv: bool = False,
attn_mask: Optional[Tensor] = None,
before_softmax: bool = False,
need_head_weights: bool = False,
) -> Tuple[Tensor, Optional[Tensor]]:
"""Input shape: Time x Batch x Channel

Args:
key_padding_mask (ByteTensor, optional): mask to exclude
keys that are pads, of shape `(batch, src_len)`, where
padding elements are indicated by 1s.
need_weights (bool, optional): return the attention weights,
averaged over heads (default: False).
attn_mask (ByteTensor, optional): typically used to
implement causal attention, where the mask prevents the
attention from looking forward in time (default: None).
before_softmax (bool, optional): return the raw attention
weights and values before the attention softmax.
need_head_weights (bool, optional): return the attention
weights for each head. Implies *need_weights*. Default:
return the average attention weights over all heads.
"""
if need_head_weights:
need_weights = True

is_tpu = query.device.type == 'xla'

tgt_len, bsz, embed_dim = query.size()
src_len = tgt_len
assert embed_dim == self.embed_dim, f'query dim {embed_dim} != {self.embed_dim}'
assert list(query.size()) == [tgt_len, bsz, embed_dim]
if key is not None:
src_len, key_bsz, _ = key.size()
if not torch.jit.is_scripting():
assert key_bsz == bsz
assert value is not None
assert src_len, bsz == value.shape[:2]

if (not self.onnx_trace
and not is_tpu # don't use PyTorch version on TPUs
and incremental_state is None and not static_kv
# A workaround for quantization to work. Otherwise JIT compilation
# treats bias in linear module as method.
and not torch.jit.is_scripting()):
assert key is not None and value is not None
return F.multi_head_attention_forward(
query,
key,
value,
self.embed_dim,
self.num_heads,
torch.empty([0]),
torch.cat(
(self.q_proj.bias, self.k_proj.bias, self.v_proj.bias)),
self.bias_k,
self.bias_v,
self.add_zero_attn,
self.dropout_module.p,
self.out_proj.weight,
self.out_proj.bias,
self.training or self.dropout_module.apply_during_inference,
key_padding_mask,
need_weights,
attn_mask,
use_separate_proj_weight=True,
q_proj_weight=self.q_proj.weight,
k_proj_weight=self.k_proj.weight,
v_proj_weight=self.v_proj.weight,
)

if incremental_state is not None:
saved_state = self._get_input_buffer(incremental_state)
if saved_state is not None and 'prev_key' in saved_state:
# previous time steps are cached - no need to recompute
# key and value if they are static
if static_kv:
assert self.encoder_decoder_attention and not self.self_attention
key = value = None
else:
saved_state = None

if self.self_attention:
q = self.q_proj(query)
k = self.k_proj(query)
v = self.v_proj(query)
elif self.encoder_decoder_attention:
# encoder-decoder attention
q = self.q_proj(query)
if key is None:
assert value is None
k = v = None
else:
k = self.k_proj(key)
v = self.v_proj(key)

else:
assert key is not None and value is not None
q = self.q_proj(query)
k = self.k_proj(key)
v = self.v_proj(value)
q *= self.scaling

if self.bias_k is not None:
assert self.bias_v is not None
k = torch.cat([k, self.bias_k.repeat(1, bsz, 1)])
v = torch.cat([v, self.bias_v.repeat(1, bsz, 1)])
if attn_mask is not None:
attn_mask = torch.cat(
[attn_mask,
attn_mask.new_zeros(attn_mask.size(0), 1)],
dim=1)
if key_padding_mask is not None:
key_padding_mask = torch.cat(
[
key_padding_mask,
key_padding_mask.new_zeros(
key_padding_mask.size(0), 1),
],
dim=1,
)

q = (
q.contiguous().view(tgt_len, bsz * self.num_heads,
self.head_dim).transpose(0, 1))
if k is not None:
k = (
k.contiguous().view(-1, bsz * self.num_heads,
self.head_dim).transpose(0, 1))
if v is not None:
v = (
v.contiguous().view(-1, bsz * self.num_heads,
self.head_dim).transpose(0, 1))

if saved_state is not None:
# saved states are stored with shape (bsz, num_heads, seq_len, head_dim)
if 'prev_key' in saved_state:
_prev_key = saved_state['prev_key']
assert _prev_key is not None
prev_key = _prev_key.view(bsz * self.num_heads, -1,
self.head_dim)
if static_kv:
k = prev_key
else:
assert k is not None
k = torch.cat([prev_key, k], dim=1)
src_len = k.size(1)
if 'prev_value' in saved_state:
_prev_value = saved_state['prev_value']
assert _prev_value is not None
prev_value = _prev_value.view(bsz * self.num_heads, -1,
self.head_dim)
if static_kv:
v = prev_value
else:
assert v is not None
v = torch.cat([prev_value, v], dim=1)
prev_key_padding_mask: Optional[Tensor] = None
if 'prev_key_padding_mask' in saved_state:
prev_key_padding_mask = saved_state['prev_key_padding_mask']
assert k is not None and v is not None
key_padding_mask = MultiheadAttention._append_prev_key_padding_mask(
key_padding_mask=key_padding_mask,
prev_key_padding_mask=prev_key_padding_mask,
batch_size=bsz,
src_len=k.size(1),
static_kv=static_kv,
)

saved_state['prev_key'] = k.view(bsz, self.num_heads, -1,
self.head_dim)
saved_state['prev_value'] = v.view(bsz, self.num_heads, -1,
self.head_dim)
saved_state['prev_key_padding_mask'] = key_padding_mask
# In this branch incremental_state is never None
assert incremental_state is not None
incremental_state = self._set_input_buffer(incremental_state,
saved_state)
assert k is not None
assert k.size(1) == src_len

# This is part of a workaround to get around fork/join parallelism
# not supporting Optional types.
if key_padding_mask is not None and key_padding_mask.dim() == 0:
key_padding_mask = None

if key_padding_mask is not None:
assert key_padding_mask.size(0) == bsz
assert key_padding_mask.size(1) == src_len

if self.add_zero_attn:
assert v is not None
src_len += 1
k = torch.cat([k, k.new_zeros((k.size(0), 1) + k.size()[2:])],
dim=1)
v = torch.cat([v, v.new_zeros((v.size(0), 1) + v.size()[2:])],
dim=1)
if attn_mask is not None:
attn_mask = torch.cat(
[attn_mask,
attn_mask.new_zeros(attn_mask.size(0), 1)],
dim=1)
if key_padding_mask is not None:
key_padding_mask = torch.cat(
[
key_padding_mask,
torch.zeros(key_padding_mask.size(0),
1).type_as(key_padding_mask),
],
dim=1,
)

attn_weights = torch.bmm(q, k.transpose(1, 2))
attn_weights = self.apply_sparse_mask(attn_weights, tgt_len, src_len,
bsz)

assert list(
attn_weights.size()) == [bsz * self.num_heads, tgt_len, src_len]

if attn_mask is not None:
attn_mask = attn_mask.unsqueeze(0)
if self.onnx_trace:
attn_mask = attn_mask.repeat(attn_weights.size(0), 1, 1)
attn_weights += attn_mask

if key_padding_mask is not None:
# don't attend to padding symbols
attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len,
src_len)
if not is_tpu:
attn_weights = attn_weights.masked_fill(
key_padding_mask.unsqueeze(1).unsqueeze(2).to(torch.bool),
float('-inf'),
)
else:
attn_weights = attn_weights.transpose(0, 2)
attn_weights = attn_weights.masked_fill(
key_padding_mask, float('-inf'))
attn_weights = attn_weights.transpose(0, 2)
attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len,
src_len)

if before_softmax:
return attn_weights, v

attn_weights_float = utils.softmax(
attn_weights, dim=-1, onnx_trace=self.onnx_trace)
attn_weights = attn_weights_float.type_as(attn_weights)
attn_probs = self.dropout_module(attn_weights)

assert v is not None
attn = torch.bmm(attn_probs, v)
assert list(
attn.size()) == [bsz * self.num_heads, tgt_len, self.head_dim]
if self.onnx_trace and attn.size(1) == 1:
# when ONNX tracing a single decoder step (sequence length == 1)
# the transpose is a no-op copy before view, thus unnecessary
attn = attn.contiguous().view(tgt_len, bsz, embed_dim)
else:
attn = attn.transpose(0,
1).contiguous().view(tgt_len, bsz, embed_dim)
attn = self.out_proj(attn)
attn_weights: Optional[Tensor] = None
if need_weights:
attn_weights = attn_weights_float.view(bsz, self.num_heads,
tgt_len,
src_len).transpose(1, 0)
if not need_head_weights:
# average attention weights over heads
attn_weights = attn_weights.mean(dim=0)

return attn, attn_weights

@staticmethod
def _append_prev_key_padding_mask(
key_padding_mask: Optional[Tensor],
prev_key_padding_mask: Optional[Tensor],
batch_size: int,
src_len: int,
static_kv: bool,
) -> Optional[Tensor]:
# saved key padding masks have shape (bsz, seq_len)
if prev_key_padding_mask is not None and static_kv:
new_key_padding_mask = prev_key_padding_mask
elif prev_key_padding_mask is not None and key_padding_mask is not None:
new_key_padding_mask = torch.cat(
[prev_key_padding_mask.float(),
key_padding_mask.float()],
dim=1)
# During incremental decoding, as the padding token enters and
# leaves the frame, there will be a time when prev or current
# is None
elif prev_key_padding_mask is not None:
if src_len > prev_key_padding_mask.size(1):
filler = torch.zeros(
(batch_size, src_len - prev_key_padding_mask.size(1)),
device=prev_key_padding_mask.device,
)
new_key_padding_mask = torch.cat(
[prev_key_padding_mask.float(),
filler.float()], dim=1)
else:
new_key_padding_mask = prev_key_padding_mask.float()
elif key_padding_mask is not None:
if src_len > key_padding_mask.size(1):
filler = torch.zeros(
(batch_size, src_len - key_padding_mask.size(1)),
device=key_padding_mask.device,
)
new_key_padding_mask = torch.cat(
[filler.float(), key_padding_mask.float()], dim=1)
else:
new_key_padding_mask = key_padding_mask.float()
else:
new_key_padding_mask = prev_key_padding_mask
return new_key_padding_mask

@torch.jit.export
def reorder_incremental_state(
self,
incremental_state: Dict[str, Dict[str, Optional[Tensor]]],
new_order: Tensor,
):
"""Reorder buffered internal state (for incremental generation)."""
input_buffer = self._get_input_buffer(incremental_state)
if input_buffer is not None:
for k in input_buffer.keys():
input_buffer_k = input_buffer[k]
if input_buffer_k is not None:
if self.encoder_decoder_attention and input_buffer_k.size(
0) == new_order.size(0):
break
input_buffer[k] = input_buffer_k.index_select(0, new_order)
incremental_state = self._set_input_buffer(incremental_state,
input_buffer)
return incremental_state

def _get_input_buffer(
self, incremental_state: Optional[Dict[str, Dict[str,
Optional[Tensor]]]]
) -> Dict[str, Optional[Tensor]]:
result = self.get_incremental_state(incremental_state, 'attn_state')
if result is not None:
return result
else:
empty_result: Dict[str, Optional[Tensor]] = {}
return empty_result

def _set_input_buffer(
self,
incremental_state: Dict[str, Dict[str, Optional[Tensor]]],
buffer: Dict[str, Optional[Tensor]],
):
return self.set_incremental_state(incremental_state, 'attn_state',
buffer)

def apply_sparse_mask(self, attn_weights, tgt_len: int, src_len: int,
bsz: int):
return attn_weights

def upgrade_state_dict_named(self, state_dict, name):
prefix = name + '.' if name != '' else ''
items_to_add = {}
keys_to_remove = []
for k in state_dict.keys():
if k.endswith(prefix + 'in_proj_weight'):
# in_proj_weight used to be q + k + v with same dimensions
dim = int(state_dict[k].shape[0] / 3)
items_to_add[prefix + 'q_proj.weight'] = state_dict[k][:dim]
items_to_add[prefix + 'k_proj.weight'] = state_dict[k][dim:2
* dim]
items_to_add[prefix + 'v_proj.weight'] = state_dict[k][2
* dim:]

keys_to_remove.append(k)

k_bias = prefix + 'in_proj_bias'
if k_bias in state_dict.keys():
dim = int(state_dict[k].shape[0] / 3)
items_to_add[prefix
+ 'q_proj.bias'] = state_dict[k_bias][:dim]
items_to_add[prefix
+ 'k_proj.bias'] = state_dict[k_bias][dim:2
* dim]
items_to_add[prefix
+ 'v_proj.bias'] = state_dict[k_bias][2
* dim:]

keys_to_remove.append(prefix + 'in_proj_bias')

for k in keys_to_remove:
del state_dict[k]

for key, value in items_to_add.items():
state_dict[key] = value

+ 155
- 0
modelscope/models/multi_modal/ofa/generate/ngram_repeat_block.py View File

@@ -0,0 +1,155 @@
# Originally from Microsoft Corporation.
# Licensed under the MIT License.
""" Wrapper for ngram_repeat_block cuda extension """
import math
import warnings
from typing import Dict, List

import torch
from torch import nn

try:
from fairseq import ngram_repeat_block_cuda

EXTENSION_BUILT = True
except ImportError:
EXTENSION_BUILT = False


def is_cuda_extension_usable() -> bool:
"""Check whether ngram_repeat_block_cuda is built properly"""
if not EXTENSION_BUILT or not torch.cuda.is_available():
return False
bsz = 2
tokens = torch.tensor([[4, 4, 3, 2], [1, 2, 3, 4]],
dtype=torch.long,
device='cuda')
lprobs = torch.rand((8, 12), device='cuda')
try:
outputs = ngram_repeat_block_cuda.forward(tokens, lprobs, bsz, 3, 4, 3)
outputs = outputs + 4 # This line breaks if the extension is built incorrectly.
return True
except RuntimeError:
warnings.warn(
'NGramRepeatBlock extension must be rebuilt.'
'Run TORCH_CUDA_ARCH_LIST="6.0;6.1;7.0" python setup.py build_ext --inplace'
)
return False


class NGramRepeatBlock(nn.Module):
""" Wrapper class for calling ngram_repeat_block cuda extension """

def __init__(self, no_repeat_ngram_size: int, use_extension: bool = True):
super().__init__()
self.use_extension = is_cuda_extension_usable(
) if use_extension else False
self.no_repeat_ngram_size = no_repeat_ngram_size

def reset_parameters(self):
pass

@torch.jit.unused
def call_cuda_extension(
self,
tokens,
lprobs,
bsz: int,
beam_size: int,
step: int,
):
return ngram_repeat_block_cuda.forward(tokens, lprobs, bsz, step,
beam_size,
self.no_repeat_ngram_size)

def forward(
self,
tokens,
lprobs,
bsz: int,
beam_size: int,
step: int,
):
"""
Args:
tokens(Tensor): Input tokens(Bsz*beam, seq_len)
lprobs(Tensor): likelihood probability,
Expected to be updated in place.(Bsz*beam, vocab_size)
bsz(int): batch size
step(int): current step
beam_size(int): beam size
no_repeat_ngram_size(int): Ngram size
"""
msg = f'expected {bsz * beam_size} got'
assert tokens.size(0) == bsz * beam_size, f'{msg} {tokens.size(0)}'
assert lprobs.size(0) == bsz * beam_size, f'{msg} {lprobs.size(0)}'
if self.use_extension:
return self.call_cuda_extension(tokens, lprobs, bsz, beam_size,
step)

else:
return self._no_repeat_ngram(
tokens,
lprobs,
bsz,
beam_size,
step,
)

def _no_repeat_ngram(self, tokens, lprobs, bsz: int, beam_size: int,
step: int):
"""For each hypothesis generate a list of previous ngrams and set associated lprobs to -inf"""
gen_ngrams: List[Dict[str, List[int]]] = [
torch.jit.annotate(Dict[str, List[int]], {})
for bbsz_idx in range(bsz * beam_size)
]
cpu_tokens = tokens.cpu()
for bbsz_idx in range(bsz * beam_size):
gen_tokens: List[int] = cpu_tokens[bbsz_idx].tolist()
for ngram in self.transpose_list([
gen_tokens[i:] for i in range(self.no_repeat_ngram_size)
]): # noqa
key = ','.join([str(x) for x in ngram[:-1]])
gen_ngrams[bbsz_idx][key] = gen_ngrams[bbsz_idx].get(
key, torch.jit.annotate(List[int], [])) + [ngram[-1]]
if step + 2 - self.no_repeat_ngram_size >= 0:
# no banned tokens if we haven't generated no_repeat_ngram_size tokens yet
banned_tokens = [
self.calculate_banned_tokens(tokens, step, gen_ngrams,
self.no_repeat_ngram_size,
bbsz_idx)
for bbsz_idx in range(bsz * beam_size)
]
else:
banned_tokens = [
torch.jit.annotate(List[int], [])
for bbsz_idx in range(bsz * beam_size)
]
for bbsz_idx in range(bsz * beam_size):
lprobs[bbsz_idx][torch.tensor(
banned_tokens[bbsz_idx],
dtype=torch.int64)] = torch.tensor(-math.inf).to(lprobs)
return lprobs

@staticmethod
def calculate_banned_tokens(
tokens,
step: int,
gen_ngrams: List[Dict[str, List[int]]],
no_repeat_ngram_size: int,
bbsz_idx: int,
):
tokens_list: List[int] = tokens[bbsz_idx,
step + 2 - no_repeat_ngram_size:step
+ 1].tolist() # noqa
# before decoding the next token, prevent decoding of ngrams that have already appeared
ngram_index = ','.join([str(x) for x in tokens_list])
return gen_ngrams[bbsz_idx].get(ngram_index,
torch.jit.annotate(List[int], []))

@staticmethod
def transpose_list(l: List[List[int]]): # noqa
# GeneratorExp aren't supported in TS so ignoring the lint
min_len = min([len(x) for x in l]) # noqa
l2 = [[row[i] for row in l] for i in range(min_len)]
return l2

+ 848
- 0
modelscope/models/multi_modal/ofa/generate/search.py View File

@@ -0,0 +1,848 @@
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license which can be found at
# https://github.com/facebookresearch/fairseq/blob/main/LICENSE

import math
from typing import List, Optional

import torch
import torch.nn as nn
from torch import Tensor

from .token_generation_constraints import (ConstraintState,
OrderedConstraintState,
UnorderedConstraintState)


class Search(nn.Module):

def __init__(self, tokenizer):
super().__init__()
self.pad = tokenizer.pad_token_id
self.unk = tokenizer.unk_token_id
self.eos = tokenizer.eos_token_id
tgt_dict = {value: key for key, value in tokenizer.get_vocab().items()}
added = {
value: key
for key, value in tokenizer.get_added_vocab().items()
}
tgt_dict.update(added)
self.vocab_size = len(tgt_dict)
self.src_lengths = torch.tensor(-1)
self.supports_constraints = False
self.stop_on_max_len = False

def step(self,
step,
lprobs,
scores,
prev_output_tokens=None,
original_batch_idxs=None):
"""Take a single search step.

Args:
step: the current search step, starting at 0
lprobs: (bsz x input_beam_size x vocab_size)
the model's log-probabilities over the vocabulary at the current step
scores: (bsz x input_beam_size x step)
the historical model scores of each hypothesis up to this point
prev_output_tokens: (bsz x step)
the previously generated oputput tokens
original_batch_idxs: (bsz)
the tensor with the batch indices, in the range [0, bsz)
this is useful in case there has been applied a re-ordering
and we need to know the orignal indices

Return: A tuple of (scores, indices, beams) where:
scores: (bsz x output_beam_size)
the scores of the chosen elements; output_beam_size can be
larger than input_beam_size, e.g., we may return
2*input_beam_size to account for EOS
indices: (bsz x output_beam_size)
the indices of the chosen elements
beams: (bsz x output_beam_size)
the hypothesis ids of the chosen elements, in the range [0, input_beam_size)
"""
raise NotImplementedError

@torch.jit.export
def set_src_lengths(self, src_lengths):
self.src_lengths = src_lengths

@torch.jit.export
def init_constraints(self, batch_constraints: Optional[Tensor],
beam_size: int):
"""Initialize constraint states for constrained decoding (if supported).

Args:
batch_constraints: (torch.Tensor, optional)
the list of constraints, in packed form
beam_size: (int)
the beam size
Returns:
*encoder_out* rearranged according to *new_order*
"""
pass

def prune_sentences(self, batch_idxs: Tensor):
"""
Removes constraint states for completed sentences (if supported).
This is called from sequence_generator._generate() when sentences are
deleted from the batch.

Args:
batch_idxs: Indices of *sentences* whose constraint state should be *kept*.
"""
pass

def update_constraints(self, active_hypos: Tensor):
"""
Updates the constraint states by selecting the beam items that are retained.
This is called at each time step of sequence_generator._generate() when
the set of 2 * {beam_size} candidate hypotheses are reduced to the beam size.

Args:
active_hypos: (batch size, beam size)
list of integers denoting, for each sentence, which beam candidate items
should be kept.
"""
pass


class BeamSearch(Search):

def __init__(self, tgt_dict):
super().__init__(tgt_dict)
self.constraint_states = None

@torch.jit.export
def step(
self,
step: int,
lprobs,
scores: Optional[Tensor],
prev_output_tokens: Optional[Tensor] = None,
original_batch_idxs: Optional[Tensor] = None,
):
bsz, beam_size, vocab_size = lprobs.size()

if step == 0:
# at the first step all hypotheses are equally likely, so use
# only the first beam
lprobs = lprobs[:, ::beam_size, :].contiguous()
else:
# make probs contain cumulative scores for each hypothesis
assert scores is not None
lprobs = lprobs + scores[:, :, step - 1].unsqueeze(-1)

top_prediction = torch.topk(
lprobs.view(bsz, -1),
k=min(
# Take the best 2 x beam_size predictions. We'll choose the first
# beam_size of these which don't predict eos to continue with.
beam_size * 2,
lprobs.view(bsz, -1).size(1) - 1, # -1 so we never select pad
),
)
scores_buf = top_prediction[0]
indices_buf = top_prediction[1]
# Project back into relative indices and beams
beams_buf = indices_buf // vocab_size
indices_buf = indices_buf.fmod(vocab_size)

# At this point, beams_buf and indices_buf are single-dim and contain relative indices
return scores_buf, indices_buf, beams_buf


class PrefixConstrainedBeamSearch(Search):

def __init__(self, tgt_dict, prefix_allowed_tokens_fn):
super().__init__(tgt_dict)
self.prefix_allowed_tokens_fn = prefix_allowed_tokens_fn
self.stop_on_max_len = True

@torch.jit.export
def apply_mask(self, x, prev_output_tokens, original_batch_idxs):
beam_size = x.shape[0] // original_batch_idxs.shape[0]
original_batch_idxs = (
original_batch_idxs.unsqueeze(-1).repeat(
(1, beam_size)).flatten().tolist())

mask = torch.full_like(x, -math.inf)
for sent_i, (sent, batch_i) in enumerate(
zip(prev_output_tokens, original_batch_idxs)):
mask[sent_i, :, self.prefix_allowed_tokens_fn(batch_i, sent)] = 0

return mask

@torch.jit.export
def step(
self,
step: int,
lprobs: Tensor,
scores: Tensor,
prev_output_tokens: Tensor,
original_batch_idxs: Tensor,
):
bsz, beam_size, vocab_size = lprobs.size()

lprobs += self.apply_mask(
lprobs.view(bsz * beam_size, 1, vocab_size),
prev_output_tokens,
original_batch_idxs,
).view(bsz, beam_size, vocab_size)

if step == 0:
# at the first step all hypotheses are equally likely, so use
# only the first beam
lprobs = lprobs[:, ::beam_size, :].contiguous()
else:
# make probs contain cumulative scores for each hypothesis
assert scores is not None
lprobs = lprobs + scores[:, :, step - 1].unsqueeze(-1)

top_prediction = torch.topk(
lprobs.view(bsz, -1),
k=min(
# Take the best beam_size predictions. We'll choose the first
# beam_size of these which don't predict eos to continue with.
beam_size,
lprobs.view(bsz, -1).size(1) - 1, # -1 so we never select pad
),
)
scores_buf = top_prediction[0]
indices_buf = top_prediction[1]
beams_buf = indices_buf // vocab_size
indices_buf = indices_buf.fmod(vocab_size)
return scores_buf, indices_buf, beams_buf


class LexicallyConstrainedBeamSearch(Search):
"""Implements lexically constrained beam search as described in

Fast Lexically Constrained Decoding with Dynamic Beam
Allocation for Neural Machine Translation. Post & Vilar,
NAACL 2018. https://www.aclweb.org/anthology/N18-1119/

and

Improved Lexically Constrained Decoding for Translation and
Monolingual Rewriting. Hu et al, NAACL
2019. https://www.aclweb.org/anthology/N19-1090/

This is accomplished by maintaining, for each beam hypothesis, a
ConstraintState object (see constraints.py) that tracks which
constraints have been generated and using this information to
shape the beam for each input sentence.
"""

def __init__(self, tokenizer, representation):
super().__init__(tokenizer)
self.representation = representation
tgt_dict = {value: key for key, value in tokenizer.get_vocab().items()}
added = {
value: key
for key, value in tokenizer.get_added_vocab().items()
}
tgt_dict.update(added)
self.vocab_size = len(tgt_dict)
self.num_cands = 0
self.supports_constraints = True

@torch.jit.export
def init_constraints(self, batch_constraints: Optional[Tensor],
beam_size: int):
self.constraint_states = []
for constraint_tensor in batch_constraints:
if self.representation == 'ordered':
constraint_state = OrderedConstraintState.create(
constraint_tensor)
elif self.representation == 'unordered':
constraint_state = UnorderedConstraintState.create(
constraint_tensor)

self.constraint_states.append(
[constraint_state for i in range(beam_size)])

@torch.jit.export
def prune_sentences(self, batch_idxs: Tensor):
self.constraint_states = [
self.constraint_states[i] for i in batch_idxs.tolist()
]

@torch.jit.export
def update_constraints(self, active_hypos: Tensor):
if self.constraint_states:
batch_size = active_hypos.size(0)
for sentid in range(batch_size):
self.constraint_states[sentid] = [
self.constraint_states[sentid][i]
for i in active_hypos[sentid]
]

@torch.jit.export
def step(
self,
step: int,
lprobs: Tensor,
scores: Optional[Tensor],
prev_output_tokens: Optional[Tensor] = None,
original_batch_idxs: Optional[Tensor] = None,
):
"""
A constrained step builds a large candidates list from the following:
- the top 2 * {beam_size} items over the whole beam
- for each item in the beam
- the top {each_k} (default 1)
- all next constraints
We then compute the constrained state of each beam item, and assign
stripe codes: 0 to the best in each bank, 1 to the 2nd-best, and so
on. We then sort by (stripe, score), and truncate the list at
2 * beam size.

Args:
step: the decoder step
lprobs: (batch size, beam size, target vocab)
the target-vocab distributions for each item in the beam.
Retrun: A tuple of (scores, indices, beams, constraints) where:
scores: (batch, output beam size)
the scores of the chosen elements
indices: (batch, output beam size)
the target vocab indices of the chosen elements
beams: (batch, output beam size)
the 0-indexed hypothesis ids of the chosen elements
constraints: (batch, output beam size)
the new constraint states
"""
each_k = 1
device = lprobs.device

batch_size, beam_size, vocab_size = lprobs.size()

self.num_cands = min(
# Just take the k-best. We'll get another k from the 1-best from each
# row, plus more from the constraints
beam_size * 2,
lprobs.view(batch_size, -1).size(1)
- 1, # -1 so we never select pad
)

# STEP 0: Preliminary. Prevent EOS for unfinished hyps across all batch items
constraint_states = self.constraint_states
if constraint_states and step > 0:
not_finished_indices = []
for sentno, sent_constraints in enumerate(constraint_states):
for beamno, state in enumerate(sent_constraints):
index = sentno * beam_size + beamno
if not state.finished:
not_finished_indices.append(index)
not_finished_indices = torch.tensor(not_finished_indices)
if not_finished_indices.numel() > 0:
lprobs.view(batch_size * beam_size, -1)[not_finished_indices,
self.eos] = -math.inf

if step == 0:
# at the first step all hypotheses are equally likely, so use
# only the first beam entry for each batch item
lprobs = lprobs[:, ::beam_size, :].contiguous()
else:
# make probs contain cumulative scores for each hypothesis
assert scores is not None
lprobs = lprobs + scores[:, :, step - 1].unsqueeze(-1)

top_prediction = torch.topk(
lprobs.view(batch_size, -1),
self.num_cands,
)
scores_buf, indices_buf = top_prediction
# Project back into relative indices and beams
beams_buf = indices_buf // vocab_size
indices_buf = indices_buf.fmod(vocab_size)

# Short circuit if there are no constraints in this batch
if not constraint_states:
return scores_buf, indices_buf, beams_buf

# STEP 1: get top-1 from each hypothesis across all sentences in the batch
if step > 0:
top_scores, top_indices = torch.topk(
lprobs.view(batch_size * beam_size, -1),
k=each_k,
dim=1,
)
top_scores = top_scores.view(batch_size, -1)
top_indices = top_indices.view(batch_size, -1)
scores_buf = torch.cat((scores_buf, top_scores), dim=1)
indices_buf = torch.cat((indices_buf, top_indices), dim=1)
new_beams = torch.arange(
0, beam_size, device=device).repeat(batch_size, 1)
beams_buf = torch.cat((beams_buf, new_beams), dim=1)

# Now, process sentences in the batch one by one.
new_scores_buf = torch.zeros((batch_size, 2 * beam_size),
device=device)
new_indices_buf = torch.zeros((batch_size, 2 * beam_size),
device=device).long()
new_beams_buf = torch.zeros((batch_size, 2 * beam_size),
device=device).long()
for sentno, states in enumerate(constraint_states):
scores, indices, beams, new_states = self.step_sentence(
step,
sentno,
lprobs[sentno],
constraint_states[sentno],
beams_buf[sentno].clone(),
indices_buf[sentno].clone(),
scores_buf[sentno].clone(),
)
new_scores_buf[sentno] = scores
new_indices_buf[sentno] = indices
new_beams_buf[sentno] = beams
self.constraint_states[sentno] = new_states

return new_scores_buf, new_indices_buf, new_beams_buf

@torch.jit.export
def step_sentence(
self,
step: int,
sentno: int,
lprobs: Tensor,
constraint_states: List[List[ConstraintState]],
beams_buf: Tensor,
indices_buf: Tensor,
scores_buf: Tensor,
):
"""Does per-sentence processing. Adds all constraints for each
hypothesis to the list of candidates; then removes duplicates,
sorts, and dynamically stripes across the banks. All tensor inputs
are collapsed to those pertaining to a single input sentence.
"""
device = lprobs.device

# STEP 2: Add all constraints for each beam item
for beamno, state in enumerate(constraint_states):
next_tokens = torch.tensor(
list(state.next_tokens()), device=device).long()
if next_tokens.numel() != 0:
indices_buf = torch.cat((indices_buf, next_tokens))
next_beams = (
torch.tensor(beamno, device=device).repeat(
next_tokens.size(0)).long())
beams_buf = torch.cat((beams_buf, next_beams))
next_values = lprobs[beamno].take(next_tokens.view(-1))
scores_buf = torch.cat((scores_buf, next_values))

# At the 0th time step, there is just one beam item
if step == 0:
break

# STEP 3: Compute the "bank" for each candidate. This is the
# number of constraints it's generated. We need this so that
# we can do round-robin allocation of the beam across these
# banks. If C is the number of constraints, we select the best
# item in bank C, then the best in bank C-1, etc, followed by
# the 2nd-best in bank C, the 2nd-best in bank C-1, etc, and so
# on, until the maximum beam size. We accomplish this by
# creating a sort key and striping across the banks.

# Compute the new states for all candidates
cands_size = indices_buf.size(0)
constraint_states = [
constraint_states[beams_buf[i]].advance(indices_buf[i])
for i in range(cands_size)
]

banks = torch.tensor([state.bank for state in constraint_states],
device=device)

# STEP 4: Sort
num_constraint_tokens = len(state.tokens)

# Sort by keys (bank, score) (i.e., sort banks together, and scores
# within banks). AFAIK pytorch doesn't support either stable sort or
# multi-key sorting, so we have to hack this.
MAX_SCORE = -100
sort_key = (num_constraint_tokens - banks) * MAX_SCORE + scores_buf
sort_values, sort_indices = sort_key.sort(dim=0, descending=True)
scores_buf = scores_buf[sort_indices]
indices_buf = indices_buf[sort_indices]
beams_buf = beams_buf[sort_indices]
banks = banks[sort_indices]

# Sort the constraints to follow suit
constraint_states = [constraint_states[i] for i in sort_indices]

# STEP 5: Remove duplicates. The topk calls (overall and
# per-row) plus the per-row generation of constraints will
# produce duplicates. Here we remove them.

def roll(t):
"""Rolls a 1d tensor left by 1.

[0, 1, 2, 3, 4] becomes [4, 0, 1, 2, 3]
"""
return torch.cat((t[-1].unsqueeze(0), t[0:-1]), dim=0)

# We map candidates (beam, token_id) to a single dimension.
# This is then shifted by 1. We can then easily identify
# duplicates and create a mask that identifies unique
# extensions.
uniques_mask = beams_buf * (self.vocab_size + 1) + indices_buf
uniques_mask = roll(uniques_mask) != uniques_mask

# Use the mask to pare down the data structures
scores_buf = torch.masked_select(scores_buf, uniques_mask)
indices_buf = torch.masked_select(indices_buf, uniques_mask)
beams_buf = torch.masked_select(beams_buf, uniques_mask)
banks = torch.masked_select(banks, uniques_mask)
i = 1
for mask in uniques_mask[1:]:
if not mask:
constraint_states.pop(i)
i += mask

# STEP 6: Assign IDs round-robin across banks, sort, and
# truncate. Now that the candidates are sorted by (bank,
# score) and uniqed, we dynamically allocate the {beam_size}
# beam by striping across the candidates. These stripes will
# be used as sort keys to do round-robin selection. This is
# accomplished in a single pass with offsets. Sorting by
# highest-banks (furthest-along hypotheses) first ensures
# progress through the constraints.
#
# e.g., BANKS: 3 3 3 2 2 2 2 1 1 1 0 0
# OLD STRIPES: 0 1 2 0 1 2 3 0 1 2 0 1
# NEW STRIPES: 0 1+4 2+8 0+1 1+5 2+9 3+11 0+2 1+6 2+10 0+3 1+7
# = 0 5 10 1 6 11 13 2 7 12 3 8
#
# Sorting by this then gives the following banks:
#
# 3 2 1 0 3 2 1 0 3 2 1 2
#
# We'll take the top {beam_size} of these.
stripe_offsets = [
offset * (len(banks) + 1) for offset in range(len(banks) + 1)
]
stripes = torch.zeros_like(banks)
cur_bank_count = -1
cur_bank = banks[0]
for i, bank in enumerate(banks):
if bank != cur_bank:
cur_bank_count = 0
cur_bank = bank
else:
cur_bank_count += 1
stripes[i] = num_constraint_tokens - bank + stripe_offsets[
cur_bank_count]

# STEP 7: Sort by the stripes values
sort_values, sort_indices = stripes.sort(dim=0)
scores_buf = scores_buf[sort_indices]
indices_buf = indices_buf[sort_indices]
beams_buf = beams_buf[sort_indices]
constraint_states = [constraint_states[i] for i in sort_indices]

# STEP 8: Truncate to the candidates size!
scores_buf = scores_buf[:self.num_cands]
indices_buf = indices_buf[:self.num_cands]
beams_buf = beams_buf[:self.num_cands]

return scores_buf, indices_buf, beams_buf, constraint_states


class LengthConstrainedBeamSearch(Search):

def __init__(self, tgt_dict, min_len_a, min_len_b, max_len_a, max_len_b):
super().__init__(tgt_dict)
self.min_len_a = min_len_a
self.min_len_b = min_len_b
self.max_len_a = max_len_a
self.max_len_b = max_len_b
self.beam = BeamSearch(tgt_dict)
self.needs_src_lengths = True

def step(
self,
step: int,
lprobs,
scores,
prev_output_tokens: Optional[Tensor] = None,
original_batch_idxs: Optional[Tensor] = None,
):
min_lens = self.min_len_a * self.src_lengths + self.min_len_b
max_lens = self.max_len_a * self.src_lengths + self.max_len_b
lprobs[step < min_lens, :, self.eos] = -math.inf
lprobs[step >= max_lens, :, self.eos] = 0
return self.beam.step(step, lprobs, scores)


class DiverseBeamSearch(Search):
"""Diverse Beam Search.

See "Diverse Beam Search: Decoding Diverse Solutions from Neural Sequence
Models" for details.

We only implement the Hamming Diversity penalty here, which performed best
in the original paper.
"""

def __init__(self, tgt_dict, num_groups, diversity_strength):
super().__init__(tgt_dict)
self.num_groups = num_groups
self.diversity_strength = -diversity_strength
self.beam = BeamSearch(tgt_dict)

@torch.jit.export
def step(
self,
step: int,
lprobs,
scores,
prev_output_tokens: Optional[Tensor] = None,
original_batch_idxs: Optional[Tensor] = None,
):
bsz, beam_size, vocab_size = lprobs.size()
if beam_size % self.num_groups != 0:
raise ValueError(
'DiverseBeamSearch requires --beam to be divisible by the number of groups'
)

# initialize diversity penalty
diversity_buf = torch.zeros(lprobs[:, 0, :].size()).to(lprobs)

scores_G, indices_G, beams_G = [], [], []
for g in range(self.num_groups):
lprobs_g = lprobs[:, g::self.num_groups, :]
scores_g = scores[:, g::self.num_groups, :] if step > 0 else None

# apply diversity penalty
if g > 0:
lprobs_g = torch.add(
lprobs_g,
other=diversity_buf.unsqueeze(1),
alpha=self.diversity_strength,
)
else:
lprobs_g = lprobs_g.contiguous()

scores_buf, indices_buf, beams_buf = self.beam.step(
step, lprobs_g, scores_g)
beams_buf.mul_(self.num_groups).add_(g)

scores_G.append(scores_buf.clone())
indices_G.append(indices_buf.clone())
beams_G.append(beams_buf.clone())

# update diversity penalty
diversity_buf.scatter_add_(
1, indices_buf,
torch.ones(indices_buf.size()).to(diversity_buf))

# interleave results from different groups
scores_buf = torch.stack(scores_G, dim=2).view(bsz, -1)
indices_buf = torch.stack(indices_G, dim=2).view(bsz, -1)
beams_buf = torch.stack(beams_G, dim=2).view(bsz, -1)
return scores_buf, indices_buf, beams_buf


class Sampling(Search):
sampling_topk: int
sampling_topp: float

def __init__(self, tgt_dict, sampling_topk=-1, sampling_topp=-1.0):
super().__init__(tgt_dict)
self.sampling_topk = sampling_topk
self.sampling_topp = sampling_topp

def _sample_topp(self, lprobs):
"""Sample among the smallest set of elements whose cumulative probability mass exceeds p.

See `"The Curious Case of Neural Text Degeneration"
(Holtzman et al., 2019) <https://arxiv.org/abs/1904.09751>`_.

Args:
lprobs: (bsz x input_beam_size x vocab_size)
the model's log-probabilities over the vocabulary at the current step

Return: A tuple of (trimed_probs, truncated_indices) where:
trimed_probs: (bsz x input_beam_size x ?)
the model's probabilities over the elements selected to sample from. The
width of the third dimension is determined by top-P.
truncated_indices: (bsz x input_beam_size x ?)
the indices of the chosen elements.
"""
probs = lprobs.exp_()

# sort the last dimension (vocab dimension) in descending order
sorted_probs, sorted_indices = probs.sort(descending=True)

# compute a mask to indicate the words to be included in the top-P set.
cumsum_probs = sorted_probs.cumsum(dim=2)
mask = cumsum_probs.lt(self.sampling_topp)

# note that mask was computed by 'lt'. One more word needs to be included
# so that the cumulative probability mass can exceed p.
cumsum_mask = mask.cumsum(dim=2)
last_included = cumsum_mask[:, :, -1:]
last_included.clamp_(0, mask.size()[2] - 1)
mask = mask.scatter_(2, last_included, 1)

# truncate unnecessary dims.
max_dim = last_included.max()
truncated_mask = mask[:, :, :max_dim + 1]
truncated_probs = sorted_probs[:, :, :max_dim + 1]
truncated_indices = sorted_indices[:, :, :max_dim + 1]

# trim the words that are not in top-P by setting their probabilities
# to 0, so that they would not be sampled later.
trim_mask = ~truncated_mask
trimed_probs = truncated_probs.masked_fill_(trim_mask, 0)
return trimed_probs, truncated_indices

@torch.jit.export
def step(
self,
step: int,
lprobs,
scores,
prev_output_tokens: Optional[Tensor] = None,
original_batch_idxs: Optional[Tensor] = None,
):
bsz, beam_size, vocab_size = lprobs.size()

if step == 0:
# at the first step all hypotheses are equally likely, so use
# only the first beam
lprobs = lprobs[:, ::beam_size, :].contiguous()

if self.sampling_topp > 0:
# only sample from the smallest set of words whose cumulative probability mass exceeds p
probs, top_indices = self._sample_topp(lprobs)
elif self.sampling_topk > 0:
# only sample from top-k candidates
lprobs, top_indices = lprobs.topk(self.sampling_topk)
probs = lprobs.exp_()
else:
probs = lprobs.exp_()

# dummy data to be consistent with true branch for type check
top_indices = torch.empty(0).to(probs)
# sample
if step == 0:
indices_buf = torch.multinomial(
probs.view(bsz, -1),
beam_size,
replacement=True,
).view(bsz, beam_size)
else:
indices_buf = torch.multinomial(
probs.view(bsz * beam_size, -1),
1,
replacement=True,
).view(bsz, beam_size)

if step == 0:
# expand to beam size
probs = probs.expand(bsz, beam_size, -1)

# gather scores
scores_buf = torch.gather(
probs, dim=2, index=indices_buf.unsqueeze(-1))
scores_buf = scores_buf.log_().view(bsz, -1)

# remap indices if using top-k or top-P sampling
if self.sampling_topk > 0 or self.sampling_topp > 0:
indices_buf = torch.gather(
top_indices.expand(bsz, beam_size, -1),
dim=2,
index=indices_buf.unsqueeze(-1),
).squeeze(2)

if step == 0:
beams_buf = indices_buf.new_zeros(bsz, beam_size)
else:
beams_buf = torch.arange(0,
beam_size).to(indices_buf).repeat(bsz, 1)
# make scores cumulative
scores_buf.add_(
torch.gather(scores[:, :, step - 1], dim=1, index=beams_buf))

return scores_buf, indices_buf, beams_buf


class DiverseSiblingsSearch(Search):
"""
Beam search with diverse siblings.

See "A Simple, Fast Diverse Decoding Algorithm for Neural Generation" for details.
https://arxiv.org/abs/1611.08562

1/ Calculate hypotheses for each beam
2/ Intra-sibling ordering
3/ Rewrite scores
4/ Choose top K hypotheses

if diversity_rate == 0 is equivalent to BeamSearch
"""

def __init__(self, tgt_dict, diversity_rate):
super().__init__(tgt_dict)
self.diversity_rate = diversity_rate
self.beam = BeamSearch(tgt_dict)

def step(
self,
step: int,
lprobs,
scores,
prev_output_tokens: Optional[Tensor] = None,
original_batch_idxs: Optional[Tensor] = None,
):
bsz, beam_size, vocab_size = lprobs.size()
k = min(
# Take the best 2 x beam_size predictions. We'll choose the first
# beam_size of these which don't predict eos to continue with.
beam_size * 2,
lprobs.view(bsz, -1).size(1) - 1, # -1 so we never select pad
)
s_list: List[Tensor]
i_list: List[Tensor]
s_list = [torch.empty(0).to(lprobs) for i in range(beam_size)]
i_list = [
torch.LongTensor().to(device=lprobs.device)
for i in range(beam_size)
]
sibling_score = torch.arange(1, k + 1).to(lprobs) * self.diversity_rate

if step == 0:
return self.beam.step(step, lprobs, scores)
lprobs.add_(scores[:, :, step - 1].unsqueeze(-1))

# 1/ Calculate hypotheses for each beam
for i in range(beam_size):
torch.topk(
lprobs[:, i, :].view(bsz, -1), k, out=(s_list[i], i_list[i]))
i_list[i].fmod_(vocab_size)

# 2/ Intra-sibling ordering by default from topk + 3/ Rewrite scores
s_list[i].sub_(sibling_score)

# 4/ Choose top K hypotheses
indices = torch.stack(i_list, dim=1).view(bsz, -1)

final_scores = torch.empty(0).to(lprobs)
final_indices = torch.LongTensor().to(device=lprobs.device)
final_beams = torch.LongTensor().to(device=lprobs.device)
(final_scores, final_indices) = torch.topk(
torch.stack(s_list, dim=1).view(bsz, -1),
k,
)

final_beams = final_indices // k

for i in range(bsz):
final_indices[i] = indices[i][final_indices[i]]

return final_scores, final_indices, final_beams

+ 996
- 0
modelscope/models/multi_modal/ofa/generate/sequence_generator.py View File

@@ -0,0 +1,996 @@
# Copyright 2022 The OFA-Sys Team.
# All rights reserved.
# This source code is licensed under the Apache 2.0 license
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0

import math
import sys
from typing import Dict, List, Optional, Tuple

import torch
import torch.nn as nn
from torch import Tensor

from ..generate import search
from .ngram_repeat_block import NGramRepeatBlock


def _expand_mask(mask: torch.Tensor,
dtype: torch.dtype,
tgt_len: Optional[int] = None):
r"""
Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
"""
bsz, src_len = mask.size()
tgt_len = tgt_len if tgt_len is not None else src_len

expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len,
src_len).to(dtype)
return expanded_mask.masked_fill(expanded_mask.bool(),
torch.finfo(dtype).min)


class SequenceGenerator(nn.Module):

def __init__(self,
tokenizer,
beam_size=1,
max_len_a=0,
max_len_b=200,
max_len=0,
min_len=1,
normalize_scores=True,
len_penalty=1.0,
unk_penalty=0.0,
temperature=1.0,
match_source_len=False,
no_repeat_ngram_size=0,
search_strategy=None,
eos=None,
symbols_to_strip_from_output=None,
lm_model=None,
lm_weight=1.0,
constraint_trie=None,
constraint_range=None,
gen_code=False,
gen_box=False,
ignore_eos=False,
zero_shot=False):
"""Generates translations of a given source sentence.

Args:
models (List[~fairseq.models.FairseqModel]): ensemble of models,
currently support fairseq.models.TransformerModel for scripting
beam_size (int, optional): beam width (default: 1)
max_len_a/b (int, optional): generate sequences of maximum length
ax + b, where x is the source length
max_len (int, optional): the maximum length of the generated output
(not including end-of-sentence)
min_len (int, optional): the minimum length of the generated output
(not including end-of-sentence)
normalize_scores (bool, optional): normalize scores by the length
of the output (default: True)
len_penalty (float, optional): length penalty, where <1.0 favors
shorter, >1.0 favors longer sentences (default: 1.0)
unk_penalty (float, optional): unknown word penalty, where <0
produces more unks, >0 produces fewer (default: 0.0)
temperature (float, optional): temperature, where values
>1.0 produce more uniform samples and values <1.0 produce
sharper samples (default: 1.0)
match_source_len (bool, optional): outputs should match the source
length (default: False)
"""
super().__init__()
self.gen_code = gen_code
self.gen_box = gen_box
self.ignore_eos = ignore_eos
self.tokenizer = tokenizer
self.tgt_dict = {
value: key
for key, value in tokenizer.get_vocab().items()
}
added = {
value: key
for key, value in tokenizer.get_added_vocab().items()
}
self.tgt_dict.update(added)
self.pad = tokenizer.pad_token_id
self.unk = tokenizer.unk_token_id
self.bos = tokenizer.bos_token_id
self.eos = tokenizer.eos_token_id
self.symbols_to_strip_from_output = (
symbols_to_strip_from_output.union({self.eos}) if
symbols_to_strip_from_output is not None else {self.bos, self.eos})
self.vocab_size = len(self.tgt_dict)
self.beam_size = beam_size
# the max beam size is the dictionary size - 1, since we never select pad
self.beam_size = min(beam_size, self.vocab_size - 1)
self.max_len_a = max_len_a
self.max_len_b = max_len_b
self.min_len = min_len
self.max_len = max_len

self.normalize_scores = normalize_scores
self.len_penalty = len_penalty
self.unk_penalty = unk_penalty
self.temperature = temperature
self.match_source_len = match_source_len
self.zero_shot = zero_shot

if no_repeat_ngram_size > 0:
self.repeat_ngram_blocker = NGramRepeatBlock(no_repeat_ngram_size)
else:
self.repeat_ngram_blocker = None

assert temperature > 0, '--temperature must be greater than 0'

self.search = (
search.BeamSearch(self.tokenizer)
if search_strategy is None else search_strategy)
# We only need to set src_lengths in LengthConstrainedBeamSearch.
# As a module attribute, setting it would break in multithread
# settings when the model is shared.
self.should_set_src_lengths = (
hasattr(self.search, 'needs_src_lengths')
and self.search.needs_src_lengths)

self.lm_model = lm_model
self.lm_weight = lm_weight
if self.lm_model is not None:
self.lm_model.eval()

self.constraint_trie = constraint_trie

self.constraint_start = None
self.constraint_end = None
if constraint_range is not None:
constraint_start, constraint_end = constraint_range.split(',')
self.constraint_start = int(constraint_start)
self.constraint_end = int(constraint_end)

@torch.no_grad()
def forward(
self,
sample: Dict[str, Dict[str, Tensor]],
prefix_tokens: Optional[Tensor] = None,
bos_token: Optional[int] = None,
):
"""Generate a batch of translations.

Args:
sample (dict): batch
prefix_tokens (torch.LongTensor, optional): force decoder to begin
with these tokens
bos_token (int, optional): beginning of sentence token
(default: self.eos)
"""
return self._generate(sample, prefix_tokens, bos_token=bos_token)

@torch.no_grad()
def generate(self, models, sample: Dict[str, Dict[str, Tensor]],
**kwargs) -> List[List[Dict[str, Tensor]]]:
"""Generate translations. Match the api of other fairseq generators.

Args:
models (List[~fairseq.models.FairseqModel]): ensemble of models
sample (dict): batch
prefix_tokens (torch.LongTensor, optional): force decoder to begin
with these tokens
constraints (torch.LongTensor, optional): force decoder to include
the list of constraints
bos_token (int, optional): beginning of sentence token
(default: self.eos)
"""
return self._generate(models, sample, **kwargs)

def _generate(
self,
models,
sample: Dict[str, Dict[str, Tensor]],
prefix_tokens: Optional[Tensor] = None,
constraints: Optional[Tensor] = None,
bos_token: Optional[int] = None,
):
model = EnsembleModel(models)
# incremental_states = torch.jit.annotate(
# List[Dict[str, Dict[str, Optional[Tensor]]]],
# [
# torch.jit.annotate(Dict[str, Dict[str, Optional[Tensor]]], {})
# for i in range(model.models_size)
# ],
# )
incremental_states = torch.jit.annotate(
List[Tuple[Tuple[torch.Tensor]]],
[
torch.jit.annotate(Tuple[Tuple[torch.Tensor]], {})
for i in range(model.models_size)
],
)
# print("incremental_states",incremental_states)
# print("incremental_states[0]",incremental_states[0])
net_input = sample['net_input']

if 'src_tokens' in net_input:
src_tokens = net_input['src_tokens']
# length of the source text being the character length except EndOfSentence and pad
src_lengths = ((src_tokens.ne(self.eos)
& src_tokens.ne(self.pad)).long().sum(dim=1))
elif 'input_ids' in net_input:
src_tokens = net_input['input_ids']
# length of the source text being the character length except EndOfSentence and pad
src_lengths = ((src_tokens.ne(self.eos)
& src_tokens.ne(self.pad)).long().sum(dim=1))
elif 'source' in net_input:
src_tokens = net_input['source']
src_lengths = (
net_input['padding_mask'].size(-1)
- net_input['padding_mask'].sum(-1)
if net_input['padding_mask'] is not None else torch.tensor(
src_tokens.size(-1)).to(src_tokens))
elif 'features' in net_input:
src_tokens = net_input['features']
src_lengths = (
net_input['padding_mask'].size(-1)
- net_input['padding_mask'].sum(-1)
if net_input['padding_mask'] is not None else torch.tensor(
src_tokens.size(-1)).to(src_tokens))
else:
raise Exception(
'expected src_tokens or source in net input. input keys: '
+ str(net_input.keys()))

# bsz: total number of sentences in beam
# Note that src_tokens may have more than 2 dimensions (i.e. audio features)
bsz, src_len = src_tokens.size()[:2]
beam_size = self.beam_size

if constraints is not None and not self.search.supports_constraints:
raise NotImplementedError(
"Target-side constraints were provided, but search method doesn't support them"
)

# Initialize constraints, when active
self.search.init_constraints(constraints, beam_size)

max_len: int = -1
if self.match_source_len:
max_len = src_lengths.max().item()
else:
max_len = int(self.max_len_a * src_len + self.max_len_b)
assert (
self.min_len <= max_len
), 'min_len cannot be larger than max_len, please adjust these!'
# compute the encoder output for each beam
with torch.autograd.profiler.record_function(
'EnsembleModel: forward_encoder'):
encoder_outs = model.forward_encoder(net_input)

# placeholder of indices for bsz * beam_size to hold tokens and accumulative scores
new_order = torch.arange(bsz).view(-1, 1).repeat(1, beam_size).view(-1)
new_order = new_order.to(src_tokens.device).long()
encoder_outs = model.reorder_encoder_out(encoder_outs, new_order)
# ensure encoder_outs is a List.
assert encoder_outs is not None

# initialize buffers
scores = (torch.zeros(bsz * beam_size,
max_len + 1).to(src_tokens).float()
) # +1 for eos; pad is never chosen for scoring
tokens = (torch.zeros(bsz * beam_size,
max_len + 2).to(src_tokens).long().fill_(
self.pad)) # +2 for eos and pad
# tokens[:, 0] = self.eos if bos_token is None else bos_token
tokens[:, 0] = self.bos
attn: Optional[Tensor] = None

# A list that indicates candidates that should be ignored.
# For example, suppose we're sampling and have already finalized 2/5
# samples. Then cands_to_ignore would mark 2 positions as being ignored,
# so that we only finalize the remaining 3 samples.
cands_to_ignore = (torch.zeros(bsz, beam_size).to(src_tokens).eq(-1)
) # forward and backward-compatible False mask

# list of completed sentences
finalized = torch.jit.annotate(
List[List[Dict[str, Tensor]]],
[
torch.jit.annotate(List[Dict[str, Tensor]], [])
for i in range(bsz)
],
) # contains lists of dictionaries of infomation about the hypothesis being finalized at each step

# a boolean array indicating if the sentence at the index is finished or not
finished = [False for i in range(bsz)]
num_remaining_sent = bsz # number of sentences remaining

# number of candidate hypos per step
cand_size = 2 * beam_size # 2 x beam size in case half are EOS

# offset arrays for converting between different indexing schemes
bbsz_offsets = ((torch.arange(0, bsz)
* beam_size).unsqueeze(1).type_as(tokens).to(
src_tokens.device))
cand_offsets = torch.arange(0, cand_size).type_as(tokens).to(
src_tokens.device)

reorder_state: Optional[Tensor] = None
batch_idxs: Optional[Tensor] = None

original_batch_idxs: Optional[Tensor] = None
if 'id' in sample and isinstance(sample['id'], Tensor):
original_batch_idxs = sample['id']
else:
original_batch_idxs = torch.arange(0, bsz).type_as(tokens)

for step in range(max_len + 1): # one extra step for EOS marker
# reorder decoder internal states based on the prev choice of beams
if reorder_state is not None:
if batch_idxs is not None:
# update beam indices to take into account removed sentences
corr = batch_idxs - torch.arange(
batch_idxs.numel()).type_as(batch_idxs)
reorder_state.view(-1, beam_size).add_(
corr.unsqueeze(-1) * beam_size)
original_batch_idxs = original_batch_idxs[batch_idxs]
model.reorder_incremental_state(incremental_states,
reorder_state) # todo
encoder_outs = model.reorder_encoder_out(
encoder_outs, reorder_state)

with torch.autograd.profiler.record_function(
'EnsembleModel: forward_decoder'):
lprobs, avg_attn_scores = model.forward_decoder(
tokens[:, :step + 1],
encoder_outs,
incremental_states,
self.temperature,
constraint_trie=self.constraint_trie,
constraint_start=self.constraint_start,
constraint_end=self.constraint_end,
gen_code=self.gen_code,
zero_shot=self.zero_shot,
prefix_tokens=prefix_tokens)

if self.lm_model is not None:
lm_out = self.lm_model(tokens[:, :step + 1])
probs = self.lm_model.get_normalized_probs(
lm_out, log_probs=True, sample=None)
probs = probs[:, -1, :] * self.lm_weight
lprobs += probs
# handle prefix tokens (possibly with different lengths)
if (prefix_tokens is not None and step < prefix_tokens.size(1)
and step < max_len):
lprobs, tokens, scores = self._prefix_tokens(
step, lprobs, scores, tokens, prefix_tokens, beam_size)
elif step < self.min_len:
# minimum length constraint (does not apply if using prefix_tokens)
lprobs[:, self.eos] = -math.inf

lprobs[lprobs != lprobs] = torch.tensor(-math.inf).to(lprobs)

lprobs[:, self.pad] = -math.inf # never select pad
lprobs[:, self.unk] -= self.unk_penalty # apply unk penalty

if (self.gen_code or self.gen_box) and step < max_len:
lprobs[:, :4] = -math.inf
if self.gen_box:
lprobs[:, -1] = -math.inf
if (step + 1) % 5 == 0:
lprobs[:, self.constraint_start:59457] = -math.inf
else:
lprobs[:, 59457:] = -math.inf

# handle max length constraint
if step >= max_len:
lprobs[:, :self.eos] = -math.inf
lprobs[:, self.eos + 1:] = -math.inf
if self.ignore_eos:
lprobs[:, self.eos] = 1

# Record attention scores, only support avg_attn_scores is a Tensor
if avg_attn_scores is not None:
if attn is None:
attn = torch.empty(bsz * beam_size,
avg_attn_scores.size(1),
max_len + 2).to(scores)
# print("+++++++ debug attention shape +++++++")
# print("attn", attn.shape)
# print("avg_attn_scores", avg_attn_scores.shape)
attn[:, :, step + 1].copy_(avg_attn_scores)
# print("attn[:, :, step + 1]", attn[:, :, step + 1].shape)
# print("attn", attn.shape)

scores = scores.type_as(lprobs)
eos_bbsz_idx = torch.empty(0).to(
tokens
) # indices of hypothesis ending with eos (finished sentences)
eos_scores = torch.empty(0).to(
scores
) # scores of hypothesis ending with eos (finished sentences)

if self.should_set_src_lengths:
self.search.set_src_lengths(src_lengths)

if self.repeat_ngram_blocker is not None:
lprobs = self.repeat_ngram_blocker(tokens, lprobs, bsz,
beam_size, step)

# Shape: (batch, cand_size)
cand_scores, cand_indices, cand_beams = self.search.step(
step,
lprobs.view(bsz, -1, self.vocab_size),
scores.view(bsz, beam_size, -1)[:, :, :step],
tokens[:, :step + 1],
original_batch_idxs,
)

# cand_bbsz_idx contains beam indices for the top candidate
# hypotheses, with a range of values: [0, bsz*beam_size),
# and dimensions: [bsz, cand_size]
cand_bbsz_idx = cand_beams.add(bbsz_offsets)

# finalize hypotheses that end in eos
# Shape of eos_mask: (batch size, beam size)
eos_mask = cand_indices.eq(self.eos) & cand_scores.ne(-math.inf)
eos_mask[:, :beam_size][cands_to_ignore] = torch.tensor(0).to(
eos_mask)

# only consider eos when it's among the top beam_size indices
# Now we know what beam item(s) to finish
# Shape: 1d list of absolute-numbered
eos_bbsz_idx = torch.masked_select(
cand_bbsz_idx[:, :beam_size], mask=eos_mask[:, :beam_size])

finalized_sents: List[int] = []
if eos_bbsz_idx.numel() > 0:
eos_scores = torch.masked_select(
cand_scores[:, :beam_size], mask=eos_mask[:, :beam_size])

finalized_sents = self.finalize_hypos(
step,
eos_bbsz_idx,
eos_scores,
tokens,
scores,
finalized,
finished,
beam_size,
attn,
src_lengths,
max_len,
)
num_remaining_sent -= len(finalized_sents)

assert num_remaining_sent >= 0
if num_remaining_sent == 0:
break
if self.search.stop_on_max_len and step >= max_len:
break
assert step < max_len, f'{step} < {max_len}'

# Remove finalized sentences (ones for which {beam_size}
# finished hypotheses have been generated) from the batch.
if len(finalized_sents) > 0:
new_bsz = bsz - len(finalized_sents)

# construct batch_idxs which holds indices of batches to keep for the next pass
batch_mask = torch.ones(
bsz, dtype=torch.bool, device=cand_indices.device)
batch_mask[finalized_sents] = False
# TODO replace `nonzero(as_tuple=False)` after TorchScript supports it
batch_idxs = torch.arange(
bsz, device=cand_indices.device).masked_select(batch_mask)

# Choose the subset of the hypothesized constraints that will continue
self.search.prune_sentences(batch_idxs)

eos_mask = eos_mask[batch_idxs]
cand_beams = cand_beams[batch_idxs]
bbsz_offsets.resize_(new_bsz, 1)
cand_bbsz_idx = cand_beams.add(bbsz_offsets)
cand_scores = cand_scores[batch_idxs]
cand_indices = cand_indices[batch_idxs]

if prefix_tokens is not None:
prefix_tokens = prefix_tokens[batch_idxs]
src_lengths = src_lengths[batch_idxs]
cands_to_ignore = cands_to_ignore[batch_idxs]

scores = scores.view(bsz, -1)[batch_idxs].view(
new_bsz * beam_size, -1)
tokens = tokens.view(bsz, -1)[batch_idxs].view(
new_bsz * beam_size, -1)
if attn is not None:
attn = attn.view(bsz, -1)[batch_idxs].view(
new_bsz * beam_size, attn.size(1), -1)
bsz = new_bsz
else:
batch_idxs = None

# Set active_mask so that values > cand_size indicate eos hypos
# and values < cand_size indicate candidate active hypos.
# After, the min values per row are the top candidate active hypos

# Rewrite the operator since the element wise or is not supported in torchscript.

eos_mask[:, :beam_size] = ~( # noqa
(~cands_to_ignore) & (~eos_mask[:, :beam_size])) # noqa
active_mask = torch.add(
eos_mask.type_as(cand_offsets) * cand_size,
cand_offsets[:eos_mask.size(1)],
)

# get the top beam_size active hypotheses, which are just
# the hypos with the smallest values in active_mask.
# {active_hypos} indicates which {beam_size} hypotheses
# from the list of {2 * beam_size} candidates were
# selected. Shapes: (batch size, beam size)
new_cands_to_ignore, active_hypos = torch.topk(
active_mask, k=beam_size, dim=1, largest=False)

# update cands_to_ignore to ignore any finalized hypos.
cands_to_ignore = new_cands_to_ignore.ge(cand_size)[:, :beam_size]
# Make sure there is at least one active item for each sentence in the batch.
assert (~cands_to_ignore).any(dim=1).all()

# update cands_to_ignore to ignore any finalized hypos

# {active_bbsz_idx} denotes which beam number is continued for each new hypothesis (a beam
# can be selected more than once).
active_bbsz_idx = torch.gather(
cand_bbsz_idx, dim=1, index=active_hypos)
active_scores = torch.gather(
cand_scores, dim=1, index=active_hypos)

active_bbsz_idx = active_bbsz_idx.view(-1)
active_scores = active_scores.view(-1)

# copy tokens and scores for active hypotheses

# Set the tokens for each beam (can select the same row more than once)
tokens[:, :step + 1] = torch.index_select(
tokens[:, :step + 1], dim=0, index=active_bbsz_idx)
# Select the next token for each of them
tokens.view(bsz, beam_size, -1)[:, :, step + 1] = torch.gather(
cand_indices, dim=1, index=active_hypos)
if step > 0:
scores[:, :step] = torch.index_select(
scores[:, :step], dim=0, index=active_bbsz_idx)
scores.view(bsz, beam_size, -1)[:, :, step] = torch.gather(
cand_scores, dim=1, index=active_hypos)

# Update constraints based on which candidates were selected for the next beam
self.search.update_constraints(active_hypos)

# copy attention for active hypotheses
if attn is not None:
attn[:, :, :step + 2] = torch.index_select(
attn[:, :, :step + 2], dim=0, index=active_bbsz_idx)

# reorder incremental state in decoder
reorder_state = active_bbsz_idx

# sort by score descending
for sent in range(len(finalized)):
scores = torch.tensor(
[float(elem['score'].item()) for elem in finalized[sent]])
_, sorted_scores_indices = torch.sort(scores, descending=True)
finalized[sent] = [
finalized[sent][ssi] for ssi in sorted_scores_indices
]
finalized[sent] = torch.jit.annotate(List[Dict[str, Tensor]],
finalized[sent])
return finalized

def _prefix_tokens(self, step: int, lprobs, scores, tokens, prefix_tokens,
beam_size: int):
"""Handle prefix tokens"""
prefix_toks = prefix_tokens[:, step].unsqueeze(-1).repeat(
1, beam_size).view(-1)
prefix_lprobs = lprobs.gather(-1, prefix_toks.unsqueeze(-1))
prefix_mask = prefix_toks.ne(self.pad)
if self.constraint_trie is None:
lprobs[prefix_mask] = torch.min(prefix_lprobs) - 1
else:
lprobs[prefix_mask] = -math.inf
lprobs[prefix_mask] = lprobs[prefix_mask].scatter(
-1, prefix_toks[prefix_mask].unsqueeze(-1),
prefix_lprobs[prefix_mask])
# if prefix includes eos, then we should make sure tokens and
# scores are the same across all beams
eos_mask = prefix_toks.eq(self.eos)
if eos_mask.any():
# validate that the first beam matches the prefix
first_beam = tokens[eos_mask].view(-1, beam_size,
tokens.size(-1))[:, 0,
1:step + 1]
eos_mask_batch_dim = eos_mask.view(-1, beam_size)[:, 0]
target_prefix = prefix_tokens[eos_mask_batch_dim][:, :step]
assert (first_beam == target_prefix).all()

# copy tokens, scores and lprobs from the first beam to all beams
tokens = self.replicate_first_beam(tokens, eos_mask_batch_dim,
beam_size)
scores = self.replicate_first_beam(scores, eos_mask_batch_dim,
beam_size)
lprobs = self.replicate_first_beam(lprobs, eos_mask_batch_dim,
beam_size)
return lprobs, tokens, scores

def replicate_first_beam(self, tensor, mask, beam_size: int):
tensor = tensor.view(-1, beam_size, tensor.size(-1))
tensor[mask] = tensor[mask][:, :1, :]
return tensor.view(-1, tensor.size(-1))

def finalize_hypos(
self,
step: int,
bbsz_idx,
eos_scores,
tokens,
scores,
finalized: List[List[Dict[str, Tensor]]],
finished: List[bool],
beam_size: int,
attn: Optional[Tensor],
src_lengths,
max_len: int,
):
"""Finalize hypothesis, store finalized information in `finalized`, and change `finished` accordingly.
A sentence is finalized when {beam_size} finished items have been collected for it.

Returns number of sentences (not beam items) being finalized.
These will be removed from the batch and not processed further.
Args:
bbsz_idx (Tensor):
"""
assert bbsz_idx.numel() == eos_scores.numel()

# clone relevant token and attention tensors.
# tokens is (batch * beam, max_len). So the index_select
# gets the newly EOS rows, then selects cols 1..{step + 2}
tokens_clone = tokens.index_select(
0, bbsz_idx)[:, 1:step + 2] # skip the first index, which is EOS

tokens_clone[:, step] = self.eos
attn_clone = (
attn.index_select(0, bbsz_idx)[:, :, 1:step
+ 2] if attn is not None else None)

# compute scores per token position
pos_scores = scores.index_select(0, bbsz_idx)[:, :step + 1]
pos_scores[:, step] = eos_scores
# convert from cumulative to per-position scores
pos_scores[:, 1:] = pos_scores[:, 1:] - pos_scores[:, :-1]

# normalize sentence-level scores
if self.normalize_scores:
eos_scores /= (step + 1)**self.len_penalty

# cum_unfin records which sentences in the batch are finished.
# It helps match indexing between (a) the original sentences
# in the batch and (b) the current, possibly-reduced set of
# sentences.
cum_unfin: List[int] = []
prev = 0
for f in finished:
if f:
prev += 1
else:
cum_unfin.append(prev)
cum_fin_tensor = torch.tensor(cum_unfin, dtype=torch.int).to(bbsz_idx)

unfin_idx = bbsz_idx // beam_size
sent = unfin_idx + torch.index_select(cum_fin_tensor, 0, unfin_idx)

# Create a set of "{sent}{unfin_idx}", where
# "unfin_idx" is the index in the current (possibly reduced)
# list of sentences, and "sent" is the index in the original,
# unreduced batch
# For every finished beam item
# sentence index in the current (possibly reduced) batch
seen = (sent << 32) + unfin_idx
unique_seen: List[int] = torch.unique(seen).tolist()

if self.match_source_len:
condition = step > torch.index_select(src_lengths, 0, unfin_idx)
eos_scores = torch.where(condition, torch.tensor(-math.inf),
eos_scores)
sent_list: List[int] = sent.tolist()
for i in range(bbsz_idx.size()[0]):
# An input sentence (among those in a batch) is finished when
# beam_size hypotheses have been collected for it
if len(finalized[sent_list[i]]) < beam_size:
if attn_clone is not None:
# remove padding tokens from attn scores
hypo_attn = attn_clone[i]
else:
hypo_attn = torch.empty(0)

finalized[sent_list[i]].append({
'tokens':
tokens_clone[i],
'score':
eos_scores[i],
'attention':
hypo_attn, # src_len x tgt_len
'alignment':
torch.empty(0),
'positional_scores':
pos_scores[i],
})

newly_finished: List[int] = []
for unique_s in unique_seen:
# check termination conditions for this sentence
unique_sent: int = unique_s >> 32
unique_unfin_idx: int = unique_s - (unique_sent << 32)

if not finished[unique_sent] and self.is_finished(
step, unique_unfin_idx, max_len, len(
finalized[unique_sent]), beam_size):
finished[unique_sent] = True
newly_finished.append(unique_unfin_idx)

return newly_finished

def is_finished(
self,
step: int,
unfin_idx: int,
max_len: int,
finalized_sent_len: int,
beam_size: int,
):
"""
Check whether decoding for a sentence is finished, which
occurs when the list of finalized sentences has reached the
beam size, or when we reach the maximum length.
"""
assert finalized_sent_len <= beam_size
if finalized_sent_len == beam_size or step == max_len:
return True
return False


class EnsembleModel(nn.Module):
"""A wrapper around an ensemble of models."""

def __init__(self, models):
super().__init__()
self.models_size = len(models)
# method '__len__' is not supported in ModuleList for torch script
self.single_model = models[0]
self.models = nn.ModuleList(models)

# self.has_incremental: bool = False
# if all(
# hasattr(m, "decoder") and isinstance(m.decoder, FairseqIncrementalDecoder)
# for m in models
# ):
# self.has_incremental = True

self.has_incremental = True

def forward(self):
pass

def has_encoder(self):
return hasattr(self.single_model, 'encoder')

def has_incremental_states(self):
return self.has_incremental

def max_decoder_positions(self):
return min([
m.max_decoder_positions()
for m in self.models if hasattr(m, 'max_decoder_positions')
] + [sys.maxsize]) #

@torch.jit.export
def forward_encoder(self, net_input: Dict[str, Tensor]):
if not self.has_encoder():
return None
encoder_input = {
k: v
for k, v in net_input.items() if k != 'decoder_input_ids'
}
encoder_input['output_hidden_states'] = True
return [
model.encoder.forward(**encoder_input) for model in self.models
]

@torch.jit.export
def forward_decoder(self,
tokens,
encoder_outs: List[Dict[str, List[Tensor]]],
incremental_states: List[Optional[torch.Tensor]],
temperature: float = 1.0,
constraint_trie=None,
constraint_start=None,
constraint_end=None,
gen_code=False,
zero_shot=False,
prefix_tokens=None):
log_probs = []
avg_attn: Optional[Tensor] = None
encoder_out: Optional[Dict[str, List[Tensor]]] = None
code_mask = (tokens.new_ones(tokens.size(0)) * gen_code).bool()

for i, model in enumerate(self.models):
if self.has_encoder():
encoder_out = encoder_outs[i]
encoder_hidden_states = encoder_out.last_hidden_state
encoder_attention_mask = _expand_mask(
encoder_out.padding_mask, encoder_hidden_states.dtype,
tokens.shape[-1])
src_pos_embed = encoder_out.position_embedding

# if tokens.eq(self.single_model.config.pad_token_id).any():
attention_mask = tokens.eq(self.single_model.padding_idx)

# decode each model
if self.has_incremental_states():
decoder_out = model.decoder.forward( # todo 模型输入不同
input_ids=tokens,
attention_mask=attention_mask,
encoder_hidden_states=encoder_hidden_states,
encoder_attention_mask=encoder_attention_mask,
code_masks=code_mask,
src_pos_embed=src_pos_embed,
past_key_values=incremental_states[i],
use_cache=True,
output_attentions=True)
else:
if hasattr(model, 'decoder'):
# decoder_out = model.decoder.forward(tokens, code_masks=code_mask, encoder_out=encoder_out)
decoder_out = model.decoder.forward( # todo 模型输入不同
input_ids=tokens,
attention_mask=attention_mask,
encoder_hidden_states=encoder_hidden_states,
encoder_attention_mask=encoder_attention_mask,
code_masks=code_mask,
src_pos_embed=src_pos_embed)
else:
decoder_out = model.forward(tokens)
# print('#### decoder_out ####', decoder_out)
# print('#### decoder_out ####', decoder_out.keys())
# for k,v in decoder_out.items():
# print(k)
# if isinstance(v, Tensor):
# print(v.shape)
# elif k == "past_key_values":
# print(len(v))
# print([v[0][i].shape for i in range(len(v[0]))])
# else:
# print(len(v))
# print([v[i].shape for i in range(len(v))])

attn: Optional[Tensor] = None
decoder_len = len(decoder_out)
# if decoder_len > 1 and decoder_out[1] is not None:
# if isinstance(decoder_out[1], Tensor):
# attn = decoder_out[1]
# else:
# attn_holder = decoder_out[1]["attn"]
# if isinstance(attn_holder, Tensor):
# attn = attn_holder
# elif attn_holder is not None:
# attn = attn_holder[0]
# if attn is not None:
# attn = attn[:, -1, :]

if 'cross_attentions' in decoder_out:
attn = decoder_out['cross_attentions'][-1].transpose(1, 0)
attn = attn.mean(dim=0) # (B, tgt_len, src_len)
if attn is not None:
attn = attn[:, -1, :]

# decoder_out_tuple = (
# decoder_out[0][:, -1:, :].div_(temperature),
# None if decoder_len <= 1 else decoder_out[1],
# )

decoder_out_tuple = (
decoder_out[0][:, -1:, :].div_(temperature),
None if decoder_len <= 1 else attn,
)

beam_size = decoder_out_tuple[0].size(0) // prefix_tokens.size(
0) if prefix_tokens is not None else 0
if constraint_trie is not None and not zero_shot:
assert constraint_start is None and constraint_end is None
constraint_masks = decoder_out_tuple[0].new_zeros(
decoder_out_tuple[0].size()).bool()
constraint_prefix_tokens = tokens.tolist()
for token_index, constraint_prefix_token in enumerate(
constraint_prefix_tokens):
prefix_len = prefix_tokens[token_index // beam_size].ne(
1).sum().item() if prefix_tokens is not None else 0
if len(constraint_prefix_token) > prefix_len:
constraint_prefix_token = [
0
] + constraint_prefix_token[prefix_len + 1:]
constraint_nodes = constraint_trie.get_next_layer(
constraint_prefix_token)
constraint_masks[token_index][:,
constraint_nodes] = True
else:
constraint_masks[token_index] = True
decoder_out_tuple[0].masked_fill_(~constraint_masks, -math.inf)
if constraint_start is not None and constraint_end is not None and not zero_shot:
assert constraint_trie is None
decoder_out_tuple[0][:, :, 4:constraint_start] = -math.inf
decoder_out_tuple[0][:, :, constraint_end:] = -math.inf

probs = model.get_normalized_probs(
decoder_out_tuple, log_probs=True, sample=None)
if constraint_trie is not None and zero_shot:
assert constraint_start is None and constraint_end is None
constraint_masks = decoder_out_tuple[0].new_zeros(
decoder_out_tuple[0].size()).bool()
constraint_prefix_tokens = tokens.tolist()
for token_index, constraint_prefix_token in enumerate(
constraint_prefix_tokens):
constraint_nodes = constraint_trie.get_next_layer(
constraint_prefix_token)
constraint_masks[token_index][:, constraint_nodes] = True
probs.masked_fill_(~constraint_masks, -math.inf)
if constraint_start is not None and constraint_end is not None and zero_shot:
assert constraint_trie is None
probs[:, :, 4:constraint_start] = -math.inf
probs[:, :, constraint_end:] = -math.inf
probs = probs[:, -1, :]
if self.models_size == 1:
return probs, attn

log_probs.append(probs)
if attn is not None:
if avg_attn is None:
avg_attn = attn
else:
avg_attn.add_(attn)

avg_probs = torch.logsumexp(
torch.stack(log_probs, dim=0), dim=0) - math.log(self.models_size)

if avg_attn is not None:
avg_attn.div_(self.models_size)
return avg_probs, avg_attn

@torch.jit.export
def reorder_encoder_out(self,
encoder_outs: Optional[List[Dict[str,
List[Tensor]]]],
new_order):
"""
Reorder encoder output according to *new_order*.

Args:
encoder_out: output from the ``forward()`` method
new_order (LongTensor): desired order

Returns:
*encoder_out* rearranged according to *new_order*
"""
new_outs: List[Dict[str, List[Tensor]]] = []
if not self.has_encoder():
return new_outs
for i, model in enumerate(self.models):
assert encoder_outs is not None
new_outs.append(
model.encoder.reorder_encoder_out(encoder_outs[i], new_order))
return new_outs

@torch.jit.export
def reorder_incremental_state(
self,
incremental_states: List[Optional[torch.Tensor]],
new_order,
):
if not self.has_incremental_states():
return
for i, model in enumerate(self.models):
model.decoder.reorder_incremental_state_scripting( # todo
incremental_states[i], new_order)

+ 512
- 0
modelscope/models/multi_modal/ofa/generate/token_generation_constraints.py View File

@@ -0,0 +1,512 @@
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license which can be found at
# https://github.com/facebookresearch/fairseq/blob/main/LICENSE
"""Implements tracking of constraints for a beam item.

A list of constraints is given as a list of one or more token
sequences, each of length at least one token. For example, for an input sentence

> Die maschinelle Übersetzung ist schwer zu kontrollieren.

We could have the constraints:
* to influence
* hard

There are two implementations:
* OrderedConstraintState: Tracks progress through an ordered list of multitoken constraints.
* UnorderedConstraintState: Tracks progress through an unordered list of multitoken constraints.

The difference is that in the first, the constraints are assumed to be
in order; the algorithm will permit zero or more tokens between them.
In the second, the constraints are not ordered, so many orderings will
be explored.

The same sequence can be present any number of times, and will appear
that many times in the output.
"""

from collections import Counter
from typing import List, Set

import torch


class ConstraintState:

def __init__(self):
pass


def pack_constraints(
batch_constraints: List[List[torch.Tensor]]) -> torch.Tensor:
"""Takes a list of list of constraints in tensor form (a list of
tensor constraints for each sentence) and transforms it into a
packed Tensor. For example, here is a batch of size 3 with 3, 0,
and 1 constraints:

[ [ [3 1 2], [3], [4 5 6 7], ]
[],
[ [1 8 9 10 1 4 11 12], ]
]

Its corresponding packed structure is:

[ [ 3 3 1 2 0 3 0 4 5 6 7 0],
[ 0 0 0 0 0 0 0 0 0 0 0 0],
[ 1 1 8 9 10 1 4 11 12 0 0 0] ]

The packed tensor has shape (batch size, maxlen), where
maxlen is defined below. Each row contains concatenated
constraint tokens for that sentence, with 0 appended after
each constraint. The first item in each row is the number
of constraints for that sentence. So maxlen is the maximum
of

(number of constraints) + (sum length of constraints) + 1.

across all sentences in the batch.
"""
# The maximum word length of concatenated constraints for any sentence
max_constraints_len = 1
for sentence_constraints in batch_constraints:
if len(sentence_constraints):
# number of constraints, plus sum of constrain lens, plus a zero after each
constraints_len = (1
+ sum([c.size(0) for c in sentence_constraints])
+ len(sentence_constraints))
max_constraints_len = max(max_constraints_len, constraints_len)

batch_size = len(batch_constraints)
constraints_tensor = torch.zeros((batch_size, max_constraints_len)).long()
for i, sentence_constraints in enumerate(batch_constraints):
constraints_tensor[i, 0] = len(sentence_constraints)
offset = 1
for j, constraint in enumerate(sentence_constraints):
this_len = constraint.size(0)
constraints_tensor[i, offset:offset + this_len] = constraint
offset += this_len + 1

return constraints_tensor.long()


def unpack_constraints(constraint_tensor: torch.Tensor) -> List[torch.Tensor]:
"""
Transforms *one row* of a packed constraint tensor (e.g., for one
sentence in the batch) into a list of constraint tensors.
"""
constraint_list = []
num_constraints = constraint_tensor[0]
constraints = constraint_tensor.tolist()
offset = 1
for i in range(num_constraints):
where = constraints.index(0, offset)
constraint_list.append(constraint_tensor[offset:where])
offset = where + 1

return constraint_list


class ConstraintNode:
"""
Represents a node in a trie managing unordered constraints.
"""

def __init__(self, token: int = None, parent=None):
# The token associate with this node (None for the root)
self.token = int(token) if token is not None else None
# The parent (None at the root)
self.parent = parent
# Whether this node is a completed constraint
self.terminal = 0
# List of child nodes
self.children = {}

# The cumulative number of constraints from this point in the
# trie forward
self.num_constraints = 0

@property
def id(self):
return self.token

def __str__(self):
term = self.terminal != 0
return f'[{self.token}].{term}#{self.num_constraints}'

def __getitem__(self, key: int):
return self.children.get(key, None)

def next_tokens(self) -> Set[int]:
"""The set of child labels."""
return set(self.children.keys())

@staticmethod
def create(constraints: List[List[int]]):
root = ConstraintNode()
for sequence in constraints:
root.add_sequence(sequence)

return root

@staticmethod
def print_graph(node: 'ConstraintNode'):
if len(node.children) == 0:
return str(node)
else:
s = f'({node}'
for child in node.children.values():
s += ' ' + ConstraintNode.print_graph(child)
s += ')'
return s

def token_counts(self) -> Counter:
"""Returns a counter of the number of times each token is used
in a constraint.
"""
token_counts = Counter()
kids = list(self.children.values())
while len(kids) > 0:
kid = kids.pop()
token_counts[kid.id] += kid.num_constraints
kids += list(kid.children.values())

return token_counts

def tokens(self) -> Set[int]:
"""Returns the set of tokens in constraints."""
return set(self.token_counts().keys())

def add_sequence(self, sequence: List[int]):
"""Adds a constraint, represented as a list of integers, to
the trie."""
assert len(sequence) > 0

token = int(sequence[0])
if token not in self.children:
self.children[token] = ConstraintNode(token, parent=self)

node = self.children[token]
if len(sequence) == 1:
node.terminal += 1
node.num_constraints += 1
parent = node.parent
while parent is not None:
parent.num_constraints += 1
parent = parent.parent
else:
node.add_sequence(sequence[1:])


class UnorderedConstraintState(ConstraintState):
"""
Records progress through the set of constraints for each item in the beam
using a trie.
"""

def __init__(self,
node: ConstraintNode,
copy_from: 'ConstraintState' = None):
self.node = node

if copy_from is None:
# The root node
self.root = node
# The set of states in the graph that have been completed
self.completed = Counter()
# The...
self.generated = Counter()
# The list of tokens we need to generate
self.needed_tokens = self.root.tokens()
else:
self.completed = Counter(copy_from.completed)
self.generated = Counter(copy_from.generated)
self.root = copy_from.root

# Mark the node as generated
if self.node != self.root:
self.generated[node] += 1

@staticmethod
def create(constraint_tensor: torch.Tensor):
constraint_list = unpack_constraints(constraint_tensor)
constraint_trie_root = ConstraintNode.create(constraint_list)
return UnorderedConstraintState(constraint_trie_root)

def __str__(self):
gen_str = ','.join([str(node) for node in self.generated])
return f'{self.name}/{self.bank}({gen_str})x{self.num_completed}'

def __copy__(self):
copied_state = UnorderedConstraintState(self.node, copy_from=self)
return copied_state

def copy(self):
return self.__copy__()

@property
def name(self):
if self.node.id is None:
return 'ROOT'
else:
return str(self.node.id)

@property
def is_root(self):
return self.node == self.root

@property
def bank(self):
return sum(self.generated.values())

@property
def num_completed(self):
"""The number of constraints (not constraint tokens) that are completed.
In addition to the already-completed states, we need to account for the
current state, which might get marked as completed when another token
is generated.
"""
in_final = self.node.terminal and self.completed[
self.node] < self.node.terminal
return sum(self.completed.values()) + in_final

@property
def finished(self):
return self.root.num_constraints - self.num_completed == 0

@property
def token_counts(self):
return self.root.token_counts()

@property
def tokens(self):
return self.root.tokens()

@property
def num_constraint_tokens(self):
return sum(self.token_counts.values())

def next_tokens(self) -> Set[int]:
"""Returns the list of tokens that could come next.
These are (a) all tokens extending the root state and, for
non-root states, additionally all tokens extending the current
state."""

if self.node != self.root:
return self.root.next_tokens().union(self.node.next_tokens())
else:
return self.root.next_tokens()

def advance(self, token: int):
"""Reads in a token and advances the state. Here's how it works.

We can advance to the next state if:
- there is a matching child
- its path isn't blocked

A path is blocked when all constraints that are descendants of
that node have already been generated, in the current state.

If we are not able to advance from the current state, we "fall
off the graph" and return to the root state. There, we again
try to advance, checking the same criteria.

In any case, when falling off the graph, we need to do some
bookkeeping. We:
- check whether any constraints were met (all prefixes of
current state)
- if one is found, mark it as completed
- adjust visited nodes accordingly
"""
token = int(token)

next_state = None
child = self.node[token]
if child is not None and self.generated[child] < child.num_constraints:
next_state = UnorderedConstraintState(child, copy_from=self)

def rewind():
"""If we're mid-trie and an "illegal" token is chosen next, we need
to reset our state to the root state. However, along the way, we need
to check whether a prefix of the current trie state represents a state
we could mark as completed.
"""
node = self.node
while node != self.root:
if node.terminal and self.completed[node] < node.terminal:
next_state.completed[node] += 1
return

next_state.generated[node] -= 1
node = node.parent

# Fall off the graph, check the root
if next_state is None and token in self.root.next_tokens():
child = self.root[token]
# We can only traverse this edge if it's not saturated
if self.generated[child] < child.num_constraints:
next_state = UnorderedConstraintState(child, copy_from=self)
else:
next_state = UnorderedConstraintState(
self.root, copy_from=self)

# Rewind
rewind()

elif next_state is None:
next_state = UnorderedConstraintState(self.root, copy_from=self)
# Rewind
rewind()

return next_state


class ConstraintSequence:

def __init__(self, sequences: List[List[int]]):
"""Represents a set of possibly multitoken constraints by
concatenating them and internally recording the end points.
"""
self.sequences = []
self.endpoints = []
self.num_tokens = 0
self.tokens = set()
for sequence in sequences:
for token in sequence:
self.tokens.add(token)
self.num_tokens += len(sequence)
self.endpoints += [False
for x in range(len(sequence) - 1)] + [True]
self.sequences += sequence

def __getitem__(self, key: int):
return self.sequences[key]

def __len__(self):
return len(self.sequences)

def __str__(self):
return str(self.sequences)


class OrderedConstraintState(ConstraintState):
"""
Records progress through the set of linear nonbranching constraints with gaps.
"""

def __init__(self, sequence: ConstraintSequence, state: int = -1):
self.sequence = sequence
self.state = state

@staticmethod
def create(constraint_tensor: torch.Tensor):
constraint_list = unpack_constraints(constraint_tensor)
return OrderedConstraintState(ConstraintSequence(constraint_list), -1)

def __str__(self):
return f'{self.state}/{self.bank}x{self.num_completed}'

def __copy__(self):
return OrderedConstraintState(self.sequence, self.state)

def copy(self):
return self.__copy__()

@property
def num_completed(self):
if self.state == -1:
return 0
count = len(
list(
filter(lambda x: x,
self.sequence.endpoints[0:self.state + 1])))
return count

@property
def is_root(self):
return self.state == -1

@property
def name(self):
if self.state == -1:
return 'ROOT'
else:
return str(self.sequence[self.state])

@property
def bank(self) -> int:
return self.state + 1

@property
def finished(self):
return self.state + 1 == len(self.sequence)

@property
def token_counts(self):
return self.sequence.token_counts()

@property
def tokens(self):
return self.sequence.tokens

@property
def num_constraint_tokens(self):
return sum(self.token_counts.values())

def next_tokens(self) -> Set[int]:
"""Returns the list of tokens that could come next.
These are (a) all tokens extending the root state and, for
non-root states, additionally all tokens extending the current
state."""

tokens = set()
if self.state > 0:
tokens.add(self.sequence[0])
if not self.finished:
tokens.add(self.sequence[self.state + 1])
return tokens

def advance(self, token: int):
"""Reads in a token and advances the state. Here's how it works.

We can advance to the next state if:
- there is a matching child
- its path isn't blocked

A path is blocked when all constraints that are descendants of
that node have already been generated, in the current state.

If we are not able to advance from the current state, we "fall
off the graph" and return to the root state. There, we again
try to advance, checking the same criteria.

In any case, when falling off the graph, we need to do some
bookkeeping. We:
- check whether any constraints were met (all prefixes of
current state)
- if one is found, mark it as completed
- adjust visited nodes accordingly
"""
token = int(token)
# print(f"{self} ADVANCE({token}) {self.sequence} -> ", end="")

if self.finished:
# Accept anything
next_state = self.copy()

elif self.sequence[self.state + 1] == token:
# Advance to the next token
next_state = OrderedConstraintState(self.sequence, self.state + 1)

elif self.sequence.endpoints[self.state]:
# Accept anything between constraints (*)
next_state = self.copy()

elif token == self.sequence[0]:
# Start over having generated the first token
next_state = OrderedConstraintState(self.sequence, 0)
else:
# Start over from the root
next_state = OrderedConstraintState(self.sequence, -1)

return next_state

+ 124
- 0
modelscope/models/multi_modal/ofa/generate/utils.py View File

@@ -0,0 +1,124 @@
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license which can be found at
# https://github.com/facebookresearch/fairseq/blob/main/LICENSE

import collections
from collections import abc
from itertools import accumulate

import torch
import torch.nn.functional as F

try:
from amp_C import multi_tensor_l2norm

multi_tensor_l2norm_available = True
except ImportError:
multi_tensor_l2norm_available = False

try:
import torch_xla.core.xla_model as xm
except ImportError:
xm = None

MANIFOLD_PATH_SEP = '|'


def apply_to_sample(f, sample):
if hasattr(sample, '__len__') and len(sample) == 0:
return {}

def _apply(x):
if torch.is_tensor(x):
return f(x)
elif isinstance(x, collections.OrderedDict):
# OrderedDict has attributes that needs to be preserved
od = collections.OrderedDict(
(key, _apply(value)) for key, value in x.items())
od.__dict__ = x.__dict__
return od
elif isinstance(x, dict):
return {key: _apply(value) for key, value in x.items()}
elif isinstance(x, list):
return [_apply(x) for x in x]
elif isinstance(x, tuple):
return tuple(_apply(x) for x in x)
elif isinstance(x, set):
return {_apply(x) for x in x}
else:
return x

return _apply(sample)


def move_to_device(batch, device):
r"""Puts each data field to the device"""
if isinstance(batch, torch.Tensor):
return batch.to(device)
elif isinstance(batch, (list, tuple)):
return tuple(move_to_device(item, device) for item in batch)
elif isinstance(batch, abc.Mapping):
return {
key: move_to_device(value, device)
for key, value in batch.items()
}
else:
return batch


def strip_pad(tensor, pad):
return tensor[tensor.ne(pad)]


def get_token_to_word_mapping(tokens, exclude_list):
n = len(tokens)
word_start = [int(token not in exclude_list) for token in tokens]
word_idx = list(accumulate(word_start))
token_to_word = {i: word_idx[i] for i in range(n)}
return token_to_word


def extract_hard_alignment(attn, src_sent, tgt_sent, pad, eos):
tgt_valid = (((tgt_sent != pad) & # noqa
(tgt_sent != eos)).nonzero(as_tuple=False).squeeze(dim=-1))
src_invalid = (((src_sent == pad) | # noqa
(src_sent == eos)).nonzero(as_tuple=False).squeeze(dim=-1))
src_token_to_word = get_token_to_word_mapping(src_sent, [eos, pad])
tgt_token_to_word = get_token_to_word_mapping(tgt_sent, [eos, pad])
alignment = []
if len(tgt_valid) != 0 and len(src_invalid) < len(src_sent):
attn_valid = attn[tgt_valid]
attn_valid[:, src_invalid] = float('-inf')
_, src_indices = attn_valid.max(dim=1)
for tgt_idx, src_idx in zip(tgt_valid, src_indices):
alignment.append((
src_token_to_word[src_idx.item()] - 1,
tgt_token_to_word[tgt_idx.item()] - 1,
))
return alignment


def softmax(x, dim: int, onnx_trace: bool = False):
if onnx_trace:
return F.softmax(x.float(), dim=dim)
else:
return F.softmax(x, dim=dim, dtype=torch.float32)


def log_softmax(x, dim: int, onnx_trace: bool = False):
if onnx_trace:
return F.log_softmax(x.float(), dim=dim)
else:
return F.log_softmax(x, dim=dim, dtype=torch.float32)


def extract_soft_alignment(attn, src_sent, tgt_sent, pad, eos):
tgt_valid = (tgt_sent != pad).nonzero(as_tuple=False)
src_valid = (src_sent != pad).nonzero(as_tuple=False).squeeze(dim=-1)
alignment = []
if len(tgt_valid) != 0 and len(src_valid) != 0:
attn_valid = attn[tgt_valid, src_valid]
alignment = [['{:.6f}'.format(p) for p in src_probs.tolist()]
for src_probs in attn_valid]
return alignment

+ 2192
- 0
modelscope/models/multi_modal/ofa/modeling_ofa.py
File diff suppressed because it is too large
View File


+ 283
- 0
modelscope/models/multi_modal/ofa/resnet.py View File

@@ -0,0 +1,283 @@
import torch
import torch.nn as nn


def drop_path(x, drop_prob: float = 0., training: bool = False):
"""Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
This is the same as the DropConnect impl I created for EfficientNet, etc networks, however,
the original name is misleading as 'Drop Connect' is a.sh different form of dropout in a.sh separate paper...
See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for
changing the layer and argument names to 'drop path' rather than mix DropConnect as a.sh layer name and use
'survival rate' as the argument.
"""
if drop_prob == 0. or not training:
return x
keep_prob = 1 - drop_prob
shape = (x.shape[0], ) + (1, ) * (
x.ndim - 1) # work with diff dim tensors, not just 2D ConvNets
random_tensor = keep_prob + torch.rand(
shape, dtype=x.dtype, device=x.device)
random_tensor.floor_() # binarize
output = x.div(keep_prob) * random_tensor
return output


class DropPath(nn.Module):
"""Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
"""

def __init__(self, drop_prob=None):
super(DropPath, self).__init__()
self.drop_prob = drop_prob

def forward(self, x):
return drop_path(x, self.drop_prob, self.training)


def conv3x3(in_planes, out_planes, stride=1, groups=1, dilation=1):
"""3x3 convolution with padding"""
return nn.Conv2d(
in_planes,
out_planes,
kernel_size=3,
stride=stride,
padding=dilation,
groups=groups,
bias=False,
dilation=dilation)


def conv1x1(in_planes, out_planes, stride=1):
"""1x1 convolution"""
return nn.Conv2d(
in_planes, out_planes, kernel_size=1, stride=stride, bias=False)


class BasicBlock(nn.Module):
expansion = 1

def __init__(self,
inplanes,
planes,
stride=1,
downsample=None,
groups=1,
base_width=64,
dilation=1,
norm_layer=None):
super(BasicBlock, self).__init__()
if norm_layer is None:
norm_layer = nn.BatchNorm2d
if groups != 1 or base_width != 64:
raise ValueError(
'BasicBlock only supports groups=1 and base_width=64')
if dilation > 1:
raise NotImplementedError(
'Dilation > 1 not supported in BasicBlock')
# Both self.conv1 and self.downsample layers downsample the input when stride != 1
self.conv1 = conv3x3(inplanes, planes, stride)
self.bn1 = norm_layer(planes)
self.relu = nn.ReLU(inplace=True)
self.conv2 = conv3x3(planes, planes)
self.bn2 = norm_layer(planes)
self.downsample = downsample
self.stride = stride

def forward(self, x):
assert False
identity = x

out = self.conv1(x)
out = self.bn1(out)
out = self.relu(out)

out = self.conv2(out)
out = self.bn2(out)

if self.downsample is not None:
identity = self.downsample(x)

out += identity
out = self.relu(out)

return out


class Bottleneck(nn.Module):
# Bottleneck in torchvision places the stride for downsampling at 3x3 convolution(self.conv2)
# while original implementation places the stride at the first 1x1 convolution(self.conv1)
# according to "Deep residual learning for image recognition"https://arxiv.org/abs/1512.03385.
# This variant is also known as ResNet V1.5 and improves accuracy according to
# https://ngc.nvidia.com/catalog/model-scripts/nvidia:resnet_50_v1_5_for_pytorch.

expansion = 4

def __init__(self,
inplanes,
planes,
stride=1,
downsample=None,
groups=1,
base_width=64,
dilation=1,
norm_layer=None,
drop_path_rate=0.0):
super(Bottleneck, self).__init__()
if norm_layer is None:
norm_layer = nn.BatchNorm2d
width = int(planes * (base_width / 64.)) * groups
# Both self.conv2 and self.downsample layers downsample the input when stride != 1
self.conv1 = conv1x1(inplanes, width)
self.bn1 = norm_layer(width)
self.conv2 = conv3x3(width, width, stride, groups, dilation)
self.bn2 = norm_layer(width)
self.conv3 = conv1x1(width, planes * self.expansion)
self.bn3 = norm_layer(planes * self.expansion)
self.relu = nn.ReLU(inplace=True)
self.downsample = downsample
self.stride = stride
self.drop_path = DropPath(
drop_path_rate) if drop_path_rate > 0.0 else nn.Identity()

def forward(self, x):
identity = x

out = self.conv1(x)
out = self.bn1(out)
out = self.relu(out)

out = self.conv2(out)
out = self.bn2(out)
out = self.relu(out)

out = self.conv3(out)
out = self.bn3(out)

if self.downsample is not None:
identity = self.downsample(x)

out = identity + self.drop_path(out)
out = self.relu(out)

return out


class ResNet(nn.Module):

def __init__(self,
layers,
zero_init_residual=False,
groups=1,
width_per_group=64,
replace_stride_with_dilation=None,
norm_layer=None,
drop_path_rate=0.0):
super(ResNet, self).__init__()
if norm_layer is None:
norm_layer = nn.BatchNorm2d
self._norm_layer = norm_layer

self.inplanes = 64
self.dilation = 1
if replace_stride_with_dilation is None:
# each element in the tuple indicates if we should replace
# the 2x2 stride with a dilated convolution instead
replace_stride_with_dilation = [False, False, False]
if len(replace_stride_with_dilation) != 3:
raise ValueError('replace_stride_with_dilation should be None '
'or a 3-element tuple, got {}'.format(
replace_stride_with_dilation))
self.groups = groups
self.base_width = width_per_group
self.conv1 = nn.Conv2d(
3, self.inplanes, kernel_size=7, stride=2, padding=3, bias=False)
self.bn1 = norm_layer(self.inplanes)
self.relu = nn.ReLU(inplace=True)
self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
self.layer1 = self._make_layer(
Bottleneck, 64, layers[0], drop_path_rate=drop_path_rate)
self.layer2 = self._make_layer(
Bottleneck,
128,
layers[1],
stride=2,
dilate=replace_stride_with_dilation[0],
drop_path_rate=drop_path_rate)
self.layer3 = self._make_layer(
Bottleneck,
256,
layers[2],
stride=2,
dilate=replace_stride_with_dilation[1],
drop_path_rate=drop_path_rate)

for m in self.modules():
if isinstance(m, nn.Conv2d):
nn.init.kaiming_normal_(
m.weight, mode='fan_out', nonlinearity='relu')
elif isinstance(m,
(nn.SyncBatchNorm, nn.BatchNorm2d, nn.GroupNorm)):
nn.init.constant_(m.weight, 1)
nn.init.constant_(m.bias, 0)

# Zero-initialize the last BN in each residual branch,
# so that the residual branch starts with zeros, and each residual block behaves like an identity.
# This improves the model by 0.2~0.3% according to https://arxiv.org/abs/1706.02677
if zero_init_residual:
for m in self.modules():
if isinstance(m, Bottleneck):
nn.init.constant_(m.bn3.weight, 0)
elif isinstance(m, BasicBlock):
nn.init.constant_(m.bn2.weight, 0)

def _make_layer(self,
block,
planes,
blocks,
stride=1,
dilate=False,
drop_path_rate=0.0):
norm_layer = self._norm_layer
downsample = None
previous_dilation = self.dilation
if dilate:
self.dilation *= stride
stride = 1
if stride != 1 or self.inplanes != planes * block.expansion:
downsample = nn.Sequential(
conv1x1(self.inplanes, planes * block.expansion, stride),
norm_layer(planes * block.expansion),
)

layers = []
layers.append(
block(self.inplanes, planes, stride, downsample, self.groups,
self.base_width, previous_dilation, norm_layer))
self.inplanes = planes * block.expansion

dpr = [x.item() for x in torch.linspace(0, drop_path_rate, blocks)]
for i in range(1, blocks):
layers.append(
block(
self.inplanes,
planes,
groups=self.groups,
base_width=self.base_width,
dilation=self.dilation,
norm_layer=norm_layer,
drop_path_rate=dpr[i]))

return nn.Sequential(*layers)

def _forward_impl(self, x):
x = self.conv1(x)
x = self.bn1(x)
x = self.relu(x)
x = self.maxpool(x)
x = self.layer1(x)
x = self.layer2(x)
x = self.layer3(x)
return x

def forward(self, x):
return self._forward_impl(x)

+ 48
- 0
modelscope/models/multi_modal/ofa/tokenization_ofa.py View File

@@ -0,0 +1,48 @@
# Copyright 2022 OFA-Sys Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tokenization classes for OFA."""
from transformers.models.bart.tokenization_bart import BartTokenizer
from transformers.utils import logging

logger = logging.get_logger(__name__)

VOCAB_FILES_NAMES = {'vocab_file': 'vocab.json', 'merges_file': 'merges.txt'}

PRETRAINED_VOCAB_FILES_MAP = {
'vocab_file': {
'ofa-base': 'https://huggingface.co/ofa-base/resolve/main/vocab.json',
},
'merges_file': {
'ofa-base': 'https://huggingface.co/ofa-base/resolve/main/merges.txt',
},
}

PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
'ofa-base': 1024,
}


class OFATokenizer(BartTokenizer):
"""
Construct a OFA tokenizer.

[`~OFATokenizer`] is identical to [`BartTokenizer`] and runs end-to-end tokenization: punctuation splitting and
wordpiece.

Refer to superclass [`BartTokenizer`] for usage examples and documentation concerning parameters.
"""

vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES

+ 59
- 0
modelscope/models/multi_modal/ofa/tokenization_ofa_fast.py View File

@@ -0,0 +1,59 @@
# Copyright 2022 OFA-Sys Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tokenization classes for OFA."""
from transformers.models.bart.tokenization_bart_fast import BartTokenizerFast
from transformers.utils import logging

from .tokenization_ofa import OFATokenizer

logger = logging.get_logger(__name__)

VOCAB_FILES_NAMES = {
'vocab_file': 'vocab.json',
'merges_file': 'merges.txt',
'tokenizer_file': 'tokenizer.json'
}

PRETRAINED_VOCAB_FILES_MAP = {
'vocab_file': {
'ofa-base': 'https://huggingface.co/ofa-base/resolve/main/vocab.json',
},
'merges_file': {
'ofa-base': 'https://huggingface.co/ofa-base/resolve/main/merges.txt',
},
'tokenizer_file': {
'ofa-base':
'https://huggingface.co/ofa-base/resolve/main/tokenizer.json',
},
}

PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
'ofa-base': 1024,
}


class OFATokenizerFast(BartTokenizerFast):
r"""
Construct a "fast" OFA tokenizer (backed by HuggingFace's *tokenizers* library).

[`~OFATokenizerFast`] is identical to [`BartTokenizerFast`] and runs end-to-end tokenization: punctuation splitting
and wordpiece.

Refer to superclass [`BartTokenizerFast`] for usage examples and documentation concerning parameters.
"""

vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
slow_tokenizer_class = OFATokenizer

+ 53
- 0
modelscope/models/multi_modal/ofa_for_image_captioning_model.py View File

@@ -0,0 +1,53 @@
from typing import Any, Dict

import torch.cuda

from modelscope.metainfo import Models
from modelscope.outputs import OutputKeys
from modelscope.utils.constant import Tasks
from ..base import Model
from ..builder import MODELS
from .ofa import OFAModel, OFATokenizer
from .ofa.generate import sequence_generator as sg
from .ofa.generate.utils import move_to_device

__all__ = ['OfaForImageCaptioning']


@MODELS.register_module(Tasks.image_captioning, module_name=Models.ofa)
class OfaForImageCaptioning(Model):

def __init__(self, model_dir, *args, **kwargs):
super().__init__(model_dir=model_dir, *args, **kwargs)
model = OFAModel.from_pretrained(model_dir)

self.model = model.module if hasattr(model, 'module') else model
self.tokenizer = OFATokenizer.from_pretrained(model_dir)
self.tokenizer.add_tokens(['<code_{}>'.format(i) for i in range(8192)])
self.tokenizer.add_tokens(['<bin_{}>'.format(i) for i in range(1000)])
self._device = torch.device('cuda') if torch.cuda.is_available() \
else torch.device('cpu')
self.model.to(self._device)
# Initialize generator
sg_args = {
'tokenizer': self.tokenizer,
'beam_size': 5,
'max_len_b': 16,
'min_len': 1,
'no_repeat_ngram_size': 3,
'constraint_range': None
}
if hasattr(kwargs, 'beam_search'):
sg_args.update(kwargs['beam_search'])
self.generator = sg.SequenceGenerator(**sg_args)

def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
input = move_to_device(input, self._device)
gen_output = self.generator.generate([self.model], input)
gen = [gen_output[i][0]['tokens'] for i in range(len(gen_output))]
result = self.tokenizer.batch_decode(gen, skip_special_tokens=True)
return {'image_id': '42', OutputKeys.CAPTION: result[0]}

def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
# What should we do here ?
return inputs

+ 1
- 1
modelscope/pipelines/builder.py View File

@@ -44,7 +44,7 @@ DEFAULT_MODEL_FOR_PIPELINE = {
'damo/nlp_space_dialog-modeling'),
Tasks.dialog_state_tracking: (Pipelines.dialog_state_tracking,
'damo/nlp_space_dialog-state-tracking'),
Tasks.image_captioning: (Pipelines.image_caption,
Tasks.image_captioning: (Pipelines.image_captioning,
'damo/ofa_image-caption_coco_large_en'),
Tasks.image_generation:
(Pipelines.person_image_cartoon,


+ 1
- 1
modelscope/pipelines/multi_modal/image_captioning_pipeline.py View File

@@ -11,7 +11,7 @@ logger = get_logger()


@PIPELINES.register_module(
Tasks.image_captioning, module_name=Pipelines.image_caption)
Tasks.image_captioning, module_name=Pipelines.image_captioning)
class ImageCaptionPipeline(Pipeline):

def __init__(self,


+ 19
- 63
modelscope/preprocessors/multi_modal.py View File

@@ -2,13 +2,14 @@
import os.path as osp
from typing import Any, Dict, Union

import numpy as np
import torch
from PIL import Image
from torchvision import transforms

from modelscope.hub.snapshot_download import snapshot_download
from modelscope.metainfo import Preprocessors
from modelscope.utils.constant import Fields, ModelFile
from modelscope.models.multi_modal.ofa import OFATokenizer
from modelscope.utils.constant import Fields
from modelscope.utils.type_assert import type_assert
from .base import Preprocessor
from .builder import PREPROCESSORS
@@ -31,84 +32,39 @@ class OfaImageCaptionPreprocessor(Preprocessor):
model_dir (str): model path
"""
super().__init__(*args, **kwargs)
model_dir = model_dir if osp.exists(model_dir) else snapshot_download(
model_dir)
self.tokenizer = OFATokenizer.from_pretrained(model_dir)
self.tokenizer.add_tokens(['<code_{}>'.format(i) for i in range(8192)])
self.tokenizer.add_tokens(['<bin_{}>'.format(i) for i in range(1000)])

if osp.exists(model_dir):
local_model_dir = model_dir
else:
local_model_dir = snapshot_download(model_dir)
local_model = osp.join(local_model_dir, ModelFile.TORCH_MODEL_FILE)
bpe_dir = local_model_dir

from fairseq import checkpoint_utils, tasks, utils
from ofa.tasks.mm_tasks import CaptionTask

tasks.register_task('caption', CaptionTask)

overrides = {
'bpe_dir': bpe_dir,
'eval_cider': False,
'beam': 5,
'max_len_b': 16,
'no_repeat_ngram_size': 3,
'seed': 7
}
model, cfg, task = checkpoint_utils.load_model_ensemble_and_task(
utils.split_paths(local_model), arg_overrides=overrides)
del model
# Initialize transform
from torchvision import transforms
mean = [0.5, 0.5, 0.5]
std = [0.5, 0.5, 0.5]
patch_image_size = 480
self.patch_resize_transform = transforms.Compose([
lambda image: image.convert('RGB'),
transforms.Resize(
(cfg.task.patch_image_size, cfg.task.patch_image_size),
interpolation=Image.BICUBIC),
transforms.Resize((patch_image_size, patch_image_size),
interpolation=Image.BICUBIC),
transforms.ToTensor(),
transforms.Normalize(mean=mean, std=std),
])

self.task = task
self.bos_item = torch.LongTensor([task.src_dict.bos()])
self.eos_item = torch.LongTensor([task.src_dict.eos()])
self.pad_idx = task.src_dict.pad()

@type_assert(object, (str, tuple, Image.Image))
def __call__(self, data: Union[str, tuple]) -> Dict[str, Any]:

def encode_text(text, length=None, append_bos=False, append_eos=False):
s = self.task.tgt_dict.encode_line(
line=self.task.bpe.encode(text),
add_if_not_exist=False,
append_eos=False).long()
if length is not None:
s = s[:length]
if append_bos:
s = torch.cat([self.bos_item, s])
if append_eos:
s = torch.cat([s, self.eos_item])
return s

if isinstance(data, Image.Image):
patch_image = self.patch_resize_transform(data).unsqueeze(0)
else:
patch_image = self.patch_resize_transform(
load_image(data)).unsqueeze(0)
patch_mask = torch.tensor([True])
text = 'what does the image describe?'
src_text = encode_text(
text, append_bos=True, append_eos=True).unsqueeze(0)
src_length = torch.LongTensor(
[s.ne(self.pad_idx).long().sum() for s in src_text])
sample = {
'id': np.array(['42']),
'net_input': {
'src_tokens': src_text,
'src_lengths': src_length,
'patch_images': patch_image,
'patch_masks': patch_mask,
}
text = ' what does the image describe?'
inputs = self.tokenizer([text], max_length=1024,
return_tensors='pt')['input_ids']
sample = dict()
sample['net_input'] = {
'input_ids': inputs,
'patch_images': patch_image,
'patch_masks': torch.tensor([True])
}
return sample



+ 1
- 1
tests/pipelines/test_image_captioning.py View File

@@ -14,7 +14,7 @@ class ImageCaptionTest(unittest.TestCase):
def test_run(self):
img_captioning = pipeline(
Tasks.image_captioning,
model='damo/ofa_image-caption_coco_large_en')
model='damo/ofa_image-caption_coco_distilled_en')
result = img_captioning('data/test/images/image_captioning.png')
print(result[OutputKeys.CAPTION])



Loading…
Cancel
Save