ya235025 yingda.chen 3 years ago
parent
commit
4c08bd752a
12 changed files with 1791 additions and 469 deletions
  1. +2
    -0
      modelscope/models/multi_modal/__init__.py
  2. +1
    -1
      modelscope/models/multi_modal/clip/__init__.py
  3. +422
    -0
      modelscope/models/multi_modal/clip/bert_tokenizer.py
  4. +0
    -29
      modelscope/models/multi_modal/clip/clip_bert.py
  5. +0
    -216
      modelscope/models/multi_modal/clip/clip_model.py
  6. +0
    -131
      modelscope/models/multi_modal/clip/clip_vit.py
  7. +82
    -0
      modelscope/models/multi_modal/clip/configuration_bert.py
  8. +677
    -0
      modelscope/models/multi_modal/clip/model.py
  9. +507
    -0
      modelscope/models/multi_modal/clip/modeling_bert.py
  10. +61
    -1
      modelscope/models/multi_modal/mplug/clip/clip.py
  11. +39
    -31
      tests/pipelines/test_multi_modal_embedding.py
  12. +0
    -60
      tests/trainers/test_clip_multi_modal_embedding_trainer.py

+ 2
- 0
modelscope/models/multi_modal/__init__.py View File

@@ -12,6 +12,8 @@ if TYPE_CHECKING:
from .mplug_for_visual_question_answering import \
MPlugForVisualQuestionAnswering
from .ofa_for_all_tasks import OfaForAllTasks
from .ofa_for_text_to_image_synthesis_model import \
OfaForTextToImageSynthesis

else:
_import_structure = {


+ 1
- 1
modelscope/models/multi_modal/clip/__init__.py View File

@@ -1 +1 @@
from .clip_model import CLIPForMultiModalEmbedding
from .model import CLIPForMultiModalEmbedding

+ 422
- 0
modelscope/models/multi_modal/clip/bert_tokenizer.py View File

@@ -0,0 +1,422 @@
# Copyright 2018 The Google AI Language Team Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tokenization classes."""

from __future__ import absolute_import, division, print_function
import collections
import os
import re
import unicodedata

import six


def validate_case_matches_checkpoint(do_lower_case, init_checkpoint):
"""Checks whether the casing config is consistent with the checkpoint name."""

# The casing has to be passed in by the user and there is no explicit check
# as to whether it matches the checkpoint. The casing information probably
# should have been stored in the bert_config.json file, but it's not, so
# we have to heuristically detect it to validate.

if not init_checkpoint:
return

m = re.match('^.*?([A-Za-z0-9_-]+)/bert_model.ckpt', init_checkpoint)
if m is None:
return

model_name = m.group(1)

lower_models = [
'uncased_L-24_H-1024_A-16', 'uncased_L-12_H-768_A-12',
'multilingual_L-12_H-768_A-12', 'chinese_L-12_H-768_A-12'
]

cased_models = [
'cased_L-12_H-768_A-12', 'cased_L-24_H-1024_A-16',
'multi_cased_L-12_H-768_A-12'
]

is_bad_config = False
if model_name in lower_models and not do_lower_case:
is_bad_config = True
actual_flag = 'False'
case_name = 'lowercased'
opposite_flag = 'True'

if model_name in cased_models and do_lower_case:
is_bad_config = True
actual_flag = 'True'
case_name = 'cased'
opposite_flag = 'False'

if is_bad_config:
raise ValueError(
'You passed in `--do_lower_case=%s` with `--init_checkpoint=%s`. '
'However, `%s` seems to be a %s model, so you '
'should pass in `--do_lower_case=%s` so that the fine-tuning matches '
'how the model was pre-training. If this error is wrong, please '
'just comment out this check.' %
(actual_flag, init_checkpoint, model_name, case_name,
opposite_flag))


def convert_to_unicode(text):
"""Converts `text` to Unicode (if it's not already), assuming utf-8 input."""
if six.PY3:
if isinstance(text, str):
return text
elif isinstance(text, bytes):
return text.decode('utf-8', 'ignore')
else:
raise ValueError('Unsupported string type: %s' % (type(text)))
elif six.PY2:
if isinstance(text, str):
return text.decode('utf-8', 'ignore')
elif isinstance(text, unicode):
return text
else:
raise ValueError('Unsupported string type: %s' % (type(text)))
else:
raise ValueError('Not running on Python2 or Python 3?')


def printable_text(text):
"""Returns text encoded in a way suitable for print or `tf.logging`."""

# These functions want `str` for both Python2 and Python3, but in one case
# it's a Unicode string and in the other it's a byte string.
if six.PY3:
if isinstance(text, str):
return text
elif isinstance(text, bytes):
return text.decode('utf-8', 'ignore')
else:
raise ValueError('Unsupported string type: %s' % (type(text)))
elif six.PY2:
if isinstance(text, str):
return text
elif isinstance(text, unicode):
return text.encode('utf-8')
else:
raise ValueError('Unsupported string type: %s' % (type(text)))
else:
raise ValueError('Not running on Python2 or Python 3?')


def load_vocab(vocab_file):
"""Loads a vocabulary file into a dictionary."""
vocab = collections.OrderedDict()
index = 0
with open(vocab_file, 'r') as reader:
while True:
token = convert_to_unicode(reader.readline())
if not token:
break
token = token.strip()
vocab[token] = index
index += 1
return vocab


def convert_by_vocab(vocab, items):
"""Converts a sequence of [tokens|ids] using the vocab."""
output = []
for item in items:
output.append(vocab[item])
return output


def convert_tokens_to_ids(vocab, tokens):
return convert_by_vocab(vocab, tokens)


def convert_ids_to_tokens(inv_vocab, ids):
return convert_by_vocab(inv_vocab, ids)


def whitespace_tokenize(text):
"""Runs basic whitespace cleaning and splitting on a piece of text."""
text = text.strip()
if not text:
return []
tokens = text.split()
return tokens


class FullTokenizer(object):
"""Runs end-to-end tokenziation."""

def __init__(self, vocab_file, do_lower_case=True):
self.vocab = load_vocab(vocab_file)
self.inv_vocab = {v: k for k, v in self.vocab.items()}
self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)

def tokenize(self, text):
split_tokens = []
for token in self.basic_tokenizer.tokenize(text):
for sub_token in self.wordpiece_tokenizer.tokenize(token):
split_tokens.append(sub_token)

return split_tokens

def convert_tokens_to_ids(self, tokens):
return convert_by_vocab(self.vocab, tokens)

def convert_ids_to_tokens(self, ids):
return convert_by_vocab(self.inv_vocab, ids)

@staticmethod
def convert_tokens_to_string(tokens, clean_up_tokenization_spaces=True):
""" Converts a sequence of tokens (string) in a single string. """

def clean_up_tokenization(out_string):
""" Clean up a list of simple English tokenization artifacts
like spaces before punctuations and abreviated forms.
"""
out_string = (
out_string.replace(' .', '.').replace(' ?', '?').replace(
' !', '!').replace(' ,', ',').replace(" ' ", "'").replace(
" n't", "n't").replace(" 'm", "'m").replace(
" 's", "'s").replace(" 've",
"'ve").replace(" 're", "'re"))
return out_string

text = ' '.join(tokens).replace(' ##', '').strip()
if clean_up_tokenization_spaces:
clean_text = clean_up_tokenization(text)
return clean_text
else:
return text

def vocab_size(self):
return len(self.vocab)


class BasicTokenizer(object):
"""Runs basic tokenization (punctuation splitting, lower casing, etc.)."""

def __init__(self, do_lower_case=True):
"""Constructs a BasicTokenizer.

Args:
do_lower_case: Whether to lower case the input.
"""
self.do_lower_case = do_lower_case

def tokenize(self, text):
"""Tokenizes a piece of text."""
text = convert_to_unicode(text)
text = self._clean_text(text)

# This was added on November 1st, 2018 for the multilingual and Chinese
# models. This is also applied to the English models now, but it doesn't
# matter since the English models were not trained on any Chinese data
# and generally don't have any Chinese data in them (there are Chinese
# characters in the vocabulary because Wikipedia does have some Chinese
# words in the English Wikipedia.).
text = self._tokenize_chinese_chars(text)

orig_tokens = whitespace_tokenize(text)
split_tokens = []
for token in orig_tokens:
if self.do_lower_case:
token = token.lower()
token = self._run_strip_accents(token)
split_tokens.extend(self._run_split_on_punc(token))

output_tokens = whitespace_tokenize(' '.join(split_tokens))
return output_tokens

def _run_strip_accents(self, text):
"""Strips accents from a piece of text."""
text = unicodedata.normalize('NFD', text)
output = []
for char in text:
cat = unicodedata.category(char)
if cat == 'Mn':
continue
output.append(char)
return ''.join(output)

def _run_split_on_punc(self, text):
"""Splits punctuation on a piece of text."""
chars = list(text)
i = 0
start_new_word = True
output = []
while i < len(chars):
char = chars[i]
if _is_punctuation(char):
output.append([char])
start_new_word = True
else:
if start_new_word:
output.append([])
start_new_word = False
output[-1].append(char)
i += 1

return [''.join(x) for x in output]

def _tokenize_chinese_chars(self, text):
"""Adds whitespace around any CJK character."""
output = []
for char in text:
cp = ord(char)
if self._is_chinese_char(cp):
output.append(' ')
output.append(char)
output.append(' ')
else:
output.append(char)
return ''.join(output)

def _is_chinese_char(self, cp):
"""Checks whether CP is the codepoint of a CJK character."""
# This defines a "chinese character" as anything in the CJK Unicode block:
# https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
#
# Note that the CJK Unicode block is NOT all Japanese and Korean characters,
# despite its name. The modern Korean Hangul alphabet is a different block,
# as is Japanese Hiragana and Katakana. Those alphabets are used to write
# space-separated words, so they are not treated specially and handled
# like the all of the other languages.
if ((cp >= 0x4E00 and cp <= 0x9FFF) or (cp >= 0x3400 and cp <= 0x4DBF)
or (cp >= 0x20000 and cp <= 0x2A6DF)
or (cp >= 0x2A700 and cp <= 0x2B73F)
or (cp >= 0x2B740 and cp <= 0x2B81F)
or (cp >= 0x2B820 and cp <= 0x2CEAF)
or (cp >= 0xF900 and cp <= 0xFAFF)
or (cp >= 0x2F800 and cp <= 0x2FA1F)):
return True

return False

def _clean_text(self, text):
"""Performs invalid character removal and whitespace cleanup on text."""
output = []
for char in text:
cp = ord(char)
if cp == 0 or cp == 0xfffd or _is_control(char):
continue
if _is_whitespace(char):
output.append(' ')
else:
output.append(char)
return ''.join(output)


class WordpieceTokenizer(object):
"""Runs WordPiece tokenziation."""

def __init__(self, vocab, unk_token='[UNK]', max_input_chars_per_word=200):
self.vocab = vocab
self.unk_token = unk_token
self.max_input_chars_per_word = max_input_chars_per_word

def tokenize(self, text):
"""Tokenizes a piece of text into its word pieces.

This uses a greedy longest-match-first algorithm to perform tokenization
using the given vocabulary.

For example:
input = "unaffable"
output = ["un", "##aff", "##able"]

Args:
text: A single token or whitespace separated tokens. This should have
already been passed through `BasicTokenizer.

Returns:
A list of wordpiece tokens.
"""

text = convert_to_unicode(text)

output_tokens = []
for token in whitespace_tokenize(text):
chars = list(token)
if len(chars) > self.max_input_chars_per_word:
output_tokens.append(self.unk_token)
continue

is_bad = False
start = 0
sub_tokens = []
while start < len(chars):
end = len(chars)
cur_substr = None
while start < end:
substr = ''.join(chars[start:end])
if start > 0:
substr = '##' + substr
if substr in self.vocab:
cur_substr = substr
break
end -= 1
if cur_substr is None:
is_bad = True
break
sub_tokens.append(cur_substr)
start = end

if is_bad:
output_tokens.append(self.unk_token)
else:
output_tokens.extend(sub_tokens)
return output_tokens


def _is_whitespace(char):
"""Checks whether `chars` is a whitespace character."""
# \t, \n, and \r are technically contorl characters but we treat them
# as whitespace since they are generally considered as such.
if char == ' ' or char == '\t' or char == '\n' or char == '\r':
return True
cat = unicodedata.category(char)
if cat == 'Zs':
return True
return False


def _is_control(char):
"""Checks whether `chars` is a control character."""
# These are technically control characters but we count them as whitespace
# characters.
if char == '\t' or char == '\n' or char == '\r':
return False
cat = unicodedata.category(char)
if cat in ('Cc', 'Cf'):
return True
return False


def _is_punctuation(char):
"""Checks whether `chars` is a punctuation character."""
cp = ord(char)
# We treat all non-letter/number ASCII as punctuation.
# Characters such as "^", "$", and "`" are not in the Unicode
# Punctuation class but we treat them as punctuation anyways, for
# consistency.
if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64)
or (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):
return True
cat = unicodedata.category(char)
if cat.startswith('P'):
return True
return False

+ 0
- 29
modelscope/models/multi_modal/clip/clip_bert.py View File

@@ -1,29 +0,0 @@
import torch.nn as nn
from transformers import BertConfig, BertForMaskedLM


class TextTransformer(nn.Module):

def __init__(self, config_dict, feat_dim=768, use_grad_ckp=True):
super(TextTransformer, self).__init__()
bert_config = BertConfig.from_dict(config_dict)
if use_grad_ckp:
bert_config.gradient_checkpointing = True

self.bert = BertForMaskedLM(bert_config).bert

self.projector = nn.Linear(
bert_config.hidden_size, feat_dim, bias=False)

def forward(self, input_ids, attention_mask):
trans_features = {
'input_ids': input_ids,
'attention_mask': attention_mask
}

output_states = self.bert(**trans_features, return_dict=False)
output_tokens = output_states[0]

cls_tokens = output_tokens[:, 0, :]

return self.projector(cls_tokens)

+ 0
- 216
modelscope/models/multi_modal/clip/clip_model.py View File

@@ -1,216 +0,0 @@
from typing import Any, Dict

import cv2
import json
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from PIL import Image
from tokenizers import BertWordPieceTokenizer
from torch.distributed.nn.functional import \
all_gather as all_gather_with_backprop
from torchvision.transforms import Compose, Normalize, Resize, ToTensor

from modelscope.metainfo import Models
from modelscope.models import TorchModel
from modelscope.models.builder import MODELS
from modelscope.models.multi_modal.clip.clip_bert import TextTransformer
from modelscope.models.multi_modal.clip.clip_vit import VisionTransformer
from modelscope.utils.constant import ModeKeys, ModelFile, Tasks
from modelscope.utils.logger import get_logger

logger = get_logger()

__all__ = ['CLIPForMultiModalEmbedding']


class CLIPModel(nn.Module):

def __init__(self, model_dir):
super(CLIPModel, self).__init__()
# including vision config and text config
model_config = json.load(
open('{}/encoder_config.json'.format(model_dir)))

# vision encoder
vision_config = model_config['vision_config']
self.img_size = vision_config['input_resolution']
self.vision_encoder = VisionTransformer(
input_resolution=self.img_size,
patch_size=vision_config['patch_size'],
width=vision_config['width'],
layers=vision_config['layers'],
heads=vision_config['heads'],
output_dim=vision_config['feat_dim'],
use_grad_ckp=True)

# text encoder
text_config = model_config['text_config']
self.text_encoder = TextTransformer(
text_config['bert_config'], feat_dim=text_config['feat_dim'])

self.logit_scale = nn.Parameter(torch.ones([]) * 4.6)

def contrastive_loss(self, logits, dim):
neg_ce = torch.diag(F.log_softmax(logits, dim=dim))
return -neg_ce.mean()

def clip_loss(self, t2i_sim, i2t_sim, img_idx=None, all_img_idx=None):
if img_idx is not None and all_img_idx is not None:
with torch.no_grad():
false_neg_indicator = (
img_idx[:, None] == all_img_idx[None, :])
false_neg_indicator.fill_diagonal_(False)
t2i_sim.masked_fill_(false_neg_indicator, float('-inf'))
i2t_sim.masked_fill_(false_neg_indicator, float('-inf'))
caption_loss = self.contrastive_loss(t2i_sim, dim=1)
image_loss = self.contrastive_loss(i2t_sim, dim=1)
else:
caption_loss = self.contrastive_loss(t2i_sim, dim=1)
image_loss = self.contrastive_loss(i2t_sim, dim=1)
return (caption_loss + image_loss) / 2.0

def get_loss(self, img_tensor, text_ids_tensor, text_masks_tensor,
img_id_list):
img_feat = self.forward(img_tensor, input_type='img')
text_feat = self.forward((text_ids_tensor, text_masks_tensor),
input_type='text')

global_img_feat = torch.cat(all_gather_with_backprop(img_feat), dim=0)
global_text_feat = torch.cat(
all_gather_with_backprop(text_feat), dim=0)
global_img_id_list = torch.cat(
all_gather_with_backprop(img_id_list), dim=0)

t2i_sim_mat = text_feat @ global_img_feat.t()
i2t_sim_mat = img_feat @ global_text_feat.t()

logit_scale = self.logit_scale.exp().clamp(max=100.0)
t2i_sim_mat_logits = t2i_sim_mat * logit_scale
i2t_sim_mat_logits = i2t_sim_mat * logit_scale

loss = self.clip_loss(
t2i_sim_mat_logits,
i2t_sim_mat_logits,
img_idx=img_id_list,
all_img_idx=global_img_id_list)

return loss

def forward(self, input_data, input_type):
if input_type == 'img':
img_embedding = self.vision_encoder(input_data)
img_embedding = F.normalize(img_embedding, p=2.0, dim=1)
return img_embedding
elif input_type == 'text':
text_ids_tensor, text_mask_tensor = input_data
text_embedding = self.text_encoder(text_ids_tensor,
text_mask_tensor)
text_embedding = F.normalize(text_embedding, p=2.0, dim=1)
return text_embedding
elif input_type == ModeKeys.TRAIN:
return self.get_loss(*input_data)
else:
raise ValueError('Unknown input type')


@MODELS.register_module(Tasks.multi_modal_embedding, module_name=Models.clip)
class CLIPForMultiModalEmbedding(TorchModel):

def __init__(self, model_dir, device_id=-1):
super().__init__(model_dir=model_dir, device_id=device_id)
self.clip_model = CLIPModel(model_dir=model_dir)
pretrained_params = torch.load(
'{}/pytorch_model.bin'.format(model_dir), 'cpu')
self.clip_model.load_state_dict(pretrained_params)
self.clip_model.eval()

self.device_id = device_id
if self.device_id >= 0:
self.clip_model.to('cuda:{}'.format(self.device_id))
logger.info('Use GPU: {}'.format(self.device_id))
else:
logger.info('Use CPU for inference')

# image preprocessor
norm_op = Normalize((0.48145466, 0.4578275, 0.40821073),
(0.26862954, 0.26130258, 0.27577711))
self.img_preprocessor = Compose([
Resize((self.clip_model.img_size, self.clip_model.img_size),
interpolation=Image.BICUBIC),
ToTensor(), norm_op
])

# text tokenizer
vocab_path = f'{model_dir}/{ModelFile.VOCAB_FILE}'
self.text_tokenizer = BertWordPieceTokenizer(
vocab_path, lowercase=False)
self.text_tokenizer.enable_truncation(max_length=30)

def tokenize_text(self, text_str):
tokens = self.text_tokenizer.encode(text_str)
max_tokens = 30
text_ids_tensor = torch.zeros((1, max_tokens)).long()
text_mask_tensor = torch.zeros((1, max_tokens))

text_ids, text_mask = tokens.ids, tokens.attention_mask
text_ids_tensor[0, 0:len(text_ids)] = torch.tensor(text_ids)
text_mask_tensor[0, 0:len(text_mask)] = torch.tensor(text_mask)

return text_ids_tensor, text_mask_tensor

def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
from modelscope.outputs import OutputKeys
output = {
OutputKeys.IMG_EMBEDDING: None,
OutputKeys.TEXT_EMBEDDING: None
}
if 'img' in input and input['img'] is not None:
input_img = input['img']
if isinstance(input_img, Image.Image):
img_tensor = self.img_preprocessor(input_img)[None, ...]
elif isinstance(input_img, np.ndarray):
if len(input_img.shape) == 2:
input_img = cv2.cvtColor(input_img, cv2.COLOR_GRAY2BGR)
input_img = input_img[:, :, ::-1] # in rgb order
input_img = Image.fromarray(
input_img.astype('uint8')).convert('RGB')
img_tensor = self.img_preprocessor(input_img)[None, ...]
else:
raise TypeError(
f'img should be either PIL.Image or np.array, but got {type(input_img)}'
)

if self.device_id >= 0:
img_tensor = img_tensor.to('cuda:{}'.format(self.device_id))

img_embedding = self.clip_model(
input_data=img_tensor, input_type='img')
from modelscope.outputs import OutputKeys
output[OutputKeys.IMG_EMBEDDING] = img_embedding.data.cpu().numpy()

if 'text' in input and input['text'] is not None:
text_str = input['text']
if isinstance(text_str, str):
text_ids_tensor, text_mask_tensor = self.tokenize_text(
text_str)
else:
raise TypeError(
f'text should be str, but got {type(text_str)}')

if self.device_id >= 0:
text_ids_tensor = text_ids_tensor.to('cuda:{}'.format(
self.device_id))
text_mask_tensor = text_mask_tensor.to('cuda:{}'.format(
self.device_id))

text_embedding = self.clip_model(
input_data=(text_ids_tensor, text_mask_tensor),
input_type='text')
output['text_embedding'] = text_embedding.data.cpu().numpy()

return output

def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
return inputs

+ 0
- 131
modelscope/models/multi_modal/clip/clip_vit.py View File

@@ -1,131 +0,0 @@
# Copyright 2021 The OpenAI CLIP Authors. All rights reserved.

from collections import OrderedDict
from typing import Tuple, Union

import numpy as np
import torch
import torch.nn.functional as F
import torch.utils.checkpoint as checkpoint
from torch import nn


class LayerNorm(nn.LayerNorm):
"""Subclass torch's LayerNorm to handle fp16."""

def forward(self, x: torch.Tensor):
orig_type = x.dtype
ret = super().forward(x.type(torch.float32))
return ret.type(orig_type)


class QuickGELU(nn.Module):

def forward(self, x: torch.Tensor):
return x * torch.sigmoid(1.702 * x)


class ResidualAttentionBlock(nn.Module):

def __init__(self,
d_model: int,
n_head: int,
attn_mask: torch.Tensor = None):
super().__init__()

self.attn = nn.MultiheadAttention(d_model, n_head)
self.ln_1 = LayerNorm(d_model)
self.mlp = nn.Sequential(
OrderedDict([('c_fc', nn.Linear(d_model, d_model * 4)),
('gelu', QuickGELU()),
('c_proj', nn.Linear(d_model * 4, d_model))]))
self.ln_2 = LayerNorm(d_model)
self.attn_mask = attn_mask

def attention(self, x: torch.Tensor):
self.attn_mask = self.attn_mask.to(
dtype=x.dtype,
device=x.device) if self.attn_mask is not None else None
return self.attn(
x, x, x, need_weights=False, attn_mask=self.attn_mask)[0]

def forward(self, x: torch.Tensor):
x = x + self.attention(self.ln_1(x))
x = x + self.mlp(self.ln_2(x))
return x


class Transformer(nn.Module):

def __init__(self,
width: int,
layers: int,
heads: int,
attn_mask: torch.Tensor = None,
use_grad_ckp: bool = True):
super().__init__()
self.width = width
self.layers = layers
self.resblocks = nn.Sequential(*[
ResidualAttentionBlock(width, heads, attn_mask)
for _ in range(layers)
])

self.use_grad_ckp = use_grad_ckp

def forward(self, x: torch.Tensor):
if self.use_grad_ckp:
for each_block in self.resblocks:
x = checkpoint.checkpoint(each_block, x)
return x
else:
return self.resblocks(x)


class VisionTransformer(nn.Module):

def __init__(self, input_resolution: int, patch_size: int, width: int,
layers: int, heads: int, output_dim: int, use_grad_ckp: bool):
super().__init__()
self.input_resolution = input_resolution
self.output_dim = output_dim
self.conv1 = nn.Conv2d(
in_channels=3,
out_channels=width,
kernel_size=patch_size,
stride=patch_size,
bias=False)

scale = width**-0.5
self.class_embedding = nn.Parameter(scale * torch.randn(width))
self.positional_embedding = nn.Parameter(scale * torch.randn(
(input_resolution // patch_size)**2 + 1, width))
self.ln_pre = LayerNorm(width)

self.transformer = Transformer(
width, layers, heads, use_grad_ckp=use_grad_ckp)

self.ln_post = LayerNorm(width)
self.proj = nn.Parameter(scale * torch.randn(width, output_dim))

def forward(self, x: torch.Tensor):
x = self.conv1(x) # shape = [*, width, grid, grid]
x = x.reshape(x.shape[0], x.shape[1],
-1) # shape = [*, width, grid ** 2]
x = x.permute(0, 2, 1) # shape = [*, grid ** 2, width]
class_embeddings = self.class_embedding.to(x.dtype) + \
torch.zeros(x.shape[0], 1, x.shape[-1], dtype=x.dtype, device=x.device)
x = torch.cat([class_embeddings, x], dim=1)
x = x + self.positional_embedding.to(x.dtype)
x = self.ln_pre(x)

x = x.permute(1, 0, 2) # NLD -> LND
x = self.transformer(x)
x = x.permute(1, 0, 2) # LND -> NLD

x = self.ln_post(x[:, 0, :])

if self.proj is not None:
x = x @ self.proj

return x

+ 82
- 0
modelscope/models/multi_modal/clip/configuration_bert.py View File

@@ -0,0 +1,82 @@
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" BERT model configuration """

from __future__ import (absolute_import, division, print_function,
unicode_literals)
import logging

logger = logging.getLogger(__name__)


class BertConfig(object):
r"""
:class:`~transformers.BertConfig` is the configuration class to store the configuration of a
`BertModel`.


Arguments:
vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `BertModel`.
hidden_size: Size of the encoder layers and the pooler layer.
num_hidden_layers: Number of hidden layers in the Transformer encoder.
num_attention_heads: Number of attention heads for each attention layer in
the Transformer encoder.
intermediate_size: The size of the "intermediate" (i.e., feed-forward)
layer in the Transformer encoder.
hidden_act: The non-linear activation function (function or string) in the
encoder and pooler. If string, "gelu", "relu", "swish" and "gelu_new" are supported.
hidden_dropout_prob: The dropout probabilitiy for all fully connected
layers in the embeddings, encoder, and pooler.
attention_probs_dropout_prob: The dropout ratio for the attention
probabilities.
max_position_embeddings: The maximum sequence length that this model might
ever be used with. Typically set this to something large just in case
(e.g., 512 or 1024 or 2048).
type_vocab_size: The vocabulary size of the `token_type_ids` passed into
`BertModel`.
initializer_range: The sttdev of the truncated_normal_initializer for
initializing all weight matrices.
layer_norm_eps: The epsilon used by LayerNorm.
"""

def __init__(self,
vocab_size_or_config_json_file=30522,
hidden_size=768,
num_hidden_layers=12,
num_attention_heads=12,
intermediate_size=3072,
hidden_act='gelu',
hidden_dropout_prob=0.1,
attention_probs_dropout_prob=0.1,
max_position_embeddings=512,
type_vocab_size=2,
initializer_range=0.02,
layer_norm_eps=1e-12,
output_attentions=False,
output_hidden_states=False):
self.vocab_size = vocab_size_or_config_json_file
self.hidden_size = hidden_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.hidden_act = hidden_act
self.intermediate_size = intermediate_size
self.hidden_dropout_prob = hidden_dropout_prob
self.attention_probs_dropout_prob = attention_probs_dropout_prob
self.max_position_embeddings = max_position_embeddings
self.type_vocab_size = type_vocab_size
self.initializer_range = initializer_range
self.layer_norm_eps = layer_norm_eps
self.output_attentions = output_attentions
self.output_hidden_states = output_hidden_states

+ 677
- 0
modelscope/models/multi_modal/clip/model.py View File

@@ -0,0 +1,677 @@
import os
from collections import OrderedDict
from typing import Any, Dict, Iterable, List, Tuple, Union

import json
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from PIL import Image
from torchvision.transforms import Compose, Normalize, Resize, ToTensor

from modelscope.metainfo import Models
from modelscope.models import TorchModel
from modelscope.models.builder import MODELS
from modelscope.models.multi_modal.clip.bert_tokenizer import FullTokenizer
from modelscope.models.multi_modal.clip.configuration_bert import BertConfig
from modelscope.models.multi_modal.clip.modeling_bert import BertModel
from modelscope.utils.constant import ModeKeys, ModelFile, Tasks
from modelscope.utils.logger import get_logger

logger = get_logger()

__all__ = ['CLIPForMultiModalEmbedding']


class Bottleneck(nn.Module):
expansion = 4

def __init__(self, inplanes, planes, stride=1):
super().__init__()

# all conv layers have stride 1. an avgpool is performed after the second convolution when stride > 1
self.conv1 = nn.Conv2d(inplanes, planes, 1, bias=False)
self.bn1 = nn.BatchNorm2d(planes)

self.conv2 = nn.Conv2d(planes, planes, 3, padding=1, bias=False)
self.bn2 = nn.BatchNorm2d(planes)

self.avgpool = nn.AvgPool2d(stride) if stride > 1 else nn.Identity()

self.conv3 = nn.Conv2d(planes, planes * self.expansion, 1, bias=False)
self.bn3 = nn.BatchNorm2d(planes * self.expansion)

self.relu = nn.ReLU(inplace=True)
self.downsample = None
self.stride = stride

if stride > 1 or inplanes != planes * Bottleneck.expansion:
# downsampling layer is prepended with an avgpool, and the subsequent convolution has stride 1
self.downsample = nn.Sequential(
OrderedDict([('-1', nn.AvgPool2d(stride)),
('0',
nn.Conv2d(
inplanes,
planes * self.expansion,
1,
stride=1,
bias=False)),
('1', nn.BatchNorm2d(planes * self.expansion))]))

def forward(self, x: torch.Tensor):
identity = x

out = self.relu(self.bn1(self.conv1(x)))
out = self.relu(self.bn2(self.conv2(out)))
out = self.avgpool(out)
out = self.bn3(self.conv3(out))

if self.downsample is not None:
identity = self.downsample(x)

out += identity
out = self.relu(out)
return out


class AttentionPool2d(nn.Module):

def __init__(self,
spacial_dim: int,
embed_dim: int,
num_heads: int,
output_dim: int = None):
super().__init__()
self.positional_embedding = nn.Parameter(
torch.randn(spacial_dim**2 + 1, embed_dim) / embed_dim**0.5)
self.k_proj = nn.Linear(embed_dim, embed_dim)
self.q_proj = nn.Linear(embed_dim, embed_dim)
self.v_proj = nn.Linear(embed_dim, embed_dim)
self.c_proj = nn.Linear(embed_dim, output_dim or embed_dim)
self.num_heads = num_heads

def forward(self, x):
x = x.reshape(x.shape[0], x.shape[1],
x.shape[2] * x.shape[3]).permute(2, 0,
1) # NCHW -> (HW)NC
x = torch.cat([x.mean(dim=0, keepdim=True), x], dim=0) # (HW+1)NC
x = x + self.positional_embedding[:, None, :].to(x.dtype) # (HW+1)NC
x, _ = F.multi_head_attention_forward(
query=x,
key=x,
value=x,
embed_dim_to_check=x.shape[-1],
num_heads=self.num_heads,
q_proj_weight=self.q_proj.weight,
k_proj_weight=self.k_proj.weight,
v_proj_weight=self.v_proj.weight,
in_proj_weight=None,
in_proj_bias=torch.cat(
[self.q_proj.bias, self.k_proj.bias, self.v_proj.bias]),
bias_k=None,
bias_v=None,
add_zero_attn=False,
dropout_p=0,
out_proj_weight=self.c_proj.weight,
out_proj_bias=self.c_proj.bias,
use_separate_proj_weight=True,
training=self.training,
need_weights=False)

return x[0]


class ModifiedResNet(nn.Module):
"""
A ResNet class that is similar to torchvision's but contains the following changes:
- There are now 3 "stem" convolutions as opposed to 1, with an average pool instead of a max pool.
- Performs anti-aliasing strided convolutions, where an avgpool is prepended to convolutions with stride > 1
- The final pooling layer is a QKV attention instead of an average pool
"""

def __init__(self,
layers,
output_dim,
heads,
input_resolution=224,
width=64):
super().__init__()
self.output_dim = output_dim
self.input_resolution = input_resolution

# the 3-layer stem
self.conv1 = nn.Conv2d(
3, width // 2, kernel_size=3, stride=2, padding=1, bias=False)
self.bn1 = nn.BatchNorm2d(width // 2)
self.conv2 = nn.Conv2d(
width // 2, width // 2, kernel_size=3, padding=1, bias=False)
self.bn2 = nn.BatchNorm2d(width // 2)
self.conv3 = nn.Conv2d(
width // 2, width, kernel_size=3, padding=1, bias=False)
self.bn3 = nn.BatchNorm2d(width)
self.avgpool = nn.AvgPool2d(2)
self.relu = nn.ReLU(inplace=True)

# residual layers
self._inplanes = width # this is a *mutable* variable used during construction
self.layer1 = self._make_layer(width, layers[0])
self.layer2 = self._make_layer(width * 2, layers[1], stride=2)
self.layer3 = self._make_layer(width * 4, layers[2], stride=2)
self.layer4 = self._make_layer(width * 8, layers[3], stride=2)

embed_dim = width * 32 # the ResNet feature dimension
self.attnpool = AttentionPool2d(input_resolution // 32, embed_dim,
heads, output_dim)

def _make_layer(self, planes, blocks, stride=1):
layers = [Bottleneck(self._inplanes, planes, stride)]

self._inplanes = planes * Bottleneck.expansion
for _ in range(1, blocks):
layers.append(Bottleneck(self._inplanes, planes))

return nn.Sequential(*layers)

def forward(self, x):

def stem(x):
for conv, bn in [(self.conv1, self.bn1), (self.conv2, self.bn2),
(self.conv3, self.bn3)]:
x = self.relu(bn(conv(x)))
x = self.avgpool(x)
return x

x = x.type(self.conv1.weight.dtype)
x = stem(x)
x = self.layer1(x)
x = self.layer2(x)
x = self.layer3(x)
x = self.layer4(x)
x = self.attnpool(x)

return x


class LayerNorm(nn.LayerNorm):
"""Subclass torch's LayerNorm to handle fp16."""

def forward(self, x: torch.Tensor):
orig_type = x.dtype
ret = super().forward(x.type(torch.float32))
return ret.type(orig_type)


class QuickGELU(nn.Module):

def forward(self, x: torch.Tensor):
return x * torch.sigmoid(1.702 * x)


class ResidualAttentionBlock(nn.Module):

def __init__(self,
d_model: int,
n_head: int,
attn_mask: torch.Tensor = None):
super().__init__()

self.attn = nn.MultiheadAttention(d_model, n_head)
self.ln_1 = LayerNorm(d_model)
self.mlp = nn.Sequential(
OrderedDict([('c_fc', nn.Linear(d_model, d_model * 4)),
('gelu', QuickGELU()),
('c_proj', nn.Linear(d_model * 4, d_model))]))
self.ln_2 = LayerNorm(d_model)
self.attn_mask = attn_mask

def attention(self, x: torch.Tensor):
self.attn_mask = self.attn_mask.to(
dtype=x.dtype,
device=x.device) if self.attn_mask is not None else None
return self.attn(
x, x, x, need_weights=False, attn_mask=self.attn_mask)[0]

def forward(self, x: torch.Tensor):
x = x + self.attention(self.ln_1(x))
x = x + self.mlp(self.ln_2(x))
return x


class Transformer(nn.Module):

def __init__(self,
width: int,
layers: int,
heads: int,
attn_mask: torch.Tensor = None):
super().__init__()
self.width = width
self.layers = layers
self.resblocks = nn.Sequential(*[
ResidualAttentionBlock(width, heads, attn_mask)
for _ in range(layers)
])

def forward(self, x: torch.Tensor):
return self.resblocks(x)


class VisualTransformer(nn.Module):

def __init__(self, input_resolution: int, patch_size: int, width: int,
layers: int, heads: int, output_dim: int):
super().__init__()
self.input_resolution = input_resolution
self.output_dim = output_dim
self.conv1 = nn.Conv2d(
in_channels=3,
out_channels=width,
kernel_size=patch_size,
stride=patch_size,
bias=False)

scale = width**-0.5
self.class_embedding = nn.Parameter(scale * torch.randn(width))
self.positional_embedding = nn.Parameter(scale * torch.randn(
(input_resolution // patch_size)**2 + 1, width))
self.ln_pre = LayerNorm(width)

self.transformer = Transformer(width, layers, heads)

self.ln_post = LayerNorm(width)
self.proj = nn.Parameter(scale * torch.randn(width, output_dim))

def forward(self, x: torch.Tensor):
x = self.conv1(x) # shape = [*, width, grid, grid]
x = x.reshape(x.shape[0], x.shape[1],
-1) # shape = [*, width, grid ** 2]
x = x.permute(0, 2, 1) # shape = [*, grid ** 2, width]
x = torch.cat(
[ # noqa
self.class_embedding.to(x.dtype) + torch.zeros( # noqa
x.shape[0],
1,
x.shape[-1],
dtype=x.dtype,
device=x.device),
x # noqa
],
dim=1) # noqa shape = [*, grid ** 2 + 1, width]
x = x + self.positional_embedding.to(x.dtype)
x = self.ln_pre(x)

x = x.permute(1, 0, 2) # NLD -> LND
x = self.transformer(x)
x = x.permute(1, 0, 2) # LND -> NLD

x = self.ln_post(x[:, 0, :])

if self.proj is not None:
x = x @ self.proj

return x


class CLIP(nn.Module):

def __init__(
self,
embed_dim: int,
# vision
image_resolution: int,
vision_layers: Union[Tuple[int, int, int, int], int],
vision_width: int,
vision_patch_size: int,
# text
vocab_size: int,
text_attention_probs_dropout_prob: float,
text_hidden_act: str,
text_hidden_dropout_prob: float,
text_hidden_size: int,
text_initializer_range: float,
text_intermediate_size: int,
text_max_position_embeddings: int,
text_num_attention_heads: int,
text_num_hidden_layers: int,
text_type_vocab_size: int,
tokenizer: FullTokenizer,
):
super().__init__()

if isinstance(vision_layers, (tuple, list)):
vision_heads = vision_width * 32 // 64
self.visual = ModifiedResNet(
layers=vision_layers,
output_dim=embed_dim,
heads=vision_heads,
input_resolution=image_resolution,
width=vision_width)
else:
vision_heads = vision_width // 64
self.visual = VisualTransformer(
input_resolution=image_resolution,
patch_size=vision_patch_size,
width=vision_width,
layers=vision_layers,
heads=vision_heads,
output_dim=embed_dim)

self.bert_config = BertConfig(
vocab_size_or_config_json_file=vocab_size,
hidden_size=text_hidden_size,
num_hidden_layers=text_num_hidden_layers,
num_attention_heads=text_num_attention_heads,
intermediate_size=text_intermediate_size,
hidden_act=text_hidden_act,
hidden_dropout_prob=text_hidden_dropout_prob,
attention_probs_dropout_prob=text_attention_probs_dropout_prob,
max_position_embeddings=text_max_position_embeddings,
type_vocab_size=text_type_vocab_size,
initializer_range=text_initializer_range,
layer_norm_eps=1e-12,
)
self.bert = BertModel(self.bert_config)

self.text_projection = nn.Parameter(
torch.empty(text_hidden_size, embed_dim))
self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / 0.07))

self.tokenizer = tokenizer

self.initialize_parameters()

def initialize_parameters(self):
self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / 0.07))

if isinstance(self.visual, ModifiedResNet):
if self.visual.attnpool is not None:
std = self.visual.attnpool.c_proj.in_features**-0.5
nn.init.normal_(self.visual.attnpool.q_proj.weight, std=std)
nn.init.normal_(self.visual.attnpool.k_proj.weight, std=std)
nn.init.normal_(self.visual.attnpool.v_proj.weight, std=std)
nn.init.normal_(self.visual.attnpool.c_proj.weight, std=std)

for resnet_block in [
self.visual.layer1, self.visual.layer2, self.visual.layer3,
self.visual.layer4
]:
for name, param in resnet_block.named_parameters():
if name.endswith('bn3.weight'):
nn.init.zeros_(param)

if self.text_projection is not None:
nn.init.normal_(
self.text_projection, std=self.bert_config.hidden_size**-0.5)

@property
def dtype(self):
return self.visual.conv1.weight.dtype

def encode_image(self, image):
return self.visual(image.type(self.dtype))

def encode_text(self, text):
pad_index = self.tokenizer.vocab['[PAD]']
attn_mask = text.ne(pad_index).type(self.dtype)
x = self.bert(
text, attention_mask=attn_mask)[0].type(
self.dtype) # [batch_size, seq_length, hidden_size]
return x[:, 0, :] @ self.text_projection

def forward(self, image, text):
assert image is not None or text is not None, 'text and image cannot both be None!'

if image is None:
return self.encode_text(text)
elif text is None:
return self.encode_image(image)
image_features = self.encode_image(image)
text_features = self.encode_text(text)

image_features = image_features / image_features.norm(
dim=-1, keepdim=True)
text_features = text_features / text_features.norm(
dim=-1, keepdim=True)

return image_features, text_features, self.logit_scale.exp()

def get_similarity(self, image, text):
image_features = self.encode_image(image)
text_features = self.encode_text(text)

# normalized features
image_features = image_features / image_features.norm(
dim=1, keepdim=True)
text_features = text_features / text_features.norm(dim=1, keepdim=True)

# cosine similarity as logits
logit_scale = self.logit_scale.exp()
logits_per_image = logit_scale * image_features @ text_features.t()
logits_per_text = logits_per_image.t()

# shape = [global_batch_size, global_batch_size]
return logits_per_image, logits_per_text


def convert_models_to_fp32(model):
for p in model.parameters():
p.data = p.data.float()
if p.grad:
p.grad.data = p.grad.data.float()


def convert_weights(model: nn.Module):
"""Convert applicable model parameters to fp16"""

def _convert_weights_to_fp16(module):
if isinstance(module, (nn.Conv1d, nn.Conv2d, nn.Linear)):
module.weight.data = module.weight.data.half()
if module.bias is not None:
module.bias.data = module.bias.data.half()

if isinstance(module, nn.MultiheadAttention):
for attr in [
*[f'{s}_proj_weight' for s in ['in', 'q', 'k', 'v']],
'in_proj_bias', 'bias_k', 'bias_v'
]:
tensor = getattr(module, attr)
if tensor is not None:
tensor.data = tensor.data.half()

if isinstance(module, BertModel):
module.to(torch.half)

for name in ['text_projection', 'proj']:
if hasattr(module, name):
attr = getattr(module, name)
if attr is not None:
attr.data = attr.data.half()

model.apply(_convert_weights_to_fp16)


def _convert_to_rgb(image):
return image.convert('RGB')


def image_transform(image_size=224):
transform = Compose([
_convert_to_rgb,
Resize((image_size, image_size)),
ToTensor(),
Normalize((0.48145466, 0.4578275, 0.40821073),
(0.26862954, 0.26130258, 0.27577711)),
])
return transform


@MODELS.register_module(Tasks.multi_modal_embedding, module_name=Models.clip)
class CLIPForMultiModalEmbedding(TorchModel):

def __init__(self, model_dir, device_id=-1):
super().__init__(model_dir=model_dir, device_id=device_id)

# Initialize the model.
vision_model_config_file = '{}/vision_model_config.json'.format(
model_dir)
logger.info(
f'Loading vision model config from {vision_model_config_file}')
assert os.path.exists(vision_model_config_file)

text_model_config_file = '{}/text_model_config.json'.format(model_dir)
logger.info(f'Loading text model config from {text_model_config_file}')
assert os.path.exists(text_model_config_file)

with open(vision_model_config_file,
'r') as fv, open(text_model_config_file, 'r') as ft:
model_info = json.load(fv)
for k, v in json.load(ft).items():
model_info[k] = v

# image preprocess
self.img_preprocess = image_transform(model_info['image_resolution'])

# text tokenizer
vocab_file = f'{model_dir}/{ModelFile.VOCAB_FILE}'
self.tokenizer = FullTokenizer(vocab_file=vocab_file)

# initialize the model
self.clip_model = CLIP(**model_info, tokenizer=self.tokenizer)
convert_weights(self.clip_model)

# restore the pretrained weight
checkpoint = torch.load(
f'{model_dir}/{ModelFile.TORCH_MODEL_BIN_FILE}', 'cpu')
sd = checkpoint['state_dict']
if next(iter(sd.items()))[0].startswith('module'):
sd = {k[len('module.'):]: v for k, v in sd.items()}
self.clip_model.load_state_dict(sd)
self.clip_model.eval()

# place the model
self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
if self.device == 'cuda':
self.clip_model.to(self.device)
logger.info('Use GPU for inference')
else:
self.clip_model.float()
logger.info('Use CPU for inference')

def tokenize(self,
texts: Union[str, List[str]],
context_length: int = 52) -> torch.LongTensor:
"""
Returns the tokenized representation of given input string(s)
Parameters
----------
texts : Union[str, List[str]]
An input string or a list of input strings to tokenize
context_length : int
The context length to use; all baseline models use 24 as the context length
Returns
-------
A two-dimensional tensor containing the resulting tokens, shape = [number of input strings, context_length]
"""
if isinstance(texts, str):
texts = [texts]

all_tokens = []
for text in texts:
all_tokens.append(
[self.tokenizer.vocab['[CLS]']]
+ self.tokenizer.convert_tokens_to_ids(
self.tokenizer.tokenize(text))[:context_length - 2]
+ [self.tokenizer.vocab['[SEP]']])

result = torch.zeros(len(all_tokens), context_length, dtype=torch.long)

for i, tokens in enumerate(all_tokens):
assert len(tokens) <= context_length
result[i, :len(tokens)] = torch.tensor(tokens)

return result

def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
from modelscope.outputs import OutputKeys
output = {
OutputKeys.IMG_EMBEDDING: None,
OutputKeys.TEXT_EMBEDDING: None
}
if 'img' in input and input['img'] is not None:
image_input = input['img']

# single image input
if isinstance(image_input, Image.Image):
image_tensor = self.img_preprocess(image_input).unsqueeze(0)
# multi images input
elif isinstance(image_input, list):
if all([isinstance(elem, Image.Image)
for elem in image_input]):
image_tensor = torch.stack(
[self.img_preprocess(elem) for elem in image_input],
dim=0)
else:
unsupported_elem_type = [
type(elem) for elem in image_input
if not isinstance(elem, Image.Image)
][0]
raise TypeError(
f'img should be PIL.Image or List[PIL.Image], \
but got a List containing one {unsupported_elem_type}'
)
# others
else:
raise TypeError(
f'img should be PIL.Image or List[PIL.Image], but got {type(image_input)}'
)

image_tensor = image_tensor.to(self.device)

with torch.no_grad():
image_features = self.clip_model.encode_image(image_tensor)
image_features /= image_features.norm(
dim=-1, keepdim=True) # l2-normalize

output[OutputKeys.IMG_EMBEDDING] = image_features

if 'text' in input and input['text'] is not None:
text_input = input['text']

# single text input
if isinstance(text_input, str):
text_tensor = self.tokenize(text_input)
# multi texts input
elif isinstance(text_input, list):
if all([isinstance(elem, str) for elem in text_input]):
text_tensor = self.tokenize(text_input)
else:
unsupported_elem_type = [
type(elem) for elem in text_input
if not isinstance(elem, str)
][0]
raise TypeError(
f'text should be str or List[str], but got a List containing one {unsupported_elem_type}'
)
# others
else:
raise TypeError(
f'text should be str or List[str], but got {type(text_input)}'
)

text_tensor = text_tensor.to(self.device)

with torch.no_grad():
text_features = self.clip_model.encode_text(text_tensor)
text_features /= text_features.norm(
dim=-1, keepdim=True) # l2-normalize
output[OutputKeys.TEXT_EMBEDDING] = text_features

return output

def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
return inputs

@property
def temperature(self):
return 1.0 / self.clip_model.logit_scale.exp()

+ 507
- 0
modelscope/models/multi_modal/clip/modeling_bert.py View File

@@ -0,0 +1,507 @@
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""PyTorch BERT model. """

from __future__ import (absolute_import, division, print_function,
unicode_literals)
import logging
import math
import os
import sys
from io import open

import json
import torch
from torch import nn

from .configuration_bert import BertConfig

logger = logging.getLogger(__name__)


def gelu(x):
""" Original Implementation of the gelu activation function in Google Bert repo when initially created.
For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
Also see https://arxiv.org/abs/1606.08415
"""
return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))


def gelu_new(x):
""" Implementation of the gelu activation function currently in Google Bert repo (identical to OpenAI GPT).
Also see https://arxiv.org/abs/1606.08415
"""
return 0.5 * x * (1 + torch.tanh(
math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))


def swish(x):
return x * torch.sigmoid(x)


ACT2FN = {
'gelu': gelu,
'relu': torch.nn.functional.relu,
'swish': swish,
'gelu_new': gelu_new
}

BertLayerNorm = torch.nn.LayerNorm


class BertEmbeddings(nn.Module):
"""Construct the embeddings from word, position and token_type embeddings.
"""

def __init__(self, config):
super(BertEmbeddings, self).__init__()
self.word_embeddings = nn.Embedding(
config.vocab_size, config.hidden_size, padding_idx=0)
self.position_embeddings = nn.Embedding(config.max_position_embeddings,
config.hidden_size)
self.token_type_embeddings = nn.Embedding(config.type_vocab_size,
config.hidden_size)

# self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
# any TensorFlow checkpoint file
self.LayerNorm = BertLayerNorm(
config.hidden_size, eps=config.layer_norm_eps)
self.dropout = nn.Dropout(config.hidden_dropout_prob)

def forward(self, input_ids, token_type_ids=None, position_ids=None):
seq_length = input_ids.size(1)
if position_ids is None:
position_ids = torch.arange(
seq_length, dtype=torch.long, device=input_ids.device)
position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
if token_type_ids is None:
token_type_ids = torch.zeros_like(input_ids)

words_embeddings = self.word_embeddings(input_ids)
position_embeddings = self.position_embeddings(position_ids)
token_type_embeddings = self.token_type_embeddings(token_type_ids)

embeddings = words_embeddings + position_embeddings + token_type_embeddings
embeddings = self.LayerNorm(embeddings)
embeddings = self.dropout(embeddings)
return embeddings


class BertSelfAttention(nn.Module):

def __init__(self, config):
super(BertSelfAttention, self).__init__()
if config.hidden_size % config.num_attention_heads != 0:
raise ValueError(
'The hidden size (%d) is not a multiple of the number of attention '
'heads (%d)' %
(config.hidden_size, config.num_attention_heads))
self.output_attentions = config.output_attentions

self.num_attention_heads = config.num_attention_heads
self.attention_head_size = int(config.hidden_size
/ config.num_attention_heads)
self.all_head_size = self.num_attention_heads * self.attention_head_size

self.query = nn.Linear(config.hidden_size, self.all_head_size)
self.key = nn.Linear(config.hidden_size, self.all_head_size)
self.value = nn.Linear(config.hidden_size, self.all_head_size)

self.dropout = nn.Dropout(config.attention_probs_dropout_prob)

def transpose_for_scores(self, x):
new_x_shape = x.size()[:-1] + (self.num_attention_heads,
self.attention_head_size)
x = x.view(*new_x_shape)
return x.permute(0, 2, 1, 3)

def forward(self, hidden_states, attention_mask=None, head_mask=None):
mixed_query_layer = self.query(hidden_states)
mixed_key_layer = self.key(hidden_states)
mixed_value_layer = self.value(hidden_states)

query_layer = self.transpose_for_scores(mixed_query_layer)
key_layer = self.transpose_for_scores(mixed_key_layer)
value_layer = self.transpose_for_scores(mixed_value_layer)

# Take the dot product between "query" and "key" to get the raw attention scores.
attention_scores = torch.matmul(query_layer,
key_layer.transpose(-1, -2))
attention_scores = attention_scores / math.sqrt(
self.attention_head_size)
if attention_mask is not None:
# Apply the attention mask is (precomputed for all layers in BertModel forward() function)
attention_scores = attention_scores + attention_mask

# Normalize the attention scores to probabilities.
attention_probs = nn.Softmax(dim=-1)(attention_scores)

# This is actually dropping out entire tokens to attend to, which might
# seem a bit unusual, but is taken from the original Transformer paper.
attention_probs = self.dropout(attention_probs)

# Mask heads if we want to
if head_mask is not None:
attention_probs = attention_probs * head_mask

context_layer = torch.matmul(attention_probs, value_layer)

context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
new_context_layer_shape = context_layer.size()[:-2] + (
self.all_head_size, )
context_layer = context_layer.view(*new_context_layer_shape)

outputs = (context_layer,
attention_probs) if self.output_attentions else (
context_layer, )
return outputs


class BertSelfOutput(nn.Module):

def __init__(self, config):
super(BertSelfOutput, self).__init__()
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
self.LayerNorm = BertLayerNorm(
config.hidden_size, eps=config.layer_norm_eps)
self.dropout = nn.Dropout(config.hidden_dropout_prob)

def forward(self, hidden_states, input_tensor):
hidden_states = self.dense(hidden_states)
hidden_states = self.dropout(hidden_states)
hidden_states = self.LayerNorm(hidden_states + input_tensor)
return hidden_states


class BertAttention(nn.Module):

def __init__(self, config):
super(BertAttention, self).__init__()
self.self = BertSelfAttention(config)
self.output = BertSelfOutput(config)
self.pruned_heads = set()

def forward(self, input_tensor, attention_mask=None, head_mask=None):
self_outputs = self.self(input_tensor, attention_mask, head_mask)
attention_output = self.output(self_outputs[0], input_tensor)
outputs = (attention_output,
) + self_outputs[1:] # add attentions if we output them
return outputs


class BertIntermediate(nn.Module):

def __init__(self, config):
super(BertIntermediate, self).__init__()
self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
if isinstance(config.hidden_act,
str) or (sys.version_info[0] == 2
and isinstance(config.hidden_act, unicode)):
self.intermediate_act_fn = ACT2FN[config.hidden_act]
else:
self.intermediate_act_fn = config.hidden_act

def forward(self, hidden_states):
hidden_states = self.dense(hidden_states)
hidden_states = self.intermediate_act_fn(hidden_states)
return hidden_states


class BertOutput(nn.Module):

def __init__(self, config):
super(BertOutput, self).__init__()
self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
self.LayerNorm = BertLayerNorm(
config.hidden_size, eps=config.layer_norm_eps)
self.dropout = nn.Dropout(config.hidden_dropout_prob)

def forward(self, hidden_states, input_tensor):
hidden_states = self.dense(hidden_states)
hidden_states = self.dropout(hidden_states)
hidden_states = self.LayerNorm(hidden_states + input_tensor)
return hidden_states


class BertLayer(nn.Module):

def __init__(self, config):
super(BertLayer, self).__init__()
self.attention = BertAttention(config)
self.intermediate = BertIntermediate(config)
self.output = BertOutput(config)

def forward(self, hidden_states, attention_mask=None, head_mask=None):
attention_outputs = self.attention(hidden_states, attention_mask,
head_mask)
attention_output = attention_outputs[0]
intermediate_output = self.intermediate(attention_output)
layer_output = self.output(intermediate_output, attention_output)
outputs = (layer_output, ) + attention_outputs[
1:] # add attentions if we output them
return outputs


class BertEncoder(nn.Module):

def __init__(self, config):
super(BertEncoder, self).__init__()
self.output_attentions = config.output_attentions
self.output_hidden_states = config.output_hidden_states
self.layer = nn.ModuleList(
[BertLayer(config) for _ in range(config.num_hidden_layers)])

def forward(self, hidden_states, attention_mask=None, head_mask=None):
all_hidden_states = ()
all_attentions = ()
for i, layer_module in enumerate(self.layer):
if self.output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states, )

layer_outputs = layer_module(hidden_states, attention_mask,
head_mask[i])
hidden_states = layer_outputs[0]

if self.output_attentions:
all_attentions = all_attentions + (layer_outputs[1], )

# Add last layer
if self.output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states, )

outputs = (hidden_states, )
if self.output_hidden_states:
outputs = outputs + (all_hidden_states, )
if self.output_attentions:
outputs = outputs + (all_attentions, )
return outputs # last-layer hidden state, (all hidden states), (all attentions)


class BertPooler(nn.Module):

def __init__(self, config):
super(BertPooler, self).__init__()
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
self.activation = nn.Tanh()

def forward(self, hidden_states):
# We "pool" the model by simply taking the hidden state corresponding
# to the first token.
first_token_tensor = hidden_states[:, 0]
pooled_output = self.dense(first_token_tensor)
pooled_output = self.activation(pooled_output)
return pooled_output


class BertPredictionHeadTransform(nn.Module):

def __init__(self, config):
super(BertPredictionHeadTransform, self).__init__()
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
if isinstance(config.hidden_act,
str) or (sys.version_info[0] == 2
and isinstance(config.hidden_act, unicode)):
self.transform_act_fn = ACT2FN[config.hidden_act]
else:
self.transform_act_fn = config.hidden_act
self.LayerNorm = BertLayerNorm(
config.hidden_size, eps=config.layer_norm_eps)

def forward(self, hidden_states):
hidden_states = self.dense(hidden_states)
hidden_states = self.transform_act_fn(hidden_states)
hidden_states = self.LayerNorm(hidden_states)
return hidden_states


class BertLMPredictionHead(nn.Module):

def __init__(self, config):
super(BertLMPredictionHead, self).__init__()
self.transform = BertPredictionHeadTransform(config)

# The output weights are the same as the input embeddings, but there is
# an output-only bias for each token.
self.decoder = nn.Linear(
config.hidden_size, config.vocab_size, bias=False)

self.bias = nn.Parameter(torch.zeros(config.vocab_size))

def forward(self, hidden_states):
hidden_states = self.transform(hidden_states)
hidden_states = self.decoder(hidden_states) + self.bias
return hidden_states


class BertOnlyMLMHead(nn.Module):

def __init__(self, config):
super(BertOnlyMLMHead, self).__init__()
self.predictions = BertLMPredictionHead(config)

def forward(self, sequence_output):
prediction_scores = self.predictions(sequence_output)
return prediction_scores


class BertOnlyNSPHead(nn.Module):

def __init__(self, config):
super(BertOnlyNSPHead, self).__init__()
self.seq_relationship = nn.Linear(config.hidden_size, 2)

def forward(self, pooled_output):
seq_relationship_score = self.seq_relationship(pooled_output)
return seq_relationship_score


class BertPreTrainingHeads(nn.Module):

def __init__(self, config):
super(BertPreTrainingHeads, self).__init__()
self.predictions = BertLMPredictionHead(config)
self.seq_relationship = nn.Linear(config.hidden_size, 2)

def forward(self, sequence_output, pooled_output):
prediction_scores = self.predictions(sequence_output)
seq_relationship_score = self.seq_relationship(pooled_output)
return prediction_scores, seq_relationship_score


class BertPreTrainedModel(nn.Module):
config_class = BertConfig
base_model_prefix = 'bert'

def __init__(self, config):
super(BertPreTrainedModel, self).__init__()
self.config = config

def _init_weights(self, module):
""" Initialize the weights """
if isinstance(module, (nn.Linear, nn.Embedding)):
# Slightly different from the TF version which uses truncated_normal for initialization
# cf https://github.com/pytorch/pytorch/pull/5617
module.weight.data.normal_(
mean=0.0, std=self.config.initializer_range)
elif isinstance(module, BertLayerNorm):
module.bias.data.zero_()
module.weight.data.fill_(1.0)
if isinstance(module, nn.Linear) and module.bias is not None:
module.bias.data.zero_()


class BertModel(BertPreTrainedModel):
r"""
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
**last_hidden_state**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, hidden_size)``
Sequence of hidden-states at the output of the last layer of the model.
**pooler_output**: ``torch.FloatTensor`` of shape ``(batch_size, hidden_size)``
Last layer hidden-state of the first token of the sequence (classification token)
further processed by a Linear layer and a Tanh activation function. The Linear
layer weights are trained from the next sentence prediction (classification)
objective during Bert pretraining. This output is usually *not* a good summary
of the semantic content of the input, you're often better with averaging or pooling
the sequence of hidden-states for the whole input sequence.
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
of shape ``(batch_size, sequence_length, hidden_size)``:
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
**attentions**: (`optional`, returned when ``config.output_attentions=True``)
list of ``torch.FloatTensor`` (one for each layer)
of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
Attentions weights after the attention softmax,
used to compute the weighted average in the self-attention heads.

Examples::

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1
outputs = model(input_ids)
last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple

"""

def __init__(self, config):
super(BertModel, self).__init__(config)

self.embeddings = BertEmbeddings(config)
self.encoder = BertEncoder(config)
self.pooler = BertPooler(config)

self.apply(self._init_weights)

def forward(self,
input_ids,
attention_mask=None,
token_type_ids=None,
position_ids=None,
head_mask=None):
if attention_mask is None:
attention_mask = torch.ones_like(input_ids)
if token_type_ids is None:
token_type_ids = torch.zeros_like(input_ids)

# We create a 3D attention mask from a 2D tensor mask.
# Sizes are [batch_size, 1, 1, to_seq_length]
# So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
# this attention mask is more simple than the triangular masking of causal attention
# used in OpenAI GPT, we just need to prepare the broadcast dimension here.
extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)

# Since attention_mask is 1.0 for positions we want to attend and 0.0 for
# masked positions, this operation will create a tensor which is 0.0 for
# positions we want to attend and -10000.0 for masked positions.
# Since we are adding it to the raw scores before the softmax, this is
# effectively the same as removing these entirely.
extended_attention_mask = extended_attention_mask.to(
dtype=next(self.parameters()).dtype) # fp16 compatibility
extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0

# Prepare head mask if needed
# 1.0 in head_mask indicate we keep the head
# attention_probs has shape bsz x n_heads x N x N
# input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
# and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
if head_mask is not None:
if head_mask.dim() == 1:
head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(
-1).unsqueeze(-1)
head_mask = head_mask.expand(self.config.num_hidden_layers, -1,
-1, -1, -1)
elif head_mask.dim() == 2:
head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(
-1) # We can specify head_mask for each layer
head_mask = head_mask.to(dtype=next(self.parameters(
)).dtype) # switch to fload if need + fp16 compatibility
else:
head_mask = [None] * self.config.num_hidden_layers

embedding_output = self.embeddings(
input_ids,
position_ids=position_ids,
token_type_ids=token_type_ids)
encoder_outputs = self.encoder(
embedding_output, extended_attention_mask, head_mask=head_mask)
sequence_output = encoder_outputs[0]
pooled_output = self.pooler(sequence_output)

outputs = (
sequence_output,
pooled_output,
) + encoder_outputs[
1:] # add hidden_states and attentions if they are here
return outputs # sequence_output, pooled_output, (hidden_states), (attentions)

+ 61
- 1
modelscope/models/multi_modal/mplug/clip/clip.py View File

@@ -5,9 +5,69 @@ from typing import Tuple, Union

import torch
import torch.nn.functional as F
import torch.utils.checkpoint as checkpoint
from torch import nn

from modelscope.models.multi_modal.clip.clip_vit import Transformer

class QuickGELU(nn.Module):

def forward(self, x: torch.Tensor):
return x * torch.sigmoid(1.702 * x)


class ResidualAttentionBlock(nn.Module):

def __init__(self,
d_model: int,
n_head: int,
attn_mask: torch.Tensor = None):
super().__init__()
self.attn = nn.MultiheadAttention(d_model, n_head)
self.ln_1 = LayerNorm(d_model)
self.mlp = nn.Sequential(
OrderedDict([('c_fc', nn.Linear(d_model, d_model * 4)),
('gelu', QuickGELU()),
('c_proj', nn.Linear(d_model * 4, d_model))]))
self.ln_2 = LayerNorm(d_model)
self.attn_mask = attn_mask

def attention(self, x: torch.Tensor):
self.attn_mask = self.attn_mask.to(
dtype=x.dtype,
device=x.device) if self.attn_mask is not None else None
return self.attn(
x, x, x, need_weights=False, attn_mask=self.attn_mask)[0]

def forward(self, x: torch.Tensor):
x = x + self.attention(self.ln_1(x))
x = x + self.mlp(self.ln_2(x))
return x


class Transformer(nn.Module):

def __init__(self,
width: int,
layers: int,
heads: int,
attn_mask: torch.Tensor = None,
use_grad_ckp: bool = True):
super().__init__()
self.width = width
self.layers = layers
self.resblocks = nn.Sequential(*[
ResidualAttentionBlock(width, heads, attn_mask)
for _ in range(layers)
])
self.use_grad_ckp = use_grad_ckp

def forward(self, x: torch.Tensor):
if self.use_grad_ckp:
for each_block in self.resblocks:
x = checkpoint.checkpoint(each_block, x)
return x
else:
return self.resblocks(x)


class Bottleneck(nn.Module):


+ 39
- 31
tests/pipelines/test_multi_modal_embedding.py View File

@@ -2,50 +2,58 @@

import unittest

import numpy as np
import torch

from modelscope.models import Model
from modelscope.outputs import OutputKeys
from modelscope.pipelines import pipeline
from modelscope.utils.constant import Tasks
from modelscope.utils.test_utils import test_level


class MultiModalEmbeddingTest(unittest.TestCase):
model_id = 'damo/multi-modal_clip-vit-large-patch14_zh'
test_text = {'text': '一张风景图'}
model_id = 'damo/multi-modal_clip-vit-base-patch16_zh'
test_input = {'text': '皮卡丘'}
model_version = 'dev'

@unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
@unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
def test_run(self):
pipe_line_multi_modal_embedding = pipeline(
Tasks.multi_modal_embedding, model=self.model_id)
test_str_embedding = pipe_line_multi_modal_embedding(
self.test_text)['text_embedding']
print(np.sum(np.abs(test_str_embedding)))

@unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
pipeline_multi_modal_embedding = pipeline(
Tasks.multi_modal_embedding,
model=self.model_id,
model_revision=self.model_version)
text_embedding = pipeline_multi_modal_embedding(
self.test_input)[OutputKeys.TEXT_EMBEDDING]
print('l1-norm: {}'.format(
torch.norm(text_embedding, p=1, dim=-1).item()))
print('l2-norm: {}'.format(torch.norm(text_embedding,
dim=-1).item())) # should be 1.0

@unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
def test_run_with_model_from_modelhub(self):
model = Model.from_pretrained(self.model_id)
pipe_line_multi_modal_embedding = pipeline(
task=Tasks.multi_modal_embedding, model=model)
test_str_embedding = pipe_line_multi_modal_embedding(
self.test_text)['text_embedding']
print(np.sum(np.abs(test_str_embedding)))

@unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
def test_run_with_model_name(self):
pipe_line_multi_modal_embedding = pipeline(
task=Tasks.multi_modal_embedding, model=self.model_id)
test_str_embedding = pipe_line_multi_modal_embedding(
self.test_text)['text_embedding']
print(np.sum(np.abs(test_str_embedding)))

@unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
pipeline_multi_modal_embedding = pipeline(
task=Tasks.multi_modal_embedding,
model=model,
model_revision=self.model_version)
text_embedding = pipeline_multi_modal_embedding(
self.test_input)[OutputKeys.TEXT_EMBEDDING]
print('l1-norm: {}'.format(
torch.norm(text_embedding, p=1, dim=-1).item()))
print('l2-norm: {}'.format(torch.norm(text_embedding,
dim=-1).item())) # should be 1.0

@unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
def test_run_with_default_model(self):
pipe_line_multi_modal_embedding = pipeline(
task=Tasks.multi_modal_embedding)
test_str_embedding = pipe_line_multi_modal_embedding(
self.test_text)['text_embedding']
print(np.sum(np.abs(test_str_embedding)))
pipeline_multi_modal_embedding = pipeline(
task=Tasks.multi_modal_embedding,
model_revision=self.model_version)
text_embedding = pipeline_multi_modal_embedding(
self.test_input)[OutputKeys.TEXT_EMBEDDING]
print('l1-norm: {}'.format(
torch.norm(text_embedding, p=1, dim=-1).item()))
print('l2-norm: {}'.format(torch.norm(text_embedding,
dim=-1).item())) # should be 1.0


if __name__ == '__main__':


+ 0
- 60
tests/trainers/test_clip_multi_modal_embedding_trainer.py View File

@@ -1,60 +0,0 @@
import os
import tempfile
import unittest

import requests
import torch
import torch.distributed as dist
import torch.multiprocessing as mp

from modelscope.hub.snapshot_download import snapshot_download
from modelscope.metainfo import Trainers
from modelscope.trainers import build_trainer
from modelscope.utils.constant import ModelFile
from modelscope.utils.logger import get_logger
from modelscope.utils.test_utils import test_level

logger = get_logger()


def clip_train_worker(local_rank, ngpus, node_size, node_rank):
global_rank = local_rank + node_rank * ngpus
dist_world_size = node_size * ngpus

dist.init_process_group(
backend='nccl', world_size=dist_world_size, rank=global_rank)

model_id = 'damo/multi-modal_clip-vit-large-patch14_zh'
local_model_dir = snapshot_download(model_id)

default_args = dict(
cfg_file='{}/{}'.format(local_model_dir, ModelFile.CONFIGURATION),
model=model_id,
device_id=local_rank)
trainer = build_trainer(
name=Trainers.clip_multi_modal_embedding, default_args=default_args)

trainer.train()
trainer.evaluate()


class CLIPMultiModalEmbeddingTrainerTest(unittest.TestCase):

@unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
def test_trainer(self):
os.environ['MASTER_ADDR'] = '127.0.0.1'
os.environ['MASTER_PORT'] = '2001'
NODE_SIZE, NODE_RANK = 1, 0
logger.info('Train clip with {} machines'.format(NODE_SIZE))
ngpus = torch.cuda.device_count()
logger.info('Machine: {} has {} GPUs'.format(NODE_RANK, ngpus))
mp.spawn(
clip_train_worker,
nprocs=ngpus,
args=(ngpus, NODE_SIZE, NODE_RANK))
logger.info('Training done')


if __name__ == '__main__':
unittest.main()
...

Loading…
Cancel
Save