diff --git a/data/test/images/image_depth_estimation.jpg b/data/test/images/image_depth_estimation.jpg new file mode 100644 index 00000000..1a5943d1 --- /dev/null +++ b/data/test/images/image_depth_estimation.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3b230497f6ca10be42aed92b86db435d74fd7306746a059b4ad1e0d6b0652806 +size 35694 diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py index 4a416875..3d566da8 100644 --- a/modelscope/metainfo.py +++ b/modelscope/metainfo.py @@ -36,6 +36,7 @@ class Models(object): swinL_semantic_segmentation = 'swinL-semantic-segmentation' vitadapter_semantic_segmentation = 'vitadapter-semantic-segmentation' text_driven_segmentation = 'text-driven-segmentation' + newcrfs_depth_estimation = 'newcrfs-depth-estimation' resnet50_bert = 'resnet50-bert' referring_video_object_segmentation = 'swinT-referring-video-object-segmentation' fer = 'fer' @@ -210,6 +211,7 @@ class Pipelines(object): video_summarization = 'googlenet_pgl_video_summarization' language_guided_video_summarization = 'clip-it-video-summarization' image_semantic_segmentation = 'image-semantic-segmentation' + image_depth_estimation = 'image-depth-estimation' image_reid_person = 'passvitb-image-reid-person' image_inpainting = 'fft-inpainting' text_driven_segmentation = 'text-driven-segmentation' diff --git a/modelscope/models/base/base_model.py b/modelscope/models/base/base_model.py index 721478c3..1f464bf3 100644 --- a/modelscope/models/base/base_model.py +++ b/modelscope/models/base/base_model.py @@ -5,10 +5,10 @@ from abc import ABC, abstractmethod from typing import Any, Callable, Dict, List, Optional, Union from modelscope.hub.snapshot_download import snapshot_download -from modelscope.models.builder import MODELS, build_model +from modelscope.models.builder import build_model from modelscope.utils.checkpoint import save_checkpoint, save_pretrained from modelscope.utils.config import Config -from modelscope.utils.constant import DEFAULT_MODEL_REVISION, ModelFile, Tasks +from modelscope.utils.constant import DEFAULT_MODEL_REVISION, Invoke, ModelFile from modelscope.utils.device import verify_device from modelscope.utils.logger import get_logger @@ -94,6 +94,10 @@ class Model(ABC): if prefetched is not None: kwargs.pop('model_prefetched') + invoked_by = kwargs.get(Invoke.KEY) + if invoked_by is not None: + kwargs.pop(Invoke.KEY) + if osp.exists(model_name_or_path): local_model_dir = model_name_or_path else: @@ -101,7 +105,13 @@ class Model(ABC): raise RuntimeError( 'Expecting model is pre-fetched locally, but is not found.' ) - local_model_dir = snapshot_download(model_name_or_path, revision) + + if invoked_by is not None: + invoked_by = '%s/%s' % (Invoke.KEY, invoked_by) + else: + invoked_by = '%s/%s' % (Invoke.KEY, Invoke.PRETRAINED) + local_model_dir = snapshot_download( + model_name_or_path, revision, user_agent=invoked_by) logger.info(f'initialize model from {local_model_dir}') if cfg_dict is not None: cfg = cfg_dict @@ -133,6 +143,7 @@ class Model(ABC): model.cfg = cfg model.name = model_name_or_path + model.model_dir = local_model_dir return model def save_pretrained(self, diff --git a/modelscope/models/cv/body_3d_keypoints/body_3d_pose.py b/modelscope/models/cv/body_3d_keypoints/body_3d_pose.py index 3e920d12..6bedf2f3 100644 --- a/modelscope/models/cv/body_3d_keypoints/body_3d_pose.py +++ b/modelscope/models/cv/body_3d_keypoints/body_3d_pose.py @@ -224,8 +224,8 @@ class BodyKeypointsDetection3D(TorchModel): lst_pose2d_cannoical.append(pose2d_canonical[:, i - pad:i + pad + 1]) - input_pose2d_rr = torch.concat(lst_pose2d_cannoical, axis=0) - input_pose2d_cannoical = torch.concat(lst_pose2d_cannoical, axis=0) + input_pose2d_rr = torch.cat(lst_pose2d_cannoical, axis=0) + input_pose2d_cannoical = torch.cat(lst_pose2d_cannoical, axis=0) if self.cfg.model.MODEL.USE_CANONICAL_COORDS: input_pose2d_abs = input_pose2d_cannoical.clone() diff --git a/modelscope/models/cv/image_depth_estimation/__init__.py b/modelscope/models/cv/image_depth_estimation/__init__.py new file mode 100644 index 00000000..b937315b --- /dev/null +++ b/modelscope/models/cv/image_depth_estimation/__init__.py @@ -0,0 +1 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. diff --git a/modelscope/models/cv/image_depth_estimation/networks/__init__.py b/modelscope/models/cv/image_depth_estimation/networks/__init__.py new file mode 100644 index 00000000..b937315b --- /dev/null +++ b/modelscope/models/cv/image_depth_estimation/networks/__init__.py @@ -0,0 +1 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. diff --git a/modelscope/models/cv/image_depth_estimation/networks/newcrf_depth.py b/modelscope/models/cv/image_depth_estimation/networks/newcrf_depth.py new file mode 100644 index 00000000..1e5444e2 --- /dev/null +++ b/modelscope/models/cv/image_depth_estimation/networks/newcrf_depth.py @@ -0,0 +1,215 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +import torch +import torch.nn as nn +import torch.nn.functional as F + +from .newcrf_layers import NewCRF +from .swin_transformer import SwinTransformer +from .uper_crf_head import PSP + + +class NewCRFDepth(nn.Module): + """ + Depth network based on neural window FC-CRFs architecture. + """ + + def __init__(self, + version=None, + inv_depth=False, + pretrained=None, + frozen_stages=-1, + min_depth=0.1, + max_depth=100.0, + **kwargs): + super().__init__() + + self.inv_depth = inv_depth + self.with_auxiliary_head = False + self.with_neck = False + + norm_cfg = dict(type='BN', requires_grad=True) + # norm_cfg = dict(type='GN', requires_grad=True, num_groups=8) + + window_size = int(version[-2:]) + + if version[:-2] == 'base': + embed_dim = 128 + depths = [2, 2, 18, 2] + num_heads = [4, 8, 16, 32] + in_channels = [128, 256, 512, 1024] + elif version[:-2] == 'large': + embed_dim = 192 + depths = [2, 2, 18, 2] + num_heads = [6, 12, 24, 48] + in_channels = [192, 384, 768, 1536] + elif version[:-2] == 'tiny': + embed_dim = 96 + depths = [2, 2, 6, 2] + num_heads = [3, 6, 12, 24] + in_channels = [96, 192, 384, 768] + + backbone_cfg = dict( + embed_dim=embed_dim, + depths=depths, + num_heads=num_heads, + window_size=window_size, + ape=False, + drop_path_rate=0.3, + patch_norm=True, + use_checkpoint=False, + frozen_stages=frozen_stages) + + embed_dim = 512 + decoder_cfg = dict( + in_channels=in_channels, + in_index=[0, 1, 2, 3], + pool_scales=(1, 2, 3, 6), + channels=embed_dim, + dropout_ratio=0.0, + num_classes=32, + norm_cfg=norm_cfg, + align_corners=False) + + self.backbone = SwinTransformer(**backbone_cfg) + # v_dim = decoder_cfg['num_classes'] * 4 + win = 7 + crf_dims = [128, 256, 512, 1024] + v_dims = [64, 128, 256, embed_dim] + self.crf3 = NewCRF( + input_dim=in_channels[3], + embed_dim=crf_dims[3], + window_size=win, + v_dim=v_dims[3], + num_heads=32) + self.crf2 = NewCRF( + input_dim=in_channels[2], + embed_dim=crf_dims[2], + window_size=win, + v_dim=v_dims[2], + num_heads=16) + self.crf1 = NewCRF( + input_dim=in_channels[1], + embed_dim=crf_dims[1], + window_size=win, + v_dim=v_dims[1], + num_heads=8) + self.crf0 = NewCRF( + input_dim=in_channels[0], + embed_dim=crf_dims[0], + window_size=win, + v_dim=v_dims[0], + num_heads=4) + + self.decoder = PSP(**decoder_cfg) + self.disp_head1 = DispHead(input_dim=crf_dims[0]) + + self.up_mode = 'bilinear' + if self.up_mode == 'mask': + self.mask_head = nn.Sequential( + nn.Conv2d(crf_dims[0], 64, 3, padding=1), + nn.ReLU(inplace=True), nn.Conv2d(64, 16 * 9, 1, padding=0)) + + self.min_depth = min_depth + self.max_depth = max_depth + + self.init_weights(pretrained=pretrained) + + def init_weights(self, pretrained=None): + """Initialize the weights in backbone and heads. + + Args: + pretrained (str, optional): Path to pre-trained weights. + Defaults to None. + """ + # print(f'== Load encoder backbone from: {pretrained}') + self.backbone.init_weights(pretrained=pretrained) + self.decoder.init_weights() + if self.with_auxiliary_head: + if isinstance(self.auxiliary_head, nn.ModuleList): + for aux_head in self.auxiliary_head: + aux_head.init_weights() + else: + self.auxiliary_head.init_weights() + + def upsample_mask(self, disp, mask): + """ Upsample disp [H/4, W/4, 1] -> [H, W, 1] using convex combination """ + N, _, H, W = disp.shape + mask = mask.view(N, 1, 9, 4, 4, H, W) + mask = torch.softmax(mask, dim=2) + + up_disp = F.unfold(disp, kernel_size=3, padding=1) + up_disp = up_disp.view(N, 1, 9, 1, 1, H, W) + + up_disp = torch.sum(mask * up_disp, dim=2) + up_disp = up_disp.permute(0, 1, 4, 2, 5, 3) + return up_disp.reshape(N, 1, 4 * H, 4 * W) + + def forward(self, imgs): + + feats = self.backbone(imgs) + if self.with_neck: + feats = self.neck(feats) + + ppm_out = self.decoder(feats) + + e3 = self.crf3(feats[3], ppm_out) + e3 = nn.PixelShuffle(2)(e3) + e2 = self.crf2(feats[2], e3) + e2 = nn.PixelShuffle(2)(e2) + e1 = self.crf1(feats[1], e2) + e1 = nn.PixelShuffle(2)(e1) + e0 = self.crf0(feats[0], e1) + + if self.up_mode == 'mask': + mask = self.mask_head(e0) + d1 = self.disp_head1(e0, 1) + d1 = self.upsample_mask(d1, mask) + else: + d1 = self.disp_head1(e0, 4) + + depth = d1 * self.max_depth + + return depth + + +class DispHead(nn.Module): + + def __init__(self, input_dim=100): + super(DispHead, self).__init__() + # self.norm1 = nn.BatchNorm2d(input_dim) + self.conv1 = nn.Conv2d(input_dim, 1, 3, padding=1) + # self.relu = nn.ReLU(inplace=True) + self.sigmoid = nn.Sigmoid() + + def forward(self, x, scale): + # x = self.relu(self.norm1(x)) + x = self.sigmoid(self.conv1(x)) + if scale > 1: + x = upsample(x, scale_factor=scale) + return x + + +class DispUnpack(nn.Module): + + def __init__(self, input_dim=100, hidden_dim=128): + super(DispUnpack, self).__init__() + self.conv1 = nn.Conv2d(input_dim, hidden_dim, 3, padding=1) + self.conv2 = nn.Conv2d(hidden_dim, 16, 3, padding=1) + self.relu = nn.ReLU(inplace=True) + self.sigmoid = nn.Sigmoid() + self.pixel_shuffle = nn.PixelShuffle(4) + + def forward(self, x, output_size): + x = self.relu(self.conv1(x)) + x = self.sigmoid(self.conv2(x)) # [b, 16, h/4, w/4] + # x = torch.reshape(x, [x.shape[0], 1, x.shape[2]*4, x.shape[3]*4]) + x = self.pixel_shuffle(x) + + return x + + +def upsample(x, scale_factor=2, mode='bilinear', align_corners=False): + """Upsample input tensor by a factor of 2 + """ + return F.interpolate( + x, scale_factor=scale_factor, mode=mode, align_corners=align_corners) diff --git a/modelscope/models/cv/image_depth_estimation/networks/newcrf_layers.py b/modelscope/models/cv/image_depth_estimation/networks/newcrf_layers.py new file mode 100644 index 00000000..a57081e3 --- /dev/null +++ b/modelscope/models/cv/image_depth_estimation/networks/newcrf_layers.py @@ -0,0 +1,504 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F +import torch.utils.checkpoint as checkpoint +from timm.models.layers import DropPath, to_2tuple, trunc_normal_ + + +class Mlp(nn.Module): + """ Multilayer perceptron.""" + + def __init__(self, + in_features, + hidden_features=None, + out_features=None, + act_layer=nn.GELU, + drop=0.): + super().__init__() + out_features = out_features or in_features + hidden_features = hidden_features or in_features + self.fc1 = nn.Linear(in_features, hidden_features) + self.act = act_layer() + self.fc2 = nn.Linear(hidden_features, out_features) + self.drop = nn.Dropout(drop) + + def forward(self, x): + x = self.fc1(x) + x = self.act(x) + x = self.drop(x) + x = self.fc2(x) + x = self.drop(x) + return x + + +def window_partition(x, window_size): + """ + Args: + x: (B, H, W, C) + window_size (int): window size + + Returns: + windows: (num_windows*B, window_size, window_size, C) + """ + B, H, W, C = x.shape + x = x.view(B, H // window_size, window_size, W // window_size, window_size, + C) + windows = x.permute(0, 1, 3, 2, 4, + 5).contiguous().view(-1, window_size, window_size, C) + return windows + + +def window_reverse(windows, window_size, H, W): + """ + Args: + windows: (num_windows*B, window_size, window_size, C) + window_size (int): Window size + H (int): Height of image + W (int): Width of image + + Returns: + x: (B, H, W, C) + """ + B = int(windows.shape[0] / (H * W / window_size / window_size)) + x = windows.view(B, H // window_size, W // window_size, window_size, + window_size, -1) + x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1) + return x + + +class WindowAttention(nn.Module): + """ Window based multi-head self attention (W-MSA) module with relative position bias. + It supports both of shifted and non-shifted window. + + Args: + dim (int): Number of input channels. + window_size (tuple[int]): The height and width of the window. + num_heads (int): Number of attention heads. + qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True + qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set + attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0 + proj_drop (float, optional): Dropout ratio of output. Default: 0.0 + """ + + def __init__(self, + dim, + window_size, + num_heads, + v_dim, + qkv_bias=True, + qk_scale=None, + attn_drop=0., + proj_drop=0.): + + super().__init__() + self.dim = dim + self.window_size = window_size # Wh, Ww + self.num_heads = num_heads + head_dim = dim // num_heads + self.scale = qk_scale or head_dim**-0.5 + + # define a parameter table of relative position bias + self.relative_position_bias_table = nn.Parameter( + torch.zeros((2 * window_size[0] - 1) * (2 * window_size[1] - 1), + num_heads)) # 2*Wh-1 * 2*Ww-1, nH + + # get pair-wise relative position index for each token inside the window + coords_h = torch.arange(self.window_size[0]) + coords_w = torch.arange(self.window_size[1]) + coords = torch.stack(torch.meshgrid([coords_h, coords_w])) # 2, Wh, Ww + coords_flatten = torch.flatten(coords, 1) # 2, Wh*Ww + relative_coords = coords_flatten[:, :, + None] - coords_flatten[:, + None, :] # 2, Wh*Ww, Wh*Ww + relative_coords = relative_coords.permute( + 1, 2, 0).contiguous() # Wh*Ww, Wh*Ww, 2 + relative_coords[:, :, + 0] += self.window_size[0] - 1 # shift to start from 0 + relative_coords[:, :, 1] += self.window_size[1] - 1 + relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1 + relative_position_index = relative_coords.sum(-1) # Wh*Ww, Wh*Ww + self.register_buffer('relative_position_index', + relative_position_index) + + self.qk = nn.Linear(dim, dim * 2, bias=qkv_bias) + self.attn_drop = nn.Dropout(attn_drop) + self.proj = nn.Linear(v_dim, v_dim) + self.proj_drop = nn.Dropout(proj_drop) + + trunc_normal_(self.relative_position_bias_table, std=.02) + self.softmax = nn.Softmax(dim=-1) + + def forward(self, x, v, mask=None): + """ Forward function. + + Args: + x: input features with shape of (num_windows*B, N, C) + mask: (0/-inf) mask with shape of (num_windows, Wh*Ww, Wh*Ww) or None + """ + B_, N, C = x.shape + qk = self.qk(x).reshape(B_, N, 2, self.num_heads, + C // self.num_heads).permute(2, 0, 3, 1, 4) + q, k = qk[0], qk[ + 1] # make torchscript happy (cannot use tensor as tuple) + + q = q * self.scale + attn = (q @ k.transpose(-2, -1)) + + relative_position_bias = self.relative_position_bias_table[ + self.relative_position_index.view(-1)].view( + self.window_size[0] * self.window_size[1], + self.window_size[0] * self.window_size[1], + -1) # Wh*Ww,Wh*Ww,nH + relative_position_bias = relative_position_bias.permute( + 2, 0, 1).contiguous() # nH, Wh*Ww, Wh*Ww + attn = attn + relative_position_bias.unsqueeze(0) + + if mask is not None: + nW = mask.shape[0] + attn = attn.view(B_ // nW, nW, self.num_heads, N, + N) + mask.unsqueeze(1).unsqueeze(0) + attn = attn.view(-1, self.num_heads, N, N) + attn = self.softmax(attn) + else: + attn = self.softmax(attn) + + attn = self.attn_drop(attn) + + # assert self.dim % v.shape[-1] == 0, "self.dim % v.shape[-1] != 0" + # repeat_num = self.dim // v.shape[-1] + # v = v.view(B_, N, self.num_heads // repeat_num, -1).transpose(1, 2).repeat(1, repeat_num, 1, 1) + + assert self.dim == v.shape[-1], 'self.dim != v.shape[-1]' + v = v.view(B_, N, self.num_heads, -1).transpose(1, 2) + + x = (attn @ v).transpose(1, 2).reshape(B_, N, C) + x = self.proj(x) + x = self.proj_drop(x) + return x + + +class CRFBlock(nn.Module): + """ CRF Block. + + Args: + dim (int): Number of input channels. + num_heads (int): Number of attention heads. + window_size (int): Window size. + shift_size (int): Shift size for SW-MSA. + mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. + qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True + qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set. + drop (float, optional): Dropout rate. Default: 0.0 + attn_drop (float, optional): Attention dropout rate. Default: 0.0 + drop_path (float, optional): Stochastic depth rate. Default: 0.0 + act_layer (nn.Module, optional): Activation layer. Default: nn.GELU + norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm + """ + + def __init__(self, + dim, + num_heads, + v_dim, + window_size=7, + shift_size=0, + mlp_ratio=4., + qkv_bias=True, + qk_scale=None, + drop=0., + attn_drop=0., + drop_path=0., + act_layer=nn.GELU, + norm_layer=nn.LayerNorm): + super().__init__() + self.dim = dim + self.num_heads = num_heads + self.v_dim = v_dim + self.window_size = window_size + self.shift_size = shift_size + self.mlp_ratio = mlp_ratio + assert 0 <= self.shift_size < self.window_size, 'shift_size must in 0-window_size' + + self.norm1 = norm_layer(dim) + self.attn = WindowAttention( + dim, + window_size=to_2tuple(self.window_size), + num_heads=num_heads, + v_dim=v_dim, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + attn_drop=attn_drop, + proj_drop=drop) + + self.drop_path = DropPath( + drop_path) if drop_path > 0. else nn.Identity() + self.norm2 = norm_layer(v_dim) + mlp_hidden_dim = int(v_dim * mlp_ratio) + self.mlp = Mlp( + in_features=v_dim, + hidden_features=mlp_hidden_dim, + act_layer=act_layer, + drop=drop) + + self.H = None + self.W = None + + def forward(self, x, v, mask_matrix): + """ Forward function. + + Args: + x: Input feature, tensor size (B, H*W, C). + H, W: Spatial resolution of the input feature. + mask_matrix: Attention mask for cyclic shift. + """ + B, L, C = x.shape + H, W = self.H, self.W + assert L == H * W, 'input feature has wrong size' + + shortcut = x + x = self.norm1(x) + x = x.view(B, H, W, C) + + # pad feature maps to multiples of window size + pad_l = pad_t = 0 + pad_r = (self.window_size - W % self.window_size) % self.window_size + pad_b = (self.window_size - H % self.window_size) % self.window_size + x = F.pad(x, (0, 0, pad_l, pad_r, pad_t, pad_b)) + v = F.pad(v, (0, 0, pad_l, pad_r, pad_t, pad_b)) + _, Hp, Wp, _ = x.shape + + # cyclic shift + if self.shift_size > 0: + shifted_x = torch.roll( + x, shifts=(-self.shift_size, -self.shift_size), dims=(1, 2)) + shifted_v = torch.roll( + v, shifts=(-self.shift_size, -self.shift_size), dims=(1, 2)) + attn_mask = mask_matrix + else: + shifted_x = x + shifted_v = v + attn_mask = None + + # partition windows + x_windows = window_partition( + shifted_x, self.window_size) # nW*B, window_size, window_size, C + x_windows = x_windows.view(-1, self.window_size * self.window_size, + C) # nW*B, window_size*window_size, C + v_windows = window_partition( + shifted_v, self.window_size) # nW*B, window_size, window_size, C + v_windows = v_windows.view( + -1, self.window_size * self.window_size, + v_windows.shape[-1]) # nW*B, window_size*window_size, C + + # W-MSA/SW-MSA + attn_windows = self.attn( + x_windows, v_windows, + mask=attn_mask) # nW*B, window_size*window_size, C + + # merge windows + attn_windows = attn_windows.view(-1, self.window_size, + self.window_size, self.v_dim) + shifted_x = window_reverse(attn_windows, self.window_size, Hp, + Wp) # B H' W' C + + # reverse cyclic shift + if self.shift_size > 0: + x = torch.roll( + shifted_x, + shifts=(self.shift_size, self.shift_size), + dims=(1, 2)) + else: + x = shifted_x + + if pad_r > 0 or pad_b > 0: + x = x[:, :H, :W, :].contiguous() + + x = x.view(B, H * W, self.v_dim) + + # FFN + x = shortcut + self.drop_path(x) + x = x + self.drop_path(self.mlp(self.norm2(x))) + + return x + + +class BasicCRFLayer(nn.Module): + """ A basic NeWCRFs layer for one stage. + + Args: + dim (int): Number of feature channels + depth (int): Depths of this stage. + num_heads (int): Number of attention head. + window_size (int): Local window size. Default: 7. + mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4. + qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True + qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set. + drop (float, optional): Dropout rate. Default: 0.0 + attn_drop (float, optional): Attention dropout rate. Default: 0.0 + drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0 + norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm + downsample (nn.Module | None, optional): Downsample layer at the end of the layer. Default: None + use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False. + """ + + def __init__(self, + dim, + depth, + num_heads, + v_dim, + window_size=7, + mlp_ratio=4., + qkv_bias=True, + qk_scale=None, + drop=0., + attn_drop=0., + drop_path=0., + norm_layer=nn.LayerNorm, + downsample=None, + use_checkpoint=False): + super().__init__() + self.window_size = window_size + self.shift_size = window_size // 2 + self.depth = depth + self.use_checkpoint = use_checkpoint + + # build blocks + self.blocks = nn.ModuleList([ + CRFBlock( + dim=dim, + num_heads=num_heads, + v_dim=v_dim, + window_size=window_size, + shift_size=0 if (i % 2 == 0) else window_size // 2, + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + drop=drop, + attn_drop=attn_drop, + drop_path=drop_path[i] + if isinstance(drop_path, list) else drop_path, + norm_layer=norm_layer) for i in range(depth) + ]) + + # patch merging layer + if downsample is not None: + self.downsample = downsample(dim=dim, norm_layer=norm_layer) + else: + self.downsample = None + + def forward(self, x, v, H, W): + """ Forward function. + + Args: + x: Input feature, tensor size (B, H*W, C). + H, W: Spatial resolution of the input feature. + """ + + # calculate attention mask for SW-MSA + Hp = int(np.ceil(H / self.window_size)) * self.window_size + Wp = int(np.ceil(W / self.window_size)) * self.window_size + img_mask = torch.zeros((1, Hp, Wp, 1), device=x.device) # 1 Hp Wp 1 + h_slices = (slice(0, -self.window_size), + slice(-self.window_size, + -self.shift_size), slice(-self.shift_size, None)) + w_slices = (slice(0, -self.window_size), + slice(-self.window_size, + -self.shift_size), slice(-self.shift_size, None)) + cnt = 0 + for h in h_slices: + for w in w_slices: + img_mask[:, h, w, :] = cnt + cnt += 1 + + mask_windows = window_partition( + img_mask, self.window_size) # nW, window_size, window_size, 1 + mask_windows = mask_windows.view(-1, + self.window_size * self.window_size) + attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2) + attn_mask = attn_mask.masked_fill(attn_mask != 0, + float(-100.0)).masked_fill( + attn_mask == 0, float(0.0)) + + for blk in self.blocks: + blk.H, blk.W = H, W + if self.use_checkpoint: + x = checkpoint.checkpoint(blk, x, attn_mask) + else: + x = blk(x, v, attn_mask) + if self.downsample is not None: + x_down = self.downsample(x, H, W) + Wh, Ww = (H + 1) // 2, (W + 1) // 2 + return x, H, W, x_down, Wh, Ww + else: + return x, H, W, x, H, W + + +class NewCRF(nn.Module): + + def __init__(self, + input_dim=96, + embed_dim=96, + v_dim=64, + window_size=7, + num_heads=4, + depth=2, + patch_size=4, + in_chans=3, + norm_layer=nn.LayerNorm, + patch_norm=True): + super().__init__() + + self.embed_dim = embed_dim + self.patch_norm = patch_norm + + if input_dim != embed_dim: + self.proj_x = nn.Conv2d(input_dim, embed_dim, 3, padding=1) + else: + self.proj_x = None + + if v_dim != embed_dim: + self.proj_v = nn.Conv2d(v_dim, embed_dim, 3, padding=1) + elif embed_dim % v_dim == 0: + self.proj_v = None + + v_dim = embed_dim + assert v_dim == embed_dim + + self.crf_layer = BasicCRFLayer( + dim=embed_dim, + depth=depth, + num_heads=num_heads, + v_dim=v_dim, + window_size=window_size, + mlp_ratio=4., + qkv_bias=True, + qk_scale=None, + drop=0., + attn_drop=0., + drop_path=0., + norm_layer=norm_layer, + downsample=None, + use_checkpoint=False) + + layer = norm_layer(embed_dim) + layer_name = 'norm_crf' + self.add_module(layer_name, layer) + + def forward(self, x, v): + if self.proj_x is not None: + x = self.proj_x(x) + if self.proj_v is not None: + v = self.proj_v(v) + + Wh, Ww = x.size(2), x.size(3) + x = x.flatten(2).transpose(1, 2) + v = v.transpose(1, 2).transpose(2, 3) + + x_out, H, W, x, Wh, Ww = self.crf_layer(x, v, Wh, Ww) + norm_layer = getattr(self, 'norm_crf') + x_out = norm_layer(x_out) + out = x_out.view(-1, H, W, self.embed_dim).permute(0, 3, 1, + 2).contiguous() + + return out diff --git a/modelscope/models/cv/image_depth_estimation/networks/newcrf_utils.py b/modelscope/models/cv/image_depth_estimation/networks/newcrf_utils.py new file mode 100644 index 00000000..aa407602 --- /dev/null +++ b/modelscope/models/cv/image_depth_estimation/networks/newcrf_utils.py @@ -0,0 +1,272 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +import os +import os.path as osp +import pkgutil +import warnings +from collections import OrderedDict +from importlib import import_module + +import torch +import torch.nn as nn +import torchvision +from torch import distributed as dist +from torch.nn import functional as F +from torch.nn.parallel import DataParallel, DistributedDataParallel +from torch.utils import model_zoo + +TORCH_VERSION = torch.__version__ + + +def resize(input, + size=None, + scale_factor=None, + mode='nearest', + align_corners=None, + warning=True): + if warning: + if size is not None and align_corners: + input_h, input_w = tuple(int(x) for x in input.shape[2:]) + output_h, output_w = tuple(int(x) for x in size) + if output_h > input_h or output_w > output_h: + if ((output_h > 1 and output_w > 1 and input_h > 1 + and input_w > 1) and (output_h - 1) % (input_h - 1) + and (output_w - 1) % (input_w - 1)): + warnings.warn( + f'When align_corners={align_corners}, ' + 'the output would more aligned if ' + f'input size {(input_h, input_w)} is `x+1` and ' + f'out size {(output_h, output_w)} is `nx+1`') + if isinstance(size, torch.Size): + size = tuple(int(x) for x in size) + return F.interpolate(input, size, scale_factor, mode, align_corners) + + +def normal_init(module, mean=0, std=1, bias=0): + if hasattr(module, 'weight') and module.weight is not None: + nn.init.normal_(module.weight, mean, std) + if hasattr(module, 'bias') and module.bias is not None: + nn.init.constant_(module.bias, bias) + + +def is_module_wrapper(module): + module_wrappers = (DataParallel, DistributedDataParallel) + return isinstance(module, module_wrappers) + + +def get_dist_info(): + if TORCH_VERSION < '1.0': + initialized = dist._initialized + else: + if dist.is_available(): + initialized = dist.is_initialized() + else: + initialized = False + if initialized: + rank = dist.get_rank() + world_size = dist.get_world_size() + else: + rank = 0 + world_size = 1 + return rank, world_size + + +def load_state_dict(module, state_dict, strict=False, logger=None): + """Load state_dict to a module. + + This method is modified from :meth:`torch.nn.Module.load_state_dict`. + Default value for ``strict`` is set to ``False`` and the message for + param mismatch will be shown even if strict is False. + + Args: + module (Module): Module that receives the state_dict. + state_dict (OrderedDict): Weights. + strict (bool): whether to strictly enforce that the keys + in :attr:`state_dict` match the keys returned by this module's + :meth:`~torch.nn.Module.state_dict` function. Default: ``False``. + logger (:obj:`logging.Logger`, optional): Logger to log the error + message. If not specified, print function will be used. + """ + unexpected_keys = [] + all_missing_keys = [] + err_msg = [] + + metadata = getattr(state_dict, '_metadata', None) + state_dict = state_dict.copy() + if metadata is not None: + state_dict._metadata = metadata + + # use _load_from_state_dict to enable checkpoint version control + def load(module, prefix=''): + # recursively check parallel module in case that the model has a + # complicated structure, e.g., nn.Module(nn.Module(DDP)) + if is_module_wrapper(module): + module = module.module + local_metadata = {} if metadata is None else metadata.get( + prefix[:-1], {}) + module._load_from_state_dict(state_dict, prefix, local_metadata, True, + all_missing_keys, unexpected_keys, + err_msg) + for name, child in module._modules.items(): + if child is not None: + load(child, prefix + name + '.') + + load(module) + load = None # break load->load reference cycle + + # ignore "num_batches_tracked" of BN layers + missing_keys = [ + key for key in all_missing_keys if 'num_batches_tracked' not in key + ] + + if unexpected_keys: + err_msg.append('unexpected key in source ' + f'state_dict: {", ".join(unexpected_keys)}\n') + if missing_keys: + err_msg.append( + f'missing keys in source state_dict: {", ".join(missing_keys)}\n') + + rank, _ = get_dist_info() + if len(err_msg) > 0 and rank == 0: + err_msg.insert( + 0, 'The model and loaded state dict do not match exactly\n') + err_msg = '\n'.join(err_msg) + if strict: + raise RuntimeError(err_msg) + elif logger is not None: + logger.warning(err_msg) + else: + print(err_msg) + + +def load_url_dist(url, model_dir=None): + """In distributed setting, this function only download checkpoint at local + rank 0.""" + rank, world_size = get_dist_info() + rank = int(os.environ.get('LOCAL_RANK', rank)) + if rank == 0: + checkpoint = model_zoo.load_url(url, model_dir=model_dir) + if world_size > 1: + torch.distributed.barrier() + if rank > 0: + checkpoint = model_zoo.load_url(url, model_dir=model_dir) + return checkpoint + + +def get_torchvision_models(): + model_urls = dict() + for _, name, ispkg in pkgutil.walk_packages(torchvision.models.__path__): + if ispkg: + continue + _zoo = import_module(f'torchvision.models.{name}') + if hasattr(_zoo, 'model_urls'): + _urls = getattr(_zoo, 'model_urls') + model_urls.update(_urls) + return model_urls + + +def _load_checkpoint(filename, map_location=None): + """Load checkpoint from somewhere (modelzoo, file, url). + + Args: + filename (str): Accept local filepath, URL, ``torchvision://xxx``, + ``open-mmlab://xxx``. Please refer to ``docs/model_zoo.md`` for + details. + map_location (str | None): Same as :func:`torch.load`. Default: None. + + Returns: + dict | OrderedDict: The loaded checkpoint. It can be either an + OrderedDict storing model weights or a dict containing other + information, which depends on the checkpoint. + """ + if filename.startswith('modelzoo://'): + warnings.warn('The URL scheme of "modelzoo://" is deprecated, please ' + 'use "torchvision://" instead') + model_urls = get_torchvision_models() + model_name = filename[11:] + checkpoint = load_url_dist(model_urls[model_name]) + else: + if not osp.isfile(filename): + raise IOError(f'{filename} is not a checkpoint file') + checkpoint = torch.load(filename, map_location=map_location) + return checkpoint + + +def load_checkpoint(model, + filename, + map_location='cpu', + strict=False, + logger=None): + """Load checkpoint from a file or URI. + + Args: + model (Module): Module to load checkpoint. + filename (str): Accept local filepath, URL, ``torchvision://xxx``, + ``open-mmlab://xxx``. Please refer to ``docs/model_zoo.md`` for + details. + map_location (str): Same as :func:`torch.load`. + strict (bool): Whether to allow different params for the model and + checkpoint. + logger (:mod:`logging.Logger` or None): The logger for error message. + + Returns: + dict or OrderedDict: The loaded checkpoint. + """ + checkpoint = _load_checkpoint(filename, map_location) + # OrderedDict is a subclass of dict + if not isinstance(checkpoint, dict): + raise RuntimeError( + f'No state_dict found in checkpoint file {filename}') + # get state_dict from checkpoint + if 'state_dict' in checkpoint: + state_dict = checkpoint['state_dict'] + elif 'model' in checkpoint: + state_dict = checkpoint['model'] + else: + state_dict = checkpoint + # strip prefix of state_dict + if list(state_dict.keys())[0].startswith('module.'): + state_dict = {k[7:]: v for k, v in state_dict.items()} + + # for MoBY, load model of online branch + if sorted(list(state_dict.keys()))[0].startswith('encoder'): + state_dict = { + k.replace('encoder.', ''): v + for k, v in state_dict.items() if k.startswith('encoder.') + } + + # reshape absolute position embedding + if state_dict.get('absolute_pos_embed') is not None: + absolute_pos_embed = state_dict['absolute_pos_embed'] + N1, L, C1 = absolute_pos_embed.size() + N2, C2, H, W = model.absolute_pos_embed.size() + if N1 != N2 or C1 != C2 or L != H * W: + logger.warning('Error in loading absolute_pos_embed, pass') + else: + state_dict['absolute_pos_embed'] = absolute_pos_embed.view( + N2, H, W, C2).permute(0, 3, 1, 2) + + # interpolate position bias table if needed + relative_position_bias_table_keys = [ + k for k in state_dict.keys() if 'relative_position_bias_table' in k + ] + for table_key in relative_position_bias_table_keys: + table_pretrained = state_dict[table_key] + table_current = model.state_dict()[table_key] + L1, nH1 = table_pretrained.size() + L2, nH2 = table_current.size() + if nH1 != nH2: + logger.warning(f'Error in loading {table_key}, pass') + else: + if L1 != L2: + S1 = int(L1**0.5) + S2 = int(L2**0.5) + table_pretrained_resized = F.interpolate( + table_pretrained.permute(1, 0).view(1, nH1, S1, S1), + size=(S2, S2), + mode='bicubic') + state_dict[table_key] = table_pretrained_resized.view( + nH2, L2).permute(1, 0) + + # load state_dict + load_state_dict(model, state_dict, strict, logger) + return checkpoint diff --git a/modelscope/models/cv/image_depth_estimation/networks/swin_transformer.py b/modelscope/models/cv/image_depth_estimation/networks/swin_transformer.py new file mode 100644 index 00000000..ba219b4a --- /dev/null +++ b/modelscope/models/cv/image_depth_estimation/networks/swin_transformer.py @@ -0,0 +1,706 @@ +# The implementation is adopted from Swin Transformer +# made publicly available under the MIT License at https://github.com/microsoft/Swin-Transformer + +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F +import torch.utils.checkpoint as checkpoint +from timm.models.layers import DropPath, to_2tuple, trunc_normal_ + +from .newcrf_utils import load_checkpoint + + +class Mlp(nn.Module): + """ Multilayer perceptron.""" + + def __init__(self, + in_features, + hidden_features=None, + out_features=None, + act_layer=nn.GELU, + drop=0.): + super().__init__() + out_features = out_features or in_features + hidden_features = hidden_features or in_features + self.fc1 = nn.Linear(in_features, hidden_features) + self.act = act_layer() + self.fc2 = nn.Linear(hidden_features, out_features) + self.drop = nn.Dropout(drop) + + def forward(self, x): + x = self.fc1(x) + x = self.act(x) + x = self.drop(x) + x = self.fc2(x) + x = self.drop(x) + return x + + +def window_partition(x, window_size): + """ + Args: + x: (B, H, W, C) + window_size (int): window size + + Returns: + windows: (num_windows*B, window_size, window_size, C) + """ + B, H, W, C = x.shape + x = x.view(B, H // window_size, window_size, W // window_size, window_size, + C) + windows = x.permute(0, 1, 3, 2, 4, + 5).contiguous().view(-1, window_size, window_size, C) + return windows + + +def window_reverse(windows, window_size, H, W): + """ + Args: + windows: (num_windows*B, window_size, window_size, C) + window_size (int): Window size + H (int): Height of image + W (int): Width of image + + Returns: + x: (B, H, W, C) + """ + B = int(windows.shape[0] / (H * W / window_size / window_size)) + x = windows.view(B, H // window_size, W // window_size, window_size, + window_size, -1) + x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1) + return x + + +class WindowAttention(nn.Module): + """ Window based multi-head self attention (W-MSA) module with relative position bias. + It supports both of shifted and non-shifted window. + + Args: + dim (int): Number of input channels. + window_size (tuple[int]): The height and width of the window. + num_heads (int): Number of attention heads. + qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True + qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set + attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0 + proj_drop (float, optional): Dropout ratio of output. Default: 0.0 + """ + + def __init__(self, + dim, + window_size, + num_heads, + qkv_bias=True, + qk_scale=None, + attn_drop=0., + proj_drop=0.): + + super().__init__() + self.dim = dim + self.window_size = window_size # Wh, Ww + self.num_heads = num_heads + head_dim = dim // num_heads + self.scale = qk_scale or head_dim**-0.5 + + # define a parameter table of relative position bias + self.relative_position_bias_table = nn.Parameter( + torch.zeros((2 * window_size[0] - 1) * (2 * window_size[1] - 1), + num_heads)) # 2*Wh-1 * 2*Ww-1, nH + + # get pair-wise relative position index for each token inside the window + coords_h = torch.arange(self.window_size[0]) + coords_w = torch.arange(self.window_size[1]) + coords = torch.stack(torch.meshgrid([coords_h, coords_w])) # 2, Wh, Ww + coords_flatten = torch.flatten(coords, 1) # 2, Wh*Ww + relative_coords = coords_flatten[:, :, + None] - coords_flatten[:, + None, :] # 2, Wh*Ww, Wh*Ww + relative_coords = relative_coords.permute( + 1, 2, 0).contiguous() # Wh*Ww, Wh*Ww, 2 + relative_coords[:, :, + 0] += self.window_size[0] - 1 # shift to start from 0 + relative_coords[:, :, 1] += self.window_size[1] - 1 + relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1 + relative_position_index = relative_coords.sum(-1) # Wh*Ww, Wh*Ww + self.register_buffer('relative_position_index', + relative_position_index) + + self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias) + self.attn_drop = nn.Dropout(attn_drop) + self.proj = nn.Linear(dim, dim) + self.proj_drop = nn.Dropout(proj_drop) + + trunc_normal_(self.relative_position_bias_table, std=.02) + self.softmax = nn.Softmax(dim=-1) + + def forward(self, x, mask=None): + """ Forward function. + + Args: + x: input features with shape of (num_windows*B, N, C) + mask: (0/-inf) mask with shape of (num_windows, Wh*Ww, Wh*Ww) or None + """ + B_, N, C = x.shape + qkv = self.qkv(x).reshape(B_, N, 3, self.num_heads, + C // self.num_heads).permute(2, 0, 3, 1, 4) + q, k, v = qkv[0], qkv[1], qkv[ + 2] # make torchscript happy (cannot use tensor as tuple) + + q = q * self.scale + attn = (q @ k.transpose(-2, -1)) + + relative_position_bias = self.relative_position_bias_table[ + self.relative_position_index.view(-1)].view( + self.window_size[0] * self.window_size[1], + self.window_size[0] * self.window_size[1], + -1) # Wh*Ww,Wh*Ww,nH + relative_position_bias = relative_position_bias.permute( + 2, 0, 1).contiguous() # nH, Wh*Ww, Wh*Ww + attn = attn + relative_position_bias.unsqueeze(0) + + if mask is not None: + nW = mask.shape[0] + attn = attn.view(B_ // nW, nW, self.num_heads, N, + N) + mask.unsqueeze(1).unsqueeze(0) + attn = attn.view(-1, self.num_heads, N, N) + attn = self.softmax(attn) + else: + attn = self.softmax(attn) + + attn = self.attn_drop(attn) + + x = (attn @ v).transpose(1, 2).reshape(B_, N, C) + x = self.proj(x) + x = self.proj_drop(x) + return x + + +class SwinTransformerBlock(nn.Module): + """ Swin Transformer Block. + + Args: + dim (int): Number of input channels. + num_heads (int): Number of attention heads. + window_size (int): Window size. + shift_size (int): Shift size for SW-MSA. + mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. + qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True + qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set. + drop (float, optional): Dropout rate. Default: 0.0 + attn_drop (float, optional): Attention dropout rate. Default: 0.0 + drop_path (float, optional): Stochastic depth rate. Default: 0.0 + act_layer (nn.Module, optional): Activation layer. Default: nn.GELU + norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm + """ + + def __init__(self, + dim, + num_heads, + window_size=7, + shift_size=0, + mlp_ratio=4., + qkv_bias=True, + qk_scale=None, + drop=0., + attn_drop=0., + drop_path=0., + act_layer=nn.GELU, + norm_layer=nn.LayerNorm): + super().__init__() + self.dim = dim + self.num_heads = num_heads + self.window_size = window_size + self.shift_size = shift_size + self.mlp_ratio = mlp_ratio + assert 0 <= self.shift_size < self.window_size, 'shift_size must in 0-window_size' + + self.norm1 = norm_layer(dim) + self.attn = WindowAttention( + dim, + window_size=to_2tuple(self.window_size), + num_heads=num_heads, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + attn_drop=attn_drop, + proj_drop=drop) + + self.drop_path = DropPath( + drop_path) if drop_path > 0. else nn.Identity() + self.norm2 = norm_layer(dim) + mlp_hidden_dim = int(dim * mlp_ratio) + self.mlp = Mlp( + in_features=dim, + hidden_features=mlp_hidden_dim, + act_layer=act_layer, + drop=drop) + + self.H = None + self.W = None + + def forward(self, x, mask_matrix): + """ Forward function. + + Args: + x: Input feature, tensor size (B, H*W, C). + H, W: Spatial resolution of the input feature. + mask_matrix: Attention mask for cyclic shift. + """ + B, L, C = x.shape + H, W = self.H, self.W + assert L == H * W, 'input feature has wrong size' + + shortcut = x + x = self.norm1(x) + x = x.view(B, H, W, C) + + # pad feature maps to multiples of window size + pad_l = pad_t = 0 + pad_r = (self.window_size - W % self.window_size) % self.window_size + pad_b = (self.window_size - H % self.window_size) % self.window_size + x = F.pad(x, (0, 0, pad_l, pad_r, pad_t, pad_b)) + _, Hp, Wp, _ = x.shape + + # cyclic shift + if self.shift_size > 0: + shifted_x = torch.roll( + x, shifts=(-self.shift_size, -self.shift_size), dims=(1, 2)) + attn_mask = mask_matrix + else: + shifted_x = x + attn_mask = None + + # partition windows + x_windows = window_partition( + shifted_x, self.window_size) # nW*B, window_size, window_size, C + x_windows = x_windows.view(-1, self.window_size * self.window_size, + C) # nW*B, window_size*window_size, C + + # W-MSA/SW-MSA + attn_windows = self.attn( + x_windows, mask=attn_mask) # nW*B, window_size*window_size, C + + # merge windows + attn_windows = attn_windows.view(-1, self.window_size, + self.window_size, C) + shifted_x = window_reverse(attn_windows, self.window_size, Hp, + Wp) # B H' W' C + + # reverse cyclic shift + if self.shift_size > 0: + x = torch.roll( + shifted_x, + shifts=(self.shift_size, self.shift_size), + dims=(1, 2)) + else: + x = shifted_x + + if pad_r > 0 or pad_b > 0: + x = x[:, :H, :W, :].contiguous() + + x = x.view(B, H * W, C) + + # FFN + x = shortcut + self.drop_path(x) + x = x + self.drop_path(self.mlp(self.norm2(x))) + + return x + + +class PatchMerging(nn.Module): + """ Patch Merging Layer + + Args: + dim (int): Number of input channels. + norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm + """ + + def __init__(self, dim, norm_layer=nn.LayerNorm): + super().__init__() + self.dim = dim + self.reduction = nn.Linear(4 * dim, 2 * dim, bias=False) + self.norm = norm_layer(4 * dim) + + def forward(self, x, H, W): + """ Forward function. + + Args: + x: Input feature, tensor size (B, H*W, C). + H, W: Spatial resolution of the input feature. + """ + B, L, C = x.shape + assert L == H * W, 'input feature has wrong size' + + x = x.view(B, H, W, C) + + # padding + pad_input = (H % 2 == 1) or (W % 2 == 1) + if pad_input: + x = F.pad(x, (0, 0, 0, W % 2, 0, H % 2)) + + x0 = x[:, 0::2, 0::2, :] # B H/2 W/2 C + x1 = x[:, 1::2, 0::2, :] # B H/2 W/2 C + x2 = x[:, 0::2, 1::2, :] # B H/2 W/2 C + x3 = x[:, 1::2, 1::2, :] # B H/2 W/2 C + x = torch.cat([x0, x1, x2, x3], -1) # B H/2 W/2 4*C + x = x.view(B, -1, 4 * C) # B H/2*W/2 4*C + + x = self.norm(x) + x = self.reduction(x) + + return x + + +class BasicLayer(nn.Module): + """ A basic Swin Transformer layer for one stage. + + Args: + dim (int): Number of feature channels + depth (int): Depths of this stage. + num_heads (int): Number of attention head. + window_size (int): Local window size. Default: 7. + mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4. + qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True + qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set. + drop (float, optional): Dropout rate. Default: 0.0 + attn_drop (float, optional): Attention dropout rate. Default: 0.0 + drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0 + norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm + downsample (nn.Module | None, optional): Downsample layer at the end of the layer. Default: None + use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False. + """ + + def __init__(self, + dim, + depth, + num_heads, + window_size=7, + mlp_ratio=4., + qkv_bias=True, + qk_scale=None, + drop=0., + attn_drop=0., + drop_path=0., + norm_layer=nn.LayerNorm, + downsample=None, + use_checkpoint=False): + super().__init__() + self.window_size = window_size + self.shift_size = window_size // 2 + self.depth = depth + self.use_checkpoint = use_checkpoint + + # build blocks + self.blocks = nn.ModuleList([ + SwinTransformerBlock( + dim=dim, + num_heads=num_heads, + window_size=window_size, + shift_size=0 if (i % 2 == 0) else window_size // 2, + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + drop=drop, + attn_drop=attn_drop, + drop_path=drop_path[i] + if isinstance(drop_path, list) else drop_path, + norm_layer=norm_layer) for i in range(depth) + ]) + + # patch merging layer + if downsample is not None: + self.downsample = downsample(dim=dim, norm_layer=norm_layer) + else: + self.downsample = None + + def forward(self, x, H, W): + """ Forward function. + + Args: + x: Input feature, tensor size (B, H*W, C). + H, W: Spatial resolution of the input feature. + """ + + # calculate attention mask for SW-MSA + Hp = int(np.ceil(H / self.window_size)) * self.window_size + Wp = int(np.ceil(W / self.window_size)) * self.window_size + img_mask = torch.zeros((1, Hp, Wp, 1), device=x.device) # 1 Hp Wp 1 + h_slices = (slice(0, -self.window_size), + slice(-self.window_size, + -self.shift_size), slice(-self.shift_size, None)) + w_slices = (slice(0, -self.window_size), + slice(-self.window_size, + -self.shift_size), slice(-self.shift_size, None)) + cnt = 0 + for h in h_slices: + for w in w_slices: + img_mask[:, h, w, :] = cnt + cnt += 1 + + mask_windows = window_partition( + img_mask, self.window_size) # nW, window_size, window_size, 1 + mask_windows = mask_windows.view(-1, + self.window_size * self.window_size) + attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2) + attn_mask = attn_mask.masked_fill(attn_mask != 0, + float(-100.0)).masked_fill( + attn_mask == 0, float(0.0)) + + for blk in self.blocks: + blk.H, blk.W = H, W + if self.use_checkpoint: + x = checkpoint.checkpoint(blk, x, attn_mask) + else: + x = blk(x, attn_mask) + if self.downsample is not None: + x_down = self.downsample(x, H, W) + Wh, Ww = (H + 1) // 2, (W + 1) // 2 + return x, H, W, x_down, Wh, Ww + else: + return x, H, W, x, H, W + + +class PatchEmbed(nn.Module): + """ Image to Patch Embedding + + Args: + patch_size (int): Patch token size. Default: 4. + in_chans (int): Number of input image channels. Default: 3. + embed_dim (int): Number of linear projection output channels. Default: 96. + norm_layer (nn.Module, optional): Normalization layer. Default: None + """ + + def __init__(self, + patch_size=4, + in_chans=3, + embed_dim=96, + norm_layer=None): + super().__init__() + patch_size = to_2tuple(patch_size) + self.patch_size = patch_size + + self.in_chans = in_chans + self.embed_dim = embed_dim + + self.proj = nn.Conv2d( + in_chans, embed_dim, kernel_size=patch_size, stride=patch_size) + if norm_layer is not None: + self.norm = norm_layer(embed_dim) + else: + self.norm = None + + def forward(self, x): + """Forward function.""" + # padding + _, _, H, W = x.size() + if W % self.patch_size[1] != 0: + x = F.pad(x, (0, self.patch_size[1] - W % self.patch_size[1])) + if H % self.patch_size[0] != 0: + x = F.pad(x, + (0, 0, 0, self.patch_size[0] - H % self.patch_size[0])) + + x = self.proj(x) # B C Wh Ww + if self.norm is not None: + Wh, Ww = x.size(2), x.size(3) + x = x.flatten(2).transpose(1, 2) + x = self.norm(x) + x = x.transpose(1, 2).view(-1, self.embed_dim, Wh, Ww) + + return x + + +class SwinTransformer(nn.Module): + """ Swin Transformer backbone. + A PyTorch impl of : `Swin Transformer: Hierarchical Vision Transformer using Shifted Windows` - + https://arxiv.org/pdf/2103.14030 + + Args: + pretrain_img_size (int): Input image size for training the pretrained model, + used in absolute postion embedding. Default 224. + patch_size (int | tuple(int)): Patch size. Default: 4. + in_chans (int): Number of input image channels. Default: 3. + embed_dim (int): Number of linear projection output channels. Default: 96. + depths (tuple[int]): Depths of each Swin Transformer stage. + num_heads (tuple[int]): Number of attention head of each stage. + window_size (int): Window size. Default: 7. + mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4. + qkv_bias (bool): If True, add a learnable bias to query, key, value. Default: True + qk_scale (float): Override default qk scale of head_dim ** -0.5 if set. + drop_rate (float): Dropout rate. + attn_drop_rate (float): Attention dropout rate. Default: 0. + drop_path_rate (float): Stochastic depth rate. Default: 0.2. + norm_layer (nn.Module): Normalization layer. Default: nn.LayerNorm. + ape (bool): If True, add absolute position embedding to the patch embedding. Default: False. + patch_norm (bool): If True, add normalization after patch embedding. Default: True. + out_indices (Sequence[int]): Output from which stages. + frozen_stages (int): Stages to be frozen (stop grad and set eval mode). + -1 means not freezing any parameters. + use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False. + """ + + def __init__(self, + pretrain_img_size=224, + patch_size=4, + in_chans=3, + embed_dim=96, + depths=[2, 2, 6, 2], + num_heads=[3, 6, 12, 24], + window_size=7, + mlp_ratio=4., + qkv_bias=True, + qk_scale=None, + drop_rate=0., + attn_drop_rate=0., + drop_path_rate=0.2, + norm_layer=nn.LayerNorm, + ape=False, + patch_norm=True, + out_indices=(0, 1, 2, 3), + frozen_stages=-1, + use_checkpoint=False): + super().__init__() + + self.pretrain_img_size = pretrain_img_size + self.num_layers = len(depths) + self.embed_dim = embed_dim + self.ape = ape + self.patch_norm = patch_norm + self.out_indices = out_indices + self.frozen_stages = frozen_stages + + # split image into non-overlapping patches + self.patch_embed = PatchEmbed( + patch_size=patch_size, + in_chans=in_chans, + embed_dim=embed_dim, + norm_layer=norm_layer if self.patch_norm else None) + + # absolute position embedding + if self.ape: + pretrain_img_size = to_2tuple(pretrain_img_size) + patch_size = to_2tuple(patch_size) + patches_resolution = [ + pretrain_img_size[0] // patch_size[0], + pretrain_img_size[1] // patch_size[1] + ] + + self.absolute_pos_embed = nn.Parameter( + torch.zeros(1, embed_dim, patches_resolution[0], + patches_resolution[1])) + trunc_normal_(self.absolute_pos_embed, std=.02) + + self.pos_drop = nn.Dropout(p=drop_rate) + + # stochastic depth + dpr = [ + x.item() for x in torch.linspace(0, drop_path_rate, sum(depths)) + ] # stochastic depth decay rule + + # build layers + self.layers = nn.ModuleList() + for i_layer in range(self.num_layers): + layer = BasicLayer( + dim=int(embed_dim * 2**i_layer), + depth=depths[i_layer], + num_heads=num_heads[i_layer], + window_size=window_size, + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + drop=drop_rate, + attn_drop=attn_drop_rate, + drop_path=dpr[sum(depths[:i_layer]):sum(depths[:i_layer + 1])], + norm_layer=norm_layer, + downsample=PatchMerging if + (i_layer < self.num_layers - 1) else None, + use_checkpoint=use_checkpoint) + self.layers.append(layer) + + num_features = [int(embed_dim * 2**i) for i in range(self.num_layers)] + self.num_features = num_features + + # add a norm layer for each output + for i_layer in out_indices: + layer = norm_layer(num_features[i_layer]) + layer_name = f'norm{i_layer}' + self.add_module(layer_name, layer) + + self._freeze_stages() + + def _freeze_stages(self): + if self.frozen_stages >= 0: + self.patch_embed.eval() + for param in self.patch_embed.parameters(): + param.requires_grad = False + + if self.frozen_stages >= 1 and self.ape: + self.absolute_pos_embed.requires_grad = False + + if self.frozen_stages >= 2: + self.pos_drop.eval() + for i in range(0, self.frozen_stages - 1): + m = self.layers[i] + m.eval() + for param in m.parameters(): + param.requires_grad = False + + def init_weights(self, pretrained=None): + """Initialize the weights in backbone. + + Args: + pretrained (str, optional): Path to pre-trained weights. + Defaults to None. + """ + + def _init_weights(m): + if isinstance(m, nn.Linear): + trunc_normal_(m.weight, std=.02) + if isinstance(m, nn.Linear) and m.bias is not None: + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.LayerNorm): + nn.init.constant_(m.bias, 0) + nn.init.constant_(m.weight, 1.0) + + if isinstance(pretrained, str): + self.apply(_init_weights) + # logger = get_root_logger() + load_checkpoint(self, pretrained, strict=False) + elif pretrained is None: + self.apply(_init_weights) + else: + raise TypeError('pretrained must be a str or None') + + def forward(self, x): + """Forward function.""" + x = self.patch_embed(x) + + Wh, Ww = x.size(2), x.size(3) + if self.ape: + # interpolate the position embedding to the corresponding size + absolute_pos_embed = F.interpolate( + self.absolute_pos_embed, size=(Wh, Ww), mode='bicubic') + x = (x + absolute_pos_embed).flatten(2).transpose(1, + 2) # B Wh*Ww C + else: + x = x.flatten(2).transpose(1, 2) + x = self.pos_drop(x) + + outs = [] + for i in range(self.num_layers): + layer = self.layers[i] + x_out, H, W, x, Wh, Ww = layer(x, Wh, Ww) + + if i in self.out_indices: + norm_layer = getattr(self, f'norm{i}') + x_out = norm_layer(x_out) + + out = x_out.view(-1, H, W, + self.num_features[i]).permute(0, 3, 1, + 2).contiguous() + outs.append(out) + + return tuple(outs) + + def train(self, mode=True): + """Convert the model into training mode while keep layers freezed.""" + super(SwinTransformer, self).train(mode) + self._freeze_stages() diff --git a/modelscope/models/cv/image_depth_estimation/networks/uper_crf_head.py b/modelscope/models/cv/image_depth_estimation/networks/uper_crf_head.py new file mode 100644 index 00000000..93e1edf6 --- /dev/null +++ b/modelscope/models/cv/image_depth_estimation/networks/uper_crf_head.py @@ -0,0 +1,365 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +import torch +import torch.nn as nn +import torch.nn.functional as F +from mmcv.cnn import ConvModule + +from .newcrf_utils import normal_init, resize + + +class PPM(nn.ModuleList): + """Pooling Pyramid Module used in PSPNet. + + Args: + pool_scales (tuple[int]): Pooling scales used in Pooling Pyramid + Module. + in_channels (int): Input channels. + channels (int): Channels after modules, before conv_seg. + conv_cfg (dict|None): Config of conv layers. + norm_cfg (dict|None): Config of norm layers. + act_cfg (dict): Config of activation layers. + align_corners (bool): align_corners argument of F.interpolate. + """ + + def __init__(self, pool_scales, in_channels, channels, conv_cfg, norm_cfg, + act_cfg, align_corners): + super(PPM, self).__init__() + self.pool_scales = pool_scales + self.align_corners = align_corners + self.in_channels = in_channels + self.channels = channels + self.conv_cfg = conv_cfg + self.norm_cfg = norm_cfg + self.act_cfg = act_cfg + for pool_scale in pool_scales: + # == if batch size = 1, BN is not supported, change to GN + if pool_scale == 1: + norm_cfg = dict(type='GN', requires_grad=True, num_groups=256) + self.append( + nn.Sequential( + nn.AdaptiveAvgPool2d(pool_scale), + ConvModule( + self.in_channels, + self.channels, + 1, + conv_cfg=self.conv_cfg, + norm_cfg=norm_cfg, + act_cfg=self.act_cfg))) + + def forward(self, x): + """Forward function.""" + ppm_outs = [] + for ppm in self: + ppm_out = ppm(x) + upsampled_ppm_out = resize( + ppm_out, + size=x.size()[2:], + mode='bilinear', + align_corners=self.align_corners) + ppm_outs.append(upsampled_ppm_out) + return ppm_outs + + +class BaseDecodeHead(nn.Module): + """Base class for BaseDecodeHead. + + Args: + in_channels (int|Sequence[int]): Input channels. + channels (int): Channels after modules, before conv_seg. + num_classes (int): Number of classes. + dropout_ratio (float): Ratio of dropout layer. Default: 0.1. + conv_cfg (dict|None): Config of conv layers. Default: None. + norm_cfg (dict|None): Config of norm layers. Default: None. + act_cfg (dict): Config of activation layers. + Default: dict(type='ReLU') + in_index (int|Sequence[int]): Input feature index. Default: -1 + input_transform (str|None): Transformation type of input features. + Options: 'resize_concat', 'multiple_select', None. + 'resize_concat': Multiple feature maps will be resize to the + same size as first one and than concat together. + Usually used in FCN head of HRNet. + 'multiple_select': Multiple feature maps will be bundle into + a list and passed into decode head. + None: Only one select feature map is allowed. + Default: None. + loss_decode (dict): Config of decode loss. + Default: dict(type='CrossEntropyLoss'). + ignore_index (int | None): The label index to be ignored. When using + masked BCE loss, ignore_index should be set to None. Default: 255 + sampler (dict|None): The config of segmentation map sampler. + Default: None. + align_corners (bool): align_corners argument of F.interpolate. + Default: False. + """ + + def __init__(self, + in_channels, + channels, + *, + num_classes, + dropout_ratio=0.1, + conv_cfg=None, + norm_cfg=None, + act_cfg=dict(type='ReLU'), + in_index=-1, + input_transform=None, + loss_decode=dict( + type='CrossEntropyLoss', + use_sigmoid=False, + loss_weight=1.0), + ignore_index=255, + sampler=None, + align_corners=False): + super(BaseDecodeHead, self).__init__() + self._init_inputs(in_channels, in_index, input_transform) + self.channels = channels + self.num_classes = num_classes + self.dropout_ratio = dropout_ratio + self.conv_cfg = conv_cfg + self.norm_cfg = norm_cfg + self.act_cfg = act_cfg + self.in_index = in_index + # self.loss_decode = build_loss(loss_decode) + self.ignore_index = ignore_index + self.align_corners = align_corners + # if sampler is not None: + # self.sampler = build_pixel_sampler(sampler, context=self) + # else: + # self.sampler = None + + # self.conv_seg = nn.Conv2d(channels, num_classes, kernel_size=1) + # self.conv1 = nn.Conv2d(channels, num_classes, 3, padding=1) + if dropout_ratio > 0: + self.dropout = nn.Dropout2d(dropout_ratio) + else: + self.dropout = None + self.fp16_enabled = False + + def extra_repr(self): + """Extra repr.""" + s = f'input_transform={self.input_transform}, ' \ + f'ignore_index={self.ignore_index}, ' \ + f'align_corners={self.align_corners}' + return s + + def _init_inputs(self, in_channels, in_index, input_transform): + """Check and initialize input transforms. + + The in_channels, in_index and input_transform must match. + Specifically, when input_transform is None, only single feature map + will be selected. So in_channels and in_index must be of type int. + When input_transform + + Args: + in_channels (int|Sequence[int]): Input channels. + in_index (int|Sequence[int]): Input feature index. + input_transform (str|None): Transformation type of input features. + Options: 'resize_concat', 'multiple_select', None. + 'resize_concat': Multiple feature maps will be resize to the + same size as first one and than concat together. + Usually used in FCN head of HRNet. + 'multiple_select': Multiple feature maps will be bundle into + a list and passed into decode head. + None: Only one select feature map is allowed. + """ + + if input_transform is not None: + assert input_transform in ['resize_concat', 'multiple_select'] + self.input_transform = input_transform + self.in_index = in_index + if input_transform is not None: + assert isinstance(in_channels, (list, tuple)) + assert isinstance(in_index, (list, tuple)) + assert len(in_channels) == len(in_index) + if input_transform == 'resize_concat': + self.in_channels = sum(in_channels) + else: + self.in_channels = in_channels + else: + assert isinstance(in_channels, int) + assert isinstance(in_index, int) + self.in_channels = in_channels + + def init_weights(self): + """Initialize weights of classification layer.""" + # normal_init(self.conv_seg, mean=0, std=0.01) + # normal_init(self.conv1, mean=0, std=0.01) + + def _transform_inputs(self, inputs): + """Transform inputs for decoder. + + Args: + inputs (list[Tensor]): List of multi-level img features. + + Returns: + Tensor: The transformed inputs + """ + + if self.input_transform == 'resize_concat': + inputs = [inputs[i] for i in self.in_index] + upsampled_inputs = [ + resize( + input=x, + size=inputs[0].shape[2:], + mode='bilinear', + align_corners=self.align_corners) for x in inputs + ] + inputs = torch.cat(upsampled_inputs, dim=1) + elif self.input_transform == 'multiple_select': + inputs = [inputs[i] for i in self.in_index] + else: + inputs = inputs[self.in_index] + + return inputs + + def forward(self, inputs): + """Placeholder of forward function.""" + pass + + def forward_train(self, inputs, img_metas, gt_semantic_seg, train_cfg): + """Forward function for training. + Args: + inputs (list[Tensor]): List of multi-level img features. + img_metas (list[dict]): List of image info dict where each dict + has: 'img_shape', 'scale_factor', 'flip', and may also contain + 'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'. + For details on the values of these keys see + `mmseg/datasets/pipelines/formatting.py:Collect`. + gt_semantic_seg (Tensor): Semantic segmentation masks + used if the architecture supports semantic segmentation task. + train_cfg (dict): The training config. + + Returns: + dict[str, Tensor]: a dictionary of loss components + """ + seg_logits = self.forward(inputs) + losses = self.losses(seg_logits, gt_semantic_seg) + return losses + + def forward_test(self, inputs, img_metas, test_cfg): + """Forward function for testing. + + Args: + inputs (list[Tensor]): List of multi-level img features. + img_metas (list[dict]): List of image info dict where each dict + has: 'img_shape', 'scale_factor', 'flip', and may also contain + 'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'. + For details on the values of these keys see + `mmseg/datasets/pipelines/formatting.py:Collect`. + test_cfg (dict): The testing config. + + Returns: + Tensor: Output segmentation map. + """ + return self.forward(inputs) + + +class UPerHead(BaseDecodeHead): + + def __init__(self, pool_scales=(1, 2, 3, 6), **kwargs): + super(UPerHead, self).__init__( + input_transform='multiple_select', **kwargs) + # FPN Module + self.lateral_convs = nn.ModuleList() + self.fpn_convs = nn.ModuleList() + for in_channels in self.in_channels: # skip the top layer + l_conv = ConvModule( + in_channels, + self.channels, + 1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg, + inplace=True) + fpn_conv = ConvModule( + self.channels, + self.channels, + 3, + padding=1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg, + inplace=True) + self.lateral_convs.append(l_conv) + self.fpn_convs.append(fpn_conv) + + def forward(self, inputs): + """Forward function.""" + + inputs = self._transform_inputs(inputs) + + # build laterals + laterals = [ + lateral_conv(inputs[i]) + for i, lateral_conv in enumerate(self.lateral_convs) + ] + + # laterals.append(self.psp_forward(inputs)) + + # build top-down path + used_backbone_levels = len(laterals) + for i in range(used_backbone_levels - 1, 0, -1): + prev_shape = laterals[i - 1].shape[2:] + laterals[i - 1] += resize( + laterals[i], + size=prev_shape, + mode='bilinear', + align_corners=self.align_corners) + + # build outputs + fpn_outs = [ + self.fpn_convs[i](laterals[i]) + for i in range(used_backbone_levels - 1) + ] + # append psp feature + fpn_outs.append(laterals[-1]) + + return fpn_outs[0] + + +class PSP(BaseDecodeHead): + """Unified Perceptual Parsing for Scene Understanding. + + This head is the implementation of `UPerNet + `_. + + Args: + pool_scales (tuple[int]): Pooling scales used in Pooling Pyramid + Module applied on the last feature. Default: (1, 2, 3, 6). + """ + + def __init__(self, pool_scales=(1, 2, 3, 6), **kwargs): + super(PSP, self).__init__(input_transform='multiple_select', **kwargs) + # PSP Module + self.psp_modules = PPM( + pool_scales, + self.in_channels[-1], + self.channels, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg, + align_corners=self.align_corners) + self.bottleneck = ConvModule( + self.in_channels[-1] + len(pool_scales) * self.channels, + self.channels, + 3, + padding=1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + + def psp_forward(self, inputs): + """Forward function of PSP module.""" + x = inputs[-1] + psp_outs = [x] + psp_outs.extend(self.psp_modules(x)) + psp_outs = torch.cat(psp_outs, dim=1) + output = self.bottleneck(psp_outs) + + return output + + def forward(self, inputs): + """Forward function.""" + inputs = self._transform_inputs(inputs) + + return self.psp_forward(inputs) diff --git a/modelscope/models/cv/image_depth_estimation/newcrfs_model.py b/modelscope/models/cv/image_depth_estimation/newcrfs_model.py new file mode 100644 index 00000000..4087cb67 --- /dev/null +++ b/modelscope/models/cv/image_depth_estimation/newcrfs_model.py @@ -0,0 +1,53 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +import os.path as osp + +import numpy as np +import torch + +from modelscope.metainfo import Models +from modelscope.models.base.base_torch_model import TorchModel +from modelscope.models.builder import MODELS +from modelscope.models.cv.image_depth_estimation.networks.newcrf_depth import \ + NewCRFDepth +from modelscope.outputs import OutputKeys +from modelscope.utils.constant import ModelFile, Tasks + + +@MODELS.register_module( + Tasks.image_depth_estimation, module_name=Models.newcrfs_depth_estimation) +class DepthEstimation(TorchModel): + + def __init__(self, model_dir: str, **kwargs): + """str -- model file root.""" + super().__init__(model_dir, **kwargs) + + # build model + self.model = NewCRFDepth( + version='large07', inv_depth=False, max_depth=10) + + # load model + model_path = osp.join(model_dir, ModelFile.TORCH_MODEL_FILE) + checkpoint = torch.load(model_path) + + state_dict = {} + for k in checkpoint['model'].keys(): + if k.startswith('module.'): + state_dict[k[7:]] = checkpoint['model'][k] + else: + state_dict[k] = checkpoint['model'][k] + self.model.load_state_dict(state_dict) + self.model.eval() + + def forward(self, Inputs): + return self.model(Inputs['imgs']) + + def postprocess(self, Inputs): + depth_result = Inputs + + results = {OutputKeys.DEPTHS: depth_result} + return results + + def inference(self, data): + results = self.forward(data) + + return results diff --git a/modelscope/models/multi_modal/clip/model.py b/modelscope/models/multi_modal/clip/model.py index c2d82dca..f6258c36 100644 --- a/modelscope/models/multi_modal/clip/model.py +++ b/modelscope/models/multi_modal/clip/model.py @@ -509,8 +509,8 @@ def convert_weights(model: nn.Module): @MODELS.register_module(Tasks.multi_modal_embedding, module_name=Models.clip) class CLIPForMultiModalEmbedding(TorchModel): - def __init__(self, model_dir, device_id=-1): - super().__init__(model_dir=model_dir, device_id=device_id) + def __init__(self, model_dir, *args, **kwargs): + super().__init__(model_dir=model_dir, *args, **kwargs) # Initialize the model. vision_model_config_file = '{}/vision_model_config.json'.format( diff --git a/modelscope/models/nlp/mglm/mglm_for_text_summarization.py b/modelscope/models/nlp/mglm/mglm_for_text_summarization.py index ea1dfb5a..2df11d6c 100644 --- a/modelscope/models/nlp/mglm/mglm_for_text_summarization.py +++ b/modelscope/models/nlp/mglm/mglm_for_text_summarization.py @@ -9,7 +9,6 @@ import numpy as np import torch import torch.nn.functional as F -from modelscope.hub.snapshot_download import snapshot_download from modelscope.metainfo import Models from modelscope.models.base import Tensor, TorchModel from modelscope.models.builder import MODELS diff --git a/modelscope/models/science/unifold/data/data_ops.py b/modelscope/models/science/unifold/data/data_ops.py index 637aa0cd..c6acbfe2 100644 --- a/modelscope/models/science/unifold/data/data_ops.py +++ b/modelscope/models/science/unifold/data/data_ops.py @@ -730,7 +730,7 @@ def make_msa_feat_v2(batch): batch['cluster_profile'], deletion_mean_value, ] - batch['msa_feat'] = torch.concat(msa_feat, dim=-1) + batch['msa_feat'] = torch.cat(msa_feat, dim=-1) return batch @@ -1320,7 +1320,7 @@ def get_contiguous_crop_idx( asym_offset + this_start + csz)) asym_offset += ll - return torch.concat(crop_idxs) + return torch.cat(crop_idxs) def get_spatial_crop_idx( diff --git a/modelscope/models/science/unifold/modules/attentions.py b/modelscope/models/science/unifold/modules/attentions.py index d2319079..21d92ffd 100644 --- a/modelscope/models/science/unifold/modules/attentions.py +++ b/modelscope/models/science/unifold/modules/attentions.py @@ -217,7 +217,7 @@ class MSAAttention(nn.Module): if mask is not None else None) outputs.append( self.mha(q=cur_m, k=cur_m, v=cur_m, mask=cur_mask, bias=bias)) - return torch.concat(outputs, dim=-3) + return torch.cat(outputs, dim=-3) def _attn_forward(self, m, mask, bias: Optional[torch.Tensor] = None): m = self.layer_norm_m(m) diff --git a/modelscope/outputs/outputs.py b/modelscope/outputs/outputs.py index e3251e48..949a91b5 100644 --- a/modelscope/outputs/outputs.py +++ b/modelscope/outputs/outputs.py @@ -19,6 +19,7 @@ class OutputKeys(object): BOXES = 'boxes' KEYPOINTS = 'keypoints' MASKS = 'masks' + DEPTHS = 'depths' TEXT = 'text' POLYGONS = 'polygons' OUTPUT = 'output' diff --git a/modelscope/pipelines/base.py b/modelscope/pipelines/base.py index 86ea6dab..afe05cbe 100644 --- a/modelscope/pipelines/base.py +++ b/modelscope/pipelines/base.py @@ -16,7 +16,7 @@ from modelscope.outputs import TASK_OUTPUTS from modelscope.pipeline_inputs import TASK_INPUTS, check_input_type from modelscope.preprocessors import Preprocessor from modelscope.utils.config import Config -from modelscope.utils.constant import Frameworks, ModelFile +from modelscope.utils.constant import Frameworks, Invoke, ModelFile from modelscope.utils.device import (create_device, device_placement, verify_device) from modelscope.utils.hub import read_config, snapshot_download @@ -47,8 +47,10 @@ class Pipeline(ABC): logger.info(f'initiate model from location {model}.') # expecting model has been prefetched to local cache beforehand return Model.from_pretrained( - model, model_prefetched=True, - device=self.device_name) if is_model(model) else model + model, + device=self.device_name, + model_prefetched=True, + invoked_by=Invoke.PIPELINE) if is_model(model) else model else: return model @@ -231,7 +233,7 @@ class Pipeline(ABC): batch_data[k] = value_list for k in batch_data.keys(): if isinstance(batch_data[k][0], torch.Tensor): - batch_data[k] = torch.concat(batch_data[k]) + batch_data[k] = torch.cat(batch_data[k]) return batch_data def _process_batch(self, input: List[Input], batch_size, @@ -383,15 +385,12 @@ class DistributedPipeline(Pipeline): preprocessor: Union[Preprocessor, List[Preprocessor]] = None, auto_collate=True, **kwargs): - self.preprocessor = preprocessor + super().__init__(model=model, preprocessor=preprocessor, kwargs=kwargs) self._model_prepare = False self._model_prepare_lock = Lock() self._auto_collate = auto_collate - if os.path.exists(model): - self.model_dir = model - else: - self.model_dir = snapshot_download(model) + self.model_dir = self.model.model_dir self.cfg = read_config(self.model_dir) self.world_size = self.cfg.model.world_size self.model_pool = None diff --git a/modelscope/pipelines/builder.py b/modelscope/pipelines/builder.py index 8b097bfc..58ec4db5 100644 --- a/modelscope/pipelines/builder.py +++ b/modelscope/pipelines/builder.py @@ -7,7 +7,7 @@ from modelscope.hub.snapshot_download import snapshot_download from modelscope.metainfo import Pipelines from modelscope.models.base import Model from modelscope.utils.config import ConfigDict, check_config -from modelscope.utils.constant import DEFAULT_MODEL_REVISION, Tasks +from modelscope.utils.constant import DEFAULT_MODEL_REVISION, Invoke, Tasks from modelscope.utils.hub import read_config from modelscope.utils.registry import Registry, build_from_cfg from .base import Pipeline @@ -147,6 +147,9 @@ DEFAULT_MODEL_FOR_PIPELINE = { Tasks.image_segmentation: (Pipelines.image_instance_segmentation, 'damo/cv_swin-b_image-instance-segmentation_coco'), + Tasks.image_depth_estimation: + (Pipelines.image_depth_estimation, + 'damo/cv_newcrfs_image-depth-estimation_indoor'), Tasks.image_style_transfer: (Pipelines.image_style_transfer, 'damo/cv_aams_style-transfer_damo'), Tasks.face_image_generation: (Pipelines.face_image_generation, @@ -209,6 +212,8 @@ DEFAULT_MODEL_FOR_PIPELINE = { Tasks.referring_video_object_segmentation: (Pipelines.referring_video_object_segmentation, 'damo/cv_swin-t_referring_video-object-segmentation'), + Tasks.video_summarization: (Pipelines.video_summarization, + 'damo/cv_googlenet_pgl-video-summarization'), } @@ -220,14 +225,19 @@ def normalize_model_input(model, model_revision): # skip revision download if model is a local directory if not os.path.exists(model): # note that if there is already a local copy, snapshot_download will check and skip downloading - model = snapshot_download(model, revision=model_revision) + model = snapshot_download( + model, + revision=model_revision, + user_agent={Invoke.KEY: Invoke.PIPELINE}) elif isinstance(model, list) and isinstance(model[0], str): for idx in range(len(model)): if is_official_hub_path( model[idx], model_revision) and not os.path.exists(model[idx]): model[idx] = snapshot_download( - model[idx], revision=model_revision) + model[idx], + revision=model_revision, + user_agent={Invoke.KEY: Invoke.PIPELINE}) return model diff --git a/modelscope/pipelines/cv/animal_recognition_pipeline.py b/modelscope/pipelines/cv/animal_recognition_pipeline.py index 6d395a46..251b2fae 100644 --- a/modelscope/pipelines/cv/animal_recognition_pipeline.py +++ b/modelscope/pipelines/cv/animal_recognition_pipeline.py @@ -8,14 +8,13 @@ import torch from PIL import Image from torchvision import transforms -from modelscope.hub.snapshot_download import snapshot_download from modelscope.metainfo import Pipelines from modelscope.models.cv.animal_recognition import Bottleneck, ResNet from modelscope.outputs import OutputKeys from modelscope.pipelines.base import Input, Pipeline from modelscope.pipelines.builder import PIPELINES from modelscope.preprocessors import LoadImage -from modelscope.utils.constant import Tasks +from modelscope.utils.constant import Devices, ModelFile, Tasks from modelscope.utils.logger import get_logger logger = get_logger() @@ -67,15 +66,10 @@ class AnimalRecognitionPipeline(Pipeline): filter_param(src_params, own_state) model.load_state_dict(own_state) - self.model = resnest101(num_classes=8288) - local_model_dir = model - if osp.exists(model): - local_model_dir = model - else: - local_model_dir = snapshot_download(model) - self.local_path = local_model_dir + self.local_path = self.model src_params = torch.load( - osp.join(local_model_dir, 'pytorch_model.pt'), 'cpu') + osp.join(self.local_path, ModelFile.TORCH_MODEL_FILE), Devices.cpu) + self.model = resnest101(num_classes=8288) load_pretrained(self.model, src_params) logger.info('load model done') diff --git a/modelscope/pipelines/cv/body_3d_keypoints_pipeline.py b/modelscope/pipelines/cv/body_3d_keypoints_pipeline.py index d113fb3c..dbd59e97 100644 --- a/modelscope/pipelines/cv/body_3d_keypoints_pipeline.py +++ b/modelscope/pipelines/cv/body_3d_keypoints_pipeline.py @@ -120,8 +120,7 @@ class Body3DKeypointsPipeline(Pipeline): """ super().__init__(model=model, **kwargs) - self.keypoint_model_3d = model if isinstance( - model, BodyKeypointsDetection3D) else Model.from_pretrained(model) + self.keypoint_model_3d = self.model self.keypoint_model_3d.eval() # init human body 2D keypoints detection pipeline diff --git a/modelscope/pipelines/cv/easycv_pipelines/base.py b/modelscope/pipelines/cv/easycv_pipelines/base.py index c130aea0..37cae4ce 100644 --- a/modelscope/pipelines/cv/easycv_pipelines/base.py +++ b/modelscope/pipelines/cv/easycv_pipelines/base.py @@ -11,7 +11,7 @@ from PIL import ImageFile from modelscope.hub.snapshot_download import snapshot_download from modelscope.pipelines.util import is_official_hub_path from modelscope.utils.config import Config -from modelscope.utils.constant import DEFAULT_MODEL_REVISION, ModelFile +from modelscope.utils.constant import DEFAULT_MODEL_REVISION, Invoke, ModelFile from modelscope.utils.device import create_device @@ -37,7 +37,9 @@ class EasyCVPipeline(object): assert is_official_hub_path( model), 'Only support local model path and official hub path!' model_dir = snapshot_download( - model_id=model, revision=DEFAULT_MODEL_REVISION) + model_id=model, + revision=DEFAULT_MODEL_REVISION, + user_agent={Invoke.KEY: Invoke.PIPELINE}) assert osp.isdir(model_dir) model_files = glob.glob( @@ -48,6 +50,7 @@ class EasyCVPipeline(object): model_path = model_files[0] self.model_path = model_path + self.model_dir = model_dir # get configuration file from source model dir self.config_file = os.path.join(model_dir, ModelFile.CONFIGURATION) diff --git a/modelscope/pipelines/cv/easycv_pipelines/human_wholebody_keypoint_pipeline.py b/modelscope/pipelines/cv/easycv_pipelines/human_wholebody_keypoint_pipeline.py index 936accbf..903c4106 100644 --- a/modelscope/pipelines/cv/easycv_pipelines/human_wholebody_keypoint_pipeline.py +++ b/modelscope/pipelines/cv/easycv_pipelines/human_wholebody_keypoint_pipeline.py @@ -24,7 +24,6 @@ class HumanWholebodyKeypointsPipeline(EasyCVPipeline): model (str): model id on modelscope hub or local model path. model_file_pattern (str): model file pattern. """ - self.model_dir = model super(HumanWholebodyKeypointsPipeline, self).__init__( model=model, model_file_pattern=model_file_pattern, diff --git a/modelscope/pipelines/cv/general_recognition_pipeline.py b/modelscope/pipelines/cv/general_recognition_pipeline.py index c1136882..a36c3ebe 100644 --- a/modelscope/pipelines/cv/general_recognition_pipeline.py +++ b/modelscope/pipelines/cv/general_recognition_pipeline.py @@ -8,7 +8,6 @@ import torch from PIL import Image from torchvision import transforms -from modelscope.hub.snapshot_download import snapshot_download from modelscope.metainfo import Pipelines from modelscope.models.cv.animal_recognition import resnet from modelscope.outputs import OutputKeys @@ -67,16 +66,12 @@ class GeneralRecognitionPipeline(Pipeline): filter_param(src_params, own_state) model.load_state_dict(own_state) - self.model = resnest101(num_classes=54092) - local_model_dir = model device = 'cpu' - if osp.exists(model): - local_model_dir = model - else: - local_model_dir = snapshot_download(model) - self.local_path = local_model_dir + self.local_path = self.model src_params = torch.load( - osp.join(local_model_dir, ModelFile.TORCH_MODEL_FILE), device) + osp.join(self.local_path, ModelFile.TORCH_MODEL_FILE), device) + + self.model = resnest101(num_classes=54092) load_pretrained(self.model, src_params) logger.info('load model done') diff --git a/modelscope/pipelines/cv/hand_2d_keypoints_pipeline.py b/modelscope/pipelines/cv/hand_2d_keypoints_pipeline.py index bad0c652..63281e80 100644 --- a/modelscope/pipelines/cv/hand_2d_keypoints_pipeline.py +++ b/modelscope/pipelines/cv/hand_2d_keypoints_pipeline.py @@ -21,7 +21,6 @@ class Hand2DKeypointsPipeline(EasyCVPipeline): model (str): model id on modelscope hub or local model path. model_file_pattern (str): model file pattern. """ - self.model_dir = model super(Hand2DKeypointsPipeline, self).__init__( model=model, model_file_pattern=model_file_pattern, diff --git a/modelscope/pipelines/cv/image_classification_pipeline.py b/modelscope/pipelines/cv/image_classification_pipeline.py index 69dbd1fb..8d4f7694 100644 --- a/modelscope/pipelines/cv/image_classification_pipeline.py +++ b/modelscope/pipelines/cv/image_classification_pipeline.py @@ -1,5 +1,5 @@ # Copyright (c) Alibaba, Inc. and its affiliates. -from typing import Any, Dict, Union +from typing import Any, Dict, Optional, Union import cv2 import numpy as np @@ -25,22 +25,15 @@ class ImageClassificationPipeline(Pipeline): def __init__(self, model: Union[Model, str], - preprocessor: [Preprocessor] = None, + preprocessor: Optional[Preprocessor] = None, **kwargs): - super().__init__(model=model) + super().__init__(model=model, preprocessor=preprocessor, **kwargs) assert isinstance(model, str) or isinstance(model, Model), \ 'model must be a single str or OfaForAllTasks' - if isinstance(model, str): - pipe_model = Model.from_pretrained(model) - elif isinstance(model, Model): - pipe_model = model - else: - raise NotImplementedError - pipe_model.model.eval() - pipe_model.to(get_device()) - if preprocessor is None and isinstance(pipe_model, OfaForAllTasks): - preprocessor = OfaPreprocessor(model_dir=pipe_model.model_dir) - super().__init__(model=pipe_model, preprocessor=preprocessor, **kwargs) + self.model.eval() + self.model.to(get_device()) + if preprocessor is None and isinstance(self.model, OfaForAllTasks): + self.preprocessor = OfaPreprocessor(model_dir=self.model.model_dir) def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]: return inputs diff --git a/modelscope/pipelines/cv/image_color_enhance_pipeline.py b/modelscope/pipelines/cv/image_color_enhance_pipeline.py index 3a4cf8bc..ca3dacec 100644 --- a/modelscope/pipelines/cv/image_color_enhance_pipeline.py +++ b/modelscope/pipelines/cv/image_color_enhance_pipeline.py @@ -32,10 +32,8 @@ class ImageColorEnhancePipeline(Pipeline): Args: model: model id on modelscope hub. """ - model = model if isinstance( - model, ImageColorEnhance) else Model.from_pretrained(model) - model.eval() super().__init__(model=model, preprocessor=preprocessor, **kwargs) + self.model.eval() if torch.cuda.is_available(): self._device = torch.device('cuda') diff --git a/modelscope/pipelines/cv/image_denoise_pipeline.py b/modelscope/pipelines/cv/image_denoise_pipeline.py index 34ac1e81..82097b19 100644 --- a/modelscope/pipelines/cv/image_denoise_pipeline.py +++ b/modelscope/pipelines/cv/image_denoise_pipeline.py @@ -32,17 +32,14 @@ class ImageDenoisePipeline(Pipeline): Args: model: model id on modelscope hub. """ - model = model if isinstance( - model, NAFNetForImageDenoise) else Model.from_pretrained(model) - model.eval() super().__init__(model=model, preprocessor=preprocessor, **kwargs) - self.config = model.config + self.model.eval() + self.config = self.model.config if torch.cuda.is_available(): self._device = torch.device('cuda') else: self._device = torch.device('cpu') - self.model = model logger.info('load image denoise model done') def preprocess(self, input: Input) -> Dict[str, Any]: diff --git a/modelscope/pipelines/cv/image_depth_estimation_pipeline.py b/modelscope/pipelines/cv/image_depth_estimation_pipeline.py new file mode 100644 index 00000000..d318ebd2 --- /dev/null +++ b/modelscope/pipelines/cv/image_depth_estimation_pipeline.py @@ -0,0 +1,52 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +from typing import Any, Dict, Union + +import cv2 +import numpy as np +import PIL +import torch + +from modelscope.metainfo import Pipelines +from modelscope.outputs import OutputKeys +from modelscope.pipelines.base import Input, Model, Pipeline +from modelscope.pipelines.builder import PIPELINES +from modelscope.preprocessors import LoadImage +from modelscope.utils.constant import Tasks +from modelscope.utils.logger import get_logger + +logger = get_logger() + + +@PIPELINES.register_module( + Tasks.image_depth_estimation, module_name=Pipelines.image_depth_estimation) +class ImageDepthEstimationPipeline(Pipeline): + + def __init__(self, model: str, **kwargs): + """ + use `model` to create a image depth estimation pipeline for prediction + Args: + model: model id on modelscope hub. + """ + super().__init__(model=model, **kwargs) + + logger.info('depth estimation model, pipeline init') + + def preprocess(self, input: Input) -> Dict[str, Any]: + img = LoadImage.convert_to_ndarray(input).astype(np.float32) + H, W = 480, 640 + img = cv2.resize(img, [W, H]) + img = img.transpose(2, 0, 1) / 255.0 + imgs = img[None, ...] + data = {'imgs': imgs} + + return data + + def forward(self, input: Dict[str, Any]) -> Dict[str, Any]: + results = self.model.inference(input) + return results + + def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]: + results = self.model.postprocess(inputs) + outputs = {OutputKeys.DEPTHS: results[OutputKeys.DEPTHS]} + + return outputs diff --git a/modelscope/pipelines/cv/language_guided_video_summarization_pipeline.py b/modelscope/pipelines/cv/language_guided_video_summarization_pipeline.py index 059dadb7..2edd59a1 100755 --- a/modelscope/pipelines/cv/language_guided_video_summarization_pipeline.py +++ b/modelscope/pipelines/cv/language_guided_video_summarization_pipeline.py @@ -44,7 +44,7 @@ class LanguageGuidedVideoSummarizationPipeline(Pipeline): """ super().__init__(model=model, auto_collate=False, **kwargs) logger.info(f'loading model from {model}') - self.model_dir = model + self.model_dir = self.model.model_dir self.tmp_dir = kwargs.get('tmp_dir', None) if self.tmp_dir is None: diff --git a/modelscope/pipelines/cv/virtual_try_on_pipeline.py b/modelscope/pipelines/cv/virtual_try_on_pipeline.py index cd6e7046..1a521345 100644 --- a/modelscope/pipelines/cv/virtual_try_on_pipeline.py +++ b/modelscope/pipelines/cv/virtual_try_on_pipeline.py @@ -9,7 +9,6 @@ import PIL import torch from PIL import Image -from modelscope.hub.snapshot_download import snapshot_download from modelscope.metainfo import Pipelines from modelscope.models.cv.virual_tryon import SDAFNet_Tryon from modelscope.outputs import OutputKeys @@ -52,17 +51,12 @@ class VirtualTryonPipeline(Pipeline): filter_param(src_params, own_state) model.load_state_dict(own_state) - self.model = SDAFNet_Tryon(ref_in_channel=6).to(self.device) - local_model_dir = model - if osp.exists(model): - local_model_dir = model - else: - local_model_dir = snapshot_download(model) - self.local_path = local_model_dir + self.local_path = self.model src_params = torch.load( - osp.join(local_model_dir, ModelFile.TORCH_MODEL_FILE), 'cpu') + osp.join(self.local_path, ModelFile.TORCH_MODEL_FILE), 'cpu') + self.model = SDAFNet_Tryon(ref_in_channel=6).to(self.device) load_pretrained(self.model, src_params) - self.model = self.model.eval() + self.model.eval() self.size = 192 from torchvision import transforms self.test_transforms = transforms.Compose([ diff --git a/modelscope/pipelines/multi_modal/image_captioning_pipeline.py b/modelscope/pipelines/multi_modal/image_captioning_pipeline.py index 63966ed4..e1d5c769 100644 --- a/modelscope/pipelines/multi_modal/image_captioning_pipeline.py +++ b/modelscope/pipelines/multi_modal/image_captioning_pipeline.py @@ -29,22 +29,13 @@ class ImageCaptioningPipeline(Pipeline): Args: model: model id on modelscope hub. """ - super().__init__(model=model) - assert isinstance(model, str) or isinstance(model, Model), \ - 'model must be a single str or OfaForAllTasks' - if isinstance(model, str): - pipe_model = Model.from_pretrained(model) - elif isinstance(model, Model): - pipe_model = model - else: - raise NotImplementedError - pipe_model.model.eval() + super().__init__(model=model, preprocessor=preprocessor, **kwargs) + self.model.eval() if preprocessor is None: - if isinstance(pipe_model, OfaForAllTasks): - preprocessor = OfaPreprocessor(pipe_model.model_dir) - elif isinstance(pipe_model, MPlugForAllTasks): - preprocessor = MPlugPreprocessor(pipe_model.model_dir) - super().__init__(model=pipe_model, preprocessor=preprocessor, **kwargs) + if isinstance(self.model, OfaForAllTasks): + self.preprocessor = OfaPreprocessor(self.model.model_dir) + elif isinstance(self.model, MPlugForAllTasks): + self.preprocessor = MPlugPreprocessor(self.model.model_dir) def _batch(self, data): if isinstance(self.model, OfaForAllTasks): @@ -55,17 +46,17 @@ class ImageCaptioningPipeline(Pipeline): batch_data['samples'] = [d['samples'][0] for d in data] batch_data['net_input'] = {} for k in data[0]['net_input'].keys(): - batch_data['net_input'][k] = torch.concat( + batch_data['net_input'][k] = torch.cat( [d['net_input'][k] for d in data]) return batch_data elif isinstance(self.model, MPlugForAllTasks): from transformers.tokenization_utils_base import BatchEncoding batch_data = dict(train=data[0]['train']) - batch_data['image'] = torch.concat([d['image'] for d in data]) + batch_data['image'] = torch.cat([d['image'] for d in data]) question = {} for k in data[0]['question'].keys(): - question[k] = torch.concat([d['question'][k] for d in data]) + question[k] = torch.cat([d['question'][k] for d in data]) batch_data['question'] = BatchEncoding(question) return batch_data else: diff --git a/modelscope/pipelines/multi_modal/image_text_retrieval_pipeline.py b/modelscope/pipelines/multi_modal/image_text_retrieval_pipeline.py index 329d79bf..09be8265 100644 --- a/modelscope/pipelines/multi_modal/image_text_retrieval_pipeline.py +++ b/modelscope/pipelines/multi_modal/image_text_retrieval_pipeline.py @@ -28,19 +28,10 @@ class ImageTextRetrievalPipeline(Pipeline): Args: model: model id on modelscope hub. """ - super().__init__(model=model) - assert isinstance(model, str) or isinstance(model, Model), \ - f'model must be a single str or Model, but got {type(model)}' - if isinstance(model, str): - pipe_model = Model.from_pretrained(model) - elif isinstance(model, Model): - pipe_model = model - else: - raise NotImplementedError - pipe_model.model.eval() + super().__init__(model=model, preprocessor=preprocessor, **kwargs) + self.model.eval() if preprocessor is None: - preprocessor = MPlugPreprocessor(pipe_model.model_dir) - super().__init__(model=pipe_model, preprocessor=preprocessor, **kwargs) + self.preprocessor = MPlugPreprocessor(self.model.model_dir) def forward(self, inputs: Dict[str, Any], **forward_params) -> Dict[str, Any]: diff --git a/modelscope/pipelines/multi_modal/multi_modal_embedding_pipeline.py b/modelscope/pipelines/multi_modal/multi_modal_embedding_pipeline.py index 18ee1dbf..79f67a35 100644 --- a/modelscope/pipelines/multi_modal/multi_modal_embedding_pipeline.py +++ b/modelscope/pipelines/multi_modal/multi_modal_embedding_pipeline.py @@ -28,21 +28,14 @@ class MultiModalEmbeddingPipeline(Pipeline): Args: model: model id on modelscope hub. """ - if isinstance(model, str): - pipe_model = Model.from_pretrained(model) - elif isinstance(model, Model): - pipe_model = model - else: - raise NotImplementedError('model must be a single str') - pipe_model.eval() + super().__init__(model=model, preprocessor=preprocessor, **kwargs) + self.model.eval() if preprocessor is None: - if isinstance(pipe_model, CLIPForMultiModalEmbedding): - preprocessor = CLIPPreprocessor(pipe_model.model_dir) + if isinstance(self.model, CLIPForMultiModalEmbedding): + self.preprocessor = CLIPPreprocessor(self.model.model_dir) else: raise NotImplementedError - super().__init__(model=pipe_model, preprocessor=preprocessor, **kwargs) - def forward(self, input: Dict[str, Any]) -> Dict[str, Any]: return self.model(self.preprocess(input)) diff --git a/modelscope/pipelines/multi_modal/ocr_recognition_pipeline.py b/modelscope/pipelines/multi_modal/ocr_recognition_pipeline.py index c61b38f3..3c4a3c3c 100644 --- a/modelscope/pipelines/multi_modal/ocr_recognition_pipeline.py +++ b/modelscope/pipelines/multi_modal/ocr_recognition_pipeline.py @@ -28,20 +28,11 @@ class OcrRecognitionPipeline(Pipeline): Args: model: model id on modelscope hub. """ - super().__init__(model=model) - assert isinstance(model, str) or isinstance(model, Model), \ - 'model must be a single str or OfaForAllTasks' - if isinstance(model, str): - pipe_model = Model.from_pretrained(model) - elif isinstance(model, Model): - pipe_model = model - else: - raise NotImplementedError - pipe_model.model.eval() + super().__init__(model=model, preprocessor=preprocessor, **kwargs) + self.model.eval() if preprocessor is None: - if isinstance(pipe_model, OfaForAllTasks): - preprocessor = OfaPreprocessor(pipe_model.model_dir) - super().__init__(model=pipe_model, preprocessor=preprocessor, **kwargs) + if isinstance(self.model, OfaForAllTasks): + self.preprocessor = OfaPreprocessor(self.model.model_dir) def forward(self, inputs: Dict[str, Any], **forward_params) -> Dict[str, Any]: diff --git a/modelscope/pipelines/multi_modal/text_to_image_synthesis_pipeline.py b/modelscope/pipelines/multi_modal/text_to_image_synthesis_pipeline.py index 7516c5be..36e761aa 100644 --- a/modelscope/pipelines/multi_modal/text_to_image_synthesis_pipeline.py +++ b/modelscope/pipelines/multi_modal/text_to_image_synthesis_pipeline.py @@ -31,18 +31,10 @@ class TextToImageSynthesisPipeline(Pipeline): Args: model: model id on modelscope hub. """ - device_id = 0 if torch.cuda.is_available() else -1 - if isinstance(model, str): - pipe_model = Model.from_pretrained(model, device_id=device_id) - elif isinstance(model, Model): - pipe_model = model - else: - raise NotImplementedError( - f'expecting a Model instance or str, but get {type(model)}.') - if preprocessor is None and isinstance(pipe_model, + super().__init__(model=model, preprocessor=preprocessor, **kwargs) + if preprocessor is None and isinstance(self.model, OfaForTextToImageSynthesis): - preprocessor = OfaPreprocessor(pipe_model.model_dir) - super().__init__(model=pipe_model, preprocessor=preprocessor, **kwargs) + self.preprocessor = OfaPreprocessor(self.model.model_dir) def preprocess(self, input: Input, **preprocess_params) -> Dict[str, Any]: if self.preprocessor is not None: diff --git a/modelscope/pipelines/multi_modal/visual_entailment_pipeline.py b/modelscope/pipelines/multi_modal/visual_entailment_pipeline.py index 2a7bd1d0..67661b39 100644 --- a/modelscope/pipelines/multi_modal/visual_entailment_pipeline.py +++ b/modelscope/pipelines/multi_modal/visual_entailment_pipeline.py @@ -1,5 +1,5 @@ # Copyright (c) Alibaba, Inc. and its affiliates. -from typing import Any, Dict, Union +from typing import Any, Dict, Optional, Union from modelscope.metainfo import Pipelines from modelscope.models.multi_modal import OfaForAllTasks @@ -18,26 +18,17 @@ class VisualEntailmentPipeline(Pipeline): def __init__(self, model: Union[Model, str], - preprocessor: [Preprocessor] = None, + preprocessor: Optional[Preprocessor] = None, **kwargs): """ use `model` and `preprocessor` to create a visual entailment pipeline for prediction Args: model: model id on modelscope hub. """ - super().__init__(model=model) - assert isinstance(model, str) or isinstance(model, Model), \ - 'model must be a single str or OfaForAllTasks' - if isinstance(model, str): - pipe_model = Model.from_pretrained(model) - elif isinstance(model, Model): - pipe_model = model - else: - raise NotImplementedError - pipe_model.model.eval() - if preprocessor is None and isinstance(pipe_model, OfaForAllTasks): - preprocessor = OfaPreprocessor(model_dir=pipe_model.model_dir) - super().__init__(model=pipe_model, preprocessor=preprocessor, **kwargs) + super().__init__(model=model, preprocessor=preprocessor, **kwargs) + self.model.eval() + if preprocessor is None and isinstance(self.model, OfaForAllTasks): + self.preprocessor = OfaPreprocessor(model_dir=self.model.model_dir) def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]: return inputs diff --git a/modelscope/pipelines/multi_modal/visual_grounding_pipeline.py b/modelscope/pipelines/multi_modal/visual_grounding_pipeline.py index 651109d9..f8a79d55 100644 --- a/modelscope/pipelines/multi_modal/visual_grounding_pipeline.py +++ b/modelscope/pipelines/multi_modal/visual_grounding_pipeline.py @@ -1,5 +1,5 @@ # Copyright (c) Alibaba, Inc. and its affiliates. -from typing import Any, Dict, Union +from typing import Any, Dict, Optional, Union from modelscope.metainfo import Pipelines from modelscope.models.multi_modal import OfaForAllTasks @@ -18,26 +18,17 @@ class VisualGroundingPipeline(Pipeline): def __init__(self, model: Union[Model, str], - preprocessor: [Preprocessor] = None, + preprocessor: Optional[Preprocessor] = None, **kwargs): """ use `model` and `preprocessor` to create a visual grounding pipeline for prediction Args: model: model id on modelscope hub. """ - super().__init__(model=model) - assert isinstance(model, str) or isinstance(model, Model), \ - 'model must be a single str or OfaForAllTasks' - if isinstance(model, str): - pipe_model = Model.from_pretrained(model) - elif isinstance(model, Model): - pipe_model = model - else: - raise NotImplementedError - pipe_model.model.eval() - if preprocessor is None and isinstance(pipe_model, OfaForAllTasks): - preprocessor = OfaPreprocessor(model_dir=pipe_model.model_dir) - super().__init__(model=pipe_model, preprocessor=preprocessor, **kwargs) + super().__init__(model=model, preprocessor=preprocessor, **kwargs) + self.model.model.eval() + if preprocessor is None and isinstance(self.model, OfaForAllTasks): + self.preprocessor = OfaPreprocessor(model_dir=self.model.model_dir) def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]: return inputs diff --git a/modelscope/pipelines/multi_modal/visual_question_answering_pipeline.py b/modelscope/pipelines/multi_modal/visual_question_answering_pipeline.py index 86177074..a30cf1c5 100644 --- a/modelscope/pipelines/multi_modal/visual_question_answering_pipeline.py +++ b/modelscope/pipelines/multi_modal/visual_question_answering_pipeline.py @@ -31,15 +31,13 @@ class VisualQuestionAnsweringPipeline(Pipeline): model (MPlugForVisualQuestionAnswering): a model instance preprocessor (MPlugVisualQuestionAnsweringPreprocessor): a preprocessor instance """ - model = model if isinstance(model, - Model) else Model.from_pretrained(model) - if preprocessor is None: - if isinstance(model, OfaForAllTasks): - preprocessor = OfaPreprocessor(model.model_dir) - elif isinstance(model, MPlugForAllTasks): - preprocessor = MPlugPreprocessor(model.model_dir) - model.model.eval() super().__init__(model=model, preprocessor=preprocessor, **kwargs) + if preprocessor is None: + if isinstance(self.model, OfaForAllTasks): + self.preprocessor = OfaPreprocessor(self.model.model_dir) + elif isinstance(self.model, MPlugForAllTasks): + self.preprocessor = MPlugPreprocessor(self.model.model_dir) + self.model.eval() def forward(self, inputs: Dict[str, Any], **forward_params) -> Dict[str, Any]: diff --git a/modelscope/pipelines/nlp/conversational_text_to_sql_pipeline.py b/modelscope/pipelines/nlp/conversational_text_to_sql_pipeline.py index 48df0c40..afd5e29f 100644 --- a/modelscope/pipelines/nlp/conversational_text_to_sql_pipeline.py +++ b/modelscope/pipelines/nlp/conversational_text_to_sql_pipeline.py @@ -32,12 +32,10 @@ class ConversationalTextToSqlPipeline(Pipeline): preprocessor (ConversationalTextToSqlPreprocessor): a preprocessor instance """ - model = model if isinstance( - model, StarForTextToSql) else Model.from_pretrained(model) - if preprocessor is None: - preprocessor = ConversationalTextToSqlPreprocessor(model.model_dir) - super().__init__(model=model, preprocessor=preprocessor, **kwargs) + if preprocessor is None: + self.preprocessor = ConversationalTextToSqlPreprocessor( + self.model.model_dir) def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, str]: """process the prediction results diff --git a/modelscope/pipelines/nlp/dialog_intent_prediction_pipeline.py b/modelscope/pipelines/nlp/dialog_intent_prediction_pipeline.py index 70374c50..c803663b 100644 --- a/modelscope/pipelines/nlp/dialog_intent_prediction_pipeline.py +++ b/modelscope/pipelines/nlp/dialog_intent_prediction_pipeline.py @@ -30,13 +30,11 @@ class DialogIntentPredictionPipeline(Pipeline): or a SpaceForDialogIntent instance. preprocessor (DialogIntentPredictionPreprocessor): An optional preprocessor instance. """ - model = model if isinstance( - model, SpaceForDialogIntent) else Model.from_pretrained(model) - if preprocessor is None: - preprocessor = DialogIntentPredictionPreprocessor(model.model_dir) - self.model = model super().__init__(model=model, preprocessor=preprocessor, **kwargs) - self.categories = preprocessor.categories + if preprocessor is None: + self.preprocessor = DialogIntentPredictionPreprocessor( + self.model.model_dir) + self.categories = self.preprocessor.categories def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, str]: """process the prediction results diff --git a/modelscope/pipelines/nlp/dialog_modeling_pipeline.py b/modelscope/pipelines/nlp/dialog_modeling_pipeline.py index 3215d765..c0cd52dd 100644 --- a/modelscope/pipelines/nlp/dialog_modeling_pipeline.py +++ b/modelscope/pipelines/nlp/dialog_modeling_pipeline.py @@ -29,13 +29,10 @@ class DialogModelingPipeline(Pipeline): or a SpaceForDialogModeling instance. preprocessor (DialogModelingPreprocessor): An optional preprocessor instance. """ - model = model if isinstance( - model, SpaceForDialogModeling) else Model.from_pretrained(model) - self.model = model - if preprocessor is None: - preprocessor = DialogModelingPreprocessor(model.model_dir) super().__init__(model=model, preprocessor=preprocessor, **kwargs) - self.preprocessor = preprocessor + if preprocessor is None: + self.preprocessor = DialogModelingPreprocessor( + self.model.model_dir) def postprocess(self, inputs: Dict[str, Tensor]) -> Dict[str, str]: """process the prediction results diff --git a/modelscope/pipelines/nlp/dialog_state_tracking_pipeline.py b/modelscope/pipelines/nlp/dialog_state_tracking_pipeline.py index 9520c06f..b7adf904 100644 --- a/modelscope/pipelines/nlp/dialog_state_tracking_pipeline.py +++ b/modelscope/pipelines/nlp/dialog_state_tracking_pipeline.py @@ -31,16 +31,13 @@ class DialogStateTrackingPipeline(Pipeline): from the model hub, or a SpaceForDialogStateTracking instance. preprocessor (DialogStateTrackingPreprocessor): An optional preprocessor instance. """ - - model = model if isinstance( - model, SpaceForDST) else Model.from_pretrained(model) - self.model = model - if preprocessor is None: - preprocessor = DialogStateTrackingPreprocessor(model.model_dir) super().__init__(model=model, preprocessor=preprocessor, **kwargs) + if preprocessor is None: + self.preprocessor = DialogStateTrackingPreprocessor( + self.model.model_dir) - self.tokenizer = preprocessor.tokenizer - self.config = preprocessor.config + self.tokenizer = self.preprocessor.tokenizer + self.config = self.preprocessor.config def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, str]: """process the prediction results diff --git a/modelscope/pipelines/nlp/document_segmentation_pipeline.py b/modelscope/pipelines/nlp/document_segmentation_pipeline.py index 5e8f3ddb..b29dcca7 100644 --- a/modelscope/pipelines/nlp/document_segmentation_pipeline.py +++ b/modelscope/pipelines/nlp/document_segmentation_pipeline.py @@ -31,27 +31,22 @@ class DocumentSegmentationPipeline(Pipeline): model: Union[Model, str], preprocessor: DocumentSegmentationPreprocessor = None, **kwargs): + super().__init__(model=model, preprocessor=preprocessor, **kwargs) - model = model if isinstance(model, - Model) else Model.from_pretrained(model) - - self.model_dir = model.model_dir - self.model_cfg = model.forward() + self.model_dir = self.model.model_dir + self.model_cfg = self.model.forward() if self.model_cfg['type'] == 'bert': - config = BertConfig.from_pretrained(model.model_dir, num_labels=2) + config = BertConfig.from_pretrained(self.model_dir, num_labels=2) elif self.model_cfg['type'] == 'ponet': - config = PoNetConfig.from_pretrained(model.model_dir, num_labels=2) + config = PoNetConfig.from_pretrained(self.model_dir, num_labels=2) - self.document_segmentation_model = model.build_with_config( + self.document_segmentation_model = self.model.build_with_config( config=config) if preprocessor is None: - preprocessor = DocumentSegmentationPreprocessor( - self.model_dir, config) - super().__init__(model=model, preprocessor=preprocessor, **kwargs) - - self.preprocessor = preprocessor + self.preprocessor = DocumentSegmentationPreprocessor( + self.model.model_dir, config) def __call__( self, documents: Union[List[List[str]], List[str], diff --git a/modelscope/pipelines/nlp/faq_question_answering_pipeline.py b/modelscope/pipelines/nlp/faq_question_answering_pipeline.py index 3917f20c..46d75f49 100644 --- a/modelscope/pipelines/nlp/faq_question_answering_pipeline.py +++ b/modelscope/pipelines/nlp/faq_question_answering_pipeline.py @@ -21,12 +21,10 @@ class FaqQuestionAnsweringPipeline(Pipeline): model: Union[str, Model], preprocessor: Preprocessor = None, **kwargs): - model = Model.from_pretrained(model) if isinstance(model, - str) else model - if preprocessor is None: - preprocessor = Preprocessor.from_pretrained( - model.model_dir, **kwargs) super().__init__(model=model, preprocessor=preprocessor, **kwargs) + if preprocessor is None: + self.preprocessor = Preprocessor.from_pretrained( + self.model.model_dir, **kwargs) def _sanitize_parameters(self, **pipeline_parameters): return pipeline_parameters, pipeline_parameters, pipeline_parameters @@ -37,11 +35,11 @@ class FaqQuestionAnsweringPipeline(Pipeline): sentence_vecs = sentence_vecs.detach().tolist() return sentence_vecs - def forward(self, inputs: [list, Dict[str, Any]], + def forward(self, inputs: Union[list, Dict[str, Any]], **forward_params) -> Dict[str, Any]: return self.model(inputs) - def postprocess(self, inputs: [list, Dict[str, Any]], + def postprocess(self, inputs: Union[list, Dict[str, Any]], **postprocess_params) -> Dict[str, Any]: scores = inputs['scores'] labels = [] diff --git a/modelscope/pipelines/nlp/feature_extraction_pipeline.py b/modelscope/pipelines/nlp/feature_extraction_pipeline.py index e94e4337..aed78868 100644 --- a/modelscope/pipelines/nlp/feature_extraction_pipeline.py +++ b/modelscope/pipelines/nlp/feature_extraction_pipeline.py @@ -46,21 +46,18 @@ class FeatureExtractionPipeline(Pipeline): """ - model = model if isinstance(model, - Model) else Model.from_pretrained(model) + super().__init__(model=model, preprocessor=preprocessor, **kwargs) if preprocessor is None: - preprocessor = NLPPreprocessor( - model.model_dir, + self.preprocessor = NLPPreprocessor( + self.model.model_dir, padding=kwargs.pop('padding', False), sequence_length=kwargs.pop('sequence_length', 128)) - model.eval() - super().__init__(model=model, preprocessor=preprocessor, **kwargs) + self.model.eval() - self.preprocessor = preprocessor self.config = Config.from_file( - os.path.join(model.model_dir, ModelFile.CONFIGURATION)) - self.tokenizer = preprocessor.tokenizer + os.path.join(self.model.model_dir, ModelFile.CONFIGURATION)) + self.tokenizer = self.preprocessor.tokenizer def forward(self, inputs: Dict[str, Any], **forward_params) -> Dict[str, Any]: diff --git a/modelscope/pipelines/nlp/fill_mask_pipeline.py b/modelscope/pipelines/nlp/fill_mask_pipeline.py index 0f3446e6..d7dc70f8 100644 --- a/modelscope/pipelines/nlp/fill_mask_pipeline.py +++ b/modelscope/pipelines/nlp/fill_mask_pipeline.py @@ -53,22 +53,18 @@ class FillMaskPipeline(Pipeline): If the xlm-roberta(xlm-roberta, veco, etc.) based model is used, the mask token is ''. To view other examples plese check the tests/pipelines/test_fill_mask.py. """ - - fill_mask_model = Model.from_pretrained(model) if isinstance( - model, str) else model - + super().__init__(model=model, preprocessor=preprocessor, **kwargs) if preprocessor is None: - preprocessor = Preprocessor.from_pretrained( - fill_mask_model.model_dir, + self.preprocessor = Preprocessor.from_pretrained( + self.model.model_dir, first_sequence=first_sequence, second_sequence=None, sequence_length=kwargs.pop('sequence_length', 128)) - fill_mask_model.eval() - assert hasattr( - preprocessor, 'mask_id' - ), 'The input preprocessor should have the mask_id attribute.' - super().__init__( - model=fill_mask_model, preprocessor=preprocessor, **kwargs) + assert hasattr( + self.preprocessor, 'mask_id' + ), 'The input preprocessor should have the mask_id attribute.' + + self.model.eval() def forward(self, inputs: Dict[str, Any], **forward_params) -> Dict[str, Any]: diff --git a/modelscope/pipelines/nlp/information_extraction_pipeline.py b/modelscope/pipelines/nlp/information_extraction_pipeline.py index 8ac85f43..cf96fd36 100644 --- a/modelscope/pipelines/nlp/information_extraction_pipeline.py +++ b/modelscope/pipelines/nlp/information_extraction_pipeline.py @@ -25,15 +25,12 @@ class InformationExtractionPipeline(Pipeline): model: Union[Model, str], preprocessor: Optional[Preprocessor] = None, **kwargs): - - model = model if isinstance(model, - Model) else Model.from_pretrained(model) + super().__init__(model=model, preprocessor=preprocessor, **kwargs) if preprocessor is None: - preprocessor = RelationExtractionPreprocessor( - model.model_dir, + self.preprocessor = RelationExtractionPreprocessor( + self.model.model_dir, sequence_length=kwargs.pop('sequence_length', 512)) - model.eval() - super().__init__(model=model, preprocessor=preprocessor, **kwargs) + self.model.eval() def forward(self, inputs: Dict[str, Any], **forward_params) -> Dict[str, Any]: diff --git a/modelscope/pipelines/nlp/mglm_text_summarization_pipeline.py b/modelscope/pipelines/nlp/mglm_text_summarization_pipeline.py index c6d03077..330331a5 100644 --- a/modelscope/pipelines/nlp/mglm_text_summarization_pipeline.py +++ b/modelscope/pipelines/nlp/mglm_text_summarization_pipeline.py @@ -21,7 +21,7 @@ class MGLMTextSummarizationPipeline(Pipeline): def __init__(self, model: Union[MGLMForTextSummarization, str], - preprocessor: [Preprocessor] = None, + preprocessor: Optional[Preprocessor] = None, *args, **kwargs): model = MGLMForTextSummarization(model) if isinstance(model, diff --git a/modelscope/pipelines/nlp/named_entity_recognition_pipeline.py b/modelscope/pipelines/nlp/named_entity_recognition_pipeline.py index ece75e1b..74b380ec 100644 --- a/modelscope/pipelines/nlp/named_entity_recognition_pipeline.py +++ b/modelscope/pipelines/nlp/named_entity_recognition_pipeline.py @@ -50,15 +50,12 @@ class NamedEntityRecognitionPipeline(TokenClassificationPipeline): To view other examples plese check the tests/pipelines/test_named_entity_recognition.py. """ - - model = model if isinstance(model, - Model) else Model.from_pretrained(model) + super().__init__(model=model, preprocessor=preprocessor, **kwargs) if preprocessor is None: - preprocessor = TokenClassificationPreprocessor( - model.model_dir, + self.preprocessor = TokenClassificationPreprocessor( + self.model.model_dir, sequence_length=kwargs.pop('sequence_length', 128)) - model.eval() - super().__init__(model=model, preprocessor=preprocessor, **kwargs) + self.model.eval() self.id2label = kwargs.get('id2label') if self.id2label is None and hasattr(self.preprocessor, 'id2label'): self.id2label = self.preprocessor.id2label @@ -73,13 +70,11 @@ class NamedEntityRecognitionThaiPipeline(NamedEntityRecognitionPipeline): model: Union[Model, str], preprocessor: Optional[Preprocessor] = None, **kwargs): - model = model if isinstance(model, - Model) else Model.from_pretrained(model) + super().__init__(model=model, preprocessor=preprocessor, **kwargs) if preprocessor is None: - preprocessor = NERPreprocessorThai( - model.model_dir, + self.preprocessor = NERPreprocessorThai( + self.model.model_dir, sequence_length=kwargs.pop('sequence_length', 512)) - super().__init__(model=model, preprocessor=preprocessor, **kwargs) @PIPELINES.register_module( @@ -91,10 +86,8 @@ class NamedEntityRecognitionVietPipeline(NamedEntityRecognitionPipeline): model: Union[Model, str], preprocessor: Optional[Preprocessor] = None, **kwargs): - model = model if isinstance(model, - Model) else Model.from_pretrained(model) + super().__init__(model=model, preprocessor=preprocessor, **kwargs) if preprocessor is None: - preprocessor = NERPreprocessorViet( - model.model_dir, + self.preprocessor = NERPreprocessorViet( + self.model.model_dir, sequence_length=kwargs.pop('sequence_length', 512)) - super().__init__(model=model, preprocessor=preprocessor, **kwargs) diff --git a/modelscope/pipelines/nlp/sentence_embedding_pipeline.py b/modelscope/pipelines/nlp/sentence_embedding_pipeline.py index cfa5c2f1..adac7f1b 100644 --- a/modelscope/pipelines/nlp/sentence_embedding_pipeline.py +++ b/modelscope/pipelines/nlp/sentence_embedding_pipeline.py @@ -32,14 +32,13 @@ class SentenceEmbeddingPipeline(Pipeline): the model if supplied. sequence_length: Max sequence length in the user's custom scenario. 128 will be used as a default value. """ - model = Model.from_pretrained(model) if isinstance(model, - str) else model + super().__init__(model=model, preprocessor=preprocessor, **kwargs) if preprocessor is None: - preprocessor = Preprocessor.from_pretrained( - model.model_dir if isinstance(model, Model) else model, + self.preprocessor = Preprocessor.from_pretrained( + self.model.model_dir + if isinstance(self.model, Model) else model, first_sequence=first_sequence, sequence_length=kwargs.pop('sequence_length', 128)) - super().__init__(model=model, preprocessor=preprocessor, **kwargs) def forward(self, inputs: Dict[str, Any], **forward_params) -> Dict[str, Any]: diff --git a/modelscope/pipelines/nlp/summarization_pipeline.py b/modelscope/pipelines/nlp/summarization_pipeline.py index 30dd4b30..6ea7cd5f 100644 --- a/modelscope/pipelines/nlp/summarization_pipeline.py +++ b/modelscope/pipelines/nlp/summarization_pipeline.py @@ -1,5 +1,5 @@ # Copyright (c) Alibaba, Inc. and its affiliates. -from typing import Any, Dict, Union +from typing import Any, Dict, Optional, Union from modelscope.metainfo import Pipelines from modelscope.models.multi_modal import OfaForAllTasks @@ -18,7 +18,7 @@ class SummarizationPipeline(Pipeline): def __init__(self, model: Union[Model, str], - preprocessor: [Preprocessor] = None, + preprocessor: Optional[Preprocessor] = None, **kwargs): """Use `model` and `preprocessor` to create a Summarization pipeline for prediction. @@ -27,19 +27,10 @@ class SummarizationPipeline(Pipeline): or a model id from the model hub, or a model instance. preprocessor (Preprocessor): An optional preprocessor instance. """ - super().__init__(model=model) - assert isinstance(model, str) or isinstance(model, Model), \ - 'model must be a single str or OfaForAllTasks' - if isinstance(model, str): - pipe_model = Model.from_pretrained(model) - elif isinstance(model, Model): - pipe_model = model - else: - raise NotImplementedError - pipe_model.model.eval() - if preprocessor is None and isinstance(pipe_model, OfaForAllTasks): - preprocessor = OfaPreprocessor(model_dir=pipe_model.model_dir) - super().__init__(model=pipe_model, preprocessor=preprocessor, **kwargs) + super().__init__(model=model, preprocessor=preprocessor, **kwargs) + self.model.eval() + if preprocessor is None and isinstance(self.model, OfaForAllTasks): + self.preprocessor = OfaPreprocessor(model_dir=self.model.model_dir) def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]: return inputs diff --git a/modelscope/pipelines/nlp/table_question_answering_pipeline.py b/modelscope/pipelines/nlp/table_question_answering_pipeline.py index bde78196..36f4c08a 100644 --- a/modelscope/pipelines/nlp/table_question_answering_pipeline.py +++ b/modelscope/pipelines/nlp/table_question_answering_pipeline.py @@ -41,21 +41,22 @@ class TableQuestionAnsweringPipeline(Pipeline): preprocessor (TableQuestionAnsweringPreprocessor): a preprocessor instance db (Database): a database to store tables in the database """ - model = model if isinstance( - model, TableQuestionAnswering) else Model.from_pretrained(model) + super().__init__(model=model, preprocessor=preprocessor, **kwargs) if preprocessor is None: - preprocessor = TableQuestionAnsweringPreprocessor(model.model_dir) + self.preprocessor = TableQuestionAnsweringPreprocessor( + self.model.model_dir) # initilize tokenizer self.tokenizer = BertTokenizer( - os.path.join(model.model_dir, ModelFile.VOCAB_FILE)) + os.path.join(self.model.model_dir, ModelFile.VOCAB_FILE)) # initialize database if db is None: self.db = Database( tokenizer=self.tokenizer, - table_file_path=os.path.join(model.model_dir, 'table.json'), - syn_dict_file_path=os.path.join(model.model_dir, + table_file_path=os.path.join(self.model.model_dir, + 'table.json'), + syn_dict_file_path=os.path.join(self.model.model_dir, 'synonym.txt')) else: self.db = db @@ -71,8 +72,6 @@ class TableQuestionAnsweringPipeline(Pipeline): self.schema_link_dict = constant.schema_link_dict self.limit_dict = constant.limit_dict - super().__init__(model=model, preprocessor=preprocessor, **kwargs) - def post_process_multi_turn(self, history_sql, result, table): action = self.action_ops[result['action']] headers = table['header_name'] diff --git a/modelscope/pipelines/nlp/text2text_generation_pipeline.py b/modelscope/pipelines/nlp/text2text_generation_pipeline.py index a739df69..9bf226b9 100644 --- a/modelscope/pipelines/nlp/text2text_generation_pipeline.py +++ b/modelscope/pipelines/nlp/text2text_generation_pipeline.py @@ -63,16 +63,14 @@ class Text2TextGenerationPipeline(Pipeline): To view other examples plese check the tests/pipelines/test_text_generation.py. """ - model = model if isinstance(model, - Model) else Model.from_pretrained(model) + super().__init__(model=model, preprocessor=preprocessor, **kwargs) if preprocessor is None: - preprocessor = Text2TextGenerationPreprocessor( - model.model_dir, + self.preprocessor = Text2TextGenerationPreprocessor( + self.model.model_dir, sequence_length=kwargs.pop('sequence_length', 128)) - self.tokenizer = preprocessor.tokenizer - self.pipeline = model.pipeline.type - model.eval() - super().__init__(model=model, preprocessor=preprocessor, **kwargs) + self.tokenizer = self.preprocessor.tokenizer + self.pipeline = self.model.pipeline.type + self.model.eval() def preprocess(self, inputs: Input, **preprocess_params) -> Dict[str, Any]: """ Provide specific preprocess for text2text generation pipeline in order to handl multi tasks diff --git a/modelscope/pipelines/nlp/text_classification_pipeline.py b/modelscope/pipelines/nlp/text_classification_pipeline.py index 15a318b4..fd223c76 100644 --- a/modelscope/pipelines/nlp/text_classification_pipeline.py +++ b/modelscope/pipelines/nlp/text_classification_pipeline.py @@ -53,25 +53,24 @@ class TextClassificationPipeline(Pipeline): NOTE: Inputs of type 'str' are also supported. In this scenario, the 'first_sequence' and 'second_sequence' param will have no affection. """ - model = Model.from_pretrained(model) if isinstance(model, - str) else model + super().__init__(model=model, preprocessor=preprocessor, **kwargs) if preprocessor is None: - if model.__class__.__name__ == 'OfaForAllTasks': - preprocessor = Preprocessor.from_pretrained( - model_name_or_path=model.model_dir, + if self.model.__class__.__name__ == 'OfaForAllTasks': + self.preprocessor = Preprocessor.from_pretrained( + model_name_or_path=self.model.model_dir, type=Preprocessors.ofa_tasks_preprocessor, field=Fields.multi_modal) else: first_sequence = kwargs.pop('first_sequence', 'first_sequence') second_sequence = kwargs.pop('second_sequence', None) - preprocessor = Preprocessor.from_pretrained( - model if isinstance(model, str) else model.model_dir, + self.preprocessor = Preprocessor.from_pretrained( + self.model + if isinstance(self.model, str) else self.model.model_dir, first_sequence=first_sequence, second_sequence=second_sequence, sequence_length=kwargs.pop('sequence_length', 512)) - super().__init__(model=model, preprocessor=preprocessor, **kwargs) self.id2label = kwargs.get('id2label') if self.id2label is None and hasattr(self.preprocessor, 'id2label'): self.id2label = self.preprocessor.id2label diff --git a/modelscope/pipelines/nlp/text_error_correction_pipeline.py b/modelscope/pipelines/nlp/text_error_correction_pipeline.py index 8e9bf85d..ee8cb711 100644 --- a/modelscope/pipelines/nlp/text_error_correction_pipeline.py +++ b/modelscope/pipelines/nlp/text_error_correction_pipeline.py @@ -40,14 +40,12 @@ class TextErrorCorrectionPipeline(Pipeline): To view other examples plese check the tests/pipelines/test_text_error_correction.py. """ + super().__init__(model=model, preprocessor=preprocessor, **kwargs) - model = model if isinstance( - model, - BartForTextErrorCorrection) else Model.from_pretrained(model) if preprocessor is None: - preprocessor = TextErrorCorrectionPreprocessor(model.model_dir) - self.vocab = preprocessor.vocab - super().__init__(model=model, preprocessor=preprocessor, **kwargs) + self.preprocessor = TextErrorCorrectionPreprocessor( + self.model.model_dir) + self.vocab = self.preprocessor.vocab def forward(self, inputs: Dict[str, Any], **forward_params) -> Dict[str, Any]: diff --git a/modelscope/pipelines/nlp/text_generation_pipeline.py b/modelscope/pipelines/nlp/text_generation_pipeline.py index 0490c8e7..bf1162bf 100644 --- a/modelscope/pipelines/nlp/text_generation_pipeline.py +++ b/modelscope/pipelines/nlp/text_generation_pipeline.py @@ -51,15 +51,14 @@ class TextGenerationPipeline(Pipeline): To view other examples plese check the tests/pipelines/test_text_generation.py. """ - model = model if isinstance(model, - Model) else Model.from_pretrained(model) - cfg = read_config(model.model_dir) + super().__init__(model=model, preprocessor=preprocessor, **kwargs) + cfg = read_config(self.model.model_dir) self.postprocessor = cfg.pop('postprocessor', 'decode') if preprocessor is None: preprocessor_cfg = cfg.preprocessor preprocessor_cfg.update({ 'model_dir': - model.model_dir, + self.model.model_dir, 'first_sequence': first_sequence, 'second_sequence': @@ -67,9 +66,9 @@ class TextGenerationPipeline(Pipeline): 'sequence_length': kwargs.pop('sequence_length', 128) }) - preprocessor = build_preprocessor(preprocessor_cfg, Fields.nlp) - model.eval() - super().__init__(model=model, preprocessor=preprocessor, **kwargs) + self.preprocessor = build_preprocessor(preprocessor_cfg, + Fields.nlp) + self.model.eval() def _sanitize_parameters(self, **pipeline_parameters): return {}, pipeline_parameters, {} diff --git a/modelscope/pipelines/nlp/text_ranking_pipeline.py b/modelscope/pipelines/nlp/text_ranking_pipeline.py index 9cee327b..fe627e5f 100644 --- a/modelscope/pipelines/nlp/text_ranking_pipeline.py +++ b/modelscope/pipelines/nlp/text_ranking_pipeline.py @@ -32,14 +32,12 @@ class TextRankingPipeline(Pipeline): the model if supplied. sequence_length: Max sequence length in the user's custom scenario. 128 will be used as a default value. """ - model = Model.from_pretrained(model) if isinstance(model, - str) else model + super().__init__(model=model, preprocessor=preprocessor, **kwargs) if preprocessor is None: - preprocessor = Preprocessor.from_pretrained( - model.model_dir, + self.preprocessor = Preprocessor.from_pretrained( + self.model.model_dir, sequence_length=kwargs.pop('sequence_length', 128)) - super().__init__(model=model, preprocessor=preprocessor, **kwargs) def forward(self, inputs: Dict[str, Any], **forward_params) -> Dict[str, Any]: diff --git a/modelscope/pipelines/nlp/token_classification_pipeline.py b/modelscope/pipelines/nlp/token_classification_pipeline.py index 90cf6116..86cc49b7 100644 --- a/modelscope/pipelines/nlp/token_classification_pipeline.py +++ b/modelscope/pipelines/nlp/token_classification_pipeline.py @@ -39,15 +39,14 @@ class TokenClassificationPipeline(Pipeline): model (str or Model): A model instance or a model local dir or a model id in the model hub. preprocessor (Preprocessor): a preprocessor instance, must not be None. """ - model = Model.from_pretrained(model) if isinstance(model, - str) else model + super().__init__(model=model, preprocessor=preprocessor, **kwargs) if preprocessor is None: - preprocessor = Preprocessor.from_pretrained( - model.model_dir, + self.preprocessor = Preprocessor.from_pretrained( + self.model.model_dir, sequence_length=kwargs.pop('sequence_length', 128)) - model.eval() - super().__init__(model=model, preprocessor=preprocessor, **kwargs) + self.model.eval() + self.id2label = kwargs.get('id2label') if self.id2label is None and hasattr(self.preprocessor, 'id2label'): self.id2label = self.preprocessor.id2label diff --git a/modelscope/pipelines/nlp/translation_quality_estimation_pipeline.py b/modelscope/pipelines/nlp/translation_quality_estimation_pipeline.py index 6ef203b9..57fc646a 100644 --- a/modelscope/pipelines/nlp/translation_quality_estimation_pipeline.py +++ b/modelscope/pipelines/nlp/translation_quality_estimation_pipeline.py @@ -27,10 +27,10 @@ class TranslationQualityEstimationPipeline(Pipeline): def __init__(self, model: str, device: str = 'gpu', **kwargs): super().__init__(model=model, device=device) - model_file = os.path.join(model, ModelFile.TORCH_MODEL_FILE) + model_file = os.path.join(self.model, ModelFile.TORCH_MODEL_FILE) with open(model_file, 'rb') as f: buffer = io.BytesIO(f.read()) - self.tokenizer = XLMRobertaTokenizer.from_pretrained(model) + self.tokenizer = XLMRobertaTokenizer.from_pretrained(self.model) self.model = torch.jit.load( buffer, map_location=self.device).to(self.device) diff --git a/modelscope/pipelines/nlp/word_segmentation_pipeline.py b/modelscope/pipelines/nlp/word_segmentation_pipeline.py index ac1c4789..9fe2ad93 100644 --- a/modelscope/pipelines/nlp/word_segmentation_pipeline.py +++ b/modelscope/pipelines/nlp/word_segmentation_pipeline.py @@ -49,14 +49,13 @@ class WordSegmentationPipeline(TokenClassificationPipeline): To view other examples plese check the tests/pipelines/test_word_segmentation.py. """ - model = model if isinstance(model, - Model) else Model.from_pretrained(model) + super().__init__(model=model, preprocessor=preprocessor, **kwargs) if preprocessor is None: - preprocessor = TokenClassificationPreprocessor( - model.model_dir, + self.preprocessor = TokenClassificationPreprocessor( + self.model.model_dir, sequence_length=kwargs.pop('sequence_length', 128)) - model.eval() - super().__init__(model=model, preprocessor=preprocessor, **kwargs) + self.model.eval() + self.id2label = kwargs.get('id2label') if self.id2label is None and hasattr(self.preprocessor, 'id2label'): self.id2label = self.preprocessor.id2label diff --git a/modelscope/pipelines/nlp/zero_shot_classification_pipeline.py b/modelscope/pipelines/nlp/zero_shot_classification_pipeline.py index ecd538b9..31b556d7 100644 --- a/modelscope/pipelines/nlp/zero_shot_classification_pipeline.py +++ b/modelscope/pipelines/nlp/zero_shot_classification_pipeline.py @@ -59,16 +59,14 @@ class ZeroShotClassificationPipeline(Pipeline): """ assert isinstance(model, str) or isinstance(model, Model), \ 'model must be a single str or Model' - model = model if isinstance(model, - Model) else Model.from_pretrained(model) + super().__init__(model=model, preprocessor=preprocessor, **kwargs) self.entailment_id = 0 self.contradiction_id = 2 if preprocessor is None: - preprocessor = ZeroShotClassificationPreprocessor( - model.model_dir, + self.preprocessor = ZeroShotClassificationPreprocessor( + self.model.model_dir, sequence_length=kwargs.pop('sequence_length', 512)) - model.eval() - super().__init__(model=model, preprocessor=preprocessor, **kwargs) + self.model.eval() def _sanitize_parameters(self, **kwargs): preprocess_params = {} diff --git a/modelscope/pipelines/science/protein_structure_pipeline.py b/modelscope/pipelines/science/protein_structure_pipeline.py index 1ef9aa29..e326f50b 100644 --- a/modelscope/pipelines/science/protein_structure_pipeline.py +++ b/modelscope/pipelines/science/protein_structure_pipeline.py @@ -105,22 +105,16 @@ class ProteinStructurePipeline(Pipeline): >>> print(pipeline_ins(protein)) """ - import copy - model_path = copy.deepcopy(model) if isinstance(model, str) else None - cfg = read_config(model_path) # only model is str - self.cfg = cfg + super().__init__(model=model, preprocessor=preprocessor, **kwargs) + self.cfg = read_config(self.model.model_dir) self.config = model_config( - cfg['pipeline']['model_name']) # alphafold config - model = model if isinstance( - model, Model) else Model.from_pretrained(model_path) - self.postprocessor = cfg.pop('postprocessor', None) + self.cfg['pipeline']['model_name']) # alphafold config + self.postprocessor = self.cfg.pop('postprocessor', None) if preprocessor is None: - preprocessor_cfg = cfg.preprocessor - preprocessor = build_preprocessor(preprocessor_cfg, Fields.science) - model.eval() - model.model.inference_mode() - model.model_dir = model_path - super().__init__(model=model, preprocessor=preprocessor, **kwargs) + preprocessor_cfg = self.cfg.preprocessor + self.preprocessor = build_preprocessor(preprocessor_cfg, + Fields.science) + self.model.eval() def _sanitize_parameters(self, **pipeline_parameters): return pipeline_parameters, pipeline_parameters, pipeline_parameters diff --git a/modelscope/preprocessors/base.py b/modelscope/preprocessors/base.py index 38500561..e9b85424 100644 --- a/modelscope/preprocessors/base.py +++ b/modelscope/preprocessors/base.py @@ -6,7 +6,8 @@ from typing import Any, Dict, Optional, Sequence from modelscope.metainfo import Models, Preprocessors from modelscope.utils.config import Config, ConfigDict -from modelscope.utils.constant import DEFAULT_MODEL_REVISION, ModeKeys, Tasks +from modelscope.utils.constant import (DEFAULT_MODEL_REVISION, Invoke, + ModeKeys, Tasks) from modelscope.utils.hub import read_config, snapshot_download from modelscope.utils.logger import get_logger from .builder import build_preprocessor @@ -194,7 +195,9 @@ class Preprocessor(ABC): """ if not os.path.exists(model_name_or_path): model_dir = snapshot_download( - model_name_or_path, revision=revision) + model_name_or_path, + revision=revision, + user_agent={Invoke.KEY: Invoke.PREPROCESSOR}) else: model_dir = model_name_or_path if cfg_dict is None: diff --git a/modelscope/preprocessors/multi_modal.py b/modelscope/preprocessors/multi_modal.py index 7ebedce1..6d326df3 100644 --- a/modelscope/preprocessors/multi_modal.py +++ b/modelscope/preprocessors/multi_modal.py @@ -14,7 +14,8 @@ from modelscope.metainfo import Preprocessors from modelscope.pipelines.base import Input from modelscope.preprocessors import load_image from modelscope.utils.config import Config -from modelscope.utils.constant import Fields, ModeKeys, ModelFile, Tasks +from modelscope.utils.constant import (Fields, Invoke, ModeKeys, ModelFile, + Tasks) from .base import Preprocessor from .builder import PREPROCESSORS from .ofa import * # noqa @@ -57,7 +58,7 @@ class OfaPreprocessor(Preprocessor): Tasks.auto_speech_recognition: OfaASRPreprocessor } model_dir = model_dir if osp.exists(model_dir) else snapshot_download( - model_dir) + model_dir, user_agent={Invoke.KEY: Invoke.PREPROCESSOR}) self.cfg = Config.from_file( osp.join(model_dir, ModelFile.CONFIGURATION)) self.preprocess = preprocess_mapping[self.cfg.task]( @@ -131,7 +132,7 @@ class CLIPPreprocessor(Preprocessor): """ super().__init__(*args, **kwargs) model_dir = model_dir if osp.exists(model_dir) else snapshot_download( - model_dir) + model_dir, user_agent={Invoke.KEY: Invoke.PREPROCESSOR}) self.mode = mode # text tokenizer from modelscope.models.multi_modal.clip.bert_tokenizer import FullTokenizer diff --git a/modelscope/preprocessors/ofa/asr.py b/modelscope/preprocessors/ofa/asr.py index 928698c6..d74c2550 100644 --- a/modelscope/preprocessors/ofa/asr.py +++ b/modelscope/preprocessors/ofa/asr.py @@ -5,6 +5,7 @@ import random from pathlib import Path from typing import Any, Dict +import librosa import soundfile as sf import torch from fairseq.data.audio.feature_transforms import \ @@ -54,9 +55,13 @@ class OfaASRPreprocessor(OfaBasePreprocessor): def _build_train_sample(self, data: Dict[str, Any]) -> Dict[str, Any]: speed = random.choice([0.9, 1.0, 1.1]) - wav, sr = sf.read(self.column_map['wav']) + wav, sr = librosa.load(data[self.column_map['wav']], 16000, mono=True) fbank = self.prepare_fbank( - torch.tensor([wav], dtype=torch.float32), sr, speed, is_train=True) + torch.tensor([wav], dtype=torch.float32), + sr, + speed, + target_sample_rate=16000, + is_train=True) fbank_mask = torch.tensor([True]) sample = { 'fbank': fbank, @@ -86,11 +91,12 @@ class OfaASRPreprocessor(OfaBasePreprocessor): def _build_infer_sample(self, data: Dict[str, Any]) -> Dict[str, Any]: speed = 1.0 - wav, sr = sf.read(data[self.column_map['wav']]) + wav, sr = librosa.load(data[self.column_map['wav']], 16000, mono=True) fbank = self.prepare_fbank( torch.tensor([wav], dtype=torch.float32), sr, speed, + target_sample_rate=16000, is_train=False) fbank_mask = torch.tensor([True]) diff --git a/modelscope/preprocessors/ofa/base.py b/modelscope/preprocessors/ofa/base.py index 64bec9c9..8f18fe7a 100644 --- a/modelscope/preprocessors/ofa/base.py +++ b/modelscope/preprocessors/ofa/base.py @@ -170,10 +170,15 @@ class OfaBasePreprocessor: else load_image(path_or_url_or_pil) return image - def prepare_fbank(self, waveform, sample_rate, speed, is_train): - waveform, _ = torchaudio.sox_effects.apply_effects_tensor( + def prepare_fbank(self, + waveform, + sample_rate, + speed, + target_sample_rate=16000, + is_train=False): + waveform, sample_rate = torchaudio.sox_effects.apply_effects_tensor( waveform, sample_rate, - [['speed', str(speed)], ['rate', str(sample_rate)]]) + [['speed', str(speed)], ['rate', str(target_sample_rate)]]) _waveform, _ = convert_waveform( waveform, sample_rate, to_mono=True, normalize_volume=True) # Kaldi compliance: 16-bit signed integers diff --git a/modelscope/trainers/audio/kws_farfield_trainer.py b/modelscope/trainers/audio/kws_farfield_trainer.py index 9d6013e9..276bf85f 100644 --- a/modelscope/trainers/audio/kws_farfield_trainer.py +++ b/modelscope/trainers/audio/kws_farfield_trainer.py @@ -8,7 +8,6 @@ import torch from torch import nn as nn from torch import optim as optim -from modelscope.hub.snapshot_download import snapshot_download from modelscope.metainfo import Trainers from modelscope.models import Model, TorchModel from modelscope.msdatasets.task_datasets.audio import KWSDataLoader, KWSDataset @@ -54,12 +53,8 @@ class KWSFarfieldTrainer(BaseTrainer): **kwargs): if isinstance(model, str): - if os.path.exists(model): - self.model_dir = model if os.path.isdir( - model) else os.path.dirname(model) - else: - self.model_dir = snapshot_download( - model, revision=model_revision) + self.model_dir = self.get_or_download_model_dir( + model, model_revision) if cfg_file is None: cfg_file = os.path.join(self.model_dir, ModelFile.CONFIGURATION) diff --git a/modelscope/trainers/base.py b/modelscope/trainers/base.py index c0bf51f3..a2b655ed 100644 --- a/modelscope/trainers/base.py +++ b/modelscope/trainers/base.py @@ -1,11 +1,14 @@ # Copyright (c) Alibaba, Inc. and its affiliates. +import os import time from abc import ABC, abstractmethod from typing import Callable, Dict, List, Optional, Tuple, Union +from modelscope.hub.snapshot_download import snapshot_download from modelscope.trainers.builder import TRAINERS from modelscope.utils.config import Config +from modelscope.utils.constant import Invoke from .utils.log_buffer import LogBuffer @@ -32,6 +35,17 @@ class BaseTrainer(ABC): self.log_buffer = LogBuffer() self.timestamp = time.strftime('%Y%m%d_%H%M%S', time.localtime()) + def get_or_download_model_dir(self, model, model_revision=None): + if os.path.exists(model): + model_cache_dir = model if os.path.isdir( + model) else os.path.dirname(model) + else: + model_cache_dir = snapshot_download( + model, + revision=model_revision, + user_agent={Invoke.KEY: Invoke.TRAINER}) + return model_cache_dir + @abstractmethod def train(self, *args, **kwargs): """ Train (and evaluate) process diff --git a/modelscope/trainers/multi_modal/clip/clip_trainer.py b/modelscope/trainers/multi_modal/clip/clip_trainer.py index 40c524ac..8ebf00b5 100644 --- a/modelscope/trainers/multi_modal/clip/clip_trainer.py +++ b/modelscope/trainers/multi_modal/clip/clip_trainer.py @@ -20,7 +20,7 @@ from modelscope.trainers.builder import TRAINERS from modelscope.trainers.optimizer.builder import build_optimizer from modelscope.utils.config import Config from modelscope.utils.constant import (DEFAULT_MODEL_REVISION, ConfigKeys, - ModeKeys) + Invoke, ModeKeys) from .clip_trainer_utils import get_loss, get_optimizer_params, get_schedule @@ -52,7 +52,8 @@ class CLIPTrainer(EpochBasedTrainer): model_revision: Optional[str] = DEFAULT_MODEL_REVISION, seed: int = 42, **kwargs): - model = Model.from_pretrained(model, revision=model_revision) + model = Model.from_pretrained( + model, revision=model_revision, invoked_by=Invoke.TRAINER) # for training & eval, we convert the model from FP16 back to FP32 # to compatible with modelscope amp training convert_models_to_fp32(model) diff --git a/modelscope/trainers/multi_modal/ofa/ofa_trainer.py b/modelscope/trainers/multi_modal/ofa/ofa_trainer.py index e27c23fd..1188fc46 100644 --- a/modelscope/trainers/multi_modal/ofa/ofa_trainer.py +++ b/modelscope/trainers/multi_modal/ofa/ofa_trainer.py @@ -23,7 +23,7 @@ from modelscope.trainers.optimizer.builder import build_optimizer from modelscope.trainers.parallel.utils import is_parallel from modelscope.utils.config import Config from modelscope.utils.constant import (DEFAULT_MODEL_REVISION, ConfigKeys, - ModeKeys) + Invoke, ModeKeys) from .ofa_trainer_utils import (AdjustLabelSmoothedCrossEntropyCriterion, get_schedule) @@ -49,7 +49,8 @@ class OFATrainer(EpochBasedTrainer): model_revision: Optional[str] = DEFAULT_MODEL_REVISION, seed: int = 42, **kwargs): - model = Model.from_pretrained(model, revision=model_revision) + model = Model.from_pretrained( + model, revision=model_revision, invoked_by=Invoke.TRAINER) model_dir = model.model_dir self.cfg_modify_fn = cfg_modify_fn cfg = self.rebuild_config(Config.from_file(cfg_file)) diff --git a/modelscope/trainers/multi_modal/team/team_trainer.py b/modelscope/trainers/multi_modal/team/team_trainer.py index 7c557416..acff8044 100644 --- a/modelscope/trainers/multi_modal/team/team_trainer.py +++ b/modelscope/trainers/multi_modal/team/team_trainer.py @@ -7,21 +7,17 @@ from typing import Callable, Dict, Optional import numpy as np import torch import torch.nn as nn -import torchvision.datasets as datasets -import torchvision.transforms as transforms from sklearn.metrics import confusion_matrix -from torch.optim import AdamW from torch.utils.data import DataLoader, Dataset from modelscope.metainfo import Trainers from modelscope.models.base import Model -from modelscope.msdatasets import MsDataset from modelscope.trainers.base import BaseTrainer from modelscope.trainers.builder import TRAINERS -from modelscope.trainers.multi_modal.team.team_trainer_utils import ( - get_optimizer, train_mapping, val_mapping) +from modelscope.trainers.multi_modal.team.team_trainer_utils import \ + get_optimizer from modelscope.utils.config import Config -from modelscope.utils.constant import DownloadMode, ModeKeys +from modelscope.utils.constant import Invoke from modelscope.utils.logger import get_logger logger = get_logger() @@ -36,7 +32,7 @@ class TEAMImgClsTrainer(BaseTrainer): super().__init__(cfg_file) self.cfg = Config.from_file(cfg_file) - team_model = Model.from_pretrained(model) + team_model = Model.from_pretrained(model, invoked_by=Invoke.TRAINER) image_model = team_model.model.image_model.vision_transformer classification_model = nn.Sequential( OrderedDict([('encoder', image_model), diff --git a/modelscope/trainers/nlp/csanmt_translation_trainer.py b/modelscope/trainers/nlp/csanmt_translation_trainer.py index 08a3a351..3a654db2 100644 --- a/modelscope/trainers/nlp/csanmt_translation_trainer.py +++ b/modelscope/trainers/nlp/csanmt_translation_trainer.py @@ -24,8 +24,7 @@ logger = get_logger() class CsanmtTranslationTrainer(BaseTrainer): def __init__(self, model: str, cfg_file: str = None, *args, **kwargs): - if not osp.exists(model): - model = snapshot_download(model) + model = self.get_or_download_model_dir(model) tf.reset_default_graph() self.model_dir = model diff --git a/modelscope/trainers/nlp_trainer.py b/modelscope/trainers/nlp_trainer.py index 5ff6f62f..65e56f9e 100644 --- a/modelscope/trainers/nlp_trainer.py +++ b/modelscope/trainers/nlp_trainer.py @@ -10,7 +10,6 @@ import torch from torch import nn from torch.utils.data import Dataset -from modelscope.hub.snapshot_download import snapshot_download from modelscope.metainfo import Trainers from modelscope.metrics.builder import build_metric from modelscope.models.base import Model, TorchModel @@ -478,11 +477,7 @@ class NlpEpochBasedTrainer(EpochBasedTrainer): """ if isinstance(model, str): - if os.path.exists(model): - model_dir = model if os.path.isdir(model) else os.path.dirname( - model) - else: - model_dir = snapshot_download(model, revision=model_revision) + model_dir = self.get_or_download_model_dir(model, model_revision) if cfg_file is None: cfg_file = os.path.join(model_dir, ModelFile.CONFIGURATION) else: diff --git a/modelscope/trainers/trainer.py b/modelscope/trainers/trainer.py index 3556badf..db5f6a9c 100644 --- a/modelscope/trainers/trainer.py +++ b/modelscope/trainers/trainer.py @@ -14,7 +14,6 @@ from torch.utils.data import DataLoader, Dataset from torch.utils.data.dataloader import default_collate from torch.utils.data.distributed import DistributedSampler -from modelscope.hub.snapshot_download import snapshot_download from modelscope.metainfo import Trainers from modelscope.metrics import build_metric, task_default_metrics from modelscope.models.base import Model, TorchModel @@ -98,12 +97,8 @@ class EpochBasedTrainer(BaseTrainer): self._seed = seed set_random_seed(self._seed) if isinstance(model, str): - if os.path.exists(model): - self.model_dir = model if os.path.isdir( - model) else os.path.dirname(model) - else: - self.model_dir = snapshot_download( - model, revision=model_revision) + self.model_dir = self.get_or_download_model_dir( + model, model_revision) if cfg_file is None: cfg_file = os.path.join(self.model_dir, ModelFile.CONFIGURATION) diff --git a/modelscope/utils/constant.py b/modelscope/utils/constant.py index 007a6174..e14977f9 100644 --- a/modelscope/utils/constant.py +++ b/modelscope/utils/constant.py @@ -44,6 +44,7 @@ class CVTasks(object): image_segmentation = 'image-segmentation' semantic_segmentation = 'semantic-segmentation' + image_depth_estimation = 'image-depth-estimation' portrait_matting = 'portrait-matting' text_driven_segmentation = 'text-driven-segmentation' shop_segmentation = 'shop-segmentation' @@ -293,6 +294,14 @@ class ModelFile(object): TS_MODEL_FILE = 'model.ts' +class Invoke(object): + KEY = 'invoked_by' + PRETRAINED = 'from_pretrained' + PIPELINE = 'pipeline' + TRAINER = 'trainer' + PREPROCESSOR = 'preprocessor' + + class ConfigFields(object): """ First level keyword in configuration file """ diff --git a/modelscope/utils/cv/image_utils.py b/modelscope/utils/cv/image_utils.py index 095c36ec..0ac257e2 100644 --- a/modelscope/utils/cv/image_utils.py +++ b/modelscope/utils/cv/image_utils.py @@ -1,6 +1,7 @@ # Copyright (c) Alibaba, Inc. and its affiliates. import cv2 +import matplotlib.pyplot as plt import numpy as np from modelscope.outputs import OutputKeys @@ -439,3 +440,11 @@ def show_image_object_detection_auto_result(img_path, if save_path is not None: cv2.imwrite(save_path, img) return img + + +def depth_to_color(depth): + colormap = plt.get_cmap('plasma') + depth_color = (colormap( + (depth.max() - depth) / depth.max()) * 2**8).astype(np.uint8)[:, :, :3] + depth_color = cv2.cvtColor(depth_color, cv2.COLOR_RGB2BGR) + return depth_color diff --git a/requirements/multi-modal.txt b/requirements/multi-modal.txt index 54049c56..9c144a99 100644 --- a/requirements/multi-modal.txt +++ b/requirements/multi-modal.txt @@ -1,4 +1,5 @@ ftfy>=6.0.3 +librosa ofa>=0.0.2 pycocoevalcap>=1.2 pycocotools>=2.0.4 diff --git a/tests/pipelines/test_extractive_summarization.py b/tests/pipelines/test_extractive_summarization.py index 8bf28fd2..26ac508c 100644 --- a/tests/pipelines/test_extractive_summarization.py +++ b/tests/pipelines/test_extractive_summarization.py @@ -28,7 +28,7 @@ class ExtractiveSummarizationTest(unittest.TestCase, DemoCompatibilityCheck): result = p(documents=documents) return result - @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') + @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') def test_run_with_doc(self): logger.info( 'Run doc extractive summarization (PoNet) with one document ...') @@ -37,7 +37,7 @@ class ExtractiveSummarizationTest(unittest.TestCase, DemoCompatibilityCheck): model_id=self.ponet_doc_model_id, documents=self.sentences) print(result[OutputKeys.TEXT]) - @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') + @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') def test_run_with_topic(self): logger.info( 'Run topic extractive summarization (PoNet) with one document ...') diff --git a/tests/pipelines/test_image_depth_estimation.py b/tests/pipelines/test_image_depth_estimation.py new file mode 100644 index 00000000..856734f8 --- /dev/null +++ b/tests/pipelines/test_image_depth_estimation.py @@ -0,0 +1,35 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. + +import unittest + +import cv2 +import numpy as np + +from modelscope.outputs import OutputKeys +from modelscope.pipelines import pipeline +from modelscope.utils.constant import Tasks +from modelscope.utils.cv.image_utils import depth_to_color +from modelscope.utils.demo_utils import DemoCompatibilityCheck +from modelscope.utils.test_utils import test_level + + +class ImageDepthEstimationTest(unittest.TestCase, DemoCompatibilityCheck): + + def setUp(self) -> None: + self.task = 'image-depth-estimation' + self.model_id = 'damo/cv_newcrfs_image-depth-estimation_indoor' + + @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') + def test_image_depth_estimation(self): + input_location = 'data/test/images/image_depth_estimation.jpg' + estimator = pipeline(Tasks.image_depth_estimation, model=self.model_id) + result = estimator(input_location) + depths = result[OutputKeys.DEPTHS] + depth_viz = depth_to_color(depths[0].squeeze().cpu().numpy()) + cv2.imwrite('result.jpg', depth_viz) + + print('test_image_depth_estimation DONE') + + +if __name__ == '__main__': + unittest.main()