Merge branch 'master-gitlab' into merge_master_github_1128

3 years ago · 261cbb78ce
--- a/data/test/images/image_depth_estimation.jpg
+++ b/data/test/images/image_depth_estimation.jpg
@@ -0,0 +1,3 @@
 version https://git-lfs.github.com/spec/v1
 oid sha256:3b230497f6ca10be42aed92b86db435d74fd7306746a059b4ad1e0d6b0652806
 size 35694
--- a/modelscope/metainfo.py
+++ b/modelscope/metainfo.py
@@ -36,6 +36,7 @@ class Models(object):
    swinL_semantic_segmentation = 'swinL-semantic-segmentation'
    vitadapter_semantic_segmentation = 'vitadapter-semantic-segmentation'
    text_driven_segmentation = 'text-driven-segmentation'
    newcrfs_depth_estimation = 'newcrfs-depth-estimation'
    resnet50_bert = 'resnet50-bert'
    referring_video_object_segmentation = 'swinT-referring-video-object-segmentation'
    fer = 'fer'
@@ -210,6 +211,7 @@ class Pipelines(object):
    video_summarization = 'googlenet_pgl_video_summarization'
    language_guided_video_summarization = 'clip-it-video-summarization'
    image_semantic_segmentation = 'image-semantic-segmentation'
    image_depth_estimation = 'image-depth-estimation'
    image_reid_person = 'passvitb-image-reid-person'
    image_inpainting = 'fft-inpainting'
    text_driven_segmentation = 'text-driven-segmentation'
--- a/modelscope/models/base/base_model.py
+++ b/modelscope/models/base/base_model.py
@@ -5,10 +5,10 @@ from abc import ABC, abstractmethod
 from typing import Any, Callable, Dict, List, Optional, Union

 from modelscope.hub.snapshot_download import snapshot_download
 from modelscope.models.builder import MODELS, build_model
 from modelscope.models.builder import build_model
 from modelscope.utils.checkpoint import save_checkpoint, save_pretrained
 from modelscope.utils.config import Config
 from modelscope.utils.constant import DEFAULT_MODEL_REVISION, ModelFile, Tasks
 from modelscope.utils.constant import DEFAULT_MODEL_REVISION, Invoke, ModelFile
 from modelscope.utils.device import verify_device
 from modelscope.utils.logger import get_logger

@@ -94,6 +94,10 @@ class Model(ABC):
        if prefetched is not None:
            kwargs.pop('model_prefetched')

        invoked_by = kwargs.get(Invoke.KEY)
        if invoked_by is not None:
            kwargs.pop(Invoke.KEY)

        if osp.exists(model_name_or_path):
            local_model_dir = model_name_or_path
        else:
@@ -101,7 +105,13 @@ class Model(ABC):
                raise RuntimeError(
                    'Expecting model is pre-fetched locally, but is not found.'
                )
            local_model_dir = snapshot_download(model_name_or_path, revision)

            if invoked_by is not None:
                invoked_by = '%s/%s' % (Invoke.KEY, invoked_by)
            else:
                invoked_by = '%s/%s' % (Invoke.KEY, Invoke.PRETRAINED)
            local_model_dir = snapshot_download(
                model_name_or_path, revision, user_agent=invoked_by)
        logger.info(f'initialize model from {local_model_dir}')
        if cfg_dict is not None:
            cfg = cfg_dict
@@ -133,6 +143,7 @@ class Model(ABC):
            model.cfg = cfg

        model.name = model_name_or_path
        model.model_dir = local_model_dir
        return model

    def save_pretrained(self,
--- a/modelscope/models/cv/body_3d_keypoints/body_3d_pose.py
+++ b/modelscope/models/cv/body_3d_keypoints/body_3d_pose.py
@@ -224,8 +224,8 @@ class BodyKeypointsDetection3D(TorchModel):
            lst_pose2d_cannoical.append(pose2d_canonical[:,
                                                         i - pad:i + pad + 1])

        input_pose2d_rr = torch.concat(lst_pose2d_cannoical, axis=0)
        input_pose2d_cannoical = torch.concat(lst_pose2d_cannoical, axis=0)
        input_pose2d_rr = torch.cat(lst_pose2d_cannoical, axis=0)
        input_pose2d_cannoical = torch.cat(lst_pose2d_cannoical, axis=0)

        if self.cfg.model.MODEL.USE_CANONICAL_COORDS:
            input_pose2d_abs = input_pose2d_cannoical.clone()
--- a/modelscope/models/cv/image_depth_estimation/init.py
+++ b/modelscope/models/cv/image_depth_estimation/init.py
@@ -0,0 +1 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
--- a/modelscope/models/cv/image_depth_estimation/networks/init.py
+++ b/modelscope/models/cv/image_depth_estimation/networks/init.py
@@ -0,0 +1 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
--- a/modelscope/models/cv/image_depth_estimation/networks/newcrf_depth.py
+++ b/modelscope/models/cv/image_depth_estimation/networks/newcrf_depth.py
@@ -0,0 +1,215 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import torch
 import torch.nn as nn
 import torch.nn.functional as F

 from .newcrf_layers import NewCRF
 from .swin_transformer import SwinTransformer
 from .uper_crf_head import PSP


 class NewCRFDepth(nn.Module):
    """
    Depth network based on neural window FC-CRFs architecture.
    """

    def __init__(self,
                 version=None,
                 inv_depth=False,
                 pretrained=None,
                 frozen_stages=-1,
                 min_depth=0.1,
                 max_depth=100.0,
                 **kwargs):
        super().__init__()

        self.inv_depth = inv_depth
        self.with_auxiliary_head = False
        self.with_neck = False

        norm_cfg = dict(type='BN', requires_grad=True)
        # norm_cfg = dict(type='GN', requires_grad=True, num_groups=8)

        window_size = int(version[-2:])

        if version[:-2] == 'base':
            embed_dim = 128
            depths = [2, 2, 18, 2]
            num_heads = [4, 8, 16, 32]
            in_channels = [128, 256, 512, 1024]
        elif version[:-2] == 'large':
            embed_dim = 192
            depths = [2, 2, 18, 2]
            num_heads = [6, 12, 24, 48]
            in_channels = [192, 384, 768, 1536]
        elif version[:-2] == 'tiny':
            embed_dim = 96
            depths = [2, 2, 6, 2]
            num_heads = [3, 6, 12, 24]
            in_channels = [96, 192, 384, 768]

        backbone_cfg = dict(
            embed_dim=embed_dim,
            depths=depths,
            num_heads=num_heads,
            window_size=window_size,
            ape=False,
            drop_path_rate=0.3,
            patch_norm=True,
            use_checkpoint=False,
            frozen_stages=frozen_stages)

        embed_dim = 512
        decoder_cfg = dict(
            in_channels=in_channels,
            in_index=[0, 1, 2, 3],
            pool_scales=(1, 2, 3, 6),
            channels=embed_dim,
            dropout_ratio=0.0,
            num_classes=32,
            norm_cfg=norm_cfg,
            align_corners=False)

        self.backbone = SwinTransformer(**backbone_cfg)
        # v_dim = decoder_cfg['num_classes'] * 4
        win = 7
        crf_dims = [128, 256, 512, 1024]
        v_dims = [64, 128, 256, embed_dim]
        self.crf3 = NewCRF(
            input_dim=in_channels[3],
            embed_dim=crf_dims[3],
            window_size=win,
            v_dim=v_dims[3],
            num_heads=32)
        self.crf2 = NewCRF(
            input_dim=in_channels[2],
            embed_dim=crf_dims[2],
            window_size=win,
            v_dim=v_dims[2],
            num_heads=16)
        self.crf1 = NewCRF(
            input_dim=in_channels[1],
            embed_dim=crf_dims[1],
            window_size=win,
            v_dim=v_dims[1],
            num_heads=8)
        self.crf0 = NewCRF(
            input_dim=in_channels[0],
            embed_dim=crf_dims[0],
            window_size=win,
            v_dim=v_dims[0],
            num_heads=4)

        self.decoder = PSP(**decoder_cfg)
        self.disp_head1 = DispHead(input_dim=crf_dims[0])

        self.up_mode = 'bilinear'
        if self.up_mode == 'mask':
            self.mask_head = nn.Sequential(
                nn.Conv2d(crf_dims[0], 64, 3, padding=1),
                nn.ReLU(inplace=True), nn.Conv2d(64, 16 * 9, 1, padding=0))

        self.min_depth = min_depth
        self.max_depth = max_depth

        self.init_weights(pretrained=pretrained)

    def init_weights(self, pretrained=None):
        """Initialize the weights in backbone and heads.

        Args:
            pretrained (str, optional): Path to pre-trained weights.
                Defaults to None.
        """
        # print(f'== Load encoder backbone from: {pretrained}')
        self.backbone.init_weights(pretrained=pretrained)
        self.decoder.init_weights()
        if self.with_auxiliary_head:
            if isinstance(self.auxiliary_head, nn.ModuleList):
                for aux_head in self.auxiliary_head:
                    aux_head.init_weights()
            else:
                self.auxiliary_head.init_weights()

    def upsample_mask(self, disp, mask):
        """ Upsample disp [H/4, W/4, 1] -> [H, W, 1] using convex combination """
        N, _, H, W = disp.shape
        mask = mask.view(N, 1, 9, 4, 4, H, W)
        mask = torch.softmax(mask, dim=2)

        up_disp = F.unfold(disp, kernel_size=3, padding=1)
        up_disp = up_disp.view(N, 1, 9, 1, 1, H, W)

        up_disp = torch.sum(mask * up_disp, dim=2)
        up_disp = up_disp.permute(0, 1, 4, 2, 5, 3)
        return up_disp.reshape(N, 1, 4 * H, 4 * W)

    def forward(self, imgs):

        feats = self.backbone(imgs)
        if self.with_neck:
            feats = self.neck(feats)

        ppm_out = self.decoder(feats)

        e3 = self.crf3(feats[3], ppm_out)
        e3 = nn.PixelShuffle(2)(e3)
        e2 = self.crf2(feats[2], e3)
        e2 = nn.PixelShuffle(2)(e2)
        e1 = self.crf1(feats[1], e2)
        e1 = nn.PixelShuffle(2)(e1)
        e0 = self.crf0(feats[0], e1)

        if self.up_mode == 'mask':
            mask = self.mask_head(e0)
            d1 = self.disp_head1(e0, 1)
            d1 = self.upsample_mask(d1, mask)
        else:
            d1 = self.disp_head1(e0, 4)

        depth = d1 * self.max_depth

        return depth


 class DispHead(nn.Module):

    def __init__(self, input_dim=100):
        super(DispHead, self).__init__()
        # self.norm1 = nn.BatchNorm2d(input_dim)
        self.conv1 = nn.Conv2d(input_dim, 1, 3, padding=1)
        # self.relu = nn.ReLU(inplace=True)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x, scale):
        # x = self.relu(self.norm1(x))
        x = self.sigmoid(self.conv1(x))
        if scale > 1:
            x = upsample(x, scale_factor=scale)
        return x


 class DispUnpack(nn.Module):

    def __init__(self, input_dim=100, hidden_dim=128):
        super(DispUnpack, self).__init__()
        self.conv1 = nn.Conv2d(input_dim, hidden_dim, 3, padding=1)
        self.conv2 = nn.Conv2d(hidden_dim, 16, 3, padding=1)
        self.relu = nn.ReLU(inplace=True)
        self.sigmoid = nn.Sigmoid()
        self.pixel_shuffle = nn.PixelShuffle(4)

    def forward(self, x, output_size):
        x = self.relu(self.conv1(x))
        x = self.sigmoid(self.conv2(x))  # [b, 16, h/4, w/4]
        # x = torch.reshape(x, [x.shape[0], 1, x.shape[2]*4, x.shape[3]*4])
        x = self.pixel_shuffle(x)

        return x


 def upsample(x, scale_factor=2, mode='bilinear', align_corners=False):
    """Upsample input tensor by a factor of 2
    """
    return F.interpolate(
        x, scale_factor=scale_factor, mode=mode, align_corners=align_corners)
--- a/modelscope/models/cv/image_depth_estimation/networks/newcrf_layers.py
+++ b/modelscope/models/cv/image_depth_estimation/networks/newcrf_layers.py
@@ -0,0 +1,504 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import numpy as np
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 import torch.utils.checkpoint as checkpoint
 from timm.models.layers import DropPath, to_2tuple, trunc_normal_


 class Mlp(nn.Module):
    """ Multilayer perceptron."""

    def __init__(self,
                 in_features,
                 hidden_features=None,
                 out_features=None,
                 act_layer=nn.GELU,
                 drop=0.):
        super().__init__()
        out_features = out_features or in_features
        hidden_features = hidden_features or in_features
        self.fc1 = nn.Linear(in_features, hidden_features)
        self.act = act_layer()
        self.fc2 = nn.Linear(hidden_features, out_features)
        self.drop = nn.Dropout(drop)

    def forward(self, x):
        x = self.fc1(x)
        x = self.act(x)
        x = self.drop(x)
        x = self.fc2(x)
        x = self.drop(x)
        return x


 def window_partition(x, window_size):
    """
    Args:
        x: (B, H, W, C)
        window_size (int): window size

    Returns:
        windows: (num_windows*B, window_size, window_size, C)
    """
    B, H, W, C = x.shape
    x = x.view(B, H // window_size, window_size, W // window_size, window_size,
               C)
    windows = x.permute(0, 1, 3, 2, 4,
                        5).contiguous().view(-1, window_size, window_size, C)
    return windows


 def window_reverse(windows, window_size, H, W):
    """
    Args:
        windows: (num_windows*B, window_size, window_size, C)
        window_size (int): Window size
        H (int): Height of image
        W (int): Width of image

    Returns:
        x: (B, H, W, C)
    """
    B = int(windows.shape[0] / (H * W / window_size / window_size))
    x = windows.view(B, H // window_size, W // window_size, window_size,
                     window_size, -1)
    x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1)
    return x


 class WindowAttention(nn.Module):
    """ Window based multi-head self attention (W-MSA) module with relative position bias.
    It supports both of shifted and non-shifted window.

    Args:
        dim (int): Number of input channels.
        window_size (tuple[int]): The height and width of the window.
        num_heads (int): Number of attention heads.
        qkv_bias (bool, optional):  If True, add a learnable bias to query, key, value. Default: True
        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set
        attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0
        proj_drop (float, optional): Dropout ratio of output. Default: 0.0
    """

    def __init__(self,
                 dim,
                 window_size,
                 num_heads,
                 v_dim,
                 qkv_bias=True,
                 qk_scale=None,
                 attn_drop=0.,
                 proj_drop=0.):

        super().__init__()
        self.dim = dim
        self.window_size = window_size  # Wh, Ww
        self.num_heads = num_heads
        head_dim = dim // num_heads
        self.scale = qk_scale or head_dim**-0.5

        # define a parameter table of relative position bias
        self.relative_position_bias_table = nn.Parameter(
            torch.zeros((2 * window_size[0] - 1) * (2 * window_size[1] - 1),
                        num_heads))  # 2*Wh-1 * 2*Ww-1, nH

        # get pair-wise relative position index for each token inside the window
        coords_h = torch.arange(self.window_size[0])
        coords_w = torch.arange(self.window_size[1])
        coords = torch.stack(torch.meshgrid([coords_h, coords_w]))  # 2, Wh, Ww
        coords_flatten = torch.flatten(coords, 1)  # 2, Wh*Ww
        relative_coords = coords_flatten[:, :,
                                         None] - coords_flatten[:,
                                                                None, :]  # 2, Wh*Ww, Wh*Ww
        relative_coords = relative_coords.permute(
            1, 2, 0).contiguous()  # Wh*Ww, Wh*Ww, 2
        relative_coords[:, :,
                        0] += self.window_size[0] - 1  # shift to start from 0
        relative_coords[:, :, 1] += self.window_size[1] - 1
        relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1
        relative_position_index = relative_coords.sum(-1)  # Wh*Ww, Wh*Ww
        self.register_buffer('relative_position_index',
                             relative_position_index)

        self.qk = nn.Linear(dim, dim * 2, bias=qkv_bias)
        self.attn_drop = nn.Dropout(attn_drop)
        self.proj = nn.Linear(v_dim, v_dim)
        self.proj_drop = nn.Dropout(proj_drop)

        trunc_normal_(self.relative_position_bias_table, std=.02)
        self.softmax = nn.Softmax(dim=-1)

    def forward(self, x, v, mask=None):
        """ Forward function.

        Args:
            x: input features with shape of (num_windows*B, N, C)
            mask: (0/-inf) mask with shape of (num_windows, Wh*Ww, Wh*Ww) or None
        """
        B_, N, C = x.shape
        qk = self.qk(x).reshape(B_, N, 2, self.num_heads,
                                C // self.num_heads).permute(2, 0, 3, 1, 4)
        q, k = qk[0], qk[
            1]  # make torchscript happy (cannot use tensor as tuple)

        q = q * self.scale
        attn = (q @ k.transpose(-2, -1))

        relative_position_bias = self.relative_position_bias_table[
            self.relative_position_index.view(-1)].view(
                self.window_size[0] * self.window_size[1],
                self.window_size[0] * self.window_size[1],
                -1)  # Wh*Ww,Wh*Ww,nH
        relative_position_bias = relative_position_bias.permute(
            2, 0, 1).contiguous()  # nH, Wh*Ww, Wh*Ww
        attn = attn + relative_position_bias.unsqueeze(0)

        if mask is not None:
            nW = mask.shape[0]
            attn = attn.view(B_ // nW, nW, self.num_heads, N,
                             N) + mask.unsqueeze(1).unsqueeze(0)
            attn = attn.view(-1, self.num_heads, N, N)
            attn = self.softmax(attn)
        else:
            attn = self.softmax(attn)

        attn = self.attn_drop(attn)

        # assert self.dim % v.shape[-1] == 0, "self.dim % v.shape[-1] != 0"
        # repeat_num = self.dim // v.shape[-1]
        # v = v.view(B_, N, self.num_heads // repeat_num, -1).transpose(1, 2).repeat(1, repeat_num, 1, 1)

        assert self.dim == v.shape[-1], 'self.dim != v.shape[-1]'
        v = v.view(B_, N, self.num_heads, -1).transpose(1, 2)

        x = (attn @ v).transpose(1, 2).reshape(B_, N, C)
        x = self.proj(x)
        x = self.proj_drop(x)
        return x


 class CRFBlock(nn.Module):
    """ CRF Block.

    Args:
        dim (int): Number of input channels.
        num_heads (int): Number of attention heads.
        window_size (int): Window size.
        shift_size (int): Shift size for SW-MSA.
        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
        drop (float, optional): Dropout rate. Default: 0.0
        attn_drop (float, optional): Attention dropout rate. Default: 0.0
        drop_path (float, optional): Stochastic depth rate. Default: 0.0
        act_layer (nn.Module, optional): Activation layer. Default: nn.GELU
        norm_layer (nn.Module, optional): Normalization layer.  Default: nn.LayerNorm
    """

    def __init__(self,
                 dim,
                 num_heads,
                 v_dim,
                 window_size=7,
                 shift_size=0,
                 mlp_ratio=4.,
                 qkv_bias=True,
                 qk_scale=None,
                 drop=0.,
                 attn_drop=0.,
                 drop_path=0.,
                 act_layer=nn.GELU,
                 norm_layer=nn.LayerNorm):
        super().__init__()
        self.dim = dim
        self.num_heads = num_heads
        self.v_dim = v_dim
        self.window_size = window_size
        self.shift_size = shift_size
        self.mlp_ratio = mlp_ratio
        assert 0 <= self.shift_size < self.window_size, 'shift_size must in 0-window_size'

        self.norm1 = norm_layer(dim)
        self.attn = WindowAttention(
            dim,
            window_size=to_2tuple(self.window_size),
            num_heads=num_heads,
            v_dim=v_dim,
            qkv_bias=qkv_bias,
            qk_scale=qk_scale,
            attn_drop=attn_drop,
            proj_drop=drop)

        self.drop_path = DropPath(
            drop_path) if drop_path > 0. else nn.Identity()
        self.norm2 = norm_layer(v_dim)
        mlp_hidden_dim = int(v_dim * mlp_ratio)
        self.mlp = Mlp(
            in_features=v_dim,
            hidden_features=mlp_hidden_dim,
            act_layer=act_layer,
            drop=drop)

        self.H = None
        self.W = None

    def forward(self, x, v, mask_matrix):
        """ Forward function.

        Args:
            x: Input feature, tensor size (B, H*W, C).
            H, W: Spatial resolution of the input feature.
            mask_matrix: Attention mask for cyclic shift.
        """
        B, L, C = x.shape
        H, W = self.H, self.W
        assert L == H * W, 'input feature has wrong size'

        shortcut = x
        x = self.norm1(x)
        x = x.view(B, H, W, C)

        # pad feature maps to multiples of window size
        pad_l = pad_t = 0
        pad_r = (self.window_size - W % self.window_size) % self.window_size
        pad_b = (self.window_size - H % self.window_size) % self.window_size
        x = F.pad(x, (0, 0, pad_l, pad_r, pad_t, pad_b))
        v = F.pad(v, (0, 0, pad_l, pad_r, pad_t, pad_b))
        _, Hp, Wp, _ = x.shape

        # cyclic shift
        if self.shift_size > 0:
            shifted_x = torch.roll(
                x, shifts=(-self.shift_size, -self.shift_size), dims=(1, 2))
            shifted_v = torch.roll(
                v, shifts=(-self.shift_size, -self.shift_size), dims=(1, 2))
            attn_mask = mask_matrix
        else:
            shifted_x = x
            shifted_v = v
            attn_mask = None

        # partition windows
        x_windows = window_partition(
            shifted_x, self.window_size)  # nW*B, window_size, window_size, C
        x_windows = x_windows.view(-1, self.window_size * self.window_size,
                                   C)  # nW*B, window_size*window_size, C
        v_windows = window_partition(
            shifted_v, self.window_size)  # nW*B, window_size, window_size, C
        v_windows = v_windows.view(
            -1, self.window_size * self.window_size,
            v_windows.shape[-1])  # nW*B, window_size*window_size, C

        # W-MSA/SW-MSA
        attn_windows = self.attn(
            x_windows, v_windows,
            mask=attn_mask)  # nW*B, window_size*window_size, C

        # merge windows
        attn_windows = attn_windows.view(-1, self.window_size,
                                         self.window_size, self.v_dim)
        shifted_x = window_reverse(attn_windows, self.window_size, Hp,
                                   Wp)  # B H' W' C

        # reverse cyclic shift
        if self.shift_size > 0:
            x = torch.roll(
                shifted_x,
                shifts=(self.shift_size, self.shift_size),
                dims=(1, 2))
        else:
            x = shifted_x

        if pad_r > 0 or pad_b > 0:
            x = x[:, :H, :W, :].contiguous()

        x = x.view(B, H * W, self.v_dim)

        # FFN
        x = shortcut + self.drop_path(x)
        x = x + self.drop_path(self.mlp(self.norm2(x)))

        return x


 class BasicCRFLayer(nn.Module):
    """ A basic NeWCRFs layer for one stage.

    Args:
        dim (int): Number of feature channels
        depth (int): Depths of this stage.
        num_heads (int): Number of attention head.
        window_size (int): Local window size. Default: 7.
        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.
        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
        drop (float, optional): Dropout rate. Default: 0.0
        attn_drop (float, optional): Attention dropout rate. Default: 0.0
        drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0
        norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm
        downsample (nn.Module | None, optional): Downsample layer at the end of the layer. Default: None
        use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.
    """

    def __init__(self,
                 dim,
                 depth,
                 num_heads,
                 v_dim,
                 window_size=7,
                 mlp_ratio=4.,
                 qkv_bias=True,
                 qk_scale=None,
                 drop=0.,
                 attn_drop=0.,
                 drop_path=0.,
                 norm_layer=nn.LayerNorm,
                 downsample=None,
                 use_checkpoint=False):
        super().__init__()
        self.window_size = window_size
        self.shift_size = window_size // 2
        self.depth = depth
        self.use_checkpoint = use_checkpoint

        # build blocks
        self.blocks = nn.ModuleList([
            CRFBlock(
                dim=dim,
                num_heads=num_heads,
                v_dim=v_dim,
                window_size=window_size,
                shift_size=0 if (i % 2 == 0) else window_size // 2,
                mlp_ratio=mlp_ratio,
                qkv_bias=qkv_bias,
                qk_scale=qk_scale,
                drop=drop,
                attn_drop=attn_drop,
                drop_path=drop_path[i]
                if isinstance(drop_path, list) else drop_path,
                norm_layer=norm_layer) for i in range(depth)
        ])

        # patch merging layer
        if downsample is not None:
            self.downsample = downsample(dim=dim, norm_layer=norm_layer)
        else:
            self.downsample = None

    def forward(self, x, v, H, W):
        """ Forward function.

        Args:
            x: Input feature, tensor size (B, H*W, C).
            H, W: Spatial resolution of the input feature.
        """

        # calculate attention mask for SW-MSA
        Hp = int(np.ceil(H / self.window_size)) * self.window_size
        Wp = int(np.ceil(W / self.window_size)) * self.window_size
        img_mask = torch.zeros((1, Hp, Wp, 1), device=x.device)  # 1 Hp Wp 1
        h_slices = (slice(0, -self.window_size),
                    slice(-self.window_size,
                          -self.shift_size), slice(-self.shift_size, None))
        w_slices = (slice(0, -self.window_size),
                    slice(-self.window_size,
                          -self.shift_size), slice(-self.shift_size, None))
        cnt = 0
        for h in h_slices:
            for w in w_slices:
                img_mask[:, h, w, :] = cnt
                cnt += 1

        mask_windows = window_partition(
            img_mask, self.window_size)  # nW, window_size, window_size, 1
        mask_windows = mask_windows.view(-1,
                                         self.window_size * self.window_size)
        attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
        attn_mask = attn_mask.masked_fill(attn_mask != 0,
                                          float(-100.0)).masked_fill(
                                              attn_mask == 0, float(0.0))

        for blk in self.blocks:
            blk.H, blk.W = H, W
            if self.use_checkpoint:
                x = checkpoint.checkpoint(blk, x, attn_mask)
            else:
                x = blk(x, v, attn_mask)
        if self.downsample is not None:
            x_down = self.downsample(x, H, W)
            Wh, Ww = (H + 1) // 2, (W + 1) // 2
            return x, H, W, x_down, Wh, Ww
        else:
            return x, H, W, x, H, W


 class NewCRF(nn.Module):

    def __init__(self,
                 input_dim=96,
                 embed_dim=96,
                 v_dim=64,
                 window_size=7,
                 num_heads=4,
                 depth=2,
                 patch_size=4,
                 in_chans=3,
                 norm_layer=nn.LayerNorm,
                 patch_norm=True):
        super().__init__()

        self.embed_dim = embed_dim
        self.patch_norm = patch_norm

        if input_dim != embed_dim:
            self.proj_x = nn.Conv2d(input_dim, embed_dim, 3, padding=1)
        else:
            self.proj_x = None

        if v_dim != embed_dim:
            self.proj_v = nn.Conv2d(v_dim, embed_dim, 3, padding=1)
        elif embed_dim % v_dim == 0:
            self.proj_v = None

        v_dim = embed_dim
        assert v_dim == embed_dim

        self.crf_layer = BasicCRFLayer(
            dim=embed_dim,
            depth=depth,
            num_heads=num_heads,
            v_dim=v_dim,
            window_size=window_size,
            mlp_ratio=4.,
            qkv_bias=True,
            qk_scale=None,
            drop=0.,
            attn_drop=0.,
            drop_path=0.,
            norm_layer=norm_layer,
            downsample=None,
            use_checkpoint=False)

        layer = norm_layer(embed_dim)
        layer_name = 'norm_crf'
        self.add_module(layer_name, layer)

    def forward(self, x, v):
        if self.proj_x is not None:
            x = self.proj_x(x)
        if self.proj_v is not None:
            v = self.proj_v(v)

        Wh, Ww = x.size(2), x.size(3)
        x = x.flatten(2).transpose(1, 2)
        v = v.transpose(1, 2).transpose(2, 3)

        x_out, H, W, x, Wh, Ww = self.crf_layer(x, v, Wh, Ww)
        norm_layer = getattr(self, 'norm_crf')
        x_out = norm_layer(x_out)
        out = x_out.view(-1, H, W, self.embed_dim).permute(0, 3, 1,
                                                           2).contiguous()

        return out
--- a/modelscope/models/cv/image_depth_estimation/networks/newcrf_utils.py
+++ b/modelscope/models/cv/image_depth_estimation/networks/newcrf_utils.py
@@ -0,0 +1,272 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import os
 import os.path as osp
 import pkgutil
 import warnings
 from collections import OrderedDict
 from importlib import import_module

 import torch
 import torch.nn as nn
 import torchvision
 from torch import distributed as dist
 from torch.nn import functional as F
 from torch.nn.parallel import DataParallel, DistributedDataParallel
 from torch.utils import model_zoo

 TORCH_VERSION = torch.__version__


 def resize(input,
           size=None,
           scale_factor=None,
           mode='nearest',
           align_corners=None,
           warning=True):
    if warning:
        if size is not None and align_corners:
            input_h, input_w = tuple(int(x) for x in input.shape[2:])
            output_h, output_w = tuple(int(x) for x in size)
            if output_h > input_h or output_w > output_h:
                if ((output_h > 1 and output_w > 1 and input_h > 1
                     and input_w > 1) and (output_h - 1) % (input_h - 1)
                        and (output_w - 1) % (input_w - 1)):
                    warnings.warn(
                        f'When align_corners={align_corners}, '
                        'the output would more aligned if '
                        f'input size {(input_h, input_w)} is `x+1` and '
                        f'out size {(output_h, output_w)} is `nx+1`')
    if isinstance(size, torch.Size):
        size = tuple(int(x) for x in size)
    return F.interpolate(input, size, scale_factor, mode, align_corners)


 def normal_init(module, mean=0, std=1, bias=0):
    if hasattr(module, 'weight') and module.weight is not None:
        nn.init.normal_(module.weight, mean, std)
    if hasattr(module, 'bias') and module.bias is not None:
        nn.init.constant_(module.bias, bias)


 def is_module_wrapper(module):
    module_wrappers = (DataParallel, DistributedDataParallel)
    return isinstance(module, module_wrappers)


 def get_dist_info():
    if TORCH_VERSION < '1.0':
        initialized = dist._initialized
    else:
        if dist.is_available():
            initialized = dist.is_initialized()
        else:
            initialized = False
    if initialized:
        rank = dist.get_rank()
        world_size = dist.get_world_size()
    else:
        rank = 0
        world_size = 1
    return rank, world_size


 def load_state_dict(module, state_dict, strict=False, logger=None):
    """Load state_dict to a module.

    This method is modified from :meth:`torch.nn.Module.load_state_dict`.
    Default value for ``strict`` is set to ``False`` and the message for
    param mismatch will be shown even if strict is False.

    Args:
        module (Module): Module that receives the state_dict.
        state_dict (OrderedDict): Weights.
        strict (bool): whether to strictly enforce that the keys
            in :attr:`state_dict` match the keys returned by this module's
            :meth:`~torch.nn.Module.state_dict` function. Default: ``False``.
        logger (:obj:`logging.Logger`, optional): Logger to log the error
            message. If not specified, print function will be used.
    """
    unexpected_keys = []
    all_missing_keys = []
    err_msg = []

    metadata = getattr(state_dict, '_metadata', None)
    state_dict = state_dict.copy()
    if metadata is not None:
        state_dict._metadata = metadata

    # use _load_from_state_dict to enable checkpoint version control
    def load(module, prefix=''):
        # recursively check parallel module in case that the model has a
        # complicated structure, e.g., nn.Module(nn.Module(DDP))
        if is_module_wrapper(module):
            module = module.module
        local_metadata = {} if metadata is None else metadata.get(
            prefix[:-1], {})
        module._load_from_state_dict(state_dict, prefix, local_metadata, True,
                                     all_missing_keys, unexpected_keys,
                                     err_msg)
        for name, child in module._modules.items():
            if child is not None:
                load(child, prefix + name + '.')

    load(module)
    load = None  # break load->load reference cycle

    # ignore "num_batches_tracked" of BN layers
    missing_keys = [
        key for key in all_missing_keys if 'num_batches_tracked' not in key
    ]

    if unexpected_keys:
        err_msg.append('unexpected key in source '
                       f'state_dict: {", ".join(unexpected_keys)}\n')
    if missing_keys:
        err_msg.append(
            f'missing keys in source state_dict: {", ".join(missing_keys)}\n')

    rank, _ = get_dist_info()
    if len(err_msg) > 0 and rank == 0:
        err_msg.insert(
            0, 'The model and loaded state dict do not match exactly\n')
        err_msg = '\n'.join(err_msg)
        if strict:
            raise RuntimeError(err_msg)
        elif logger is not None:
            logger.warning(err_msg)
        else:
            print(err_msg)


 def load_url_dist(url, model_dir=None):
    """In distributed setting, this function only download checkpoint at local
    rank 0."""
    rank, world_size = get_dist_info()
    rank = int(os.environ.get('LOCAL_RANK', rank))
    if rank == 0:
        checkpoint = model_zoo.load_url(url, model_dir=model_dir)
    if world_size > 1:
        torch.distributed.barrier()
        if rank > 0:
            checkpoint = model_zoo.load_url(url, model_dir=model_dir)
    return checkpoint


 def get_torchvision_models():
    model_urls = dict()
    for _, name, ispkg in pkgutil.walk_packages(torchvision.models.__path__):
        if ispkg:
            continue
        _zoo = import_module(f'torchvision.models.{name}')
        if hasattr(_zoo, 'model_urls'):
            _urls = getattr(_zoo, 'model_urls')
            model_urls.update(_urls)
    return model_urls


 def _load_checkpoint(filename, map_location=None):
    """Load checkpoint from somewhere (modelzoo, file, url).

    Args:
        filename (str): Accept local filepath, URL, ``torchvision://xxx``,
            ``open-mmlab://xxx``. Please refer to ``docs/model_zoo.md`` for
            details.
        map_location (str | None): Same as :func:`torch.load`. Default: None.

    Returns:
        dict | OrderedDict: The loaded checkpoint. It can be either an
            OrderedDict storing model weights or a dict containing other
            information, which depends on the checkpoint.
    """
    if filename.startswith('modelzoo://'):
        warnings.warn('The URL scheme of "modelzoo://" is deprecated, please '
                      'use "torchvision://" instead')
        model_urls = get_torchvision_models()
        model_name = filename[11:]
        checkpoint = load_url_dist(model_urls[model_name])
    else:
        if not osp.isfile(filename):
            raise IOError(f'{filename} is not a checkpoint file')
        checkpoint = torch.load(filename, map_location=map_location)
    return checkpoint


 def load_checkpoint(model,
                    filename,
                    map_location='cpu',
                    strict=False,
                    logger=None):
    """Load checkpoint from a file or URI.

    Args:
        model (Module): Module to load checkpoint.
        filename (str): Accept local filepath, URL, ``torchvision://xxx``,
            ``open-mmlab://xxx``. Please refer to ``docs/model_zoo.md`` for
            details.
        map_location (str): Same as :func:`torch.load`.
        strict (bool): Whether to allow different params for the model and
            checkpoint.
        logger (:mod:`logging.Logger` or None): The logger for error message.

    Returns:
        dict or OrderedDict: The loaded checkpoint.
    """
    checkpoint = _load_checkpoint(filename, map_location)
    # OrderedDict is a subclass of dict
    if not isinstance(checkpoint, dict):
        raise RuntimeError(
            f'No state_dict found in checkpoint file {filename}')
    # get state_dict from checkpoint
    if 'state_dict' in checkpoint:
        state_dict = checkpoint['state_dict']
    elif 'model' in checkpoint:
        state_dict = checkpoint['model']
    else:
        state_dict = checkpoint
    # strip prefix of state_dict
    if list(state_dict.keys())[0].startswith('module.'):
        state_dict = {k[7:]: v for k, v in state_dict.items()}

    # for MoBY, load model of online branch
    if sorted(list(state_dict.keys()))[0].startswith('encoder'):
        state_dict = {
            k.replace('encoder.', ''): v
            for k, v in state_dict.items() if k.startswith('encoder.')
        }

    # reshape absolute position embedding
    if state_dict.get('absolute_pos_embed') is not None:
        absolute_pos_embed = state_dict['absolute_pos_embed']
        N1, L, C1 = absolute_pos_embed.size()
        N2, C2, H, W = model.absolute_pos_embed.size()
        if N1 != N2 or C1 != C2 or L != H * W:
            logger.warning('Error in loading absolute_pos_embed, pass')
        else:
            state_dict['absolute_pos_embed'] = absolute_pos_embed.view(
                N2, H, W, C2).permute(0, 3, 1, 2)

    # interpolate position bias table if needed
    relative_position_bias_table_keys = [
        k for k in state_dict.keys() if 'relative_position_bias_table' in k
    ]
    for table_key in relative_position_bias_table_keys:
        table_pretrained = state_dict[table_key]
        table_current = model.state_dict()[table_key]
        L1, nH1 = table_pretrained.size()
        L2, nH2 = table_current.size()
        if nH1 != nH2:
            logger.warning(f'Error in loading {table_key}, pass')
        else:
            if L1 != L2:
                S1 = int(L1**0.5)
                S2 = int(L2**0.5)
                table_pretrained_resized = F.interpolate(
                    table_pretrained.permute(1, 0).view(1, nH1, S1, S1),
                    size=(S2, S2),
                    mode='bicubic')
                state_dict[table_key] = table_pretrained_resized.view(
                    nH2, L2).permute(1, 0)

    # load state_dict
    load_state_dict(model, state_dict, strict, logger)
    return checkpoint
--- a/modelscope/models/cv/image_depth_estimation/networks/swin_transformer.py
+++ b/modelscope/models/cv/image_depth_estimation/networks/swin_transformer.py
@@ -0,0 +1,706 @@
 # The implementation is adopted from Swin Transformer
 # made publicly available under the MIT License at https://github.com/microsoft/Swin-Transformer

 import numpy as np
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 import torch.utils.checkpoint as checkpoint
 from timm.models.layers import DropPath, to_2tuple, trunc_normal_

 from .newcrf_utils import load_checkpoint


 class Mlp(nn.Module):
    """ Multilayer perceptron."""

    def __init__(self,
                 in_features,
                 hidden_features=None,
                 out_features=None,
                 act_layer=nn.GELU,
                 drop=0.):
        super().__init__()
        out_features = out_features or in_features
        hidden_features = hidden_features or in_features
        self.fc1 = nn.Linear(in_features, hidden_features)
        self.act = act_layer()
        self.fc2 = nn.Linear(hidden_features, out_features)
        self.drop = nn.Dropout(drop)

    def forward(self, x):
        x = self.fc1(x)
        x = self.act(x)
        x = self.drop(x)
        x = self.fc2(x)
        x = self.drop(x)
        return x


 def window_partition(x, window_size):
    """
    Args:
        x: (B, H, W, C)
        window_size (int): window size

    Returns:
        windows: (num_windows*B, window_size, window_size, C)
    """
    B, H, W, C = x.shape
    x = x.view(B, H // window_size, window_size, W // window_size, window_size,
               C)
    windows = x.permute(0, 1, 3, 2, 4,
                        5).contiguous().view(-1, window_size, window_size, C)
    return windows


 def window_reverse(windows, window_size, H, W):
    """
    Args:
        windows: (num_windows*B, window_size, window_size, C)
        window_size (int): Window size
        H (int): Height of image
        W (int): Width of image

    Returns:
        x: (B, H, W, C)
    """
    B = int(windows.shape[0] / (H * W / window_size / window_size))
    x = windows.view(B, H // window_size, W // window_size, window_size,
                     window_size, -1)
    x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1)
    return x


 class WindowAttention(nn.Module):
    """ Window based multi-head self attention (W-MSA) module with relative position bias.
    It supports both of shifted and non-shifted window.

    Args:
        dim (int): Number of input channels.
        window_size (tuple[int]): The height and width of the window.
        num_heads (int): Number of attention heads.
        qkv_bias (bool, optional):  If True, add a learnable bias to query, key, value. Default: True
        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set
        attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0
        proj_drop (float, optional): Dropout ratio of output. Default: 0.0
    """

    def __init__(self,
                 dim,
                 window_size,
                 num_heads,
                 qkv_bias=True,
                 qk_scale=None,
                 attn_drop=0.,
                 proj_drop=0.):

        super().__init__()
        self.dim = dim
        self.window_size = window_size  # Wh, Ww
        self.num_heads = num_heads
        head_dim = dim // num_heads
        self.scale = qk_scale or head_dim**-0.5

        # define a parameter table of relative position bias
        self.relative_position_bias_table = nn.Parameter(
            torch.zeros((2 * window_size[0] - 1) * (2 * window_size[1] - 1),
                        num_heads))  # 2*Wh-1 * 2*Ww-1, nH

        # get pair-wise relative position index for each token inside the window
        coords_h = torch.arange(self.window_size[0])
        coords_w = torch.arange(self.window_size[1])
        coords = torch.stack(torch.meshgrid([coords_h, coords_w]))  # 2, Wh, Ww
        coords_flatten = torch.flatten(coords, 1)  # 2, Wh*Ww
        relative_coords = coords_flatten[:, :,
                                         None] - coords_flatten[:,
                                                                None, :]  # 2, Wh*Ww, Wh*Ww
        relative_coords = relative_coords.permute(
            1, 2, 0).contiguous()  # Wh*Ww, Wh*Ww, 2
        relative_coords[:, :,
                        0] += self.window_size[0] - 1  # shift to start from 0
        relative_coords[:, :, 1] += self.window_size[1] - 1
        relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1
        relative_position_index = relative_coords.sum(-1)  # Wh*Ww, Wh*Ww
        self.register_buffer('relative_position_index',
                             relative_position_index)

        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
        self.attn_drop = nn.Dropout(attn_drop)
        self.proj = nn.Linear(dim, dim)
        self.proj_drop = nn.Dropout(proj_drop)

        trunc_normal_(self.relative_position_bias_table, std=.02)
        self.softmax = nn.Softmax(dim=-1)

    def forward(self, x, mask=None):
        """ Forward function.

        Args:
            x: input features with shape of (num_windows*B, N, C)
            mask: (0/-inf) mask with shape of (num_windows, Wh*Ww, Wh*Ww) or None
        """
        B_, N, C = x.shape
        qkv = self.qkv(x).reshape(B_, N, 3, self.num_heads,
                                  C // self.num_heads).permute(2, 0, 3, 1, 4)
        q, k, v = qkv[0], qkv[1], qkv[
            2]  # make torchscript happy (cannot use tensor as tuple)

        q = q * self.scale
        attn = (q @ k.transpose(-2, -1))

        relative_position_bias = self.relative_position_bias_table[
            self.relative_position_index.view(-1)].view(
                self.window_size[0] * self.window_size[1],
                self.window_size[0] * self.window_size[1],
                -1)  # Wh*Ww,Wh*Ww,nH
        relative_position_bias = relative_position_bias.permute(
            2, 0, 1).contiguous()  # nH, Wh*Ww, Wh*Ww
        attn = attn + relative_position_bias.unsqueeze(0)

        if mask is not None:
            nW = mask.shape[0]
            attn = attn.view(B_ // nW, nW, self.num_heads, N,
                             N) + mask.unsqueeze(1).unsqueeze(0)
            attn = attn.view(-1, self.num_heads, N, N)
            attn = self.softmax(attn)
        else:
            attn = self.softmax(attn)

        attn = self.attn_drop(attn)

        x = (attn @ v).transpose(1, 2).reshape(B_, N, C)
        x = self.proj(x)
        x = self.proj_drop(x)
        return x


 class SwinTransformerBlock(nn.Module):
    """ Swin Transformer Block.

    Args:
        dim (int): Number of input channels.
        num_heads (int): Number of attention heads.
        window_size (int): Window size.
        shift_size (int): Shift size for SW-MSA.
        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
        drop (float, optional): Dropout rate. Default: 0.0
        attn_drop (float, optional): Attention dropout rate. Default: 0.0
        drop_path (float, optional): Stochastic depth rate. Default: 0.0
        act_layer (nn.Module, optional): Activation layer. Default: nn.GELU
        norm_layer (nn.Module, optional): Normalization layer.  Default: nn.LayerNorm
    """

    def __init__(self,
                 dim,
                 num_heads,
                 window_size=7,
                 shift_size=0,
                 mlp_ratio=4.,
                 qkv_bias=True,
                 qk_scale=None,
                 drop=0.,
                 attn_drop=0.,
                 drop_path=0.,
                 act_layer=nn.GELU,
                 norm_layer=nn.LayerNorm):
        super().__init__()
        self.dim = dim
        self.num_heads = num_heads
        self.window_size = window_size
        self.shift_size = shift_size
        self.mlp_ratio = mlp_ratio
        assert 0 <= self.shift_size < self.window_size, 'shift_size must in 0-window_size'

        self.norm1 = norm_layer(dim)
        self.attn = WindowAttention(
            dim,
            window_size=to_2tuple(self.window_size),
            num_heads=num_heads,
            qkv_bias=qkv_bias,
            qk_scale=qk_scale,
            attn_drop=attn_drop,
            proj_drop=drop)

        self.drop_path = DropPath(
            drop_path) if drop_path > 0. else nn.Identity()
        self.norm2 = norm_layer(dim)
        mlp_hidden_dim = int(dim * mlp_ratio)
        self.mlp = Mlp(
            in_features=dim,
            hidden_features=mlp_hidden_dim,
            act_layer=act_layer,
            drop=drop)

        self.H = None
        self.W = None

    def forward(self, x, mask_matrix):
        """ Forward function.

        Args:
            x: Input feature, tensor size (B, H*W, C).
            H, W: Spatial resolution of the input feature.
            mask_matrix: Attention mask for cyclic shift.
        """
        B, L, C = x.shape
        H, W = self.H, self.W
        assert L == H * W, 'input feature has wrong size'

        shortcut = x
        x = self.norm1(x)
        x = x.view(B, H, W, C)

        # pad feature maps to multiples of window size
        pad_l = pad_t = 0
        pad_r = (self.window_size - W % self.window_size) % self.window_size
        pad_b = (self.window_size - H % self.window_size) % self.window_size
        x = F.pad(x, (0, 0, pad_l, pad_r, pad_t, pad_b))
        _, Hp, Wp, _ = x.shape

        # cyclic shift
        if self.shift_size > 0:
            shifted_x = torch.roll(
                x, shifts=(-self.shift_size, -self.shift_size), dims=(1, 2))
            attn_mask = mask_matrix
        else:
            shifted_x = x
            attn_mask = None

        # partition windows
        x_windows = window_partition(
            shifted_x, self.window_size)  # nW*B, window_size, window_size, C
        x_windows = x_windows.view(-1, self.window_size * self.window_size,
                                   C)  # nW*B, window_size*window_size, C

        # W-MSA/SW-MSA
        attn_windows = self.attn(
            x_windows, mask=attn_mask)  # nW*B, window_size*window_size, C

        # merge windows
        attn_windows = attn_windows.view(-1, self.window_size,
                                         self.window_size, C)
        shifted_x = window_reverse(attn_windows, self.window_size, Hp,
                                   Wp)  # B H' W' C

        # reverse cyclic shift
        if self.shift_size > 0:
            x = torch.roll(
                shifted_x,
                shifts=(self.shift_size, self.shift_size),
                dims=(1, 2))
        else:
            x = shifted_x

        if pad_r > 0 or pad_b > 0:
            x = x[:, :H, :W, :].contiguous()

        x = x.view(B, H * W, C)

        # FFN
        x = shortcut + self.drop_path(x)
        x = x + self.drop_path(self.mlp(self.norm2(x)))

        return x


 class PatchMerging(nn.Module):
    """ Patch Merging Layer

    Args:
        dim (int): Number of input channels.
        norm_layer (nn.Module, optional): Normalization layer.  Default: nn.LayerNorm
    """

    def __init__(self, dim, norm_layer=nn.LayerNorm):
        super().__init__()
        self.dim = dim
        self.reduction = nn.Linear(4 * dim, 2 * dim, bias=False)
        self.norm = norm_layer(4 * dim)

    def forward(self, x, H, W):
        """ Forward function.

        Args:
            x: Input feature, tensor size (B, H*W, C).
            H, W: Spatial resolution of the input feature.
        """
        B, L, C = x.shape
        assert L == H * W, 'input feature has wrong size'

        x = x.view(B, H, W, C)

        # padding
        pad_input = (H % 2 == 1) or (W % 2 == 1)
        if pad_input:
            x = F.pad(x, (0, 0, 0, W % 2, 0, H % 2))

        x0 = x[:, 0::2, 0::2, :]  # B H/2 W/2 C
        x1 = x[:, 1::2, 0::2, :]  # B H/2 W/2 C
        x2 = x[:, 0::2, 1::2, :]  # B H/2 W/2 C
        x3 = x[:, 1::2, 1::2, :]  # B H/2 W/2 C
        x = torch.cat([x0, x1, x2, x3], -1)  # B H/2 W/2 4*C
        x = x.view(B, -1, 4 * C)  # B H/2*W/2 4*C

        x = self.norm(x)
        x = self.reduction(x)

        return x


 class BasicLayer(nn.Module):
    """ A basic Swin Transformer layer for one stage.

    Args:
        dim (int): Number of feature channels
        depth (int): Depths of this stage.
        num_heads (int): Number of attention head.
        window_size (int): Local window size. Default: 7.
        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.
        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
        drop (float, optional): Dropout rate. Default: 0.0
        attn_drop (float, optional): Attention dropout rate. Default: 0.0
        drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0
        norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm
        downsample (nn.Module | None, optional): Downsample layer at the end of the layer. Default: None
        use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.
    """

    def __init__(self,
                 dim,
                 depth,
                 num_heads,
                 window_size=7,
                 mlp_ratio=4.,
                 qkv_bias=True,
                 qk_scale=None,
                 drop=0.,
                 attn_drop=0.,
                 drop_path=0.,
                 norm_layer=nn.LayerNorm,
                 downsample=None,
                 use_checkpoint=False):
        super().__init__()
        self.window_size = window_size
        self.shift_size = window_size // 2
        self.depth = depth
        self.use_checkpoint = use_checkpoint

        # build blocks
        self.blocks = nn.ModuleList([
            SwinTransformerBlock(
                dim=dim,
                num_heads=num_heads,
                window_size=window_size,
                shift_size=0 if (i % 2 == 0) else window_size // 2,
                mlp_ratio=mlp_ratio,
                qkv_bias=qkv_bias,
                qk_scale=qk_scale,
                drop=drop,
                attn_drop=attn_drop,
                drop_path=drop_path[i]
                if isinstance(drop_path, list) else drop_path,
                norm_layer=norm_layer) for i in range(depth)
        ])

        # patch merging layer
        if downsample is not None:
            self.downsample = downsample(dim=dim, norm_layer=norm_layer)
        else:
            self.downsample = None

    def forward(self, x, H, W):
        """ Forward function.

        Args:
            x: Input feature, tensor size (B, H*W, C).
            H, W: Spatial resolution of the input feature.
        """

        # calculate attention mask for SW-MSA
        Hp = int(np.ceil(H / self.window_size)) * self.window_size
        Wp = int(np.ceil(W / self.window_size)) * self.window_size
        img_mask = torch.zeros((1, Hp, Wp, 1), device=x.device)  # 1 Hp Wp 1
        h_slices = (slice(0, -self.window_size),
                    slice(-self.window_size,
                          -self.shift_size), slice(-self.shift_size, None))
        w_slices = (slice(0, -self.window_size),
                    slice(-self.window_size,
                          -self.shift_size), slice(-self.shift_size, None))
        cnt = 0
        for h in h_slices:
            for w in w_slices:
                img_mask[:, h, w, :] = cnt
                cnt += 1

        mask_windows = window_partition(
            img_mask, self.window_size)  # nW, window_size, window_size, 1
        mask_windows = mask_windows.view(-1,
                                         self.window_size * self.window_size)
        attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
        attn_mask = attn_mask.masked_fill(attn_mask != 0,
                                          float(-100.0)).masked_fill(
                                              attn_mask == 0, float(0.0))

        for blk in self.blocks:
            blk.H, blk.W = H, W
            if self.use_checkpoint:
                x = checkpoint.checkpoint(blk, x, attn_mask)
            else:
                x = blk(x, attn_mask)
        if self.downsample is not None:
            x_down = self.downsample(x, H, W)
            Wh, Ww = (H + 1) // 2, (W + 1) // 2
            return x, H, W, x_down, Wh, Ww
        else:
            return x, H, W, x, H, W


 class PatchEmbed(nn.Module):
    """ Image to Patch Embedding

    Args:
        patch_size (int): Patch token size. Default: 4.
        in_chans (int): Number of input image channels. Default: 3.
        embed_dim (int): Number of linear projection output channels. Default: 96.
        norm_layer (nn.Module, optional): Normalization layer. Default: None
    """

    def __init__(self,
                 patch_size=4,
                 in_chans=3,
                 embed_dim=96,
                 norm_layer=None):
        super().__init__()
        patch_size = to_2tuple(patch_size)
        self.patch_size = patch_size

        self.in_chans = in_chans
        self.embed_dim = embed_dim

        self.proj = nn.Conv2d(
            in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
        if norm_layer is not None:
            self.norm = norm_layer(embed_dim)
        else:
            self.norm = None

    def forward(self, x):
        """Forward function."""
        # padding
        _, _, H, W = x.size()
        if W % self.patch_size[1] != 0:
            x = F.pad(x, (0, self.patch_size[1] - W % self.patch_size[1]))
        if H % self.patch_size[0] != 0:
            x = F.pad(x,
                      (0, 0, 0, self.patch_size[0] - H % self.patch_size[0]))

        x = self.proj(x)  # B C Wh Ww
        if self.norm is not None:
            Wh, Ww = x.size(2), x.size(3)
            x = x.flatten(2).transpose(1, 2)
            x = self.norm(x)
            x = x.transpose(1, 2).view(-1, self.embed_dim, Wh, Ww)

        return x


 class SwinTransformer(nn.Module):
    """ Swin Transformer backbone.
        A PyTorch impl of : `Swin Transformer: Hierarchical Vision Transformer using Shifted Windows`  -
          https://arxiv.org/pdf/2103.14030

    Args:
        pretrain_img_size (int): Input image size for training the pretrained model,
            used in absolute postion embedding. Default 224.
        patch_size (int | tuple(int)): Patch size. Default: 4.
        in_chans (int): Number of input image channels. Default: 3.
        embed_dim (int): Number of linear projection output channels. Default: 96.
        depths (tuple[int]): Depths of each Swin Transformer stage.
        num_heads (tuple[int]): Number of attention head of each stage.
        window_size (int): Window size. Default: 7.
        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.
        qkv_bias (bool): If True, add a learnable bias to query, key, value. Default: True
        qk_scale (float): Override default qk scale of head_dim ** -0.5 if set.
        drop_rate (float): Dropout rate.
        attn_drop_rate (float): Attention dropout rate. Default: 0.
        drop_path_rate (float): Stochastic depth rate. Default: 0.2.
        norm_layer (nn.Module): Normalization layer. Default: nn.LayerNorm.
        ape (bool): If True, add absolute position embedding to the patch embedding. Default: False.
        patch_norm (bool): If True, add normalization after patch embedding. Default: True.
        out_indices (Sequence[int]): Output from which stages.
        frozen_stages (int): Stages to be frozen (stop grad and set eval mode).
            -1 means not freezing any parameters.
        use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.
    """

    def __init__(self,
                 pretrain_img_size=224,
                 patch_size=4,
                 in_chans=3,
                 embed_dim=96,
                 depths=[2, 2, 6, 2],
                 num_heads=[3, 6, 12, 24],
                 window_size=7,
                 mlp_ratio=4.,
                 qkv_bias=True,
                 qk_scale=None,
                 drop_rate=0.,
                 attn_drop_rate=0.,
                 drop_path_rate=0.2,
                 norm_layer=nn.LayerNorm,
                 ape=False,
                 patch_norm=True,
                 out_indices=(0, 1, 2, 3),
                 frozen_stages=-1,
                 use_checkpoint=False):
        super().__init__()

        self.pretrain_img_size = pretrain_img_size
        self.num_layers = len(depths)
        self.embed_dim = embed_dim
        self.ape = ape
        self.patch_norm = patch_norm
        self.out_indices = out_indices
        self.frozen_stages = frozen_stages

        # split image into non-overlapping patches
        self.patch_embed = PatchEmbed(
            patch_size=patch_size,
            in_chans=in_chans,
            embed_dim=embed_dim,
            norm_layer=norm_layer if self.patch_norm else None)

        # absolute position embedding
        if self.ape:
            pretrain_img_size = to_2tuple(pretrain_img_size)
            patch_size = to_2tuple(patch_size)
            patches_resolution = [
                pretrain_img_size[0] // patch_size[0],
                pretrain_img_size[1] // patch_size[1]
            ]

            self.absolute_pos_embed = nn.Parameter(
                torch.zeros(1, embed_dim, patches_resolution[0],
                            patches_resolution[1]))
            trunc_normal_(self.absolute_pos_embed, std=.02)

        self.pos_drop = nn.Dropout(p=drop_rate)

        # stochastic depth
        dpr = [
            x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))
        ]  # stochastic depth decay rule

        # build layers
        self.layers = nn.ModuleList()
        for i_layer in range(self.num_layers):
            layer = BasicLayer(
                dim=int(embed_dim * 2**i_layer),
                depth=depths[i_layer],
                num_heads=num_heads[i_layer],
                window_size=window_size,
                mlp_ratio=mlp_ratio,
                qkv_bias=qkv_bias,
                qk_scale=qk_scale,
                drop=drop_rate,
                attn_drop=attn_drop_rate,
                drop_path=dpr[sum(depths[:i_layer]):sum(depths[:i_layer + 1])],
                norm_layer=norm_layer,
                downsample=PatchMerging if
                (i_layer < self.num_layers - 1) else None,
                use_checkpoint=use_checkpoint)
            self.layers.append(layer)

        num_features = [int(embed_dim * 2**i) for i in range(self.num_layers)]
        self.num_features = num_features

        # add a norm layer for each output
        for i_layer in out_indices:
            layer = norm_layer(num_features[i_layer])
            layer_name = f'norm{i_layer}'
            self.add_module(layer_name, layer)

        self._freeze_stages()

    def _freeze_stages(self):
        if self.frozen_stages >= 0:
            self.patch_embed.eval()
            for param in self.patch_embed.parameters():
                param.requires_grad = False

        if self.frozen_stages >= 1 and self.ape:
            self.absolute_pos_embed.requires_grad = False

        if self.frozen_stages >= 2:
            self.pos_drop.eval()
            for i in range(0, self.frozen_stages - 1):
                m = self.layers[i]
                m.eval()
                for param in m.parameters():
                    param.requires_grad = False

    def init_weights(self, pretrained=None):
        """Initialize the weights in backbone.

        Args:
            pretrained (str, optional): Path to pre-trained weights.
                Defaults to None.
        """

        def _init_weights(m):
            if isinstance(m, nn.Linear):
                trunc_normal_(m.weight, std=.02)
                if isinstance(m, nn.Linear) and m.bias is not None:
                    nn.init.constant_(m.bias, 0)
            elif isinstance(m, nn.LayerNorm):
                nn.init.constant_(m.bias, 0)
                nn.init.constant_(m.weight, 1.0)

        if isinstance(pretrained, str):
            self.apply(_init_weights)
            # logger = get_root_logger()
            load_checkpoint(self, pretrained, strict=False)
        elif pretrained is None:
            self.apply(_init_weights)
        else:
            raise TypeError('pretrained must be a str or None')

    def forward(self, x):
        """Forward function."""
        x = self.patch_embed(x)

        Wh, Ww = x.size(2), x.size(3)
        if self.ape:
            # interpolate the position embedding to the corresponding size
            absolute_pos_embed = F.interpolate(
                self.absolute_pos_embed, size=(Wh, Ww), mode='bicubic')
            x = (x + absolute_pos_embed).flatten(2).transpose(1,
                                                              2)  # B Wh*Ww C
        else:
            x = x.flatten(2).transpose(1, 2)
        x = self.pos_drop(x)

        outs = []
        for i in range(self.num_layers):
            layer = self.layers[i]
            x_out, H, W, x, Wh, Ww = layer(x, Wh, Ww)

            if i in self.out_indices:
                norm_layer = getattr(self, f'norm{i}')
                x_out = norm_layer(x_out)

                out = x_out.view(-1, H, W,
                                 self.num_features[i]).permute(0, 3, 1,
                                                               2).contiguous()
                outs.append(out)

        return tuple(outs)

    def train(self, mode=True):
        """Convert the model into training mode while keep layers freezed."""
        super(SwinTransformer, self).train(mode)
        self._freeze_stages()
--- a/modelscope/models/cv/image_depth_estimation/networks/uper_crf_head.py
+++ b/modelscope/models/cv/image_depth_estimation/networks/uper_crf_head.py
@@ -0,0 +1,365 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from mmcv.cnn import ConvModule

 from .newcrf_utils import normal_init, resize


 class PPM(nn.ModuleList):
    """Pooling Pyramid Module used in PSPNet.

    Args:
        pool_scales (tuple[int]): Pooling scales used in Pooling Pyramid
            Module.
        in_channels (int): Input channels.
        channels (int): Channels after modules, before conv_seg.
        conv_cfg (dict|None): Config of conv layers.
        norm_cfg (dict|None): Config of norm layers.
        act_cfg (dict): Config of activation layers.
        align_corners (bool): align_corners argument of F.interpolate.
    """

    def __init__(self, pool_scales, in_channels, channels, conv_cfg, norm_cfg,
                 act_cfg, align_corners):
        super(PPM, self).__init__()
        self.pool_scales = pool_scales
        self.align_corners = align_corners
        self.in_channels = in_channels
        self.channels = channels
        self.conv_cfg = conv_cfg
        self.norm_cfg = norm_cfg
        self.act_cfg = act_cfg
        for pool_scale in pool_scales:
            # == if batch size = 1, BN is not supported, change to GN
            if pool_scale == 1:
                norm_cfg = dict(type='GN', requires_grad=True, num_groups=256)
            self.append(
                nn.Sequential(
                    nn.AdaptiveAvgPool2d(pool_scale),
                    ConvModule(
                        self.in_channels,
                        self.channels,
                        1,
                        conv_cfg=self.conv_cfg,
                        norm_cfg=norm_cfg,
                        act_cfg=self.act_cfg)))

    def forward(self, x):
        """Forward function."""
        ppm_outs = []
        for ppm in self:
            ppm_out = ppm(x)
            upsampled_ppm_out = resize(
                ppm_out,
                size=x.size()[2:],
                mode='bilinear',
                align_corners=self.align_corners)
            ppm_outs.append(upsampled_ppm_out)
        return ppm_outs


 class BaseDecodeHead(nn.Module):
    """Base class for BaseDecodeHead.

    Args:
        in_channels (int|Sequence[int]): Input channels.
        channels (int): Channels after modules, before conv_seg.
        num_classes (int): Number of classes.
        dropout_ratio (float): Ratio of dropout layer. Default: 0.1.
        conv_cfg (dict|None): Config of conv layers. Default: None.
        norm_cfg (dict|None): Config of norm layers. Default: None.
        act_cfg (dict): Config of activation layers.
            Default: dict(type='ReLU')
        in_index (int|Sequence[int]): Input feature index. Default: -1
        input_transform (str|None): Transformation type of input features.
            Options: 'resize_concat', 'multiple_select', None.
            'resize_concat': Multiple feature maps will be resize to the
                same size as first one and than concat together.
                Usually used in FCN head of HRNet.
            'multiple_select': Multiple feature maps will be bundle into
                a list and passed into decode head.
            None: Only one select feature map is allowed.
            Default: None.
        loss_decode (dict): Config of decode loss.
            Default: dict(type='CrossEntropyLoss').
        ignore_index (int | None): The label index to be ignored. When using
            masked BCE loss, ignore_index should be set to None. Default: 255
        sampler (dict|None): The config of segmentation map sampler.
            Default: None.
        align_corners (bool): align_corners argument of F.interpolate.
            Default: False.
    """

    def __init__(self,
                 in_channels,
                 channels,
                 *,
                 num_classes,
                 dropout_ratio=0.1,
                 conv_cfg=None,
                 norm_cfg=None,
                 act_cfg=dict(type='ReLU'),
                 in_index=-1,
                 input_transform=None,
                 loss_decode=dict(
                     type='CrossEntropyLoss',
                     use_sigmoid=False,
                     loss_weight=1.0),
                 ignore_index=255,
                 sampler=None,
                 align_corners=False):
        super(BaseDecodeHead, self).__init__()
        self._init_inputs(in_channels, in_index, input_transform)
        self.channels = channels
        self.num_classes = num_classes
        self.dropout_ratio = dropout_ratio
        self.conv_cfg = conv_cfg
        self.norm_cfg = norm_cfg
        self.act_cfg = act_cfg
        self.in_index = in_index
        # self.loss_decode = build_loss(loss_decode)
        self.ignore_index = ignore_index
        self.align_corners = align_corners
        # if sampler is not None:
        #     self.sampler = build_pixel_sampler(sampler, context=self)
        # else:
        #     self.sampler = None

        # self.conv_seg = nn.Conv2d(channels, num_classes, kernel_size=1)
        # self.conv1 = nn.Conv2d(channels, num_classes, 3, padding=1)
        if dropout_ratio > 0:
            self.dropout = nn.Dropout2d(dropout_ratio)
        else:
            self.dropout = None
        self.fp16_enabled = False

    def extra_repr(self):
        """Extra repr."""
        s = f'input_transform={self.input_transform}, ' \
            f'ignore_index={self.ignore_index}, ' \
            f'align_corners={self.align_corners}'
        return s

    def _init_inputs(self, in_channels, in_index, input_transform):
        """Check and initialize input transforms.

        The in_channels, in_index and input_transform must match.
        Specifically, when input_transform is None, only single feature map
        will be selected. So in_channels and in_index must be of type int.
        When input_transform

        Args:
            in_channels (int|Sequence[int]): Input channels.
            in_index (int|Sequence[int]): Input feature index.
            input_transform (str|None): Transformation type of input features.
                Options: 'resize_concat', 'multiple_select', None.
                'resize_concat': Multiple feature maps will be resize to the
                    same size as first one and than concat together.
                    Usually used in FCN head of HRNet.
                'multiple_select': Multiple feature maps will be bundle into
                    a list and passed into decode head.
                None: Only one select feature map is allowed.
        """

        if input_transform is not None:
            assert input_transform in ['resize_concat', 'multiple_select']
        self.input_transform = input_transform
        self.in_index = in_index
        if input_transform is not None:
            assert isinstance(in_channels, (list, tuple))
            assert isinstance(in_index, (list, tuple))
            assert len(in_channels) == len(in_index)
            if input_transform == 'resize_concat':
                self.in_channels = sum(in_channels)
            else:
                self.in_channels = in_channels
        else:
            assert isinstance(in_channels, int)
            assert isinstance(in_index, int)
            self.in_channels = in_channels

    def init_weights(self):
        """Initialize weights of classification layer."""
        # normal_init(self.conv_seg, mean=0, std=0.01)
        # normal_init(self.conv1, mean=0, std=0.01)

    def _transform_inputs(self, inputs):
        """Transform inputs for decoder.

        Args:
            inputs (list[Tensor]): List of multi-level img features.

        Returns:
            Tensor: The transformed inputs
        """

        if self.input_transform == 'resize_concat':
            inputs = [inputs[i] for i in self.in_index]
            upsampled_inputs = [
                resize(
                    input=x,
                    size=inputs[0].shape[2:],
                    mode='bilinear',
                    align_corners=self.align_corners) for x in inputs
            ]
            inputs = torch.cat(upsampled_inputs, dim=1)
        elif self.input_transform == 'multiple_select':
            inputs = [inputs[i] for i in self.in_index]
        else:
            inputs = inputs[self.in_index]

        return inputs

    def forward(self, inputs):
        """Placeholder of forward function."""
        pass

    def forward_train(self, inputs, img_metas, gt_semantic_seg, train_cfg):
        """Forward function for training.
        Args:
            inputs (list[Tensor]): List of multi-level img features.
            img_metas (list[dict]): List of image info dict where each dict
                has: 'img_shape', 'scale_factor', 'flip', and may also contain
                'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
                For details on the values of these keys see
                `mmseg/datasets/pipelines/formatting.py:Collect`.
            gt_semantic_seg (Tensor): Semantic segmentation masks
                used if the architecture supports semantic segmentation task.
            train_cfg (dict): The training config.

        Returns:
            dict[str, Tensor]: a dictionary of loss components
        """
        seg_logits = self.forward(inputs)
        losses = self.losses(seg_logits, gt_semantic_seg)
        return losses

    def forward_test(self, inputs, img_metas, test_cfg):
        """Forward function for testing.

        Args:
            inputs (list[Tensor]): List of multi-level img features.
            img_metas (list[dict]): List of image info dict where each dict
                has: 'img_shape', 'scale_factor', 'flip', and may also contain
                'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
                For details on the values of these keys see
                `mmseg/datasets/pipelines/formatting.py:Collect`.
            test_cfg (dict): The testing config.

        Returns:
            Tensor: Output segmentation map.
        """
        return self.forward(inputs)


 class UPerHead(BaseDecodeHead):

    def __init__(self, pool_scales=(1, 2, 3, 6), **kwargs):
        super(UPerHead, self).__init__(
            input_transform='multiple_select', **kwargs)
        # FPN Module
        self.lateral_convs = nn.ModuleList()
        self.fpn_convs = nn.ModuleList()
        for in_channels in self.in_channels:  # skip the top layer
            l_conv = ConvModule(
                in_channels,
                self.channels,
                1,
                conv_cfg=self.conv_cfg,
                norm_cfg=self.norm_cfg,
                act_cfg=self.act_cfg,
                inplace=True)
            fpn_conv = ConvModule(
                self.channels,
                self.channels,
                3,
                padding=1,
                conv_cfg=self.conv_cfg,
                norm_cfg=self.norm_cfg,
                act_cfg=self.act_cfg,
                inplace=True)
            self.lateral_convs.append(l_conv)
            self.fpn_convs.append(fpn_conv)

    def forward(self, inputs):
        """Forward function."""

        inputs = self._transform_inputs(inputs)

        # build laterals
        laterals = [
            lateral_conv(inputs[i])
            for i, lateral_conv in enumerate(self.lateral_convs)
        ]

        # laterals.append(self.psp_forward(inputs))

        # build top-down path
        used_backbone_levels = len(laterals)
        for i in range(used_backbone_levels - 1, 0, -1):
            prev_shape = laterals[i - 1].shape[2:]
            laterals[i - 1] += resize(
                laterals[i],
                size=prev_shape,
                mode='bilinear',
                align_corners=self.align_corners)

        # build outputs
        fpn_outs = [
            self.fpn_convs[i](laterals[i])
            for i in range(used_backbone_levels - 1)
        ]
        # append psp feature
        fpn_outs.append(laterals[-1])

        return fpn_outs[0]


 class PSP(BaseDecodeHead):
    """Unified Perceptual Parsing for Scene Understanding.

    This head is the implementation of `UPerNet
    <https://arxiv.org/abs/1807.10221>`_.

    Args:
        pool_scales (tuple[int]): Pooling scales used in Pooling Pyramid
            Module applied on the last feature. Default: (1, 2, 3, 6).
    """

    def __init__(self, pool_scales=(1, 2, 3, 6), **kwargs):
        super(PSP, self).__init__(input_transform='multiple_select', **kwargs)
        # PSP Module
        self.psp_modules = PPM(
            pool_scales,
            self.in_channels[-1],
            self.channels,
            conv_cfg=self.conv_cfg,
            norm_cfg=self.norm_cfg,
            act_cfg=self.act_cfg,
            align_corners=self.align_corners)
        self.bottleneck = ConvModule(
            self.in_channels[-1] + len(pool_scales) * self.channels,
            self.channels,
            3,
            padding=1,
            conv_cfg=self.conv_cfg,
            norm_cfg=self.norm_cfg,
            act_cfg=self.act_cfg)

    def psp_forward(self, inputs):
        """Forward function of PSP module."""
        x = inputs[-1]
        psp_outs = [x]
        psp_outs.extend(self.psp_modules(x))
        psp_outs = torch.cat(psp_outs, dim=1)
        output = self.bottleneck(psp_outs)

        return output

    def forward(self, inputs):
        """Forward function."""
        inputs = self._transform_inputs(inputs)

        return self.psp_forward(inputs)
--- a/modelscope/models/cv/image_depth_estimation/newcrfs_model.py
+++ b/modelscope/models/cv/image_depth_estimation/newcrfs_model.py
@@ -0,0 +1,53 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import os.path as osp

 import numpy as np
 import torch

 from modelscope.metainfo import Models
 from modelscope.models.base.base_torch_model import TorchModel
 from modelscope.models.builder import MODELS
 from modelscope.models.cv.image_depth_estimation.networks.newcrf_depth import \
    NewCRFDepth
 from modelscope.outputs import OutputKeys
 from modelscope.utils.constant import ModelFile, Tasks


@MODELS.register_module(
    Tasks.image_depth_estimation, module_name=Models.newcrfs_depth_estimation)
 class DepthEstimation(TorchModel):

    def __init__(self, model_dir: str, **kwargs):
        """str -- model file root."""
        super().__init__(model_dir, **kwargs)

        # build model
        self.model = NewCRFDepth(
            version='large07', inv_depth=False, max_depth=10)

        # load model
        model_path = osp.join(model_dir, ModelFile.TORCH_MODEL_FILE)
        checkpoint = torch.load(model_path)

        state_dict = {}
        for k in checkpoint['model'].keys():
            if k.startswith('module.'):
                state_dict[k[7:]] = checkpoint['model'][k]
            else:
                state_dict[k] = checkpoint['model'][k]
        self.model.load_state_dict(state_dict)
        self.model.eval()

    def forward(self, Inputs):
        return self.model(Inputs['imgs'])

    def postprocess(self, Inputs):
        depth_result = Inputs

        results = {OutputKeys.DEPTHS: depth_result}
        return results

    def inference(self, data):
        results = self.forward(data)

        return results
--- a/modelscope/models/multi_modal/clip/model.py
+++ b/modelscope/models/multi_modal/clip/model.py
@@ -509,8 +509,8 @@ def convert_weights(model: nn.Module):
@MODELS.register_module(Tasks.multi_modal_embedding, module_name=Models.clip)
 class CLIPForMultiModalEmbedding(TorchModel):

    def __init__(self, model_dir, device_id=-1):
        super().__init__(model_dir=model_dir, device_id=device_id)
    def __init__(self, model_dir, *args, **kwargs):
        super().__init__(model_dir=model_dir, *args, **kwargs)

        # Initialize the model.
        vision_model_config_file = '{}/vision_model_config.json'.format(
--- a/modelscope/models/nlp/mglm/mglm_for_text_summarization.py
+++ b/modelscope/models/nlp/mglm/mglm_for_text_summarization.py
@@ -9,7 +9,6 @@ import numpy as np
 import torch
 import torch.nn.functional as F

 from modelscope.hub.snapshot_download import snapshot_download
 from modelscope.metainfo import Models
 from modelscope.models.base import Tensor, TorchModel
 from modelscope.models.builder import MODELS
--- a/modelscope/models/science/unifold/data/data_ops.py
+++ b/modelscope/models/science/unifold/data/data_ops.py
@@ -730,7 +730,7 @@ def make_msa_feat_v2(batch):
        batch['cluster_profile'],
        deletion_mean_value,
    ]
    batch['msa_feat'] = torch.concat(msa_feat, dim=-1)
    batch['msa_feat'] = torch.cat(msa_feat, dim=-1)
    return batch


@@ -1320,7 +1320,7 @@ def get_contiguous_crop_idx(
                             asym_offset + this_start + csz))
            asym_offset += ll

    return torch.concat(crop_idxs)
    return torch.cat(crop_idxs)


 def get_spatial_crop_idx(
--- a/modelscope/models/science/unifold/modules/attentions.py
+++ b/modelscope/models/science/unifold/modules/attentions.py
@@ -217,7 +217,7 @@ class MSAAttention(nn.Module):
                if mask is not None else None)
            outputs.append(
                self.mha(q=cur_m, k=cur_m, v=cur_m, mask=cur_mask, bias=bias))
        return torch.concat(outputs, dim=-3)
        return torch.cat(outputs, dim=-3)

    def _attn_forward(self, m, mask, bias: Optional[torch.Tensor] = None):
        m = self.layer_norm_m(m)
--- a/modelscope/outputs/outputs.py
+++ b/modelscope/outputs/outputs.py
@@ -19,6 +19,7 @@ class OutputKeys(object):
    BOXES = 'boxes'
    KEYPOINTS = 'keypoints'
    MASKS = 'masks'
    DEPTHS = 'depths'
    TEXT = 'text'
    POLYGONS = 'polygons'
    OUTPUT = 'output'
--- a/modelscope/pipelines/base.py
+++ b/modelscope/pipelines/base.py
@@ -16,7 +16,7 @@ from modelscope.outputs import TASK_OUTPUTS
 from modelscope.pipeline_inputs import TASK_INPUTS, check_input_type
 from modelscope.preprocessors import Preprocessor
 from modelscope.utils.config import Config
 from modelscope.utils.constant import Frameworks, ModelFile
 from modelscope.utils.constant import Frameworks, Invoke, ModelFile
 from modelscope.utils.device import (create_device, device_placement,
                                     verify_device)
 from modelscope.utils.hub import read_config, snapshot_download
@@ -47,8 +47,10 @@ class Pipeline(ABC):
            logger.info(f'initiate model from location {model}.')
            # expecting model has been prefetched to local cache beforehand
            return Model.from_pretrained(
                model, model_prefetched=True,
                device=self.device_name) if is_model(model) else model
                model,
                device=self.device_name,
                model_prefetched=True,
                invoked_by=Invoke.PIPELINE) if is_model(model) else model
        else:
            return model

@@ -231,7 +233,7 @@ class Pipeline(ABC):
                batch_data[k] = value_list
        for k in batch_data.keys():
            if isinstance(batch_data[k][0], torch.Tensor):
                batch_data[k] = torch.concat(batch_data[k])
                batch_data[k] = torch.cat(batch_data[k])
        return batch_data

    def _process_batch(self, input: List[Input], batch_size,
@@ -383,15 +385,12 @@ class DistributedPipeline(Pipeline):
                 preprocessor: Union[Preprocessor, List[Preprocessor]] = None,
                 auto_collate=True,
                 **kwargs):
        self.preprocessor = preprocessor
        super().__init__(model=model, preprocessor=preprocessor, kwargs=kwargs)
        self._model_prepare = False
        self._model_prepare_lock = Lock()
        self._auto_collate = auto_collate

        if os.path.exists(model):
            self.model_dir = model
        else:
            self.model_dir = snapshot_download(model)
        self.model_dir = self.model.model_dir
        self.cfg = read_config(self.model_dir)
        self.world_size = self.cfg.model.world_size
        self.model_pool = None
--- a/modelscope/pipelines/builder.py
+++ b/modelscope/pipelines/builder.py
@@ -7,7 +7,7 @@ from modelscope.hub.snapshot_download import snapshot_download
 from modelscope.metainfo import Pipelines
 from modelscope.models.base import Model
 from modelscope.utils.config import ConfigDict, check_config
 from modelscope.utils.constant import DEFAULT_MODEL_REVISION, Tasks
 from modelscope.utils.constant import DEFAULT_MODEL_REVISION, Invoke, Tasks
 from modelscope.utils.hub import read_config
 from modelscope.utils.registry import Registry, build_from_cfg
 from .base import Pipeline
@@ -147,6 +147,9 @@ DEFAULT_MODEL_FOR_PIPELINE = {
    Tasks.image_segmentation:
    (Pipelines.image_instance_segmentation,
     'damo/cv_swin-b_image-instance-segmentation_coco'),
    Tasks.image_depth_estimation:
    (Pipelines.image_depth_estimation,
     'damo/cv_newcrfs_image-depth-estimation_indoor'),
    Tasks.image_style_transfer: (Pipelines.image_style_transfer,
                                 'damo/cv_aams_style-transfer_damo'),
    Tasks.face_image_generation: (Pipelines.face_image_generation,
@@ -209,6 +212,8 @@ DEFAULT_MODEL_FOR_PIPELINE = {
    Tasks.referring_video_object_segmentation:
    (Pipelines.referring_video_object_segmentation,
     'damo/cv_swin-t_referring_video-object-segmentation'),
    Tasks.video_summarization: (Pipelines.video_summarization,
                                'damo/cv_googlenet_pgl-video-summarization'),
 }


@@ -220,14 +225,19 @@ def normalize_model_input(model, model_revision):
        # skip revision download if model is a local directory
        if not os.path.exists(model):
            # note that if there is already a local copy, snapshot_download will check and skip downloading
            model = snapshot_download(model, revision=model_revision)
            model = snapshot_download(
                model,
                revision=model_revision,
                user_agent={Invoke.KEY: Invoke.PIPELINE})
    elif isinstance(model, list) and isinstance(model[0], str):
        for idx in range(len(model)):
            if is_official_hub_path(
                    model[idx],
                    model_revision) and not os.path.exists(model[idx]):
                model[idx] = snapshot_download(
                    model[idx], revision=model_revision)
                    model[idx],
                    revision=model_revision,
                    user_agent={Invoke.KEY: Invoke.PIPELINE})
    return model


--- a/modelscope/pipelines/cv/animal_recognition_pipeline.py
+++ b/modelscope/pipelines/cv/animal_recognition_pipeline.py
@@ -8,14 +8,13 @@ import torch
 from PIL import Image
 from torchvision import transforms

 from modelscope.hub.snapshot_download import snapshot_download
 from modelscope.metainfo import Pipelines
 from modelscope.models.cv.animal_recognition import Bottleneck, ResNet
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines.base import Input, Pipeline
 from modelscope.pipelines.builder import PIPELINES
 from modelscope.preprocessors import LoadImage
 from modelscope.utils.constant import Tasks
 from modelscope.utils.constant import Devices, ModelFile, Tasks
 from modelscope.utils.logger import get_logger

 logger = get_logger()
@@ -67,15 +66,10 @@ class AnimalRecognitionPipeline(Pipeline):
            filter_param(src_params, own_state)
            model.load_state_dict(own_state)

        self.model = resnest101(num_classes=8288)
        local_model_dir = model
        if osp.exists(model):
            local_model_dir = model
        else:
            local_model_dir = snapshot_download(model)
        self.local_path = local_model_dir
        self.local_path = self.model
        src_params = torch.load(
            osp.join(local_model_dir, 'pytorch_model.pt'), 'cpu')
            osp.join(self.local_path, ModelFile.TORCH_MODEL_FILE), Devices.cpu)
        self.model = resnest101(num_classes=8288)
        load_pretrained(self.model, src_params)
        logger.info('load model done')

--- a/modelscope/pipelines/cv/body_3d_keypoints_pipeline.py
+++ b/modelscope/pipelines/cv/body_3d_keypoints_pipeline.py
@@ -120,8 +120,7 @@ class Body3DKeypointsPipeline(Pipeline):
        """
        super().__init__(model=model, **kwargs)

        self.keypoint_model_3d = model if isinstance(
            model, BodyKeypointsDetection3D) else Model.from_pretrained(model)
        self.keypoint_model_3d = self.model
        self.keypoint_model_3d.eval()

        # init human body 2D keypoints detection pipeline
--- a/modelscope/pipelines/cv/easycv_pipelines/base.py
+++ b/modelscope/pipelines/cv/easycv_pipelines/base.py
@@ -11,7 +11,7 @@ from PIL import ImageFile
 from modelscope.hub.snapshot_download import snapshot_download
 from modelscope.pipelines.util import is_official_hub_path
 from modelscope.utils.config import Config
 from modelscope.utils.constant import DEFAULT_MODEL_REVISION, ModelFile
 from modelscope.utils.constant import DEFAULT_MODEL_REVISION, Invoke, ModelFile
 from modelscope.utils.device import create_device


@@ -37,7 +37,9 @@ class EasyCVPipeline(object):
            assert is_official_hub_path(
                model), 'Only support local model path and official hub path!'
            model_dir = snapshot_download(
                model_id=model, revision=DEFAULT_MODEL_REVISION)
                model_id=model,
                revision=DEFAULT_MODEL_REVISION,
                user_agent={Invoke.KEY: Invoke.PIPELINE})

        assert osp.isdir(model_dir)
        model_files = glob.glob(
@@ -48,6 +50,7 @@ class EasyCVPipeline(object):

        model_path = model_files[0]
        self.model_path = model_path
        self.model_dir = model_dir

        # get configuration file from source model dir
        self.config_file = os.path.join(model_dir, ModelFile.CONFIGURATION)
--- a/modelscope/pipelines/cv/easycv_pipelines/human_wholebody_keypoint_pipeline.py
+++ b/modelscope/pipelines/cv/easycv_pipelines/human_wholebody_keypoint_pipeline.py
@@ -24,7 +24,6 @@ class HumanWholebodyKeypointsPipeline(EasyCVPipeline):
            model (str): model id on modelscope hub or local model path.
            model_file_pattern (str): model file pattern.
        """
        self.model_dir = model
        super(HumanWholebodyKeypointsPipeline, self).__init__(
            model=model,
            model_file_pattern=model_file_pattern,
--- a/modelscope/pipelines/cv/general_recognition_pipeline.py
+++ b/modelscope/pipelines/cv/general_recognition_pipeline.py
@@ -8,7 +8,6 @@ import torch
 from PIL import Image
 from torchvision import transforms

 from modelscope.hub.snapshot_download import snapshot_download
 from modelscope.metainfo import Pipelines
 from modelscope.models.cv.animal_recognition import resnet
 from modelscope.outputs import OutputKeys
@@ -67,16 +66,12 @@ class GeneralRecognitionPipeline(Pipeline):
            filter_param(src_params, own_state)
            model.load_state_dict(own_state)

        self.model = resnest101(num_classes=54092)
        local_model_dir = model
        device = 'cpu'
        if osp.exists(model):
            local_model_dir = model
        else:
            local_model_dir = snapshot_download(model)
        self.local_path = local_model_dir
        self.local_path = self.model
        src_params = torch.load(
            osp.join(local_model_dir, ModelFile.TORCH_MODEL_FILE), device)
            osp.join(self.local_path, ModelFile.TORCH_MODEL_FILE), device)

        self.model = resnest101(num_classes=54092)
        load_pretrained(self.model, src_params)
        logger.info('load model done')

--- a/modelscope/pipelines/cv/hand_2d_keypoints_pipeline.py
+++ b/modelscope/pipelines/cv/hand_2d_keypoints_pipeline.py
@@ -21,7 +21,6 @@ class Hand2DKeypointsPipeline(EasyCVPipeline):
            model (str): model id on modelscope hub or local model path.
            model_file_pattern (str): model file pattern.
        """
        self.model_dir = model
        super(Hand2DKeypointsPipeline, self).__init__(
            model=model,
            model_file_pattern=model_file_pattern,
--- a/modelscope/pipelines/cv/image_classification_pipeline.py
+++ b/modelscope/pipelines/cv/image_classification_pipeline.py
@@ -1,5 +1,5 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 from typing import Any, Dict, Union
 from typing import Any, Dict, Optional, Union

 import cv2
 import numpy as np
@@ -25,22 +25,15 @@ class ImageClassificationPipeline(Pipeline):

    def __init__(self,
                 model: Union[Model, str],
                 preprocessor: [Preprocessor] = None,
                 preprocessor: Optional[Preprocessor] = None,
                 **kwargs):
        super().__init__(model=model)
        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
        assert isinstance(model, str) or isinstance(model, Model), \
            'model must be a single str or OfaForAllTasks'
        if isinstance(model, str):
            pipe_model = Model.from_pretrained(model)
        elif isinstance(model, Model):
            pipe_model = model
        else:
            raise NotImplementedError
        pipe_model.model.eval()
        pipe_model.to(get_device())
        if preprocessor is None and isinstance(pipe_model, OfaForAllTasks):
            preprocessor = OfaPreprocessor(model_dir=pipe_model.model_dir)
        super().__init__(model=pipe_model, preprocessor=preprocessor, **kwargs)
        self.model.eval()
        self.model.to(get_device())
        if preprocessor is None and isinstance(self.model, OfaForAllTasks):
            self.preprocessor = OfaPreprocessor(model_dir=self.model.model_dir)

    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
        return inputs
--- a/modelscope/pipelines/cv/image_color_enhance_pipeline.py
+++ b/modelscope/pipelines/cv/image_color_enhance_pipeline.py
@@ -32,10 +32,8 @@ class ImageColorEnhancePipeline(Pipeline):
        Args:
            model: model id on modelscope hub.
        """
        model = model if isinstance(
            model, ImageColorEnhance) else Model.from_pretrained(model)
        model.eval()
        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
        self.model.eval()

        if torch.cuda.is_available():
            self._device = torch.device('cuda')
--- a/modelscope/pipelines/cv/image_denoise_pipeline.py
+++ b/modelscope/pipelines/cv/image_denoise_pipeline.py
@@ -32,17 +32,14 @@ class ImageDenoisePipeline(Pipeline):
        Args:
            model: model id on modelscope hub.
        """
        model = model if isinstance(
            model, NAFNetForImageDenoise) else Model.from_pretrained(model)
        model.eval()
        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
        self.config = model.config
        self.model.eval()
        self.config = self.model.config

        if torch.cuda.is_available():
            self._device = torch.device('cuda')
        else:
            self._device = torch.device('cpu')
        self.model = model
        logger.info('load image denoise model done')

    def preprocess(self, input: Input) -> Dict[str, Any]:
--- a/modelscope/pipelines/cv/image_depth_estimation_pipeline.py
+++ b/modelscope/pipelines/cv/image_depth_estimation_pipeline.py
@@ -0,0 +1,52 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 from typing import Any, Dict, Union

 import cv2
 import numpy as np
 import PIL
 import torch

 from modelscope.metainfo import Pipelines
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines.base import Input, Model, Pipeline
 from modelscope.pipelines.builder import PIPELINES
 from modelscope.preprocessors import LoadImage
 from modelscope.utils.constant import Tasks
 from modelscope.utils.logger import get_logger

 logger = get_logger()


@PIPELINES.register_module(
    Tasks.image_depth_estimation, module_name=Pipelines.image_depth_estimation)
 class ImageDepthEstimationPipeline(Pipeline):

    def __init__(self, model: str, **kwargs):
        """
        use `model` to create a image depth estimation pipeline for prediction
        Args:
            model: model id on modelscope hub.
        """
        super().__init__(model=model, **kwargs)

        logger.info('depth estimation model, pipeline init')

    def preprocess(self, input: Input) -> Dict[str, Any]:
        img = LoadImage.convert_to_ndarray(input).astype(np.float32)
        H, W = 480, 640
        img = cv2.resize(img, [W, H])
        img = img.transpose(2, 0, 1) / 255.0
        imgs = img[None, ...]
        data = {'imgs': imgs}

        return data

    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
        results = self.model.inference(input)
        return results

    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
        results = self.model.postprocess(inputs)
        outputs = {OutputKeys.DEPTHS: results[OutputKeys.DEPTHS]}

        return outputs
--- a/modelscope/pipelines/cv/language_guided_video_summarization_pipeline.py
+++ b/modelscope/pipelines/cv/language_guided_video_summarization_pipeline.py
@@ -44,7 +44,7 @@ class LanguageGuidedVideoSummarizationPipeline(Pipeline):
        """
        super().__init__(model=model, auto_collate=False, **kwargs)
        logger.info(f'loading model from {model}')
        self.model_dir = model
        self.model_dir = self.model.model_dir

        self.tmp_dir = kwargs.get('tmp_dir', None)
        if self.tmp_dir is None:
--- a/modelscope/pipelines/cv/virtual_try_on_pipeline.py
+++ b/modelscope/pipelines/cv/virtual_try_on_pipeline.py
@@ -9,7 +9,6 @@ import PIL
 import torch
 from PIL import Image

 from modelscope.hub.snapshot_download import snapshot_download
 from modelscope.metainfo import Pipelines
 from modelscope.models.cv.virual_tryon import SDAFNet_Tryon
 from modelscope.outputs import OutputKeys
@@ -52,17 +51,12 @@ class VirtualTryonPipeline(Pipeline):
            filter_param(src_params, own_state)
            model.load_state_dict(own_state)

        self.model = SDAFNet_Tryon(ref_in_channel=6).to(self.device)
        local_model_dir = model
        if osp.exists(model):
            local_model_dir = model
        else:
            local_model_dir = snapshot_download(model)
        self.local_path = local_model_dir
        self.local_path = self.model
        src_params = torch.load(
            osp.join(local_model_dir, ModelFile.TORCH_MODEL_FILE), 'cpu')
            osp.join(self.local_path, ModelFile.TORCH_MODEL_FILE), 'cpu')
        self.model = SDAFNet_Tryon(ref_in_channel=6).to(self.device)
        load_pretrained(self.model, src_params)
        self.model = self.model.eval()
        self.model.eval()
        self.size = 192
        from torchvision import transforms
        self.test_transforms = transforms.Compose([
--- a/modelscope/pipelines/multi_modal/image_captioning_pipeline.py
+++ b/modelscope/pipelines/multi_modal/image_captioning_pipeline.py
@@ -29,22 +29,13 @@ class ImageCaptioningPipeline(Pipeline):
        Args:
            model: model id on modelscope hub.
        """
        super().__init__(model=model)
        assert isinstance(model, str) or isinstance(model, Model), \
            'model must be a single str or OfaForAllTasks'
        if isinstance(model, str):
            pipe_model = Model.from_pretrained(model)
        elif isinstance(model, Model):
            pipe_model = model
        else:
            raise NotImplementedError
        pipe_model.model.eval()
        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
        self.model.eval()
        if preprocessor is None:
            if isinstance(pipe_model, OfaForAllTasks):
                preprocessor = OfaPreprocessor(pipe_model.model_dir)
            elif isinstance(pipe_model, MPlugForAllTasks):
                preprocessor = MPlugPreprocessor(pipe_model.model_dir)
        super().__init__(model=pipe_model, preprocessor=preprocessor, **kwargs)
            if isinstance(self.model, OfaForAllTasks):
                self.preprocessor = OfaPreprocessor(self.model.model_dir)
            elif isinstance(self.model, MPlugForAllTasks):
                self.preprocessor = MPlugPreprocessor(self.model.model_dir)

    def _batch(self, data):
        if isinstance(self.model, OfaForAllTasks):
@@ -55,17 +46,17 @@ class ImageCaptioningPipeline(Pipeline):
                batch_data['samples'] = [d['samples'][0] for d in data]
                batch_data['net_input'] = {}
                for k in data[0]['net_input'].keys():
                    batch_data['net_input'][k] = torch.concat(
                    batch_data['net_input'][k] = torch.cat(
                        [d['net_input'][k] for d in data])

            return batch_data
        elif isinstance(self.model, MPlugForAllTasks):
            from transformers.tokenization_utils_base import BatchEncoding
            batch_data = dict(train=data[0]['train'])
            batch_data['image'] = torch.concat([d['image'] for d in data])
            batch_data['image'] = torch.cat([d['image'] for d in data])
            question = {}
            for k in data[0]['question'].keys():
                question[k] = torch.concat([d['question'][k] for d in data])
                question[k] = torch.cat([d['question'][k] for d in data])
            batch_data['question'] = BatchEncoding(question)
            return batch_data
        else:
--- a/modelscope/pipelines/multi_modal/image_text_retrieval_pipeline.py
+++ b/modelscope/pipelines/multi_modal/image_text_retrieval_pipeline.py
@@ -28,19 +28,10 @@ class ImageTextRetrievalPipeline(Pipeline):
        Args:
            model: model id on modelscope hub.
        """
        super().__init__(model=model)
        assert isinstance(model, str) or isinstance(model, Model), \
            f'model must be a single str or Model, but got {type(model)}'
        if isinstance(model, str):
            pipe_model = Model.from_pretrained(model)
        elif isinstance(model, Model):
            pipe_model = model
        else:
            raise NotImplementedError
        pipe_model.model.eval()
        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
        self.model.eval()
        if preprocessor is None:
            preprocessor = MPlugPreprocessor(pipe_model.model_dir)
        super().__init__(model=pipe_model, preprocessor=preprocessor, **kwargs)
            self.preprocessor = MPlugPreprocessor(self.model.model_dir)

    def forward(self, inputs: Dict[str, Any],
                **forward_params) -> Dict[str, Any]:
--- a/modelscope/pipelines/multi_modal/multi_modal_embedding_pipeline.py
+++ b/modelscope/pipelines/multi_modal/multi_modal_embedding_pipeline.py
@@ -28,21 +28,14 @@ class MultiModalEmbeddingPipeline(Pipeline):
        Args:
            model: model id on modelscope hub.
        """
        if isinstance(model, str):
            pipe_model = Model.from_pretrained(model)
        elif isinstance(model, Model):
            pipe_model = model
        else:
            raise NotImplementedError('model must be a single str')
        pipe_model.eval()
        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
        self.model.eval()
        if preprocessor is None:
            if isinstance(pipe_model, CLIPForMultiModalEmbedding):
                preprocessor = CLIPPreprocessor(pipe_model.model_dir)
            if isinstance(self.model, CLIPForMultiModalEmbedding):
                self.preprocessor = CLIPPreprocessor(self.model.model_dir)
            else:
                raise NotImplementedError

        super().__init__(model=pipe_model, preprocessor=preprocessor, **kwargs)

    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
        return self.model(self.preprocess(input))

--- a/modelscope/pipelines/multi_modal/ocr_recognition_pipeline.py
+++ b/modelscope/pipelines/multi_modal/ocr_recognition_pipeline.py
@@ -28,20 +28,11 @@ class OcrRecognitionPipeline(Pipeline):
        Args:
            model: model id on modelscope hub.
        """
        super().__init__(model=model)
        assert isinstance(model, str) or isinstance(model, Model), \
            'model must be a single str or OfaForAllTasks'
        if isinstance(model, str):
            pipe_model = Model.from_pretrained(model)
        elif isinstance(model, Model):
            pipe_model = model
        else:
            raise NotImplementedError
        pipe_model.model.eval()
        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
        self.model.eval()
        if preprocessor is None:
            if isinstance(pipe_model, OfaForAllTasks):
                preprocessor = OfaPreprocessor(pipe_model.model_dir)
        super().__init__(model=pipe_model, preprocessor=preprocessor, **kwargs)
            if isinstance(self.model, OfaForAllTasks):
                self.preprocessor = OfaPreprocessor(self.model.model_dir)

    def forward(self, inputs: Dict[str, Any],
                **forward_params) -> Dict[str, Any]:
--- a/modelscope/pipelines/multi_modal/text_to_image_synthesis_pipeline.py
+++ b/modelscope/pipelines/multi_modal/text_to_image_synthesis_pipeline.py
@@ -31,18 +31,10 @@ class TextToImageSynthesisPipeline(Pipeline):
        Args:
            model: model id on modelscope hub.
        """
        device_id = 0 if torch.cuda.is_available() else -1
        if isinstance(model, str):
            pipe_model = Model.from_pretrained(model, device_id=device_id)
        elif isinstance(model, Model):
            pipe_model = model
        else:
            raise NotImplementedError(
                f'expecting a Model instance or str, but get {type(model)}.')
        if preprocessor is None and isinstance(pipe_model,
        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
        if preprocessor is None and isinstance(self.model,
                                               OfaForTextToImageSynthesis):
            preprocessor = OfaPreprocessor(pipe_model.model_dir)
        super().__init__(model=pipe_model, preprocessor=preprocessor, **kwargs)
            self.preprocessor = OfaPreprocessor(self.model.model_dir)

    def preprocess(self, input: Input, **preprocess_params) -> Dict[str, Any]:
        if self.preprocessor is not None:
--- a/modelscope/pipelines/multi_modal/visual_entailment_pipeline.py
+++ b/modelscope/pipelines/multi_modal/visual_entailment_pipeline.py
@@ -1,5 +1,5 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 from typing import Any, Dict, Union
 from typing import Any, Dict, Optional, Union

 from modelscope.metainfo import Pipelines
 from modelscope.models.multi_modal import OfaForAllTasks
@@ -18,26 +18,17 @@ class VisualEntailmentPipeline(Pipeline):

    def __init__(self,
                 model: Union[Model, str],
                 preprocessor: [Preprocessor] = None,
                 preprocessor: Optional[Preprocessor] = None,
                 **kwargs):
        """
        use `model` and `preprocessor` to create a visual entailment pipeline for prediction
        Args:
            model: model id on modelscope hub.
        """
        super().__init__(model=model)
        assert isinstance(model, str) or isinstance(model, Model), \
            'model must be a single str or OfaForAllTasks'
        if isinstance(model, str):
            pipe_model = Model.from_pretrained(model)
        elif isinstance(model, Model):
            pipe_model = model
        else:
            raise NotImplementedError
        pipe_model.model.eval()
        if preprocessor is None and isinstance(pipe_model, OfaForAllTasks):
            preprocessor = OfaPreprocessor(model_dir=pipe_model.model_dir)
        super().__init__(model=pipe_model, preprocessor=preprocessor, **kwargs)
        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
        self.model.eval()
        if preprocessor is None and isinstance(self.model, OfaForAllTasks):
            self.preprocessor = OfaPreprocessor(model_dir=self.model.model_dir)

    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
        return inputs
--- a/modelscope/pipelines/multi_modal/visual_grounding_pipeline.py
+++ b/modelscope/pipelines/multi_modal/visual_grounding_pipeline.py
@@ -1,5 +1,5 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 from typing import Any, Dict, Union
 from typing import Any, Dict, Optional, Union

 from modelscope.metainfo import Pipelines
 from modelscope.models.multi_modal import OfaForAllTasks
@@ -18,26 +18,17 @@ class VisualGroundingPipeline(Pipeline):

    def __init__(self,
                 model: Union[Model, str],
                 preprocessor: [Preprocessor] = None,
                 preprocessor: Optional[Preprocessor] = None,
                 **kwargs):
        """
        use `model` and `preprocessor` to create a visual grounding pipeline for prediction
        Args:
            model: model id on modelscope hub.
        """
        super().__init__(model=model)
        assert isinstance(model, str) or isinstance(model, Model), \
            'model must be a single str or OfaForAllTasks'
        if isinstance(model, str):
            pipe_model = Model.from_pretrained(model)
        elif isinstance(model, Model):
            pipe_model = model
        else:
            raise NotImplementedError
        pipe_model.model.eval()
        if preprocessor is None and isinstance(pipe_model, OfaForAllTasks):
            preprocessor = OfaPreprocessor(model_dir=pipe_model.model_dir)
        super().__init__(model=pipe_model, preprocessor=preprocessor, **kwargs)
        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
        self.model.model.eval()
        if preprocessor is None and isinstance(self.model, OfaForAllTasks):
            self.preprocessor = OfaPreprocessor(model_dir=self.model.model_dir)

    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
        return inputs
--- a/modelscope/pipelines/multi_modal/visual_question_answering_pipeline.py
+++ b/modelscope/pipelines/multi_modal/visual_question_answering_pipeline.py
@@ -31,15 +31,13 @@ class VisualQuestionAnsweringPipeline(Pipeline):
            model (MPlugForVisualQuestionAnswering): a model instance
            preprocessor (MPlugVisualQuestionAnsweringPreprocessor): a preprocessor instance
        """
        model = model if isinstance(model,
                                    Model) else Model.from_pretrained(model)
        if preprocessor is None:
            if isinstance(model, OfaForAllTasks):
                preprocessor = OfaPreprocessor(model.model_dir)
            elif isinstance(model, MPlugForAllTasks):
                preprocessor = MPlugPreprocessor(model.model_dir)
        model.model.eval()
        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
        if preprocessor is None:
            if isinstance(self.model, OfaForAllTasks):
                self.preprocessor = OfaPreprocessor(self.model.model_dir)
            elif isinstance(self.model, MPlugForAllTasks):
                self.preprocessor = MPlugPreprocessor(self.model.model_dir)
        self.model.eval()

    def forward(self, inputs: Dict[str, Any],
                **forward_params) -> Dict[str, Any]:
--- a/modelscope/pipelines/nlp/conversational_text_to_sql_pipeline.py
+++ b/modelscope/pipelines/nlp/conversational_text_to_sql_pipeline.py
@@ -32,12 +32,10 @@ class ConversationalTextToSqlPipeline(Pipeline):
            preprocessor (ConversationalTextToSqlPreprocessor):
                a preprocessor instance
        """
        model = model if isinstance(
            model, StarForTextToSql) else Model.from_pretrained(model)
        if preprocessor is None:
            preprocessor = ConversationalTextToSqlPreprocessor(model.model_dir)

        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
        if preprocessor is None:
            self.preprocessor = ConversationalTextToSqlPreprocessor(
                self.model.model_dir)

    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, str]:
        """process the prediction results
--- a/modelscope/pipelines/nlp/dialog_intent_prediction_pipeline.py
+++ b/modelscope/pipelines/nlp/dialog_intent_prediction_pipeline.py
@@ -30,13 +30,11 @@ class DialogIntentPredictionPipeline(Pipeline):
            or a SpaceForDialogIntent instance.
            preprocessor (DialogIntentPredictionPreprocessor): An optional preprocessor instance.
        """
        model = model if isinstance(
            model, SpaceForDialogIntent) else Model.from_pretrained(model)
        if preprocessor is None:
            preprocessor = DialogIntentPredictionPreprocessor(model.model_dir)
        self.model = model
        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
        self.categories = preprocessor.categories
        if preprocessor is None:
            self.preprocessor = DialogIntentPredictionPreprocessor(
                self.model.model_dir)
        self.categories = self.preprocessor.categories

    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, str]:
        """process the prediction results
--- a/modelscope/pipelines/nlp/dialog_modeling_pipeline.py
+++ b/modelscope/pipelines/nlp/dialog_modeling_pipeline.py
@@ -29,13 +29,10 @@ class DialogModelingPipeline(Pipeline):
            or a SpaceForDialogModeling instance.
            preprocessor (DialogModelingPreprocessor): An optional preprocessor instance.
        """
        model = model if isinstance(
            model, SpaceForDialogModeling) else Model.from_pretrained(model)
        self.model = model
        if preprocessor is None:
            preprocessor = DialogModelingPreprocessor(model.model_dir)
        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
        self.preprocessor = preprocessor
        if preprocessor is None:
            self.preprocessor = DialogModelingPreprocessor(
                self.model.model_dir)

    def postprocess(self, inputs: Dict[str, Tensor]) -> Dict[str, str]:
        """process the prediction results
--- a/modelscope/pipelines/nlp/dialog_state_tracking_pipeline.py
+++ b/modelscope/pipelines/nlp/dialog_state_tracking_pipeline.py
@@ -31,16 +31,13 @@ class DialogStateTrackingPipeline(Pipeline):
            from the model hub, or a SpaceForDialogStateTracking instance.
            preprocessor (DialogStateTrackingPreprocessor): An optional preprocessor instance.
        """

        model = model if isinstance(
            model, SpaceForDST) else Model.from_pretrained(model)
        self.model = model
        if preprocessor is None:
            preprocessor = DialogStateTrackingPreprocessor(model.model_dir)
        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
        if preprocessor is None:
            self.preprocessor = DialogStateTrackingPreprocessor(
                self.model.model_dir)

        self.tokenizer = preprocessor.tokenizer
        self.config = preprocessor.config
        self.tokenizer = self.preprocessor.tokenizer
        self.config = self.preprocessor.config

    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, str]:
        """process the prediction results
--- a/modelscope/pipelines/nlp/document_segmentation_pipeline.py
+++ b/modelscope/pipelines/nlp/document_segmentation_pipeline.py
@@ -31,27 +31,22 @@ class DocumentSegmentationPipeline(Pipeline):
                 model: Union[Model, str],
                 preprocessor: DocumentSegmentationPreprocessor = None,
                 **kwargs):
        super().__init__(model=model, preprocessor=preprocessor, **kwargs)

        model = model if isinstance(model,
                                    Model) else Model.from_pretrained(model)

        self.model_dir = model.model_dir
        self.model_cfg = model.forward()
        self.model_dir = self.model.model_dir
        self.model_cfg = self.model.forward()

        if self.model_cfg['type'] == 'bert':
            config = BertConfig.from_pretrained(model.model_dir, num_labels=2)
            config = BertConfig.from_pretrained(self.model_dir, num_labels=2)
        elif self.model_cfg['type'] == 'ponet':
            config = PoNetConfig.from_pretrained(model.model_dir, num_labels=2)
            config = PoNetConfig.from_pretrained(self.model_dir, num_labels=2)

        self.document_segmentation_model = model.build_with_config(
        self.document_segmentation_model = self.model.build_with_config(
            config=config)

        if preprocessor is None:
            preprocessor = DocumentSegmentationPreprocessor(
                self.model_dir, config)
        super().__init__(model=model, preprocessor=preprocessor, **kwargs)

        self.preprocessor = preprocessor
            self.preprocessor = DocumentSegmentationPreprocessor(
                self.model.model_dir, config)

    def __call__(
            self, documents: Union[List[List[str]], List[str],
--- a/modelscope/pipelines/nlp/faq_question_answering_pipeline.py
+++ b/modelscope/pipelines/nlp/faq_question_answering_pipeline.py
@@ -21,12 +21,10 @@ class FaqQuestionAnsweringPipeline(Pipeline):
                 model: Union[str, Model],
                 preprocessor: Preprocessor = None,
                 **kwargs):
        model = Model.from_pretrained(model) if isinstance(model,
                                                           str) else model
        if preprocessor is None:
            preprocessor = Preprocessor.from_pretrained(
                model.model_dir, **kwargs)
        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
        if preprocessor is None:
            self.preprocessor = Preprocessor.from_pretrained(
                self.model.model_dir, **kwargs)

    def _sanitize_parameters(self, **pipeline_parameters):
        return pipeline_parameters, pipeline_parameters, pipeline_parameters
@@ -37,11 +35,11 @@ class FaqQuestionAnsweringPipeline(Pipeline):
        sentence_vecs = sentence_vecs.detach().tolist()
        return sentence_vecs

    def forward(self, inputs: [list, Dict[str, Any]],
    def forward(self, inputs: Union[list, Dict[str, Any]],
                **forward_params) -> Dict[str, Any]:
        return self.model(inputs)

    def postprocess(self, inputs: [list, Dict[str, Any]],
    def postprocess(self, inputs: Union[list, Dict[str, Any]],
                    **postprocess_params) -> Dict[str, Any]:
        scores = inputs['scores']
        labels = []
--- a/modelscope/pipelines/nlp/feature_extraction_pipeline.py
+++ b/modelscope/pipelines/nlp/feature_extraction_pipeline.py
@@ -46,21 +46,18 @@ class FeatureExtractionPipeline(Pipeline):


        """
        model = model if isinstance(model,
                                    Model) else Model.from_pretrained(model)
        super().__init__(model=model, preprocessor=preprocessor, **kwargs)

        if preprocessor is None:
            preprocessor = NLPPreprocessor(
                model.model_dir,
            self.preprocessor = NLPPreprocessor(
                self.model.model_dir,
                padding=kwargs.pop('padding', False),
                sequence_length=kwargs.pop('sequence_length', 128))
        model.eval()
        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
        self.model.eval()

        self.preprocessor = preprocessor
        self.config = Config.from_file(
            os.path.join(model.model_dir, ModelFile.CONFIGURATION))
        self.tokenizer = preprocessor.tokenizer
            os.path.join(self.model.model_dir, ModelFile.CONFIGURATION))
        self.tokenizer = self.preprocessor.tokenizer

    def forward(self, inputs: Dict[str, Any],
                **forward_params) -> Dict[str, Any]:
--- a/modelscope/pipelines/nlp/fill_mask_pipeline.py
+++ b/modelscope/pipelines/nlp/fill_mask_pipeline.py
@@ -53,22 +53,18 @@ class FillMaskPipeline(Pipeline):
            If the xlm-roberta(xlm-roberta, veco, etc.) based model is used, the mask token is '<mask>'.
            To view other examples plese check the tests/pipelines/test_fill_mask.py.
        """

        fill_mask_model = Model.from_pretrained(model) if isinstance(
            model, str) else model

        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
        if preprocessor is None:
            preprocessor = Preprocessor.from_pretrained(
                fill_mask_model.model_dir,
            self.preprocessor = Preprocessor.from_pretrained(
                self.model.model_dir,
                first_sequence=first_sequence,
                second_sequence=None,
                sequence_length=kwargs.pop('sequence_length', 128))
        fill_mask_model.eval()
        assert hasattr(
            preprocessor, 'mask_id'
        ), 'The input preprocessor should have the mask_id attribute.'
        super().__init__(
            model=fill_mask_model, preprocessor=preprocessor, **kwargs)
            assert hasattr(
                self.preprocessor, 'mask_id'
            ), 'The input preprocessor should have the mask_id attribute.'

        self.model.eval()

    def forward(self, inputs: Dict[str, Any],
                **forward_params) -> Dict[str, Any]:
--- a/modelscope/pipelines/nlp/information_extraction_pipeline.py
+++ b/modelscope/pipelines/nlp/information_extraction_pipeline.py
@@ -25,15 +25,12 @@ class InformationExtractionPipeline(Pipeline):
                 model: Union[Model, str],
                 preprocessor: Optional[Preprocessor] = None,
                 **kwargs):

        model = model if isinstance(model,
                                    Model) else Model.from_pretrained(model)
        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
        if preprocessor is None:
            preprocessor = RelationExtractionPreprocessor(
                model.model_dir,
            self.preprocessor = RelationExtractionPreprocessor(
                self.model.model_dir,
                sequence_length=kwargs.pop('sequence_length', 512))
        model.eval()
        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
        self.model.eval()

    def forward(self, inputs: Dict[str, Any],
                **forward_params) -> Dict[str, Any]:
--- a/modelscope/pipelines/nlp/mglm_text_summarization_pipeline.py
+++ b/modelscope/pipelines/nlp/mglm_text_summarization_pipeline.py
@@ -21,7 +21,7 @@ class MGLMTextSummarizationPipeline(Pipeline):

    def __init__(self,
                 model: Union[MGLMForTextSummarization, str],
                 preprocessor: [Preprocessor] = None,
                 preprocessor: Optional[Preprocessor] = None,
                 *args,
                 **kwargs):
        model = MGLMForTextSummarization(model) if isinstance(model,
--- a/modelscope/pipelines/nlp/named_entity_recognition_pipeline.py
+++ b/modelscope/pipelines/nlp/named_entity_recognition_pipeline.py
@@ -50,15 +50,12 @@ class NamedEntityRecognitionPipeline(TokenClassificationPipeline):

            To view other examples plese check the tests/pipelines/test_named_entity_recognition.py.
        """

        model = model if isinstance(model,
                                    Model) else Model.from_pretrained(model)
        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
        if preprocessor is None:
            preprocessor = TokenClassificationPreprocessor(
                model.model_dir,
            self.preprocessor = TokenClassificationPreprocessor(
                self.model.model_dir,
                sequence_length=kwargs.pop('sequence_length', 128))
        model.eval()
        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
        self.model.eval()
        self.id2label = kwargs.get('id2label')
        if self.id2label is None and hasattr(self.preprocessor, 'id2label'):
            self.id2label = self.preprocessor.id2label
@@ -73,13 +70,11 @@ class NamedEntityRecognitionThaiPipeline(NamedEntityRecognitionPipeline):
                 model: Union[Model, str],
                 preprocessor: Optional[Preprocessor] = None,
                 **kwargs):
        model = model if isinstance(model,
                                    Model) else Model.from_pretrained(model)
        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
        if preprocessor is None:
            preprocessor = NERPreprocessorThai(
                model.model_dir,
            self.preprocessor = NERPreprocessorThai(
                self.model.model_dir,
                sequence_length=kwargs.pop('sequence_length', 512))
        super().__init__(model=model, preprocessor=preprocessor, **kwargs)


@PIPELINES.register_module(
@@ -91,10 +86,8 @@ class NamedEntityRecognitionVietPipeline(NamedEntityRecognitionPipeline):
                 model: Union[Model, str],
                 preprocessor: Optional[Preprocessor] = None,
                 **kwargs):
        model = model if isinstance(model,
                                    Model) else Model.from_pretrained(model)
        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
        if preprocessor is None:
            preprocessor = NERPreprocessorViet(
                model.model_dir,
            self.preprocessor = NERPreprocessorViet(
                self.model.model_dir,
                sequence_length=kwargs.pop('sequence_length', 512))
        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
--- a/modelscope/pipelines/nlp/sentence_embedding_pipeline.py
+++ b/modelscope/pipelines/nlp/sentence_embedding_pipeline.py
@@ -32,14 +32,13 @@ class SentenceEmbeddingPipeline(Pipeline):
            the model if supplied.
            sequence_length: Max sequence length in the user's custom scenario. 128 will be used as a default value.
        """
        model = Model.from_pretrained(model) if isinstance(model,
                                                           str) else model
        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
        if preprocessor is None:
            preprocessor = Preprocessor.from_pretrained(
                model.model_dir if isinstance(model, Model) else model,
            self.preprocessor = Preprocessor.from_pretrained(
                self.model.model_dir
                if isinstance(self.model, Model) else model,
                first_sequence=first_sequence,
                sequence_length=kwargs.pop('sequence_length', 128))
        super().__init__(model=model, preprocessor=preprocessor, **kwargs)

    def forward(self, inputs: Dict[str, Any],
                **forward_params) -> Dict[str, Any]:
--- a/modelscope/pipelines/nlp/summarization_pipeline.py
+++ b/modelscope/pipelines/nlp/summarization_pipeline.py
@@ -1,5 +1,5 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 from typing import Any, Dict, Union
 from typing import Any, Dict, Optional, Union

 from modelscope.metainfo import Pipelines
 from modelscope.models.multi_modal import OfaForAllTasks
@@ -18,7 +18,7 @@ class SummarizationPipeline(Pipeline):

    def __init__(self,
                 model: Union[Model, str],
                 preprocessor: [Preprocessor] = None,
                 preprocessor: Optional[Preprocessor] = None,
                 **kwargs):
        """Use `model` and `preprocessor` to create a Summarization pipeline for prediction.

@@ -27,19 +27,10 @@ class SummarizationPipeline(Pipeline):
            or a model id from the model hub, or a model instance.
            preprocessor (Preprocessor): An optional preprocessor instance.
        """
        super().__init__(model=model)
        assert isinstance(model, str) or isinstance(model, Model), \
            'model must be a single str or OfaForAllTasks'
        if isinstance(model, str):
            pipe_model = Model.from_pretrained(model)
        elif isinstance(model, Model):
            pipe_model = model
        else:
            raise NotImplementedError
        pipe_model.model.eval()
        if preprocessor is None and isinstance(pipe_model, OfaForAllTasks):
            preprocessor = OfaPreprocessor(model_dir=pipe_model.model_dir)
        super().__init__(model=pipe_model, preprocessor=preprocessor, **kwargs)
        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
        self.model.eval()
        if preprocessor is None and isinstance(self.model, OfaForAllTasks):
            self.preprocessor = OfaPreprocessor(model_dir=self.model.model_dir)

    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
        return inputs
--- a/modelscope/pipelines/nlp/table_question_answering_pipeline.py
+++ b/modelscope/pipelines/nlp/table_question_answering_pipeline.py
@@ -41,21 +41,22 @@ class TableQuestionAnsweringPipeline(Pipeline):
            preprocessor (TableQuestionAnsweringPreprocessor): a preprocessor instance
            db (Database): a database to store tables in the database
        """
        model = model if isinstance(
            model, TableQuestionAnswering) else Model.from_pretrained(model)
        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
        if preprocessor is None:
            preprocessor = TableQuestionAnsweringPreprocessor(model.model_dir)
            self.preprocessor = TableQuestionAnsweringPreprocessor(
                self.model.model_dir)

        # initilize tokenizer
        self.tokenizer = BertTokenizer(
            os.path.join(model.model_dir, ModelFile.VOCAB_FILE))
            os.path.join(self.model.model_dir, ModelFile.VOCAB_FILE))

        # initialize database
        if db is None:
            self.db = Database(
                tokenizer=self.tokenizer,
                table_file_path=os.path.join(model.model_dir, 'table.json'),
                syn_dict_file_path=os.path.join(model.model_dir,
                table_file_path=os.path.join(self.model.model_dir,
                                             'table.json'),
                syn_dict_file_path=os.path.join(self.model.model_dir,
                                                'synonym.txt'))
        else:
            self.db = db
@@ -71,8 +72,6 @@ class TableQuestionAnsweringPipeline(Pipeline):
        self.schema_link_dict = constant.schema_link_dict
        self.limit_dict = constant.limit_dict

        super().__init__(model=model, preprocessor=preprocessor, **kwargs)

    def post_process_multi_turn(self, history_sql, result, table):
        action = self.action_ops[result['action']]
        headers = table['header_name']
--- a/modelscope/pipelines/nlp/text2text_generation_pipeline.py
+++ b/modelscope/pipelines/nlp/text2text_generation_pipeline.py
@@ -63,16 +63,14 @@ class Text2TextGenerationPipeline(Pipeline):

            To view other examples plese check the tests/pipelines/test_text_generation.py.
        """
        model = model if isinstance(model,
                                    Model) else Model.from_pretrained(model)
        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
        if preprocessor is None:
            preprocessor = Text2TextGenerationPreprocessor(
                model.model_dir,
            self.preprocessor = Text2TextGenerationPreprocessor(
                self.model.model_dir,
                sequence_length=kwargs.pop('sequence_length', 128))
        self.tokenizer = preprocessor.tokenizer
        self.pipeline = model.pipeline.type
        model.eval()
        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
        self.tokenizer = self.preprocessor.tokenizer
        self.pipeline = self.model.pipeline.type
        self.model.eval()

    def preprocess(self, inputs: Input, **preprocess_params) -> Dict[str, Any]:
        """ Provide specific preprocess for text2text generation pipeline in order to handl multi tasks
--- a/modelscope/pipelines/nlp/text_classification_pipeline.py
+++ b/modelscope/pipelines/nlp/text_classification_pipeline.py
@@ -53,25 +53,24 @@ class TextClassificationPipeline(Pipeline):
        NOTE: Inputs of type 'str' are also supported. In this scenario, the 'first_sequence' and 'second_sequence'
            param will have no affection.
        """
        model = Model.from_pretrained(model) if isinstance(model,
                                                           str) else model
        super().__init__(model=model, preprocessor=preprocessor, **kwargs)

        if preprocessor is None:
            if model.__class__.__name__ == 'OfaForAllTasks':
                preprocessor = Preprocessor.from_pretrained(
                    model_name_or_path=model.model_dir,
            if self.model.__class__.__name__ == 'OfaForAllTasks':
                self.preprocessor = Preprocessor.from_pretrained(
                    model_name_or_path=self.model.model_dir,
                    type=Preprocessors.ofa_tasks_preprocessor,
                    field=Fields.multi_modal)
            else:
                first_sequence = kwargs.pop('first_sequence', 'first_sequence')
                second_sequence = kwargs.pop('second_sequence', None)
                preprocessor = Preprocessor.from_pretrained(
                    model if isinstance(model, str) else model.model_dir,
                self.preprocessor = Preprocessor.from_pretrained(
                    self.model
                    if isinstance(self.model, str) else self.model.model_dir,
                    first_sequence=first_sequence,
                    second_sequence=second_sequence,
                    sequence_length=kwargs.pop('sequence_length', 512))

        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
        self.id2label = kwargs.get('id2label')
        if self.id2label is None and hasattr(self.preprocessor, 'id2label'):
            self.id2label = self.preprocessor.id2label
--- a/modelscope/pipelines/nlp/text_error_correction_pipeline.py
+++ b/modelscope/pipelines/nlp/text_error_correction_pipeline.py
@@ -40,14 +40,12 @@ class TextErrorCorrectionPipeline(Pipeline):

        To view other examples plese check the tests/pipelines/test_text_error_correction.py.
        """
        super().__init__(model=model, preprocessor=preprocessor, **kwargs)

        model = model if isinstance(
            model,
            BartForTextErrorCorrection) else Model.from_pretrained(model)
        if preprocessor is None:
            preprocessor = TextErrorCorrectionPreprocessor(model.model_dir)
        self.vocab = preprocessor.vocab
        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
            self.preprocessor = TextErrorCorrectionPreprocessor(
                self.model.model_dir)
        self.vocab = self.preprocessor.vocab

    def forward(self, inputs: Dict[str, Any],
                **forward_params) -> Dict[str, Any]:
--- a/modelscope/pipelines/nlp/text_generation_pipeline.py
+++ b/modelscope/pipelines/nlp/text_generation_pipeline.py
@@ -51,15 +51,14 @@ class TextGenerationPipeline(Pipeline):

            To view other examples plese check the tests/pipelines/test_text_generation.py.
        """
        model = model if isinstance(model,
                                    Model) else Model.from_pretrained(model)
        cfg = read_config(model.model_dir)
        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
        cfg = read_config(self.model.model_dir)
        self.postprocessor = cfg.pop('postprocessor', 'decode')
        if preprocessor is None:
            preprocessor_cfg = cfg.preprocessor
            preprocessor_cfg.update({
                'model_dir':
                model.model_dir,
                self.model.model_dir,
                'first_sequence':
                first_sequence,
                'second_sequence':
@@ -67,9 +66,9 @@ class TextGenerationPipeline(Pipeline):
                'sequence_length':
                kwargs.pop('sequence_length', 128)
            })
            preprocessor = build_preprocessor(preprocessor_cfg, Fields.nlp)
        model.eval()
        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
            self.preprocessor = build_preprocessor(preprocessor_cfg,
                                                   Fields.nlp)
        self.model.eval()

    def _sanitize_parameters(self, **pipeline_parameters):
        return {}, pipeline_parameters, {}
--- a/modelscope/pipelines/nlp/text_ranking_pipeline.py
+++ b/modelscope/pipelines/nlp/text_ranking_pipeline.py
@@ -32,14 +32,12 @@ class TextRankingPipeline(Pipeline):
            the model if supplied.
            sequence_length: Max sequence length in the user's custom scenario. 128 will be used as a default value.
        """
        model = Model.from_pretrained(model) if isinstance(model,
                                                           str) else model
        super().__init__(model=model, preprocessor=preprocessor, **kwargs)

        if preprocessor is None:
            preprocessor = Preprocessor.from_pretrained(
                model.model_dir,
            self.preprocessor = Preprocessor.from_pretrained(
                self.model.model_dir,
                sequence_length=kwargs.pop('sequence_length', 128))
        super().__init__(model=model, preprocessor=preprocessor, **kwargs)

    def forward(self, inputs: Dict[str, Any],
                **forward_params) -> Dict[str, Any]:
--- a/modelscope/pipelines/nlp/token_classification_pipeline.py
+++ b/modelscope/pipelines/nlp/token_classification_pipeline.py
@@ -39,15 +39,14 @@ class TokenClassificationPipeline(Pipeline):
            model (str or Model): A model instance or a model local dir or a model id in the model hub.
            preprocessor (Preprocessor): a preprocessor instance, must not be None.
        """
        model = Model.from_pretrained(model) if isinstance(model,
                                                           str) else model
        super().__init__(model=model, preprocessor=preprocessor, **kwargs)

        if preprocessor is None:
            preprocessor = Preprocessor.from_pretrained(
                model.model_dir,
            self.preprocessor = Preprocessor.from_pretrained(
                self.model.model_dir,
                sequence_length=kwargs.pop('sequence_length', 128))
        model.eval()
        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
        self.model.eval()

        self.id2label = kwargs.get('id2label')
        if self.id2label is None and hasattr(self.preprocessor, 'id2label'):
            self.id2label = self.preprocessor.id2label
--- a/modelscope/pipelines/nlp/translation_quality_estimation_pipeline.py
+++ b/modelscope/pipelines/nlp/translation_quality_estimation_pipeline.py
@@ -27,10 +27,10 @@ class TranslationQualityEstimationPipeline(Pipeline):

    def __init__(self, model: str, device: str = 'gpu', **kwargs):
        super().__init__(model=model, device=device)
        model_file = os.path.join(model, ModelFile.TORCH_MODEL_FILE)
        model_file = os.path.join(self.model, ModelFile.TORCH_MODEL_FILE)
        with open(model_file, 'rb') as f:
            buffer = io.BytesIO(f.read())
        self.tokenizer = XLMRobertaTokenizer.from_pretrained(model)
        self.tokenizer = XLMRobertaTokenizer.from_pretrained(self.model)
        self.model = torch.jit.load(
            buffer, map_location=self.device).to(self.device)

--- a/modelscope/pipelines/nlp/word_segmentation_pipeline.py
+++ b/modelscope/pipelines/nlp/word_segmentation_pipeline.py
@@ -49,14 +49,13 @@ class WordSegmentationPipeline(TokenClassificationPipeline):

            To view other examples plese check the tests/pipelines/test_word_segmentation.py.
        """
        model = model if isinstance(model,
                                    Model) else Model.from_pretrained(model)
        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
        if preprocessor is None:
            preprocessor = TokenClassificationPreprocessor(
                model.model_dir,
            self.preprocessor = TokenClassificationPreprocessor(
                self.model.model_dir,
                sequence_length=kwargs.pop('sequence_length', 128))
        model.eval()
        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
        self.model.eval()

        self.id2label = kwargs.get('id2label')
        if self.id2label is None and hasattr(self.preprocessor, 'id2label'):
            self.id2label = self.preprocessor.id2label
--- a/modelscope/pipelines/nlp/zero_shot_classification_pipeline.py
+++ b/modelscope/pipelines/nlp/zero_shot_classification_pipeline.py
@@ -59,16 +59,14 @@ class ZeroShotClassificationPipeline(Pipeline):
        """
        assert isinstance(model, str) or isinstance(model, Model), \
            'model must be a single str or Model'
        model = model if isinstance(model,
                                    Model) else Model.from_pretrained(model)
        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
        self.entailment_id = 0
        self.contradiction_id = 2
        if preprocessor is None:
            preprocessor = ZeroShotClassificationPreprocessor(
                model.model_dir,
            self.preprocessor = ZeroShotClassificationPreprocessor(
                self.model.model_dir,
                sequence_length=kwargs.pop('sequence_length', 512))
        model.eval()
        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
        self.model.eval()

    def _sanitize_parameters(self, **kwargs):
        preprocess_params = {}
--- a/modelscope/pipelines/science/protein_structure_pipeline.py
+++ b/modelscope/pipelines/science/protein_structure_pipeline.py
@@ -105,22 +105,16 @@ class ProteinStructurePipeline(Pipeline):
            >>> print(pipeline_ins(protein))

        """
        import copy
        model_path = copy.deepcopy(model) if isinstance(model, str) else None
        cfg = read_config(model_path)  # only model is str
        self.cfg = cfg
        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
        self.cfg = read_config(self.model.model_dir)
        self.config = model_config(
            cfg['pipeline']['model_name'])  # alphafold config
        model = model if isinstance(
            model, Model) else Model.from_pretrained(model_path)
        self.postprocessor = cfg.pop('postprocessor', None)
            self.cfg['pipeline']['model_name'])  # alphafold config
        self.postprocessor = self.cfg.pop('postprocessor', None)
        if preprocessor is None:
            preprocessor_cfg = cfg.preprocessor
            preprocessor = build_preprocessor(preprocessor_cfg, Fields.science)
        model.eval()
        model.model.inference_mode()
        model.model_dir = model_path
        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
            preprocessor_cfg = self.cfg.preprocessor
            self.preprocessor = build_preprocessor(preprocessor_cfg,
                                                   Fields.science)
        self.model.eval()

    def _sanitize_parameters(self, **pipeline_parameters):
        return pipeline_parameters, pipeline_parameters, pipeline_parameters
--- a/modelscope/preprocessors/base.py
+++ b/modelscope/preprocessors/base.py
@@ -6,7 +6,8 @@ from typing import Any, Dict, Optional, Sequence

 from modelscope.metainfo import Models, Preprocessors
 from modelscope.utils.config import Config, ConfigDict
 from modelscope.utils.constant import DEFAULT_MODEL_REVISION, ModeKeys, Tasks
 from modelscope.utils.constant import (DEFAULT_MODEL_REVISION, Invoke,
                                       ModeKeys, Tasks)
 from modelscope.utils.hub import read_config, snapshot_download
 from modelscope.utils.logger import get_logger
 from .builder import build_preprocessor
@@ -194,7 +195,9 @@ class Preprocessor(ABC):
        """
        if not os.path.exists(model_name_or_path):
            model_dir = snapshot_download(
                model_name_or_path, revision=revision)
                model_name_or_path,
                revision=revision,
                user_agent={Invoke.KEY: Invoke.PREPROCESSOR})
        else:
            model_dir = model_name_or_path
        if cfg_dict is None:
--- a/modelscope/preprocessors/multi_modal.py
+++ b/modelscope/preprocessors/multi_modal.py
@@ -14,7 +14,8 @@ from modelscope.metainfo import Preprocessors
 from modelscope.pipelines.base import Input
 from modelscope.preprocessors import load_image
 from modelscope.utils.config import Config
 from modelscope.utils.constant import Fields, ModeKeys, ModelFile, Tasks
 from modelscope.utils.constant import (Fields, Invoke, ModeKeys, ModelFile,
                                       Tasks)
 from .base import Preprocessor
 from .builder import PREPROCESSORS
 from .ofa import *  # noqa
@@ -57,7 +58,7 @@ class OfaPreprocessor(Preprocessor):
            Tasks.auto_speech_recognition: OfaASRPreprocessor
        }
        model_dir = model_dir if osp.exists(model_dir) else snapshot_download(
            model_dir)
            model_dir, user_agent={Invoke.KEY: Invoke.PREPROCESSOR})
        self.cfg = Config.from_file(
            osp.join(model_dir, ModelFile.CONFIGURATION))
        self.preprocess = preprocess_mapping[self.cfg.task](
@@ -131,7 +132,7 @@ class CLIPPreprocessor(Preprocessor):
        """
        super().__init__(*args, **kwargs)
        model_dir = model_dir if osp.exists(model_dir) else snapshot_download(
            model_dir)
            model_dir, user_agent={Invoke.KEY: Invoke.PREPROCESSOR})
        self.mode = mode
        # text tokenizer
        from modelscope.models.multi_modal.clip.bert_tokenizer import FullTokenizer
--- a/modelscope/preprocessors/ofa/asr.py
+++ b/modelscope/preprocessors/ofa/asr.py
@@ -5,6 +5,7 @@ import random
 from pathlib import Path
 from typing import Any, Dict

 import librosa
 import soundfile as sf
 import torch
 from fairseq.data.audio.feature_transforms import \
@@ -54,9 +55,13 @@ class OfaASRPreprocessor(OfaBasePreprocessor):

    def _build_train_sample(self, data: Dict[str, Any]) -> Dict[str, Any]:
        speed = random.choice([0.9, 1.0, 1.1])
        wav, sr = sf.read(self.column_map['wav'])
        wav, sr = librosa.load(data[self.column_map['wav']], 16000, mono=True)
        fbank = self.prepare_fbank(
            torch.tensor([wav], dtype=torch.float32), sr, speed, is_train=True)
            torch.tensor([wav], dtype=torch.float32),
            sr,
            speed,
            target_sample_rate=16000,
            is_train=True)
        fbank_mask = torch.tensor([True])
        sample = {
            'fbank': fbank,
@@ -86,11 +91,12 @@ class OfaASRPreprocessor(OfaBasePreprocessor):

    def _build_infer_sample(self, data: Dict[str, Any]) -> Dict[str, Any]:
        speed = 1.0
        wav, sr = sf.read(data[self.column_map['wav']])
        wav, sr = librosa.load(data[self.column_map['wav']], 16000, mono=True)
        fbank = self.prepare_fbank(
            torch.tensor([wav], dtype=torch.float32),
            sr,
            speed,
            target_sample_rate=16000,
            is_train=False)
        fbank_mask = torch.tensor([True])

--- a/modelscope/preprocessors/ofa/base.py
+++ b/modelscope/preprocessors/ofa/base.py
@@ -170,10 +170,15 @@ class OfaBasePreprocessor:
            else load_image(path_or_url_or_pil)
        return image

    def prepare_fbank(self, waveform, sample_rate, speed, is_train):
        waveform, _ = torchaudio.sox_effects.apply_effects_tensor(
    def prepare_fbank(self,
                      waveform,
                      sample_rate,
                      speed,
                      target_sample_rate=16000,
                      is_train=False):
        waveform, sample_rate = torchaudio.sox_effects.apply_effects_tensor(
            waveform, sample_rate,
            [['speed', str(speed)], ['rate', str(sample_rate)]])
            [['speed', str(speed)], ['rate', str(target_sample_rate)]])
        _waveform, _ = convert_waveform(
            waveform, sample_rate, to_mono=True, normalize_volume=True)
        # Kaldi compliance: 16-bit signed integers
--- a/modelscope/trainers/audio/kws_farfield_trainer.py
+++ b/modelscope/trainers/audio/kws_farfield_trainer.py
@@ -8,7 +8,6 @@ import torch
 from torch import nn as nn
 from torch import optim as optim

 from modelscope.hub.snapshot_download import snapshot_download
 from modelscope.metainfo import Trainers
 from modelscope.models import Model, TorchModel
 from modelscope.msdatasets.task_datasets.audio import KWSDataLoader, KWSDataset
@@ -54,12 +53,8 @@ class KWSFarfieldTrainer(BaseTrainer):
                 **kwargs):

        if isinstance(model, str):
            if os.path.exists(model):
                self.model_dir = model if os.path.isdir(
                    model) else os.path.dirname(model)
            else:
                self.model_dir = snapshot_download(
                    model, revision=model_revision)
            self.model_dir = self.get_or_download_model_dir(
                model, model_revision)
            if cfg_file is None:
                cfg_file = os.path.join(self.model_dir,
                                        ModelFile.CONFIGURATION)
--- a/modelscope/trainers/base.py
+++ b/modelscope/trainers/base.py
@@ -1,11 +1,14 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.

 import os
 import time
 from abc import ABC, abstractmethod
 from typing import Callable, Dict, List, Optional, Tuple, Union

 from modelscope.hub.snapshot_download import snapshot_download
 from modelscope.trainers.builder import TRAINERS
 from modelscope.utils.config import Config
 from modelscope.utils.constant import Invoke
 from .utils.log_buffer import LogBuffer


@@ -32,6 +35,17 @@ class BaseTrainer(ABC):
        self.log_buffer = LogBuffer()
        self.timestamp = time.strftime('%Y%m%d_%H%M%S', time.localtime())

    def get_or_download_model_dir(self, model, model_revision=None):
        if os.path.exists(model):
            model_cache_dir = model if os.path.isdir(
                model) else os.path.dirname(model)
        else:
            model_cache_dir = snapshot_download(
                model,
                revision=model_revision,
                user_agent={Invoke.KEY: Invoke.TRAINER})
        return model_cache_dir

    @abstractmethod
    def train(self, *args, **kwargs):
        """ Train (and evaluate) process
--- a/modelscope/trainers/multi_modal/clip/clip_trainer.py
+++ b/modelscope/trainers/multi_modal/clip/clip_trainer.py
@@ -20,7 +20,7 @@ from modelscope.trainers.builder import TRAINERS
 from modelscope.trainers.optimizer.builder import build_optimizer
 from modelscope.utils.config import Config
 from modelscope.utils.constant import (DEFAULT_MODEL_REVISION, ConfigKeys,
                                       ModeKeys)
                                       Invoke, ModeKeys)
 from .clip_trainer_utils import get_loss, get_optimizer_params, get_schedule


@@ -52,7 +52,8 @@ class CLIPTrainer(EpochBasedTrainer):
            model_revision: Optional[str] = DEFAULT_MODEL_REVISION,
            seed: int = 42,
            **kwargs):
        model = Model.from_pretrained(model, revision=model_revision)
        model = Model.from_pretrained(
            model, revision=model_revision, invoked_by=Invoke.TRAINER)
        # for training & eval, we convert the model from FP16 back to FP32
        # to compatible with modelscope amp training
        convert_models_to_fp32(model)
--- a/modelscope/trainers/multi_modal/ofa/ofa_trainer.py
+++ b/modelscope/trainers/multi_modal/ofa/ofa_trainer.py
@@ -23,7 +23,7 @@ from modelscope.trainers.optimizer.builder import build_optimizer
 from modelscope.trainers.parallel.utils import is_parallel
 from modelscope.utils.config import Config
 from modelscope.utils.constant import (DEFAULT_MODEL_REVISION, ConfigKeys,
                                       ModeKeys)
                                       Invoke, ModeKeys)
 from .ofa_trainer_utils import (AdjustLabelSmoothedCrossEntropyCriterion,
                                get_schedule)

@@ -49,7 +49,8 @@ class OFATrainer(EpochBasedTrainer):
            model_revision: Optional[str] = DEFAULT_MODEL_REVISION,
            seed: int = 42,
            **kwargs):
        model = Model.from_pretrained(model, revision=model_revision)
        model = Model.from_pretrained(
            model, revision=model_revision, invoked_by=Invoke.TRAINER)
        model_dir = model.model_dir
        self.cfg_modify_fn = cfg_modify_fn
        cfg = self.rebuild_config(Config.from_file(cfg_file))
--- a/modelscope/trainers/multi_modal/team/team_trainer.py
+++ b/modelscope/trainers/multi_modal/team/team_trainer.py
@@ -7,21 +7,17 @@ from typing import Callable, Dict, Optional
 import numpy as np
 import torch
 import torch.nn as nn
 import torchvision.datasets as datasets
 import torchvision.transforms as transforms
 from sklearn.metrics import confusion_matrix
 from torch.optim import AdamW
 from torch.utils.data import DataLoader, Dataset

 from modelscope.metainfo import Trainers
 from modelscope.models.base import Model
 from modelscope.msdatasets import MsDataset
 from modelscope.trainers.base import BaseTrainer
 from modelscope.trainers.builder import TRAINERS
 from modelscope.trainers.multi_modal.team.team_trainer_utils import (
    get_optimizer, train_mapping, val_mapping)
 from modelscope.trainers.multi_modal.team.team_trainer_utils import \
    get_optimizer
 from modelscope.utils.config import Config
 from modelscope.utils.constant import DownloadMode, ModeKeys
 from modelscope.utils.constant import Invoke
 from modelscope.utils.logger import get_logger

 logger = get_logger()
@@ -36,7 +32,7 @@ class TEAMImgClsTrainer(BaseTrainer):
        super().__init__(cfg_file)

        self.cfg = Config.from_file(cfg_file)
        team_model = Model.from_pretrained(model)
        team_model = Model.from_pretrained(model, invoked_by=Invoke.TRAINER)
        image_model = team_model.model.image_model.vision_transformer
        classification_model = nn.Sequential(
            OrderedDict([('encoder', image_model),
--- a/modelscope/trainers/nlp/csanmt_translation_trainer.py
+++ b/modelscope/trainers/nlp/csanmt_translation_trainer.py
@@ -24,8 +24,7 @@ logger = get_logger()
 class CsanmtTranslationTrainer(BaseTrainer):

    def __init__(self, model: str, cfg_file: str = None, *args, **kwargs):
        if not osp.exists(model):
            model = snapshot_download(model)
        model = self.get_or_download_model_dir(model)
        tf.reset_default_graph()

        self.model_dir = model
--- a/modelscope/trainers/nlp_trainer.py
+++ b/modelscope/trainers/nlp_trainer.py
@@ -10,7 +10,6 @@ import torch
 from torch import nn
 from torch.utils.data import Dataset

 from modelscope.hub.snapshot_download import snapshot_download
 from modelscope.metainfo import Trainers
 from modelscope.metrics.builder import build_metric
 from modelscope.models.base import Model, TorchModel
@@ -478,11 +477,7 @@ class NlpEpochBasedTrainer(EpochBasedTrainer):
        """

        if isinstance(model, str):
            if os.path.exists(model):
                model_dir = model if os.path.isdir(model) else os.path.dirname(
                    model)
            else:
                model_dir = snapshot_download(model, revision=model_revision)
            model_dir = self.get_or_download_model_dir(model, model_revision)
            if cfg_file is None:
                cfg_file = os.path.join(model_dir, ModelFile.CONFIGURATION)
        else:
--- a/modelscope/trainers/trainer.py
+++ b/modelscope/trainers/trainer.py
@@ -14,7 +14,6 @@ from torch.utils.data import DataLoader, Dataset
 from torch.utils.data.dataloader import default_collate
 from torch.utils.data.distributed import DistributedSampler

 from modelscope.hub.snapshot_download import snapshot_download
 from modelscope.metainfo import Trainers
 from modelscope.metrics import build_metric, task_default_metrics
 from modelscope.models.base import Model, TorchModel
@@ -98,12 +97,8 @@ class EpochBasedTrainer(BaseTrainer):
        self._seed = seed
        set_random_seed(self._seed)
        if isinstance(model, str):
            if os.path.exists(model):
                self.model_dir = model if os.path.isdir(
                    model) else os.path.dirname(model)
            else:
                self.model_dir = snapshot_download(
                    model, revision=model_revision)
            self.model_dir = self.get_or_download_model_dir(
                model, model_revision)
            if cfg_file is None:
                cfg_file = os.path.join(self.model_dir,
                                        ModelFile.CONFIGURATION)
--- a/modelscope/utils/constant.py
+++ b/modelscope/utils/constant.py
@@ -44,6 +44,7 @@ class CVTasks(object):

    image_segmentation = 'image-segmentation'
    semantic_segmentation = 'semantic-segmentation'
    image_depth_estimation = 'image-depth-estimation'
    portrait_matting = 'portrait-matting'
    text_driven_segmentation = 'text-driven-segmentation'
    shop_segmentation = 'shop-segmentation'
@@ -293,6 +294,14 @@ class ModelFile(object):
    TS_MODEL_FILE = 'model.ts'


 class Invoke(object):
    KEY = 'invoked_by'
    PRETRAINED = 'from_pretrained'
    PIPELINE = 'pipeline'
    TRAINER = 'trainer'
    PREPROCESSOR = 'preprocessor'


 class ConfigFields(object):
    """ First level keyword in configuration file
    """
--- a/modelscope/utils/cv/image_utils.py
+++ b/modelscope/utils/cv/image_utils.py
@@ -1,6 +1,7 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.

 import cv2
 import matplotlib.pyplot as plt
 import numpy as np

 from modelscope.outputs import OutputKeys
@@ -439,3 +440,11 @@ def show_image_object_detection_auto_result(img_path,
    if save_path is not None:
        cv2.imwrite(save_path, img)
    return img


 def depth_to_color(depth):
    colormap = plt.get_cmap('plasma')
    depth_color = (colormap(
        (depth.max() - depth) / depth.max()) * 2**8).astype(np.uint8)[:, :, :3]
    depth_color = cv2.cvtColor(depth_color, cv2.COLOR_RGB2BGR)
    return depth_color
--- a/requirements/multi-modal.txt
+++ b/requirements/multi-modal.txt
@@ -1,4 +1,5 @@
 ftfy>=6.0.3
 librosa
 ofa>=0.0.2
 pycocoevalcap>=1.2
 pycocotools>=2.0.4
--- a/tests/pipelines/test_extractive_summarization.py
+++ b/tests/pipelines/test_extractive_summarization.py
@@ -28,7 +28,7 @@ class ExtractiveSummarizationTest(unittest.TestCase, DemoCompatibilityCheck):
        result = p(documents=documents)
        return result

    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
    def test_run_with_doc(self):
        logger.info(
            'Run doc extractive summarization (PoNet) with one document ...')
@@ -37,7 +37,7 @@ class ExtractiveSummarizationTest(unittest.TestCase, DemoCompatibilityCheck):
            model_id=self.ponet_doc_model_id, documents=self.sentences)
        print(result[OutputKeys.TEXT])

    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
    def test_run_with_topic(self):
        logger.info(
            'Run topic extractive summarization (PoNet) with one document ...')
--- a/tests/pipelines/test_image_depth_estimation.py
+++ b/tests/pipelines/test_image_depth_estimation.py
@@ -0,0 +1,35 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.

 import unittest

 import cv2
 import numpy as np

 from modelscope.outputs import OutputKeys
 from modelscope.pipelines import pipeline
 from modelscope.utils.constant import Tasks
 from modelscope.utils.cv.image_utils import depth_to_color
 from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.test_utils import test_level


 class ImageDepthEstimationTest(unittest.TestCase, DemoCompatibilityCheck):

    def setUp(self) -> None:
        self.task = 'image-depth-estimation'
        self.model_id = 'damo/cv_newcrfs_image-depth-estimation_indoor'

    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
    def test_image_depth_estimation(self):
        input_location = 'data/test/images/image_depth_estimation.jpg'
        estimator = pipeline(Tasks.image_depth_estimation, model=self.model_id)
        result = estimator(input_location)
        depths = result[OutputKeys.DEPTHS]
        depth_viz = depth_to_color(depths[0].squeeze().cpu().numpy())
        cv2.imwrite('result.jpg', depth_viz)

        print('test_image_depth_estimation DONE')


 if __name__ == '__main__':
    unittest.main()