[to #42322933]image2image_translation codes

Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/9526987
3 years ago · 0424f3c510
--- a/data/test/images/img2img_input.jpg
+++ b/data/test/images/img2img_input.jpg
@@ -0,0 +1,3 @@
 version https://git-lfs.github.com/spec/v1
 oid sha256:7e4cbf844cd16a892a7d2f2764b1537c346675d3b0145016d6836441ba907366
 size 9195
--- a/data/test/images/img2img_input_mask.png
+++ b/data/test/images/img2img_input_mask.png
@@ -0,0 +1,3 @@
 version https://git-lfs.github.com/spec/v1
 oid sha256:33b3d3076e191fa92511bf69fa76e1222b3b3be0049e711c948a1218b587510c
 size 4805
--- a/data/test/images/img2img_input_masked_img.png
+++ b/data/test/images/img2img_input_masked_img.png
@@ -0,0 +1,3 @@
 version https://git-lfs.github.com/spec/v1
 oid sha256:99c2b02a927b86ff194287ea4c5a05349dd800cff2b523212d1dad378c252feb
 size 103334
--- a/modelscope/metainfo.py
+++ b/modelscope/metainfo.py
@@ -77,6 +77,7 @@ class Pipelines(object):
    face_image_generation = 'gan-face-image-generation'
    style_transfer = 'AAMS-style-transfer'
    image_instance_segmentation = 'cascade-mask-rcnn-swin-image-instance-segmentation'
    image2image_translation = 'image-to-image-translation'
    live_category = 'live-category'
    video_category = 'video-category'

--- a/modelscope/models/cv/image_to_image_translation/data/init.py
+++ b/modelscope/models/cv/image_to_image_translation/data/init.py
@@ -0,0 +1 @@
 from .transforms import *  # noqa F403
--- a/modelscope/models/cv/image_to_image_translation/data/transforms.py
+++ b/modelscope/models/cv/image_to_image_translation/data/transforms.py
@@ -0,0 +1,121 @@
 import math
 import random

 import torchvision.transforms.functional as TF
 from PIL import Image, ImageFilter

 __all__ = [
    'Identity', 'PadToSquare', 'RandomScale', 'RandomRotate',
    'RandomGaussianBlur', 'RandomCrop'
 ]


 class Identity(object):

    def __call__(self, *args):
        if len(args) == 0:
            return None
        elif len(args) == 1:
            return args[0]
        else:
            return args


 class PadToSquare(object):

    def __init__(self, fill=(255, 255, 255)):
        self.fill = fill

    def __call__(self, img):
        w, h = img.size
        if w != h:
            if w > h:
                t = (w - h) // 2
                b = w - h - t
                padding = (0, t, 0, b)
            else:
                left = (h - w) // 2
                right = h - w - l
                padding = (left, 0, right, 0)
            img = TF.pad(img, padding, fill=self.fill)
        return img


 class RandomScale(object):

    def __init__(self,
                 min_scale=0.5,
                 max_scale=2.0,
                 min_ratio=0.8,
                 max_ratio=1.25):
        self.min_scale = min_scale
        self.max_scale = max_scale
        self.min_ratio = min_ratio
        self.max_ratio = max_ratio

    def __call__(self, img):
        w, h = img.size
        scale = 2**random.uniform(
            math.log2(self.min_scale), math.log2(self.max_scale))
        ratio = 2**random.uniform(
            math.log2(self.min_ratio), math.log2(self.max_ratio))
        ow = int(w * scale * math.sqrt(ratio))
        oh = int(h * scale / math.sqrt(ratio))
        img = img.resize((ow, oh), Image.BILINEAR)
        return img


 class RandomRotate(object):

    def __init__(self,
                 min_angle=-10.0,
                 max_angle=10.0,
                 padding=(255, 255, 255),
                 p=0.5):
        self.min_angle = min_angle
        self.max_angle = max_angle
        self.padding = padding
        self.p = p

    def __call__(self, img):
        if random.random() < self.p:
            angle = random.uniform(self.min_angle, self.max_angle)
            img = img.rotate(angle, Image.BILINEAR, fillcolor=self.padding)
        return img


 class RandomGaussianBlur(object):

    def __init__(self, radius=5, p=0.5):
        self.radius = radius
        self.p = p

    def __call__(self, img):
        if random.random() < self.p:
            img = img.filter(ImageFilter.GaussianBlur(radius=self.radius))
        return img


 class RandomCrop(object):

    def __init__(self, size, padding=(255, 255, 255)):
        self.size = size
        self.padding = padding

    def __call__(self, img):
        # pad
        w, h = img.size
        pad_w = max(0, self.size - w)
        pad_h = max(0, self.size - h)
        if pad_w > 0 or pad_h > 0:
            half_w = pad_w // 2
            half_h = pad_h // 2
            pad = (half_w, half_h, pad_w - half_w, pad_h - half_h)
            img = TF.pad(img, pad, fill=self.padding)

        # crop
        w, h = img.size
        x1 = random.randint(0, w - self.size)
        y1 = random.randint(0, h - self.size)
        img = img.crop((x1, y1, x1 + self.size, y1 + self.size))
        return img
--- a/modelscope/models/cv/image_to_image_translation/model_translation.py
+++ b/modelscope/models/cv/image_to_image_translation/model_translation.py
@@ -0,0 +1,323 @@
 import math

 import torch
 import torch.nn as nn
 import torch.nn.functional as F

 __all__ = ['UNet']


 def sinusoidal_embedding(timesteps, dim):
    # check input
    half = dim // 2
    timesteps = timesteps.float()

    # compute sinusoidal embedding
    sinusoid = torch.outer(
        timesteps, torch.pow(10000,
                             -torch.arange(half).to(timesteps).div(half)))
    x = torch.cat([torch.cos(sinusoid), torch.sin(sinusoid)], dim=1)
    if dim % 2 != 0:
        x = torch.cat([x, torch.zeros_like(x[:, :1])], dim=1)
    return x


 class Resample(nn.Module):

    def __init__(self, scale_factor=1.0):
        assert scale_factor in [0.5, 1.0, 2.0]
        super(Resample, self).__init__()
        self.scale_factor = scale_factor

    def forward(self, x):
        if self.scale_factor == 2.0:
            x = F.interpolate(x, scale_factor=2, mode='nearest')
        elif self.scale_factor == 0.5:
            x = F.avg_pool2d(x, kernel_size=2, stride=2)
        return x


 class ResidualBlock(nn.Module):

    def __init__(self, in_dim, embed_dim, out_dim, dropout=0.0):
        super(ResidualBlock, self).__init__()
        self.in_dim = in_dim
        self.embed_dim = embed_dim
        self.out_dim = out_dim

        # layers
        self.layer1 = nn.Sequential(
            nn.GroupNorm(32, in_dim), nn.SiLU(),
            nn.Conv2d(in_dim, out_dim, 3, padding=1))
        self.embedding = nn.Sequential(nn.SiLU(),
                                       nn.Linear(embed_dim, out_dim))
        self.layer2 = nn.Sequential(
            nn.GroupNorm(32, out_dim), nn.SiLU(), nn.Dropout(dropout),
            nn.Conv2d(out_dim, out_dim, 3, padding=1))
        self.shortcut = nn.Identity() if in_dim == out_dim else nn.Conv2d(
            in_dim, out_dim, 1)

        # zero out the last layer params
        nn.init.zeros_(self.layer2[-1].weight)

    def forward(self, x, y):
        identity = x
        x = self.layer1(x)
        x = x + self.embedding(y).unsqueeze(-1).unsqueeze(-1)
        x = self.layer2(x)
        x = x + self.shortcut(identity)
        return x


 class MultiHeadAttention(nn.Module):

    def __init__(self, dim, context_dim=None, num_heads=8, dropout=0.0):
        assert dim % num_heads == 0
        assert context_dim is None or context_dim % num_heads == 0
        context_dim = context_dim or dim
        super(MultiHeadAttention, self).__init__()
        self.dim = dim
        self.context_dim = context_dim
        self.num_heads = num_heads
        self.head_dim = dim // num_heads
        self.scale = math.pow(self.head_dim, -0.25)

        # layers
        self.q = nn.Linear(dim, dim, bias=False)
        self.k = nn.Linear(context_dim, dim, bias=False)
        self.v = nn.Linear(context_dim, dim, bias=False)
        self.o = nn.Linear(dim, dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, context=None):
        # check inputs
        context = x if context is None else context
        b, n, c = x.size(0), self.num_heads, self.head_dim

        # compute query, key, value
        q = self.q(x).view(b, -1, n, c)
        k = self.k(context).view(b, -1, n, c)
        v = self.v(context).view(b, -1, n, c)

        # compute attention
        attn = torch.einsum('binc,bjnc->bnij', q * self.scale, k * self.scale)
        attn = F.softmax(attn, dim=-1)
        attn = self.dropout(attn)

        # gather context
        x = torch.einsum('bnij,bjnc->binc', attn, v)
        x = x.reshape(b, -1, n * c)

        # output
        x = self.o(x)
        x = self.dropout(x)
        return x


 class GLU(nn.Module):

    def __init__(self, in_dim, out_dim):
        super(GLU, self).__init__()
        self.in_dim = in_dim
        self.out_dim = out_dim
        self.proj = nn.Linear(in_dim, out_dim * 2)

    def forward(self, x):
        x, gate = self.proj(x).chunk(2, dim=-1)
        return x * F.gelu(gate)


 class TransformerBlock(nn.Module):

    def __init__(self, dim, context_dim, num_heads, dropout=0.0):
        super(TransformerBlock, self).__init__()
        self.dim = dim
        self.context_dim = context_dim
        self.num_heads = num_heads
        self.head_dim = dim // num_heads

        # input
        self.norm1 = nn.GroupNorm(32, dim, eps=1e-6, affine=True)
        self.conv1 = nn.Conv2d(dim, dim, 1)

        # self attention
        self.norm2 = nn.LayerNorm(dim)
        self.self_attn = MultiHeadAttention(dim, None, num_heads, dropout)

        # cross attention
        self.norm3 = nn.LayerNorm(dim)
        self.cross_attn = MultiHeadAttention(dim, context_dim, num_heads,
                                             dropout)

        # ffn
        self.norm4 = nn.LayerNorm(dim)
        self.ffn = nn.Sequential(
            GLU(dim, dim * 4), nn.Dropout(dropout), nn.Linear(dim * 4, dim))

        # output
        self.conv2 = nn.Conv2d(dim, dim, 1)

        # zero out the last layer params
        nn.init.zeros_(self.conv2.weight)

    def forward(self, x, context):
        b, c, h, w = x.size()
        identity = x

        # input
        x = self.norm1(x)
        x = self.conv1(x).view(b, c, -1).transpose(1, 2)

        # attention
        x = x + self.self_attn(self.norm2(x))
        x = x + self.cross_attn(self.norm3(x), context)
        x = x + self.ffn(self.norm4(x))

        # output
        x = x.transpose(1, 2).view(b, c, h, w)
        x = self.conv2(x)
        return x + identity


 class UNet(nn.Module):

    def __init__(self,
                 resolution=64,
                 in_dim=3,
                 dim=192,
                 context_dim=512,
                 out_dim=3,
                 dim_mult=[1, 2, 3, 5],
                 num_heads=1,
                 head_dim=None,
                 num_res_blocks=2,
                 attn_scales=[1 / 2, 1 / 4, 1 / 8],
                 num_classes=1001,
                 dropout=0.0):
        embed_dim = dim * 4
        super(UNet, self).__init__()
        self.resolution = resolution
        self.in_dim = in_dim
        self.dim = dim
        self.context_dim = context_dim
        self.out_dim = out_dim
        self.dim_mult = dim_mult
        self.num_heads = num_heads
        self.head_dim = head_dim
        self.num_res_blocks = num_res_blocks
        self.attn_scales = attn_scales
        self.num_classes = num_classes

        # params
        enc_dims = [dim * u for u in [1] + dim_mult]
        dec_dims = [dim * u for u in [dim_mult[-1]] + dim_mult[::-1]]
        shortcut_dims = []
        scale = 1.0

        # embeddings
        self.time_embedding = nn.Sequential(
            nn.Linear(dim, embed_dim), nn.SiLU(),
            nn.Linear(embed_dim, embed_dim))
        self.label_embedding = nn.Embedding(num_classes, context_dim)

        # encoder
        self.encoder = nn.ModuleList(
            [nn.Conv2d(self.in_dim, dim, 3, padding=1)])
        shortcut_dims.append(dim)
        for i, (in_dim,
                out_dim) in enumerate(zip(enc_dims[:-1], enc_dims[1:])):
            for j in range(num_res_blocks):
                # residual (+attention) blocks
                block = nn.ModuleList(
                    [ResidualBlock(in_dim, embed_dim, out_dim, dropout)])
                if scale in attn_scales:
                    block.append(
                        TransformerBlock(out_dim, context_dim, num_heads))
                in_dim = out_dim
                self.encoder.append(block)
                shortcut_dims.append(out_dim)

                # downsample
                if i != len(dim_mult) - 1 and j == num_res_blocks - 1:
                    self.encoder.append(
                        nn.Conv2d(out_dim, out_dim, 3, stride=2, padding=1))
                    shortcut_dims.append(out_dim)
                    scale /= 2.0

        # middle
        self.middle = nn.ModuleList([
            ResidualBlock(out_dim, embed_dim, out_dim, dropout),
            TransformerBlock(out_dim, context_dim, num_heads),
            ResidualBlock(out_dim, embed_dim, out_dim, dropout)
        ])

        # decoder
        self.decoder = nn.ModuleList()
        for i, (in_dim,
                out_dim) in enumerate(zip(dec_dims[:-1], dec_dims[1:])):
            for j in range(num_res_blocks + 1):
                # residual (+attention) blocks
                block = nn.ModuleList([
                    ResidualBlock(in_dim + shortcut_dims.pop(), embed_dim,
                                  out_dim, dropout)
                ])
                if scale in attn_scales:
                    block.append(
                        TransformerBlock(out_dim, context_dim, num_heads,
                                         dropout))
                in_dim = out_dim

                # upsample
                if i != len(dim_mult) - 1 and j == num_res_blocks:
                    block.append(
                        nn.Sequential(
                            Resample(scale_factor=2.0),
                            nn.Conv2d(out_dim, out_dim, 3, padding=1)))
                    scale *= 2.0
                self.decoder.append(block)

        # head
        self.head = nn.Sequential(
            nn.GroupNorm(32, out_dim), nn.SiLU(),
            nn.Conv2d(out_dim, self.out_dim, 3, padding=1))

        # zero out the last layer params
        nn.init.zeros_(self.head[-1].weight)

    def forward(self, x, t, y, concat=None):
        # embeddings
        if concat is not None:
            x = torch.cat([x, concat], dim=1)
        t = self.time_embedding(sinusoidal_embedding(t, self.dim))
        y = self.label_embedding(y)

        # encoder
        xs = []
        for block in self.encoder:
            x = self._forward_single(block, x, t, y)
            xs.append(x)

        # middle
        for block in self.middle:
            x = self._forward_single(block, x, t, y)

        # decoder
        for block in self.decoder:
            x = torch.cat([x, xs.pop()], dim=1)
            x = self._forward_single(block, x, t, y)

        # head
        x = self.head(x)
        return x

    def _forward_single(self, module, x, t, y):
        if isinstance(module, ResidualBlock):
            x = module(x, t)
        elif isinstance(module, TransformerBlock):
            x = module(x, y)
        elif isinstance(module, nn.ModuleList):
            for block in module:
                x = self._forward_single(block, x, t, y)
        else:
            x = module(x)
        return x
--- a/modelscope/models/cv/image_to_image_translation/models/init.py
+++ b/modelscope/models/cv/image_to_image_translation/models/init.py
@@ -0,0 +1,2 @@
 from .autoencoder import *  # noqa F403
 from .clip import *  # noqa F403
--- a/modelscope/models/cv/image_to_image_translation/models/autoencoder.py
+++ b/modelscope/models/cv/image_to_image_translation/models/autoencoder.py
@@ -0,0 +1,412 @@
 import math

 import torch
 import torch.nn as nn
 import torch.nn.functional as F

 __all__ = ['VQAutoencoder', 'KLAutoencoder', 'PatchDiscriminator']


 def group_norm(dim):
    return nn.GroupNorm(32, dim, eps=1e-6, affine=True)


 class Resample(nn.Module):

    def __init__(self, dim, scale_factor):
        super(Resample, self).__init__()
        self.dim = dim
        self.scale_factor = scale_factor

        # layers
        if scale_factor == 2.0:
            self.resample = nn.Sequential(
                nn.Upsample(scale_factor=scale_factor, mode='nearest'),
                nn.Conv2d(dim, dim, 3, padding=1))
        elif scale_factor == 0.5:
            self.resample = nn.Sequential(
                nn.ZeroPad2d((0, 1, 0, 1)),
                nn.Conv2d(dim, dim, 3, stride=2, padding=0))
        else:
            self.resample = nn.Identity()

    def forward(self, x):
        return self.resample(x)


 class ResidualBlock(nn.Module):

    def __init__(self, in_dim, out_dim, dropout=0.0):
        super(ResidualBlock, self).__init__()
        self.in_dim = in_dim
        self.out_dim = out_dim

        # layers
        self.residual = nn.Sequential(
            group_norm(in_dim), nn.SiLU(),
            nn.Conv2d(in_dim, out_dim, 3, padding=1), group_norm(out_dim),
            nn.SiLU(), nn.Dropout(dropout),
            nn.Conv2d(out_dim, out_dim, 3, padding=1))
        self.shortcut = nn.Conv2d(in_dim, out_dim,
                                  1) if in_dim != out_dim else nn.Identity()

        # zero out the last layer params
        nn.init.zeros_(self.residual[-1].weight)

    def forward(self, x):
        return self.residual(x) + self.shortcut(x)


 class AttentionBlock(nn.Module):

    def __init__(self, dim):
        super(AttentionBlock, self).__init__()
        self.dim = dim
        self.scale = math.pow(dim, -0.25)

        # layers
        self.norm = group_norm(dim)
        self.to_qkv = nn.Conv2d(dim, dim * 3, 1)
        self.proj = nn.Conv2d(dim, dim, 1)

        # zero out the last layer params
        nn.init.zeros_(self.proj.weight)

    def forward(self, x):
        identity = x
        b, c, h, w = x.size()

        # compute query, key, value
        x = self.norm(x)
        q, k, v = self.to_qkv(x).view(b, c * 3, -1).chunk(3, dim=1)

        # compute attention
        attn = torch.einsum('bci,bcj->bij', q * self.scale, k * self.scale)
        attn = F.softmax(attn, dim=-1)

        # gather context
        x = torch.einsum('bij,bcj->bci', attn, v)
        x = x.reshape(b, c, h, w)

        # output
        x = self.proj(x)
        return x + identity


 class Encoder(nn.Module):

    def __init__(self,
                 dim=128,
                 z_dim=3,
                 dim_mult=[1, 2, 4],
                 num_res_blocks=2,
                 attn_scales=[],
                 dropout=0.0):
        super(Encoder, self).__init__()
        self.dim = dim
        self.z_dim = z_dim
        self.dim_mult = dim_mult
        self.num_res_blocks = num_res_blocks
        self.attn_scales = attn_scales

        # params
        dims = [dim * u for u in [1] + dim_mult]
        scale = 1.0

        # init block
        self.conv1 = nn.Conv2d(3, dims[0], 3, padding=1)

        # downsample blocks
        downsamples = []
        for i, (in_dim, out_dim) in enumerate(zip(dims[:-1], dims[1:])):
            # residual (+attention) blocks
            for _ in range(num_res_blocks):
                downsamples.append(ResidualBlock(in_dim, out_dim, dropout))
                if scale in attn_scales:
                    downsamples.append(AttentionBlock(out_dim))
                in_dim = out_dim

            # downsample block
            if i != len(dim_mult) - 1:
                downsamples.append(Resample(out_dim, scale_factor=0.5))
                scale /= 2.0
        self.downsamples = nn.Sequential(*downsamples)

        # middle blocks
        self.middle = nn.Sequential(
            ResidualBlock(out_dim, out_dim, dropout), AttentionBlock(out_dim),
            ResidualBlock(out_dim, out_dim, dropout))

        # output blocks
        self.head = nn.Sequential(
            group_norm(out_dim), nn.SiLU(),
            nn.Conv2d(out_dim, z_dim, 3, padding=1))

    def forward(self, x):
        x = self.conv1(x)
        x = self.downsamples(x)
        x = self.middle(x)
        x = self.head(x)
        return x


 class Decoder(nn.Module):

    def __init__(self,
                 dim=128,
                 z_dim=3,
                 dim_mult=[1, 2, 4],
                 num_res_blocks=2,
                 attn_scales=[],
                 dropout=0.0):
        super(Decoder, self).__init__()
        self.dim = dim
        self.z_dim = z_dim
        self.dim_mult = dim_mult
        self.num_res_blocks = num_res_blocks
        self.attn_scales = attn_scales

        # params
        dims = [dim * u for u in [dim_mult[-1]] + dim_mult[::-1]]
        scale = 1.0 / 2**(len(dim_mult) - 2)

        # init block
        self.conv1 = nn.Conv2d(z_dim, dims[0], 3, padding=1)

        # middle blocks
        self.middle = nn.Sequential(
            ResidualBlock(dims[0], dims[0], dropout), AttentionBlock(dims[0]),
            ResidualBlock(dims[0], dims[0], dropout))

        # upsample blocks
        upsamples = []
        for i, (in_dim, out_dim) in enumerate(zip(dims[:-1], dims[1:])):
            # residual (+attention) blocks
            for _ in range(num_res_blocks + 1):
                upsamples.append(ResidualBlock(in_dim, out_dim, dropout))
                if scale in attn_scales:
                    upsamples.append(AttentionBlock(out_dim))
                in_dim = out_dim

            # upsample block
            if i != len(dim_mult) - 1:
                upsamples.append(Resample(out_dim, scale_factor=2.0))
                scale *= 2.0
        self.upsamples = nn.Sequential(*upsamples)

        # output blocks
        self.head = nn.Sequential(
            group_norm(out_dim), nn.SiLU(),
            nn.Conv2d(out_dim, 3, 3, padding=1))

    def forward(self, x):
        x = self.conv1(x)
        x = self.middle(x)
        x = self.upsamples(x)
        x = self.head(x)
        return x


 class VectorQuantizer(nn.Module):

    def __init__(self, codebook_size=8192, z_dim=3, beta=0.25):
        super(VectorQuantizer, self).__init__()
        self.codebook_size = codebook_size
        self.z_dim = z_dim
        self.beta = beta

        # init codebook
        eps = math.sqrt(1.0 / codebook_size)
        self.codebook = nn.Parameter(
            torch.empty(codebook_size, z_dim).uniform_(-eps, eps))

    def forward(self, z):
        # preprocess
        b, c, h, w = z.size()
        flatten = z.permute(0, 2, 3, 1).reshape(-1, c)

        # quantization
        with torch.no_grad():
            tokens = torch.cdist(flatten, self.codebook).argmin(dim=1)
        quantized = F.embedding(tokens,
                                self.codebook).view(b, h, w,
                                                    c).permute(0, 3, 1, 2)

        # compute loss
        codebook_loss = F.mse_loss(quantized, z.detach())
        commitment_loss = F.mse_loss(quantized.detach(), z)
        loss = codebook_loss + self.beta * commitment_loss

        # perplexity
        counts = F.one_hot(tokens, self.codebook_size).sum(dim=0).to(z.dtype)
        # dist.all_reduce(counts)
        p = counts / counts.sum()
        perplexity = torch.exp(-torch.sum(p * torch.log(p + 1e-10)))

        # postprocess
        tokens = tokens.view(b, h, w)
        quantized = z + (quantized - z).detach()
        return quantized, tokens, loss, perplexity


 class VQAutoencoder(nn.Module):

    def __init__(self,
                 dim=128,
                 z_dim=3,
                 dim_mult=[1, 2, 4],
                 num_res_blocks=2,
                 attn_scales=[],
                 dropout=0.0,
                 codebook_size=8192,
                 beta=0.25):
        super(VQAutoencoder, self).__init__()
        self.dim = dim
        self.z_dim = z_dim
        self.dim_mult = dim_mult
        self.num_res_blocks = num_res_blocks
        self.attn_scales = attn_scales
        self.codebook_size = codebook_size
        self.beta = beta

        # blocks
        self.encoder = Encoder(dim, z_dim, dim_mult, num_res_blocks,
                               attn_scales, dropout)
        self.conv1 = nn.Conv2d(z_dim, z_dim, 1)
        self.quantizer = VectorQuantizer(codebook_size, z_dim, beta)
        self.conv2 = nn.Conv2d(z_dim, z_dim, 1)
        self.decoder = Decoder(dim, z_dim, dim_mult, num_res_blocks,
                               attn_scales, dropout)

    def forward(self, x):
        z = self.encoder(x)
        z = self.conv1(z)
        z, tokens, loss, perplexity = self.quantizer(z)
        z = self.conv2(z)
        x = self.decoder(z)
        return x, tokens, loss, perplexity

    def encode(self, imgs):
        z = self.encoder(imgs)
        z = self.conv1(z)
        return z

    def decode(self, z):
        r"""Absort the quantizer in the decoder.
        """
        z = self.quantizer(z)[0]
        z = self.conv2(z)
        imgs = self.decoder(z)
        return imgs

    @torch.no_grad()
    def encode_to_tokens(self, imgs):
        # preprocess
        z = self.encoder(imgs)
        z = self.conv1(z)

        # quantization
        b, c, h, w = z.size()
        flatten = z.permute(0, 2, 3, 1).reshape(-1, c)
        tokens = torch.cdist(flatten, self.quantizer.codebook).argmin(dim=1)
        return tokens.view(b, -1)

    @torch.no_grad()
    def decode_from_tokens(self, tokens):
        # dequantization
        z = F.embedding(tokens, self.quantizer.codebook)

        # postprocess
        b, l, c = z.size()
        h = w = int(math.sqrt(l))
        z = z.view(b, h, w, c).permute(0, 3, 1, 2)
        z = self.conv2(z)
        imgs = self.decoder(z)
        return imgs


 class KLAutoencoder(nn.Module):

    def __init__(self,
                 dim=128,
                 z_dim=4,
                 dim_mult=[1, 2, 4, 4],
                 num_res_blocks=2,
                 attn_scales=[],
                 dropout=0.0):
        super(KLAutoencoder, self).__init__()
        self.dim = dim
        self.z_dim = z_dim
        self.dim_mult = dim_mult
        self.num_res_blocks = num_res_blocks
        self.attn_scales = attn_scales

        # blocks
        self.encoder = Encoder(dim, z_dim * 2, dim_mult, num_res_blocks,
                               attn_scales, dropout)
        self.conv1 = nn.Conv2d(z_dim * 2, z_dim * 2, 1)
        self.conv2 = nn.Conv2d(z_dim, z_dim, 1)
        self.decoder = Decoder(dim, z_dim, dim_mult, num_res_blocks,
                               attn_scales, dropout)

    def forward(self, x):
        mu, log_var = self.encode(x)
        z = self.reparameterize(mu, log_var)
        x = self.decode(z)
        return x, mu, log_var

    def encode(self, x):
        x = self.encoder(x)
        mu, log_var = self.conv1(x).chunk(2, dim=1)
        return mu, log_var

    def decode(self, z):
        x = self.conv2(z)
        x = self.decoder(x)
        return x

    def reparameterize(self, mu, log_var):
        std = torch.exp(0.5 * log_var)
        eps = torch.randn_like(std)
        return eps * std + mu


 class PatchDiscriminator(nn.Module):

    def __init__(self, in_dim=3, dim=64, num_layers=3):
        super(PatchDiscriminator, self).__init__()
        self.in_dim = in_dim
        self.dim = dim
        self.num_layers = num_layers

        # params
        dims = [dim * min(8, 2**u) for u in range(num_layers + 1)]

        # layers
        layers = [
            nn.Conv2d(in_dim, dim, 4, stride=2, padding=1),
            nn.LeakyReLU(0.2)
        ]
        for i, (in_dim, out_dim) in enumerate(zip(dims[:-1], dims[1:])):
            stride = 1 if i == num_layers - 1 else 2
            layers += [
                nn.Conv2d(
                    in_dim, out_dim, 4, stride=stride, padding=1, bias=False),
                nn.BatchNorm2d(out_dim),
                nn.LeakyReLU(0.2)
            ]
        layers += [nn.Conv2d(out_dim, 1, 4, stride=1, padding=1)]
        self.layers = nn.Sequential(*layers)

        # initialize weights
        self.apply(self.init_weights)

    def forward(self, x):
        return self.layers(x)

    def init_weights(self, m):
        if isinstance(m, nn.Conv2d):
            nn.init.normal_(m.weight, 0.0, 0.02)
        elif isinstance(m, nn.BatchNorm2d):
            nn.init.normal_(m.weight, 1.0, 0.02)
            nn.init.zeros_(m.bias)
--- a/modelscope/models/cv/image_to_image_translation/models/clip.py
+++ b/modelscope/models/cv/image_to_image_translation/models/clip.py
@@ -0,0 +1,418 @@
 import math

 import torch
 import torch.nn as nn
 import torch.nn.functional as F

 import modelscope.models.cv.image_to_image_translation.ops as ops  # for using differentiable all_gather

 __all__ = [
    'CLIP', 'clip_vit_b_32', 'clip_vit_b_16', 'clip_vit_l_14',
    'clip_vit_l_14_336px', 'clip_vit_h_16'
 ]


 def to_fp16(m):
    if isinstance(m, (nn.Linear, nn.Conv2d)):
        m.weight.data = m.weight.data.half()
        if m.bias is not None:
            m.bias.data = m.bias.data.half()
    elif hasattr(m, 'head'):
        p = getattr(m, 'head')
        p.data = p.data.half()


 class QuickGELU(nn.Module):

    def forward(self, x):
        return x * torch.sigmoid(1.702 * x)


 class LayerNorm(nn.LayerNorm):
    r"""Subclass of nn.LayerNorm to handle fp16.
    """

    def forward(self, x):
        return super(LayerNorm, self).forward(x.float()).type_as(x)


 class SelfAttention(nn.Module):

    def __init__(self, dim, num_heads, attn_dropout=0.0, proj_dropout=0.0):
        assert dim % num_heads == 0
        super(SelfAttention, self).__init__()
        self.dim = dim
        self.num_heads = num_heads
        self.head_dim = dim // num_heads
        self.scale = 1.0 / math.sqrt(self.head_dim)

        # layers
        self.to_qkv = nn.Linear(dim, dim * 3)
        self.attn_dropout = nn.Dropout(attn_dropout)
        self.proj = nn.Linear(dim, dim)
        self.proj_dropout = nn.Dropout(proj_dropout)

    def forward(self, x, mask=None):
        r"""x:      [B, L, C].
            mask:   [*, L, L].
        """
        b, l, _, n = *x.size(), self.num_heads

        # compute query, key, and value
        q, k, v = self.to_qkv(x.transpose(0, 1)).chunk(3, dim=-1)
        q = q.reshape(l, b * n, -1).transpose(0, 1)
        k = k.reshape(l, b * n, -1).transpose(0, 1)
        v = v.reshape(l, b * n, -1).transpose(0, 1)

        # compute attention
        attn = self.scale * torch.bmm(q, k.transpose(1, 2))
        if mask is not None:
            attn = attn.masked_fill(mask[:, :l, :l] == 0, float('-inf'))
        attn = F.softmax(attn.float(), dim=-1).type_as(attn)
        attn = self.attn_dropout(attn)

        # gather context
        x = torch.bmm(attn, v)
        x = x.view(b, n, l, -1).transpose(1, 2).reshape(b, l, -1)

        # output
        x = self.proj(x)
        x = self.proj_dropout(x)
        return x


 class AttentionBlock(nn.Module):

    def __init__(self, dim, num_heads, attn_dropout=0.0, proj_dropout=0.0):
        super(AttentionBlock, self).__init__()
        self.dim = dim
        self.num_heads = num_heads

        # layers
        self.norm1 = LayerNorm(dim)
        self.attn = SelfAttention(dim, num_heads, attn_dropout, proj_dropout)
        self.norm2 = LayerNorm(dim)
        self.mlp = nn.Sequential(
            nn.Linear(dim, dim * 4), QuickGELU(), nn.Linear(dim * 4, dim),
            nn.Dropout(proj_dropout))

    def forward(self, x, mask=None):
        x = x + self.attn(self.norm1(x), mask)
        x = x + self.mlp(self.norm2(x))
        return x


 class VisionTransformer(nn.Module):

    def __init__(self,
                 image_size=224,
                 patch_size=16,
                 dim=768,
                 out_dim=512,
                 num_heads=12,
                 num_layers=12,
                 attn_dropout=0.0,
                 proj_dropout=0.0,
                 embedding_dropout=0.0):
        assert image_size % patch_size == 0
        super(VisionTransformer, self).__init__()
        self.image_size = image_size
        self.patch_size = patch_size
        self.dim = dim
        self.out_dim = out_dim
        self.num_heads = num_heads
        self.num_layers = num_layers
        self.num_patches = (image_size // patch_size)**2

        # embeddings
        gain = 1.0 / math.sqrt(dim)
        self.patch_embedding = nn.Conv2d(
            3, dim, kernel_size=patch_size, stride=patch_size, bias=False)
        self.cls_embedding = nn.Parameter(gain * torch.randn(1, 1, dim))
        self.pos_embedding = nn.Parameter(
            gain * torch.randn(1, self.num_patches + 1, dim))
        self.dropout = nn.Dropout(embedding_dropout)

        # transformer
        self.pre_norm = LayerNorm(dim)
        self.transformer = nn.Sequential(*[
            AttentionBlock(dim, num_heads, attn_dropout, proj_dropout)
            for _ in range(num_layers)
        ])
        self.post_norm = LayerNorm(dim)

        # head
        self.head = nn.Parameter(gain * torch.randn(dim, out_dim))

    def forward(self, x):
        b, dtype = x.size(0), self.head.dtype
        x = x.type(dtype)

        # patch-embedding
        x = self.patch_embedding(x).flatten(2).permute(0, 2, 1)  # [b, n, c]
        x = torch.cat([self.cls_embedding.repeat(b, 1, 1).type(dtype), x],
                      dim=1)
        x = self.dropout(x + self.pos_embedding.type(dtype))
        x = self.pre_norm(x)

        # transformer
        x = self.transformer(x)

        # head
        x = self.post_norm(x)
        x = torch.mm(x[:, 0, :], self.head)
        return x

    def fp16(self):
        return self.apply(to_fp16)


 class TextTransformer(nn.Module):

    def __init__(self,
                 vocab_size,
                 text_len,
                 dim=512,
                 out_dim=512,
                 num_heads=8,
                 num_layers=12,
                 attn_dropout=0.0,
                 proj_dropout=0.0,
                 embedding_dropout=0.0):
        super(TextTransformer, self).__init__()
        self.vocab_size = vocab_size
        self.text_len = text_len
        self.dim = dim
        self.out_dim = out_dim
        self.num_heads = num_heads
        self.num_layers = num_layers

        # embeddings
        self.token_embedding = nn.Embedding(vocab_size, dim)
        self.pos_embedding = nn.Parameter(0.01 * torch.randn(1, text_len, dim))
        self.dropout = nn.Dropout(embedding_dropout)

        # transformer
        self.transformer = nn.ModuleList([
            AttentionBlock(dim, num_heads, attn_dropout, proj_dropout)
            for _ in range(num_layers)
        ])
        self.norm = LayerNorm(dim)

        # head
        gain = 1.0 / math.sqrt(dim)
        self.head = nn.Parameter(gain * torch.randn(dim, out_dim))

        # causal attention mask
        self.register_buffer('attn_mask',
                             torch.tril(torch.ones(1, text_len, text_len)))

    def forward(self, x):
        eot, dtype = x.argmax(dim=-1), self.head.dtype

        # embeddings
        x = self.dropout(
            self.token_embedding(x).type(dtype)
            + self.pos_embedding.type(dtype))

        # transformer
        for block in self.transformer:
            x = block(x, self.attn_mask)

        # head
        x = self.norm(x)
        x = torch.mm(x[torch.arange(x.size(0)), eot], self.head)
        return x

    def fp16(self):
        return self.apply(to_fp16)


 class CLIP(nn.Module):

    def __init__(self,
                 embed_dim=512,
                 image_size=224,
                 patch_size=16,
                 vision_dim=768,
                 vision_heads=12,
                 vision_layers=12,
                 vocab_size=49408,
                 text_len=77,
                 text_dim=512,
                 text_heads=8,
                 text_layers=12,
                 attn_dropout=0.0,
                 proj_dropout=0.0,
                 embedding_dropout=0.0):
        super(CLIP, self).__init__()
        self.embed_dim = embed_dim
        self.image_size = image_size
        self.patch_size = patch_size
        self.vision_dim = vision_dim
        self.vision_heads = vision_heads
        self.vision_layers = vision_layers
        self.vocab_size = vocab_size
        self.text_len = text_len
        self.text_dim = text_dim
        self.text_heads = text_heads
        self.text_layers = text_layers

        # models
        self.visual = VisionTransformer(
            image_size=image_size,
            patch_size=patch_size,
            dim=vision_dim,
            out_dim=embed_dim,
            num_heads=vision_heads,
            num_layers=vision_layers,
            attn_dropout=attn_dropout,
            proj_dropout=proj_dropout,
            embedding_dropout=embedding_dropout)
        self.textual = TextTransformer(
            vocab_size=vocab_size,
            text_len=text_len,
            dim=text_dim,
            out_dim=embed_dim,
            num_heads=text_heads,
            num_layers=text_layers,
            attn_dropout=attn_dropout,
            proj_dropout=proj_dropout,
            embedding_dropout=embedding_dropout)
        self.log_scale = nn.Parameter(math.log(1 / 0.07) * torch.ones([]))

    def forward(self, imgs, txt_tokens):
        r"""imgs:       [B, C, H, W] of torch.float32.
            txt_tokens: [B, T] of torch.long.
        """
        xi = self.visual(imgs)
        xt = self.textual(txt_tokens)

        # normalize features
        xi = F.normalize(xi, p=2, dim=1)
        xt = F.normalize(xt, p=2, dim=1)

        # gather features from all ranks
        full_xi = ops.diff_all_gather(xi)
        full_xt = ops.diff_all_gather(xt)

        # logits
        scale = self.log_scale.exp()
        logits_i2t = scale * torch.mm(xi, full_xt.t())
        logits_t2i = scale * torch.mm(xt, full_xi.t())

        # labels
        labels = torch.arange(
            len(xi) * ops.get_rank(),
            len(xi) * (ops.get_rank() + 1),
            dtype=torch.long,
            device=xi.device)
        return logits_i2t, logits_t2i, labels

    def init_weights(self):
        # embeddings
        nn.init.normal_(self.textual.token_embedding.weight, std=0.02)
        nn.init.normal_(self.visual.patch_embedding.weight, tsd=0.1)

        # attentions
        for modality in ['visual', 'textual']:
            dim = self.vision_dim if modality == 'visual' else 'textual'
            transformer = getattr(self, modality).transformer
            proj_gain = (1.0 / math.sqrt(dim)) * (
                1.0 / math.sqrt(2 * transformer.num_layers))
            attn_gain = 1.0 / math.sqrt(dim)
            mlp_gain = 1.0 / math.sqrt(2.0 * dim)
            for block in transformer.layers:
                nn.init.normal_(block.attn.to_qkv.weight, std=attn_gain)
                nn.init.normal_(block.attn.proj.weight, std=proj_gain)
                nn.init.normal_(block.mlp[0].weight, std=mlp_gain)
                nn.init.normal_(block.mlp[2].weight, std=proj_gain)

    def param_groups(self):
        groups = [{
            'params': [
                p for n, p in self.named_parameters()
                if 'norm' in n or n.endswith('bias')
            ],
            'weight_decay':
            0.0
        }, {
            'params': [
                p for n, p in self.named_parameters()
                if not ('norm' in n or n.endswith('bias'))
            ]
        }]
        return groups

    def fp16(self):
        return self.apply(to_fp16)


 def clip_vit_b_32(**kwargs):
    return CLIP(
        embed_dim=512,
        image_size=224,
        patch_size=32,
        vision_dim=768,
        vision_heads=12,
        vision_layers=12,
        text_dim=512,
        text_heads=8,
        text_layers=12,
        **kwargs)


 def clip_vit_b_16(**kwargs):
    return CLIP(
        embed_dim=512,
        image_size=224,
        patch_size=16,
        vision_dim=768,
        vision_heads=12,
        vision_layers=12,
        text_dim=512,
        text_heads=8,
        text_layers=12,
        **kwargs)


 def clip_vit_l_14(**kwargs):
    return CLIP(
        embed_dim=768,
        image_size=224,
        patch_size=14,
        vision_dim=1024,
        vision_heads=16,
        vision_layers=24,
        text_dim=768,
        text_heads=12,
        text_layers=12,
        **kwargs)


 def clip_vit_l_14_336px(**kwargs):
    return CLIP(
        embed_dim=768,
        image_size=336,
        patch_size=14,
        vision_dim=1024,
        vision_heads=16,
        vision_layers=24,
        text_dim=768,
        text_heads=12,
        text_layers=12,
        **kwargs)


 def clip_vit_h_16(**kwargs):
    return CLIP(
        embed_dim=1024,
        image_size=256,
        patch_size=16,
        vision_dim=1280,
        vision_heads=16,
        vision_layers=32,
        text_dim=1024,
        text_heads=16,
        text_layers=24,
        **kwargs)
--- a/modelscope/models/cv/image_to_image_translation/ops/init.py
+++ b/modelscope/models/cv/image_to_image_translation/ops/init.py
@@ -0,0 +1,8 @@
 from .degradation import *  # noqa F403
 from .diffusion import *  # noqa F403
 from .losses import *  # noqa F403
 from .metrics import *  # noqa F403
 from .random_color import *  # noqa F403
 from .random_mask import *  # noqa F403
 from .svd import *  # noqa F403
 from .utils import *  # noqa F403
--- a/modelscope/models/cv/image_to_image_translation/ops/apps.py
+++ b/modelscope/models/cv/image_to_image_translation/ops/apps.py
@@ -0,0 +1,663 @@
 # APPs that facilitate the use of pretrained neural networks.

 import os.path as osp

 import artist.data as data
 import artist.models as models
 import numpy as np
 import torch
 import torch.cuda.amp as amp
 import torch.nn.functional as F
 import torchvision.transforms as T
 from artist import DOWNLOAD_TO_CACHE
 from PIL import Image
 from torch.utils.data import DataLoader, Dataset

 from .utils import parallel, read_image

 __all__ = [
    'FeatureExtractor', 'Classifier', 'Text2Image', 'Sole2Shoe', 'ImageParser',
    'TextImageMatch', 'taobao_feature_extractor', 'singleton_classifier',
    'orientation_classifier', 'fashion_text2image', 'mindalle_text2image',
    'sole2shoe', 'sole_parser', 'sod_foreground_parser',
    'fashion_text_image_match'
 ]


 class ImageFolder(Dataset):

    def __init__(self, paths, transforms=None):
        self.paths = paths
        self.transforms = transforms

    def __getitem__(self, index):
        img = read_image(self.paths[index])
        if img.mode != 'RGB':
            img = img.convert('RGB')
        if self.transforms is not None:
            img = self.transforms(img)
        return img

    def __len__(self):
        return len(self.paths)


 class FeatureExtractor(object):

    def __init__(
        self,
        model='InceptionV1',
        checkpoint='models/inception-v1/1218shoes.v9_7.140.0.1520000',
        resolution=224,
        mean=[0.485, 0.456, 0.406],
        std=[0.229, 0.224, 0.225],
        batch_size=64,
        device=torch.device(
            'cuda' if torch.cuda.is_available() else 'cpu')):  # noqa E125
        self.resolution = resolution
        self.batch_size = batch_size
        self.device = device

        # init model
        self.net = getattr(
            models,
            model)(num_classes=None).eval().requires_grad_(False).to(device)
        self.net.load_state_dict(
            torch.load(DOWNLOAD_TO_CACHE(checkpoint), map_location=device))

        # data transforms
        self.transforms = T.Compose([
            data.PadToSquare(),
            T.Resize(resolution),
            T.ToTensor(),
            T.Normalize(mean, std)
        ])

    def __call__(self, imgs, num_workers=0):
        r"""imgs:   Either a PIL.Image or a list of PIL.Image instances.
        """
        # preprocess
        if isinstance(imgs, Image.Image):
            imgs = [imgs]
        assert isinstance(imgs,
                          (tuple, list)) and isinstance(imgs[0], Image.Image)
        imgs = torch.stack(parallel(self.transforms, imgs, num_workers), dim=0)

        # forward
        feats = []
        for batch in imgs.split(self.batch_size, dim=0):
            batch = batch.to(self.device, non_blocking=True)
            feats.append(self.net(batch))
        return torch.cat(feats, dim=0)

    def batch_process(self, paths):
        # init dataloader
        dataloader = DataLoader(
            dataset=ImageFolder(paths, self.transforms),
            batch_size=self.batch_size,
            shuffle=False,
            drop_last=False,
            pin_memory=True,
            num_workers=8,
            prefetch_factor=2)

        # forward
        feats = []
        for step, batch in enumerate(dataloader, 1):
            print(f'Step: {step}/{len(dataloader)}', flush=True)
            batch = batch.to(self.device, non_blocking=True)
            feats.append(self.net(batch))
        return torch.cat(feats)


 class Classifier(object):

    def __init__(
        self,
        model='InceptionV1',
        checkpoint='models/classifier/shoes+apparel+bag-sgdetect-211230.pth',
        num_classes=1,
        resolution=224,
        mean=[0.485, 0.456, 0.406],
        std=[0.229, 0.224, 0.225],
        batch_size=64,
        device=torch.device(
            'cuda' if torch.cuda.is_available() else 'cpu')):  # noqa E125
        self.num_classes = num_classes
        self.resolution = resolution
        self.batch_size = batch_size
        self.device = device

        # init model
        self.net = getattr(models, model)(
            num_classes=num_classes).eval().requires_grad_(False).to(device)
        self.net.load_state_dict(
            torch.load(DOWNLOAD_TO_CACHE(checkpoint), map_location=device))

        # data transforms
        self.transforms = T.Compose([
            data.PadToSquare(),
            T.Resize(resolution),
            T.ToTensor(),
            T.Normalize(mean, std)
        ])

    def __call__(self, imgs, num_workers=0):
        r"""imgs:   Either a PIL.Image or a list of PIL.Image instances.
        """
        # preprocess
        if isinstance(imgs, Image.Image):
            imgs = [imgs]
        assert isinstance(imgs,
                          (tuple, list)) and isinstance(imgs[0], Image.Image)
        imgs = torch.stack(parallel(self.transforms, imgs, num_workers), dim=0)

        # forward
        scores = []
        for batch in imgs.split(self.batch_size, dim=0):
            batch = batch.to(self.device, non_blocking=True)
            logits = self.net(batch)
            scores.append(logits.sigmoid() if self.num_classes ==  # noqa W504
                          1 else logits.softmax(dim=1))
        return torch.cat(scores, dim=0)


 class Text2Image(object):

    def __init__(
        self,
        vqgan_dim=128,
        vqgan_z_dim=256,
        vqgan_dim_mult=[1, 1, 2, 2, 4],
        vqgan_num_res_blocks=2,
        vqgan_attn_scales=[1.0 / 16],
        vqgan_codebook_size=975,
        vqgan_beta=0.25,
        gpt_txt_vocab_size=21128,
        gpt_txt_seq_len=64,
        gpt_img_seq_len=1024,
        gpt_dim=1024,
        gpt_num_heads=16,
        gpt_num_layers=24,
        vqgan_checkpoint='models/vqgan/vqgan_shoes+apparels_step10k_vocab975.pth',
        gpt_checkpoint='models/seq2seq_gpt/text2image_shoes+apparels_step400k.pth',
        tokenizer=data.BertTokenizer(name='bert-base-chinese', length=64),
        batch_size=16,
        device=torch.device(
            'cuda' if torch.cuda.is_available() else 'cpu')):  # noqa E125
        self.tokenizer = tokenizer
        self.batch_size = batch_size
        self.device = device

        # init VQGAN model
        self.vqgan = models.VQGAN(
            dim=vqgan_dim,
            z_dim=vqgan_z_dim,
            dim_mult=vqgan_dim_mult,
            num_res_blocks=vqgan_num_res_blocks,
            attn_scales=vqgan_attn_scales,
            codebook_size=vqgan_codebook_size,
            beta=vqgan_beta).eval().requires_grad_(False).to(device)
        self.vqgan.load_state_dict(
            torch.load(
                DOWNLOAD_TO_CACHE(vqgan_checkpoint), map_location=device))

        # init GPT model
        self.gpt = models.Seq2SeqGPT(
            src_vocab_size=gpt_txt_vocab_size,
            tar_vocab_size=vqgan_codebook_size,
            src_seq_len=gpt_txt_seq_len,
            tar_seq_len=gpt_img_seq_len,
            dim=gpt_dim,
            num_heads=gpt_num_heads,
            num_layers=gpt_num_layers).eval().requires_grad_(False).to(device)
        self.gpt.load_state_dict(
            torch.load(DOWNLOAD_TO_CACHE(gpt_checkpoint), map_location=device))

    def __call__(self,
                 txts,
                 top_k=64,
                 top_p=None,
                 temperature=0.6,
                 use_fp16=True):
        # preprocess
        if isinstance(txts, str):
            txts = [txts]
        assert isinstance(txts, (tuple, list)) and isinstance(txts[0], str)
        txt_tokens = torch.LongTensor([self.tokenizer(u) for u in txts])

        # forward
        out_imgs = []
        for batch in txt_tokens.split(self.batch_size, dim=0):
            # sample
            batch = batch.to(self.device, non_blocking=True)
            with amp.autocast(enabled=use_fp16):
                img_tokens = self.gpt.sample(batch, top_k, top_p, temperature)

            # decode
            imgs = self.vqgan.decode_from_tokens(img_tokens)
            imgs = self._whiten_borders(imgs)
            imgs = imgs.clamp_(-1, 1).add_(1).mul_(125.0).permute(
                0, 2, 3, 1).cpu().numpy().astype(np.uint8)
            imgs = [Image.fromarray(u) for u in imgs]

            # append
            out_imgs += imgs
        return out_imgs

    def _whiten_borders(self, imgs):
        r"""Remove border artifacts.
        """
        imgs[:, :, :18, :] = 1
        imgs[:, :, :, :18] = 1
        imgs[:, :, -18:, :] = 1
        imgs[:, :, :, -18:] = 1
        return imgs


 class Sole2Shoe(object):

    def __init__(
        self,
        vqgan_dim=128,
        vqgan_z_dim=256,
        vqgan_dim_mult=[1, 1, 2, 2, 4],
        vqgan_num_res_blocks=2,
        vqgan_attn_scales=[1.0 / 16],
        vqgan_codebook_size=975,
        vqgan_beta=0.25,
        src_resolution=256,
        tar_resolution=512,
        gpt_dim=1024,
        gpt_num_heads=16,
        gpt_num_layers=24,
        vqgan_checkpoint='models/vqgan/vqgan_shoes+apparels_step10k_vocab975.pth',
        gpt_checkpoint='models/seq2seq_gpt/sole2shoe-step300k-220104.pth',
        batch_size=12,
        device=torch.device(
            'cuda' if torch.cuda.is_available() else 'cpu')):  # noqa E125
        self.batch_size = batch_size
        self.device = device
        src_seq_len = (src_resolution // 16)**2
        tar_seq_len = (tar_resolution // 16)**2

        # init VQGAN model
        self.vqgan = models.VQGAN(
            dim=vqgan_dim,
            z_dim=vqgan_z_dim,
            dim_mult=vqgan_dim_mult,
            num_res_blocks=vqgan_num_res_blocks,
            attn_scales=vqgan_attn_scales,
            codebook_size=vqgan_codebook_size,
            beta=vqgan_beta).eval().requires_grad_(False).to(device)
        self.vqgan.load_state_dict(
            torch.load(
                DOWNLOAD_TO_CACHE(vqgan_checkpoint), map_location=device))

        # init GPT model
        self.gpt = models.Seq2SeqGPT(
            src_vocab_size=vqgan_codebook_size,
            tar_vocab_size=vqgan_codebook_size,
            src_seq_len=src_seq_len,
            tar_seq_len=tar_seq_len,
            dim=gpt_dim,
            num_heads=gpt_num_heads,
            num_layers=gpt_num_layers).eval().requires_grad_(False).to(device)
        self.gpt.load_state_dict(
            torch.load(DOWNLOAD_TO_CACHE(gpt_checkpoint), map_location=device))

        # data transforms
        self.transforms = T.Compose([
            data.PadToSquare(),
            T.Resize(src_resolution),
            T.ToTensor(),
            T.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
        ])

    def __call__(self,
                 sole_imgs,
                 top_k=64,
                 top_p=None,
                 temperature=0.6,
                 use_fp16=True,
                 num_workers=0):
        # preprocess
        if isinstance(sole_imgs, Image.Image):
            sole_imgs = [sole_imgs]
        assert isinstance(sole_imgs, (tuple, list)) and isinstance(
            sole_imgs[0], Image.Image)
        sole_imgs = torch.stack(
            parallel(self.transforms, sole_imgs, num_workers), dim=0)

        # forward
        out_imgs = []
        for batch in sole_imgs.split(self.batch_size, dim=0):
            # sample
            batch = batch.to(self.device)
            with amp.autocast(enabled=use_fp16):
                sole_tokens = self.vqgan.encode_to_tokens(batch)
                shoe_tokens = self.gpt.sample(sole_tokens, top_k, top_p,
                                              temperature)

            # decode
            shoe_imgs = self.vqgan.decode_from_tokens(shoe_tokens)
            shoe_imgs = self._whiten_borders(shoe_imgs)
            shoe_imgs = shoe_imgs.clamp_(-1, 1).add_(1).mul_(125.0).permute(
                0, 2, 3, 1).cpu().numpy().astype(np.uint8)
            shoe_imgs = [Image.fromarray(u) for u in shoe_imgs]

            # append
            out_imgs += shoe_imgs
        return out_imgs

    def _whiten_borders(self, imgs):
        r"""Remove border artifacts.
        """
        imgs[:, :, :18, :] = 1
        imgs[:, :, :, :18] = 1
        imgs[:, :, -18:, :] = 1
        imgs[:, :, :, -18:] = 1
        return imgs


 class ImageParser(object):

    def __init__(
        self,
        model='SPNet',
        num_classes=2,
        resolution=800,
        mean=[0.485, 0.456, 0.406],
        std=[0.229, 0.224, 0.225],
        model_with_softmax=False,
        checkpoint='models/spnet/sole_segmentation_211219.pth',
        batch_size=16,
        device=torch.device(
            'cuda' if torch.cuda.is_available() else 'cpu')):  # noqa E125
        self.batch_size = batch_size
        self.device = device

        # init model
        if checkpoint.endswith('.pt'):
            self.net = torch.jit.load(
                DOWNLOAD_TO_CACHE(checkpoint)).eval().to(device)
            [p.requires_grad_(False) for p in self.net.parameters()]
        else:
            self.net = getattr(models, model)(
                num_classes=num_classes,
                pretrained=False).eval().requires_grad_(False).to(device)
            self.net.load_state_dict(
                torch.load(DOWNLOAD_TO_CACHE(checkpoint), map_location=device))
        self.softmax = (lambda x, dim: x) if model_with_softmax else F.softmax

        # data transforms
        self.transforms = T.Compose([
            data.PadToSquare(),
            T.Resize(resolution),
            T.ToTensor(),
            T.Normalize(mean, std)
        ])

    def __call__(self, imgs, num_workers=0):
        # preprocess
        if isinstance(imgs, Image.Image):
            imgs = [imgs]
        assert isinstance(imgs,
                          (tuple, list)) and isinstance(imgs[0], Image.Image)
        sizes = [u.size for u in imgs]
        imgs = torch.stack(parallel(self.transforms, imgs, num_workers), dim=0)

        # forward
        masks = []
        for batch in imgs.split(self.batch_size, dim=0):
            batch = batch.to(self.device, non_blocking=True)
            masks.append(self.softmax(self.net(batch), dim=1))

        # postprocess
        masks = torch.cat(masks, dim=0).unsqueeze(1)
        masks = [
            F.interpolate(u, v, mode='bilinear', align_corners=False)
            for u, v in zip(masks, sizes)
        ]
        return masks


 class TextImageMatch(object):

    def __init__(
        self,
        embed_dim=512,
        image_size=224,
        patch_size=32,
        vision_dim=768,
        vision_heads=12,
        vision_layers=12,
        vocab_size=21128,
        text_len=77,
        text_dim=512,
        text_heads=8,
        text_layers=12,
        mean=[0.48145466, 0.4578275, 0.40821073],
        std=[0.26862954, 0.26130258, 0.27577711],
        checkpoint='models/clip/clip_shoes+apparels_step84k_210105.pth',
        tokenizer=data.BertTokenizer(name='bert-base-chinese', length=77),
        batch_size=64,
        device=torch.device(
            'cuda' if torch.cuda.is_available() else 'cpu')):  # noqa E125
        self.tokenizer = tokenizer
        self.batch_size = batch_size
        self.device = device

        # init model
        self.clip = models.CLIP(
            embed_dim=embed_dim,
            image_size=image_size,
            patch_size=patch_size,
            vision_dim=vision_dim,
            vision_heads=vision_heads,
            vision_layers=vision_layers,
            vocab_size=vocab_size,
            text_len=text_len,
            text_dim=text_dim,
            text_heads=text_heads,
            text_layers=text_layers).eval().requires_grad_(False).to(device)
        self.clip.load_state_dict(
            torch.load(DOWNLOAD_TO_CACHE(checkpoint), map_location=device))

        # transforms
        scale_size = int(image_size * 8 / 7)
        self.transforms = T.Compose([
            data.PadToSquare(),
            T.Resize(scale_size),
            T.CenterCrop(image_size),
            T.ToTensor(),
            T.Normalize(mean, std)
        ])

    def __call__(self, imgs, txts, num_workers=0):
        # preprocess
        assert isinstance(imgs,
                          (tuple, list)) and isinstance(imgs[0], Image.Image)
        assert isinstance(txts, (tuple, list)) and isinstance(txts[0], str)
        txt_tokens = torch.LongTensor([self.tokenizer(u) for u in txts])
        imgs = torch.stack(parallel(self.transforms, imgs, num_workers), dim=0)

        # forward
        scores = []
        for img_batch, txt_batch in zip(
                imgs.split(self.batch_size, dim=0),
                txt_tokens.split(self.batch_size, dim=0)):
            img_batch = img_batch.to(self.device)
            txt_batch = txt_batch.to(self.device)
            xi = F.normalize(self.clip.visual(img_batch), p=2, dim=1)
            xt = F.normalize(self.clip.textual(txt_batch), p=2, dim=1)
            scores.append((xi * xt).sum(dim=1))
        return torch.cat(scores, dim=0)


 def taobao_feature_extractor(category='shoes', **kwargs):
    r"""Pretrained taobao-search feature extractors.
    """
    assert category in ['softall', 'shoes', 'bag']
    checkpoint = osp.join(
        'models/inception-v1', {
            'softall': '1214softall_10.10.0.5000',
            'shoes': '1218shoes.v9_7.140.0.1520000',
            'bag': '0926bag.v9_6.29.0.140000'
        }[category])
    app = FeatureExtractor(
        model='InceptionV1',
        checkpoint=checkpoint,
        resolution=224,
        mean=[0.485, 0.456, 0.406],
        std=[0.229, 0.224, 0.225],
        **kwargs)
    return app


 def singleton_classifier(**kwargs):
    r"""Pretrained classifier that finds single-object images.
        Supports shoes, apparel, and bag images.
    """
    app = Classifier(
        model='InceptionV1',
        checkpoint='models/classifier/shoes+apparel+bag-sgdetect-211230.pth',
        num_classes=1,
        resolution=224,
        mean=[0.485, 0.456, 0.406],
        std=[0.229, 0.224, 0.225],
        **kwargs)
    return app


 def orientation_classifier(**kwargs):
    r"""Shoes orientation classifier.
    """
    app = Classifier(
        model='InceptionV1',
        checkpoint='models/classifier/shoes-oriendetect-20211026.pth',
        num_classes=1,
        resolution=224,
        mean=[0.485, 0.456, 0.406],
        std=[0.229, 0.224, 0.225],
        **kwargs)
    return app


 def fashion_text2image(**kwargs):
    r"""Fashion text-to-image generator.
        Supports shoe and apparel image generation.
    """
    app = Text2Image(
        vqgan_dim=128,
        vqgan_z_dim=256,
        vqgan_dim_mult=[1, 1, 2, 2, 4],
        vqgan_num_res_blocks=2,
        vqgan_attn_scales=[1.0 / 16],
        vqgan_codebook_size=975,
        vqgan_beta=0.25,
        gpt_txt_vocab_size=21128,
        gpt_txt_seq_len=64,
        gpt_img_seq_len=1024,
        gpt_dim=1024,
        gpt_num_heads=16,
        gpt_num_layers=24,
        vqgan_checkpoint=  # noqa E251
        'models/vqgan/vqgan_shoes+apparels_step10k_vocab975.pth',
        gpt_checkpoint=  # noqa E251
        'models/seq2seq_gpt/text2image_shoes+apparels_step400k.pth',
        tokenizer=data.BertTokenizer(name='bert-base-chinese', length=64),
        **kwargs)
    return app


 def mindalle_text2image(**kwargs):
    r"""Pretrained text2image generator with weights copied from minDALL-E.
    """
    app = Text2Image(
        vqgan_dim=128,
        vqgan_z_dim=256,
        vqgan_dim_mult=[1, 1, 2, 2, 4],
        vqgan_num_res_blocks=2,
        vqgan_attn_scales=[1.0 / 16],
        vqgan_codebook_size=16384,
        vqgan_beta=0.25,
        gpt_txt_vocab_size=16384,
        gpt_txt_seq_len=64,
        gpt_img_seq_len=256,
        gpt_dim=1536,
        gpt_num_heads=24,
        gpt_num_layers=42,
        vqgan_checkpoint='models/minDALLE/1.3B_vqgan.pth',
        gpt_checkpoint='models/minDALLE/1.3B_gpt.pth',
        tokenizer=data.BPETokenizer(length=64),
        **kwargs)
    return app


 def sole2shoe(**kwargs):
    app = Sole2Shoe(
        vqgan_dim=128,
        vqgan_z_dim=256,
        vqgan_dim_mult=[1, 1, 2, 2, 4],
        vqgan_num_res_blocks=2,
        vqgan_attn_scales=[1.0 / 16],
        vqgan_codebook_size=975,
        vqgan_beta=0.25,
        src_resolution=256,
        tar_resolution=512,
        gpt_dim=1024,
        gpt_num_heads=16,
        gpt_num_layers=24,
        vqgan_checkpoint=  # noqa E251
        'models/vqgan/vqgan_shoes+apparels_step10k_vocab975.pth',
        gpt_checkpoint='models/seq2seq_gpt/sole2shoe-step300k-220104.pth',
        **kwargs)
    return app


 def sole_parser(**kwargs):
    app = ImageParser(
        model='SPNet',
        num_classes=2,
        resolution=800,
        mean=[0.485, 0.456, 0.406],
        std=[0.229, 0.224, 0.225],
        model_with_softmax=False,
        checkpoint='models/spnet/sole_segmentation_211219.pth',
        **kwargs)
    return app


 def sod_foreground_parser(**kwargs):
    app = ImageParser(
        model=None,
        num_classes=None,
        resolution=448,
        mean=[0.488431, 0.466275, 0.403686],
        std=[0.222627, 0.21949, 0.22549],
        model_with_softmax=True,
        checkpoint='models/semseg/sod_model_20201228.pt',
        **kwargs)
    return app


 def fashion_text_image_match(**kwargs):
    app = TextImageMatch(
        embed_dim=512,
        image_size=224,
        patch_size=32,
        vision_dim=768,
        vision_heads=12,
        vision_layers=12,
        vocab_size=21128,
        text_len=77,
        text_dim=512,
        text_heads=8,
        text_layers=12,
        mean=[0.48145466, 0.4578275, 0.40821073],
        std=[0.26862954, 0.26130258, 0.27577711],
        checkpoint='models/clip/clip_shoes+apparels_step84k_210105.pth',
        tokenizer=data.BertTokenizer(name='bert-base-chinese', length=77),
        **kwargs)
    return app
--- a/modelscope/models/cv/image_to_image_translation/ops/degradation.py
+++ b/modelscope/models/cv/image_to_image_translation/ops/degradation.py
--- a/modelscope/models/cv/image_to_image_translation/ops/diffusion.py
+++ b/modelscope/models/cv/image_to_image_translation/ops/diffusion.py
@@ -0,0 +1,598 @@
 import math

 import torch

 from .losses import discretized_gaussian_log_likelihood, kl_divergence

 __all__ = ['GaussianDiffusion', 'beta_schedule']


 def _i(tensor, t, x):
    r"""Index tensor using t and format the output according to x.
    """
    shape = (x.size(0), ) + (1, ) * (x.ndim - 1)
    return tensor[t].view(shape).to(x)


 def beta_schedule(schedule,
                  num_timesteps=1000,
                  init_beta=None,
                  last_beta=None):
    if schedule == 'linear':
        scale = 1000.0 / num_timesteps
        init_beta = init_beta or scale * 0.0001
        last_beta = last_beta or scale * 0.02
        return torch.linspace(
            init_beta, last_beta, num_timesteps, dtype=torch.float64)
    elif schedule == 'quadratic':
        init_beta = init_beta or 0.0015
        last_beta = last_beta or 0.0195
        return torch.linspace(
            init_beta**0.5, last_beta**0.5, num_timesteps,
            dtype=torch.float64)**2
    elif schedule == 'cosine':
        betas = []
        for step in range(num_timesteps):
            t1 = step / num_timesteps
            t2 = (step + 1) / num_timesteps

            # fn = lambda u: math.cos((u + 0.008) / 1.008 * math.pi / 2)**2
            def fn(u):
                return math.cos((u + 0.008) / 1.008 * math.pi / 2)**2

            betas.append(min(1.0 - fn(t2) / fn(t1), 0.999))
        return torch.tensor(betas, dtype=torch.float64)
    else:
        raise ValueError(f'Unsupported schedule: {schedule}')


 class GaussianDiffusion(object):

    def __init__(self,
                 betas,
                 mean_type='eps',
                 var_type='learned_range',
                 loss_type='mse',
                 rescale_timesteps=False):
        # check input
        if not isinstance(betas, torch.DoubleTensor):
            betas = torch.tensor(betas, dtype=torch.float64)
        assert min(betas) > 0 and max(betas) <= 1
        assert mean_type in ['x0', 'x_{t-1}', 'eps']
        assert var_type in [
            'learned', 'learned_range', 'fixed_large', 'fixed_small'
        ]
        assert loss_type in [
            'mse', 'rescaled_mse', 'kl', 'rescaled_kl', 'l1', 'rescaled_l1'
        ]
        self.betas = betas
        self.num_timesteps = len(betas)
        self.mean_type = mean_type
        self.var_type = var_type
        self.loss_type = loss_type
        self.rescale_timesteps = rescale_timesteps

        # alphas
        alphas = 1 - self.betas
        self.alphas_cumprod = torch.cumprod(alphas, dim=0)
        self.alphas_cumprod_prev = torch.cat(
            [alphas.new_ones([1]), self.alphas_cumprod[:-1]])
        self.alphas_cumprod_next = torch.cat(
            [self.alphas_cumprod[1:],
             alphas.new_zeros([1])])

        # q(x_t | x_{t-1})
        self.sqrt_alphas_cumprod = torch.sqrt(self.alphas_cumprod)
        self.sqrt_one_minus_alphas_cumprod = torch.sqrt(1.0
                                                        - self.alphas_cumprod)
        self.log_one_minus_alphas_cumprod = torch.log(1.0
                                                      - self.alphas_cumprod)
        self.sqrt_recip_alphas_cumprod = torch.sqrt(1.0 / self.alphas_cumprod)
        self.sqrt_recipm1_alphas_cumprod = torch.sqrt(1.0 / self.alphas_cumprod
                                                      - 1)

        # q(x_{t-1} | x_t, x_0)
        self.posterior_variance = betas * (1.0 - self.alphas_cumprod_prev) / (
            1.0 - self.alphas_cumprod)
        self.posterior_log_variance_clipped = torch.log(
            self.posterior_variance.clamp(1e-20))
        self.posterior_mean_coef1 = betas * torch.sqrt(
            self.alphas_cumprod_prev) / (1.0 - self.alphas_cumprod)
        self.posterior_mean_coef2 = (
            1.0 - self.alphas_cumprod_prev) * torch.sqrt(alphas) / (
                1.0 - self.alphas_cumprod)

    def q_sample(self, x0, t, noise=None):
        r"""Sample from q(x_t | x_0).
        """
        noise = torch.randn_like(x0) if noise is None else noise
        return _i(self.sqrt_alphas_cumprod, t, x0) * x0 + _i(
            self.sqrt_one_minus_alphas_cumprod, t, x0) * noise

    def q_mean_variance(self, x0, t):
        r"""Distribution of q(x_t | x_0).
        """
        mu = _i(self.sqrt_alphas_cumprod, t, x0) * x0
        var = _i(1.0 - self.alphas_cumprod, t, x0)
        log_var = _i(self.log_one_minus_alphas_cumprod, t, x0)
        return mu, var, log_var

    def q_posterior_mean_variance(self, x0, xt, t):
        r"""Distribution of q(x_{t-1} | x_t, x_0).
        """
        mu = _i(self.posterior_mean_coef1, t, xt) * x0 + _i(
            self.posterior_mean_coef2, t, xt) * xt
        var = _i(self.posterior_variance, t, xt)
        log_var = _i(self.posterior_log_variance_clipped, t, xt)
        return mu, var, log_var

    @torch.no_grad()
    def p_sample(self,
                 xt,
                 t,
                 model,
                 model_kwargs={},
                 clamp=None,
                 percentile=None,
                 condition_fn=None,
                 guide_scale=None):
        r"""Sample from p(x_{t-1} | x_t).
            - condition_fn: for classifier-based guidance (guided-diffusion).
            - guide_scale: for classifier-free guidance (glide/dalle-2).
        """
        # predict distribution of p(x_{t-1} | x_t)
        mu, var, log_var, x0 = self.p_mean_variance(xt, t, model, model_kwargs,
                                                    clamp, percentile,
                                                    guide_scale)

        # random sample (with optional conditional function)
        noise = torch.randn_like(xt)
        # no noise when t == 0
        mask = t.ne(0).float().view(-1, *((1, ) * (xt.ndim - 1)))
        if condition_fn is not None:
            grad = condition_fn(xt, self._scale_timesteps(t), **model_kwargs)
            mu = mu.float() + var * grad.float()
        xt_1 = mu + mask * torch.exp(0.5 * log_var) * noise
        return xt_1, x0

    @torch.no_grad()
    def p_sample_loop(self,
                      noise,
                      model,
                      model_kwargs={},
                      clamp=None,
                      percentile=None,
                      condition_fn=None,
                      guide_scale=None):
        r"""Sample from p(x_{t-1} | x_t) p(x_{t-2} | x_{t-1}) ... p(x_0 | x_1).
        """
        # prepare input
        b, c, h, w = noise.size()
        xt = noise

        # diffusion process
        for step in torch.arange(self.num_timesteps).flip(0):
            t = torch.full((b, ), step, dtype=torch.long, device=xt.device)
            xt, _ = self.p_sample(xt, t, model, model_kwargs, clamp,
                                  percentile, condition_fn, guide_scale)
        return xt

    def p_mean_variance(self,
                        xt,
                        t,
                        model,
                        model_kwargs={},
                        clamp=None,
                        percentile=None,
                        guide_scale=None):
        r"""Distribution of p(x_{t-1} | x_t).
        """
        # predict distribution
        if guide_scale is None:
            out = model(xt, self._scale_timesteps(t), **model_kwargs)
        else:
            # classifier-free guidance
            # (model_kwargs[0]: conditional kwargs; model_kwargs[1]: non-conditional kwargs)
            assert isinstance(model_kwargs, list) and len(model_kwargs) == 2
            assert self.mean_type == 'eps'
            y_out = model(xt, self._scale_timesteps(t), **model_kwargs[0])
            u_out = model(xt, self._scale_timesteps(t), **model_kwargs[1])
            out = torch.cat(
                [
                    u_out[:, :3] + guide_scale *  # noqa W504
                    (y_out[:, :3] - u_out[:, :3]),
                    y_out[:, 3:]
                ],
                dim=1)

        # compute variance
        if self.var_type == 'learned':
            out, log_var = out.chunk(2, dim=1)
            var = torch.exp(log_var)
        elif self.var_type == 'learned_range':
            out, fraction = out.chunk(2, dim=1)
            min_log_var = _i(self.posterior_log_variance_clipped, t, xt)
            max_log_var = _i(torch.log(self.betas), t, xt)
            fraction = (fraction + 1) / 2.0
            log_var = fraction * max_log_var + (1 - fraction) * min_log_var
            var = torch.exp(log_var)
        elif self.var_type == 'fixed_large':
            var = _i(
                torch.cat([self.posterior_variance[1:2], self.betas[1:]]), t,
                xt)
            log_var = torch.log(var)
        elif self.var_type == 'fixed_small':
            var = _i(self.posterior_variance, t, xt)
            log_var = _i(self.posterior_log_variance_clipped, t, xt)

        # compute mean and x0
        if self.mean_type == 'x_{t-1}':
            mu = out  # x_{t-1}
            x0 = _i(1.0 / self.posterior_mean_coef1, t, xt) * mu - _i(
                self.posterior_mean_coef2 / self.posterior_mean_coef1, t,
                xt) * xt
        elif self.mean_type == 'x0':
            x0 = out
            mu, _, _ = self.q_posterior_mean_variance(x0, xt, t)
        elif self.mean_type == 'eps':
            x0 = _i(self.sqrt_recip_alphas_cumprod, t, xt) * xt - _i(
                self.sqrt_recipm1_alphas_cumprod, t, xt) * out
            mu, _, _ = self.q_posterior_mean_variance(x0, xt, t)

        # restrict the range of x0
        if percentile is not None:
            assert percentile > 0 and percentile <= 1  # e.g., 0.995
            s = torch.quantile(
                x0.flatten(1).abs(), percentile,
                dim=1).clamp_(1.0).view(-1, 1, 1, 1)
            x0 = torch.min(s, torch.max(-s, x0)) / s
        elif clamp is not None:
            x0 = x0.clamp(-clamp, clamp)
        return mu, var, log_var, x0

    @torch.no_grad()
    def ddim_sample(self,
                    xt,
                    t,
                    model,
                    model_kwargs={},
                    clamp=None,
                    percentile=None,
                    condition_fn=None,
                    guide_scale=None,
                    ddim_timesteps=20,
                    eta=0.0):
        stride = self.num_timesteps // ddim_timesteps

        # predict distribution of p(x_{t-1} | x_t)
        _, _, _, x0 = self.p_mean_variance(xt, t, model, model_kwargs, clamp,
                                           percentile, guide_scale)
        if condition_fn is not None:
            # x0 -> eps
            alpha = _i(self.alphas_cumprod, t, xt)
            eps = (_i(self.sqrt_recip_alphas_cumprod, t, xt) * xt - x0) / _i(
                self.sqrt_recipm1_alphas_cumprod, t, xt)
            eps = eps - (1 - alpha).sqrt() * condition_fn(
                xt, self._scale_timesteps(t), **model_kwargs)

            # eps -> x0
            x0 = _i(self.sqrt_recip_alphas_cumprod, t, xt) * xt - _i(
                self.sqrt_recipm1_alphas_cumprod, t, xt) * eps

        # derive variables
        eps = (_i(self.sqrt_recip_alphas_cumprod, t, xt) * xt - x0) / _i(
            self.sqrt_recipm1_alphas_cumprod, t, xt)
        alphas = _i(self.alphas_cumprod, t, xt)
        alphas_prev = _i(self.alphas_cumprod, (t - stride).clamp(0), xt)
        sigmas = eta * torch.sqrt((1 - alphas_prev) /  # noqa W504
                                  (1 - alphas) *  # noqa W504
                                  (1 - alphas / alphas_prev))

        # random sample
        noise = torch.randn_like(xt)
        direction = torch.sqrt(1 - alphas_prev - sigmas**2) * eps
        mask = t.ne(0).float().view(-1, *((1, ) * (xt.ndim - 1)))
        xt_1 = torch.sqrt(alphas_prev) * x0 + direction + mask * sigmas * noise
        return xt_1, x0

    @torch.no_grad()
    def ddim_sample_loop(self,
                         noise,
                         model,
                         model_kwargs={},
                         clamp=None,
                         percentile=None,
                         condition_fn=None,
                         guide_scale=None,
                         ddim_timesteps=20,
                         eta=0.0):
        # prepare input
        b, c, h, w = noise.size()
        xt = noise

        # diffusion process (TODO: clamp is inaccurate! Consider replacing the stride by explicit prev/next steps)
        steps = (1 + torch.arange(0, self.num_timesteps,
                                  self.num_timesteps // ddim_timesteps)).clamp(
                                      0, self.num_timesteps - 1).flip(0)
        for step in steps:
            t = torch.full((b, ), step, dtype=torch.long, device=xt.device)
            xt, _ = self.ddim_sample(xt, t, model, model_kwargs, clamp,
                                     percentile, condition_fn, guide_scale,
                                     ddim_timesteps, eta)
        return xt

    @torch.no_grad()
    def ddim_reverse_sample(self,
                            xt,
                            t,
                            model,
                            model_kwargs={},
                            clamp=None,
                            percentile=None,
                            guide_scale=None,
                            ddim_timesteps=20):
        r"""Sample from p(x_{t+1} | x_t) using DDIM reverse ODE (deterministic).
        """
        stride = self.num_timesteps // ddim_timesteps

        # predict distribution of p(x_{t-1} | x_t)
        _, _, _, x0 = self.p_mean_variance(xt, t, model, model_kwargs, clamp,
                                           percentile, guide_scale)

        # derive variables
        eps = (_i(self.sqrt_recip_alphas_cumprod, t, xt) * xt - x0) / _i(
            self.sqrt_recipm1_alphas_cumprod, t, xt)
        alphas_next = _i(
            torch.cat(
                [self.alphas_cumprod,
                 self.alphas_cumprod.new_zeros([1])]),
            (t + stride).clamp(0, self.num_timesteps), xt)

        # reverse sample
        mu = torch.sqrt(alphas_next) * x0 + torch.sqrt(1 - alphas_next) * eps
        return mu, x0

    @torch.no_grad()
    def ddim_reverse_sample_loop(self,
                                 x0,
                                 model,
                                 model_kwargs={},
                                 clamp=None,
                                 percentile=None,
                                 guide_scale=None,
                                 ddim_timesteps=20):
        # prepare input
        b, c, h, w = x0.size()
        xt = x0

        # reconstruction steps
        steps = torch.arange(0, self.num_timesteps,
                             self.num_timesteps // ddim_timesteps)
        for step in steps:
            t = torch.full((b, ), step, dtype=torch.long, device=xt.device)
            xt, _ = self.ddim_reverse_sample(xt, t, model, model_kwargs, clamp,
                                             percentile, guide_scale,
                                             ddim_timesteps)
        return xt

    @torch.no_grad()
    def plms_sample(self,
                    xt,
                    t,
                    model,
                    model_kwargs={},
                    clamp=None,
                    percentile=None,
                    condition_fn=None,
                    guide_scale=None,
                    plms_timesteps=20):
        r"""Sample from p(x_{t-1} | x_t) using PLMS.
            - condition_fn: for classifier-based guidance (guided-diffusion).
            - guide_scale: for classifier-free guidance (glide/dalle-2).
        """
        stride = self.num_timesteps // plms_timesteps

        # function for compute eps
        def compute_eps(xt, t):
            # predict distribution of p(x_{t-1} | x_t)
            _, _, _, x0 = self.p_mean_variance(xt, t, model, model_kwargs,
                                               clamp, percentile, guide_scale)

            # condition
            if condition_fn is not None:
                # x0 -> eps
                alpha = _i(self.alphas_cumprod, t, xt)
                eps = (_i(self.sqrt_recip_alphas_cumprod, t, xt) * xt
                       - x0) / _i(self.sqrt_recipm1_alphas_cumprod, t, xt)
                eps = eps - (1 - alpha).sqrt() * condition_fn(
                    xt, self._scale_timesteps(t), **model_kwargs)

                # eps -> x0
                x0 = _i(self.sqrt_recip_alphas_cumprod, t, xt) * xt - _i(
                    self.sqrt_recipm1_alphas_cumprod, t, xt) * eps

            # derive eps
            eps = (_i(self.sqrt_recip_alphas_cumprod, t, xt) * xt - x0) / _i(
                self.sqrt_recipm1_alphas_cumprod, t, xt)
            return eps

        # function for compute x_0 and x_{t-1}
        def compute_x0(eps, t):
            # eps -> x0
            x0 = _i(self.sqrt_recip_alphas_cumprod, t, xt) * xt - _i(
                self.sqrt_recipm1_alphas_cumprod, t, xt) * eps

            # deterministic sample
            alphas_prev = _i(self.alphas_cumprod, (t - stride).clamp(0), xt)
            direction = torch.sqrt(1 - alphas_prev) * eps
            # mask = t.ne(0).float().view(-1, *((1, ) * (xt.ndim - 1)))
            xt_1 = torch.sqrt(alphas_prev) * x0 + direction
            return xt_1, x0

        # PLMS sample
        eps = compute_eps(xt, t)
        if len(eps_cache) == 0:
            # 2nd order pseudo improved Euler
            xt_1, x0 = compute_x0(eps, t)
            eps_next = compute_eps(xt_1, (t - stride).clamp(0))
            eps_prime = (eps + eps_next) / 2.0
        elif len(eps_cache) == 1:
            # 2nd order pseudo linear multistep (Adams-Bashforth)
            eps_prime = (3 * eps - eps_cache[-1]) / 2.0
        elif len(eps_cache) == 2:
            # 3nd order pseudo linear multistep (Adams-Bashforth)
            eps_prime = (23 * eps - 16 * eps_cache[-1]
                         + 5 * eps_cache[-2]) / 12.0
        elif len(eps_cache) >= 3:
            # 4nd order pseudo linear multistep (Adams-Bashforth)
            eps_prime = (55 * eps - 59 * eps_cache[-1] + 37 * eps_cache[-2]
                         - 9 * eps_cache[-3]) / 24.0
        xt_1, x0 = compute_x0(eps_prime, t)
        return xt_1, x0, eps

    @torch.no_grad()
    def plms_sample_loop(self,
                         noise,
                         model,
                         model_kwargs={},
                         clamp=None,
                         percentile=None,
                         condition_fn=None,
                         guide_scale=None,
                         plms_timesteps=20):
        # prepare input
        b, c, h, w = noise.size()
        xt = noise

        # diffusion process
        steps = (1 + torch.arange(0, self.num_timesteps,
                                  self.num_timesteps // plms_timesteps)).clamp(
                                      0, self.num_timesteps - 1).flip(0)
        eps_cache = []
        for step in steps:
            # PLMS sampling step
            t = torch.full((b, ), step, dtype=torch.long, device=xt.device)
            xt, _, eps = self.plms_sample(xt, t, model, model_kwargs, clamp,
                                          percentile, condition_fn,
                                          guide_scale, plms_timesteps,
                                          eps_cache)

            # update eps cache
            eps_cache.append(eps)
            if len(eps_cache) >= 4:
                eps_cache.pop(0)
        return xt

    def loss(self, x0, t, model, model_kwargs={}, noise=None):
        noise = torch.randn_like(x0) if noise is None else noise
        xt = self.q_sample(x0, t, noise=noise)

        # compute loss
        if self.loss_type in ['kl', 'rescaled_kl']:
            loss, _ = self.variational_lower_bound(x0, xt, t, model,
                                                   model_kwargs)
            if self.loss_type == 'rescaled_kl':
                loss = loss * self.num_timesteps
        elif self.loss_type in ['mse', 'rescaled_mse', 'l1', 'rescaled_l1']:
            out = model(xt, self._scale_timesteps(t), **model_kwargs)

            # VLB for variation
            loss_vlb = 0.0
            if self.var_type in ['learned', 'learned_range']:
                out, var = out.chunk(2, dim=1)
                frozen = torch.cat([
                    out.detach(), var
                ], dim=1)  # learn var without affecting the prediction of mean
                loss_vlb, _ = self.variational_lower_bound(
                    x0, xt, t, model=lambda *args, **kwargs: frozen)
                if self.loss_type.startswith('rescaled_'):
                    loss_vlb = loss_vlb * self.num_timesteps / 1000.0

            # MSE/L1 for x0/eps
            target = {
                'eps': noise,
                'x0': x0,
                'x_{t-1}': self.q_posterior_mean_variance(x0, xt, t)[0]
            }[self.mean_type]
            loss = (out - target).pow(1 if self.loss_type.endswith('l1') else 2
                                      ).abs().flatten(1).mean(dim=1)

            # total loss
            loss = loss + loss_vlb
        return loss

    def variational_lower_bound(self,
                                x0,
                                xt,
                                t,
                                model,
                                model_kwargs={},
                                clamp=None,
                                percentile=None):
        # compute groundtruth and predicted distributions
        mu1, _, log_var1 = self.q_posterior_mean_variance(x0, xt, t)
        mu2, _, log_var2, x0 = self.p_mean_variance(xt, t, model, model_kwargs,
                                                    clamp, percentile)

        # compute KL loss
        kl = kl_divergence(mu1, log_var1, mu2, log_var2)
        kl = kl.flatten(1).mean(dim=1) / math.log(2.0)

        # compute discretized NLL loss (for p(x0 | x1) only)
        nll = -discretized_gaussian_log_likelihood(
            x0, mean=mu2, log_scale=0.5 * log_var2)
        nll = nll.flatten(1).mean(dim=1) / math.log(2.0)

        # NLL for p(x0 | x1) and KL otherwise
        vlb = torch.where(t == 0, nll, kl)
        return vlb, x0

    @torch.no_grad()
    def variational_lower_bound_loop(self,
                                     x0,
                                     model,
                                     model_kwargs={},
                                     clamp=None,
                                     percentile=None):
        r"""Compute the entire variational lower bound, measured in bits-per-dim.
        """
        # prepare input and output
        b, c, h, w = x0.size()
        metrics = {'vlb': [], 'mse': [], 'x0_mse': []}

        # loop
        for step in torch.arange(self.num_timesteps).flip(0):
            # compute VLB
            t = torch.full((b, ), step, dtype=torch.long, device=x0.device)
            noise = torch.randn_like(x0)
            xt = self.q_sample(x0, t, noise)
            vlb, pred_x0 = self.variational_lower_bound(
                x0, xt, t, model, model_kwargs, clamp, percentile)

            # predict eps from x0
            eps = (_i(self.sqrt_recip_alphas_cumprod, t, xt) * xt - x0) / _i(
                self.sqrt_recipm1_alphas_cumprod, t, xt)

            # collect metrics
            metrics['vlb'].append(vlb)
            metrics['x0_mse'].append(
                (pred_x0 - x0).square().flatten(1).mean(dim=1))
            metrics['mse'].append(
                (eps - noise).square().flatten(1).mean(dim=1))
        metrics = {k: torch.stack(v, dim=1) for k, v in metrics.items()}

        # compute the prior KL term for VLB, measured in bits-per-dim
        mu, _, log_var = self.q_mean_variance(x0, t)
        kl_prior = kl_divergence(mu, log_var, torch.zeros_like(mu),
                                 torch.zeros_like(log_var))
        kl_prior = kl_prior.flatten(1).mean(dim=1) / math.log(2.0)

        # update metrics
        metrics['prior_bits_per_dim'] = kl_prior
        metrics['total_bits_per_dim'] = metrics['vlb'].sum(dim=1) + kl_prior
        return metrics

    def _scale_timesteps(self, t):
        if self.rescale_timesteps:
            return t.float() * 1000.0 / self.num_timesteps
        return t
--- a/modelscope/models/cv/image_to_image_translation/ops/losses.py
+++ b/modelscope/models/cv/image_to_image_translation/ops/losses.py
@@ -0,0 +1,35 @@
 import math

 import torch

 __all__ = ['kl_divergence', 'discretized_gaussian_log_likelihood']


 def kl_divergence(mu1, logvar1, mu2, logvar2):
    return 0.5 * (
        -1.0 + logvar2 - logvar1 + torch.exp(logvar1 - logvar2) +  # noqa W504
        ((mu1 - mu2)**2) * torch.exp(-logvar2))


 def standard_normal_cdf(x):
    r"""A fast approximation of the cumulative distribution function of the standard normal.
    """
    return 0.5 * (1.0 + torch.tanh(
        math.sqrt(2.0 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))


 def discretized_gaussian_log_likelihood(x0, mean, log_scale):
    assert x0.shape == mean.shape == log_scale.shape
    cx = x0 - mean
    inv_stdv = torch.exp(-log_scale)
    cdf_plus = standard_normal_cdf(inv_stdv * (cx + 1.0 / 255.0))
    cdf_min = standard_normal_cdf(inv_stdv * (cx - 1.0 / 255.0))
    log_cdf_plus = torch.log(cdf_plus.clamp(min=1e-12))
    log_one_minus_cdf_min = torch.log((1.0 - cdf_min).clamp(min=1e-12))
    cdf_delta = cdf_plus - cdf_min
    log_probs = torch.where(
        x0 < -0.999, log_cdf_plus,
        torch.where(x0 > 0.999, log_one_minus_cdf_min,
                    torch.log(cdf_delta.clamp(min=1e-12))))
    assert log_probs.shape == x0.shape
    return log_probs
--- a/modelscope/models/cv/image_to_image_translation/ops/metrics.py
+++ b/modelscope/models/cv/image_to_image_translation/ops/metrics.py
@@ -0,0 +1,126 @@
 import numpy as np
 import scipy.linalg as linalg
 import torch

 __all__ = [
    'get_fid_net', 'get_is_net', 'compute_fid', 'compute_prdc', 'compute_is'
 ]


 def get_fid_net(resize_input=True, normalize_input=True):
    r"""InceptionV3 network for the evaluation of Fréchet Inception Distance (FID).

    Args:
        resize_input: whether or not to resize the input to (299, 299).
        normalize_input: whether or not to normalize the input from range (0, 1) to range(-1, 1).
    """
    from artist.models import InceptionV3
    return InceptionV3(
        output_blocks=(3, ),
        resize_input=resize_input,
        normalize_input=normalize_input,
        requires_grad=False,
        use_fid_inception=True).eval().requires_grad_(False)


 def get_is_net(resize_input=True, normalize_input=True):
    r"""InceptionV3 network for the evaluation of Inception Score (IS).

    Args:
        resize_input: whether or not to resize the input to (299, 299).
        normalize_input: whether or not to normalize the input from range (0, 1) to range(-1, 1).
    """
    from artist.models import InceptionV3
    return InceptionV3(
        output_blocks=(4, ),
        resize_input=resize_input,
        normalize_input=normalize_input,
        requires_grad=False,
        use_fid_inception=False).eval().requires_grad_(False)


@torch.no_grad()
 def compute_fid(real_feats, fake_feats, eps=1e-6):
    r"""Compute Fréchet Inception Distance (FID).

    Args:
        real_feats: [N, C].
        fake_feats: [N, C].
    """
    # check inputs
    if isinstance(real_feats, torch.Tensor):
        real_feats = real_feats.cpu().numpy().astype(np.float_)
    if isinstance(fake_feats, torch.Tensor):
        fake_feats = fake_feats.cpu().numpy().astype(np.float_)

    # real statistics
    mu1 = np.mean(real_feats, axis=0)
    sigma1 = np.cov(real_feats, rowvar=False)

    # fake statistics
    mu2 = np.mean(fake_feats, axis=0)
    sigma2 = np.cov(fake_feats, rowvar=False)

    # compute covmean
    covmean, _ = linalg.sqrtm(sigma1.dot(sigma2), disp=False)
    if not np.isfinite(covmean).all():
        print(
            f'FID calculation produces singular product; adding {eps} to diagonal of cov',
            flush=True)
        offset = np.eye(sigma1.shape[0]) * eps
        covmean = linalg.sqrtm((sigma1 + offset).dot(sigma2 + offset))

    # numerical error might give slight imaginary component
    if np.iscomplexobj(covmean):
        if not np.allclose(np.diagonal(covmean).imag, 0, atol=1e-3):
            m = np.max(np.abs(covmean.imag))
            raise ValueError('Imaginary component {}'.format(m))
        covmean = covmean.real

    # compute Fréchet distance
    diff = mu1 - mu2
    fid = diff.dot(diff) + np.trace(sigma1) + np.trace(
        sigma2) - 2 * np.trace(covmean)
    return fid.item()


@torch.no_grad()
 def compute_prdc(real_feats, fake_feats, knn=5):
    r"""Compute precision, recall, density, and coverage given two manifolds.

    Args:
        real_feats: [N, C].
        fake_feats: [N, C].
        knn: the number of nearest neighbors to consider.
    """
    # distances
    real_kth = -(-torch.cdist(real_feats, real_feats)).topk(
        k=knn, dim=1)[0][:, -1]
    fake_kth = -(-torch.cdist(fake_feats, fake_feats)).topk(
        k=knn, dim=1)[0][:, -1]
    dists = torch.cdist(real_feats, fake_feats)

    # metrics
    precision = (dists < real_kth.unsqueeze(1)).any(
        dim=0).float().mean().item()
    recall = (dists < fake_kth.unsqueeze(0)).any(dim=1).float().mean().item()
    density = (dists < real_kth.unsqueeze(1)).float().sum(
        dim=0).mean().item() / knn
    coverage = (dists.min(dim=1)[0] < real_kth).float().mean().item()
    return precision, recall, density, coverage


@torch.no_grad()
 def compute_is(logits, num_splits=10):
    preds = logits.softmax(dim=1).cpu().numpy()
    split_scores = []
    for k in range(num_splits):
        part = preds[k * (len(logits) // num_splits):(k + 1)
                     * (len(logits) // num_splits), :]
        py = np.mean(part, axis=0)
        scores = []
        for i in range(part.shape[0]):
            pyx = part[i, :]
            scores.append(entropy(pyx, py))
        split_scores.append(np.exp(np.mean(scores)))
    return np.mean(split_scores), np.std(split_scores)
--- a/modelscope/models/cv/image_to_image_translation/ops/random_color.py
+++ b/modelscope/models/cv/image_to_image_translation/ops/random_color.py
@@ -0,0 +1,220 @@
 import colorsys
 import random

 __all__ = ['RandomColor', 'rand_color']

 COLORMAP = {
    'blue': {
        'hue_range': [179, 257],
        'lower_bounds': [[20, 100], [30, 86], [40, 80], [50, 74], [60, 60],
                         [70, 52], [80, 44], [90, 39], [100, 35]]
    },
    'green': {
        'hue_range': [63, 178],
        'lower_bounds': [[30, 100], [40, 90], [50, 85], [60, 81], [70, 74],
                         [80, 64], [90, 50], [100, 40]]
    },
    'monochrome': {
        'hue_range': [0, 0],
        'lower_bounds': [[0, 0], [100, 0]]
    },
    'orange': {
        'hue_range': [19, 46],
        'lower_bounds': [[20, 100], [30, 93], [40, 88], [50, 86], [60, 85],
                         [70, 70], [100, 70]]
    },
    'pink': {
        'hue_range': [283, 334],
        'lower_bounds': [[20, 100], [30, 90], [40, 86], [60, 84], [80, 80],
                         [90, 75], [100, 73]]
    },
    'purple': {
        'hue_range': [258, 282],
        'lower_bounds': [[20, 100], [30, 87], [40, 79], [50, 70], [60, 65],
                         [70, 59], [80, 52], [90, 45], [100, 42]]
    },
    'red': {
        'hue_range': [-26, 18],
        'lower_bounds': [[20, 100], [30, 92], [40, 89], [50, 85], [60, 78],
                         [70, 70], [80, 60], [90, 55], [100, 50]]
    },
    'yellow': {
        'hue_range': [47, 62],
        'lower_bounds': [[25, 100], [40, 94], [50, 89], [60, 86], [70, 84],
                         [80, 82], [90, 80], [100, 75]]
    }
 }


 class RandomColor(object):

    def __init__(self, seed=None):
        self.colormap = COLORMAP
        self.random = random.Random(seed)

        for color_name, color_attrs in self.colormap.items():
            lower_bounds = color_attrs['lower_bounds']
            s_min = lower_bounds[0][0]
            s_max = lower_bounds[len(lower_bounds) - 1][0]

            b_min = lower_bounds[len(lower_bounds) - 1][1]
            b_max = lower_bounds[0][1]

            self.colormap[color_name]['saturation_range'] = [s_min, s_max]
            self.colormap[color_name]['brightness_range'] = [b_min, b_max]

    def generate(self, hue=None, luminosity=None, count=1, format_='hex'):
        colors = []
        for _ in range(count):
            # First we pick a hue (H)
            H = self.pick_hue(hue)

            # Then use H to determine saturation (S)
            S = self.pick_saturation(H, hue, luminosity)

            # Then use S and H to determine brightness (B).
            B = self.pick_brightness(H, S, luminosity)

            # Then we return the HSB color in the desired format
            colors.append(self.set_format([H, S, B], format_))

        return colors

    def pick_hue(self, hue):
        hue_range = self.get_hue_range(hue)
        hue = self.random_within(hue_range)

        # Instead of storing red as two seperate ranges,
        # we group them, using negative numbers
        if (hue < 0):
            hue += 360

        return hue

    def pick_saturation(self, hue, hue_name, luminosity):

        if luminosity == 'random':
            return self.random_within([0, 100])

        if hue_name == 'monochrome':
            return 0

        saturation_range = self.get_saturation_range(hue)

        s_min = saturation_range[0]
        s_max = saturation_range[1]

        if luminosity == 'bright':
            s_min = 55
        elif luminosity == 'dark':
            s_min = s_max - 10
        elif luminosity == 'light':
            s_max = 55

        return self.random_within([s_min, s_max])

    def pick_brightness(self, H, S, luminosity):
        b_min = self.get_minimum_brightness(H, S)
        b_max = 100

        if luminosity == 'dark':
            b_max = b_min + 20
        elif luminosity == 'light':
            b_min = (b_max + b_min) / 2
        elif luminosity == 'random':
            b_min = 0
            b_max = 100

        return self.random_within([b_min, b_max])

    def set_format(self, hsv, format_):
        if 'hsv' in format_:
            color = hsv
        elif 'rgb' in format_:
            color = self.hsv_to_rgb(hsv)
        elif 'hex' in format_:
            r, g, b = self.hsv_to_rgb(hsv)
            return '#%02x%02x%02x' % (r, g, b)
        else:
            return 'unrecognized format'

        if 'Array' in format_ or format_ == 'hex':
            return color
        else:
            prefix = format_[:3]
            color_values = [str(x) for x in color]
            return '%s(%s)' % (prefix, ', '.join(color_values))

    def get_minimum_brightness(self, H, S):
        lower_bounds = self.get_color_info(H)['lower_bounds']

        for i in range(len(lower_bounds) - 1):
            s1 = lower_bounds[i][0]
            v1 = lower_bounds[i][1]

            s2 = lower_bounds[i + 1][0]
            v2 = lower_bounds[i + 1][1]

            if s1 <= S <= s2:
                m = (v2 - v1) / (s2 - s1)
                b = v1 - m * s1

                return m * S + b

        return 0

    def get_hue_range(self, color_input):
        if color_input and color_input.isdigit():
            number = int(color_input)

            if 0 < number < 360:
                return [number, number]

        elif color_input and color_input in self.colormap:
            color = self.colormap[color_input]
            if 'hue_range' in color:
                return color['hue_range']

        else:
            return [0, 360]

    def get_saturation_range(self, hue):
        return self.get_color_info(hue)['saturation_range']

    def get_color_info(self, hue):
        # Maps red colors to make picking hue easier
        if 334 <= hue <= 360:
            hue -= 360

        for color_name, color in self.colormap.items():
            if color['hue_range'] and color['hue_range'][0] <= hue <= color[
                    'hue_range'][1]:
                return self.colormap[color_name]

        # this should probably raise an exception
        return 'Color not found'

    def random_within(self, r):
        return self.random.randint(int(r[0]), int(r[1]))

    @classmethod
    def hsv_to_rgb(cls, hsv):
        h, s, v = hsv
        h = 1 if h == 0 else h
        h = 359 if h == 360 else h

        h = float(h) / 360
        s = float(s) / 100
        v = float(v) / 100

        rgb = colorsys.hsv_to_rgb(h, s, v)
        return [int(c * 255) for c in rgb]


 def rand_color():
    generator = RandomColor()
    hue = random.choice(list(COLORMAP.keys()))
    color = generator.generate(hue=hue, count=1, format_='rgb')[0]
    color = color[color.find('(') + 1:color.find(')')]
    color = tuple([int(u) for u in color.split(',')])
    return color
--- a/modelscope/models/cv/image_to_image_translation/ops/random_mask.py
+++ b/modelscope/models/cv/image_to_image_translation/ops/random_mask.py
@@ -0,0 +1,79 @@
 import cv2
 import numpy as np

 __all__ = ['make_irregular_mask', 'make_rectangle_mask', 'make_uncrop']


 def make_irregular_mask(w,
                        h,
                        max_angle=4,
                        max_length=200,
                        max_width=100,
                        min_strokes=1,
                        max_strokes=5,
                        mode='line'):
    # initialize mask
    assert mode in ['line', 'circle', 'square']
    mask = np.zeros((h, w), np.float32)

    # draw strokes
    num_strokes = np.random.randint(min_strokes, max_strokes + 1)
    for i in range(num_strokes):
        x1 = np.random.randint(w)
        y1 = np.random.randint(h)
        for j in range(1 + np.random.randint(5)):
            angle = 0.01 + np.random.randint(max_angle)
            if i % 2 == 0:
                angle = 2 * 3.1415926 - angle
            length = 10 + np.random.randint(max_length)
            radius = 5 + np.random.randint(max_width)
            x2 = np.clip((x1 + length * np.sin(angle)).astype(np.int32), 0, w)
            y2 = np.clip((y1 + length * np.cos(angle)).astype(np.int32), 0, h)
            if mode == 'line':
                cv2.line(mask, (x1, y1), (x2, y2), 1.0, radius)
            elif mode == 'circle':
                cv2.circle(
                    mask, (x1, y1), radius=radius, color=1.0, thickness=-1)
            elif mode == 'square':
                radius = radius // 2
                mask[y1 - radius:y1 + radius, x1 - radius:x1 + radius] = 1
            x1, y1 = x2, y2
    return mask


 def make_rectangle_mask(w,
                        h,
                        margin=10,
                        min_size=30,
                        max_size=150,
                        min_strokes=1,
                        max_strokes=4):
    # initialize mask
    mask = np.zeros((h, w), np.float32)

    # draw rectangles
    num_strokes = np.random.randint(min_strokes, max_strokes + 1)
    for i in range(num_strokes):
        box_w = np.random.randint(min_size, max_size)
        box_h = np.random.randint(min_size, max_size)
        x1 = np.random.randint(margin, w - margin - box_w + 1)
        y1 = np.random.randint(margin, h - margin - box_h + 1)
        mask[y1:y1 + box_h, x1:x1 + box_w] = 1
    return mask


 def make_uncrop(w, h):
    # initialize mask
    mask = np.zeros((h, w), np.float32)

    # randomly halve the image
    side = np.random.choice([0, 1, 2, 3])
    if side == 0:
        mask[:h // 2, :] = 1
    elif side == 1:
        mask[h // 2:, :] = 1
    elif side == 2:
        mask[:, :w // 2] = 1
    elif side == 2:
        mask[:, w // 2:] = 1
    return mask
--- a/modelscope/models/cv/image_to_image_translation/ops/svd.py
+++ b/modelscope/models/cv/image_to_image_translation/ops/svd.py
@@ -0,0 +1,152 @@
 r"""SVD of linear degradation matrices described in the paper
    ``Denoising Diffusion Restoration Models.''
    @article{kawar2022denoising,
      title={Denoising Diffusion Restoration Models},
      author={Bahjat Kawar and Michael Elad and Stefano Ermon and Jiaming Song},
      year={2022},
      journal={arXiv preprint arXiv:2201.11793},
    }
 """
 import torch

 __all__ = ['SVD', 'IdentitySVD', 'DenoiseSVD', 'ColorizationSVD']


 class SVD(object):
    r"""SVD decomposition of a matrix, i.e., H = UDV^T.
        NOTE: assume that all inputs (i.e., h, x) are of shape [B, CHW].
    """

    def __init__(self, h):
        self.u, self.d, self.v = torch.svd(h, some=False)
        self.ut = self.u.t()
        self.vt = self.v.t()
        self.d[self.d < 1e-3] = 0

    def U(self, x):
        return torch.matmul(self.u, x)

    def Ut(self, x):
        return torch.matmul(self.ut, x)

    def V(self, x):
        return torch.matmul(self.v, x)

    def Vt(self, x):
        return torch.matmul(self.vt, x)

    @property
    def D(self):
        return self.d

    def H(self, x):
        return self.U(self.D * self.Vt(x)[:, :self.D.size(0)])

    def Ht(self, x):
        return self.V(self._pad(self.D * self.Ut(x)[:, :self.D.size(0)]))

    def Hinv(self, x):
        r"""Multiplies x by the pseudo inverse of H.
        """
        x = self.Ut(x)
        x[:, :self.D.size(0)] = x[:, :self.D.size(0)] / self.D
        return self.V(self._pad(x))

    def _pad(self, x):
        o = x.new_zeros(x.size(0), self.v.size(0))
        o[:, :self.u.size(0)] = x.view(x.size(0), -1)
        return o

    def to(self, *args, **kwargs):
        r"""Update the data type and device of UDV matrices.
        """
        for k, v in self.__dict__.items():
            if isinstance(v, torch.Tensor):
                setattr(self, k, v.to(*args, **kwargs))
        return self


 class IdentitySVD(SVD):

    def __init__(self, c, h, w):
        self.d = torch.ones(c * h * w)

    def U(self, x):
        return x.clone()

    def Ut(self, x):
        return x.clone()

    def V(self, x):
        return x.clone()

    def Vt(self, x):
        return x.clone()

    def H(self, x):
        return x.clone()

    def Ht(self, x):
        return x.clone()

    def Hinv(self, x):
        return x.clone()

    def _pad(self, x):
        return x.clone()


 class DenoiseSVD(SVD):

    def __init__(self, c, h, w):
        self.num_entries = c * h * w
        self.d = torch.ones(self.num_entries)

    def U(self, x):
        return x.clone()

    def Ut(self, x):
        return x.clone()

    def V(self, x):
        return x.clone()

    def Vt(self, x):
        return x.clone()

    def _pad(self, x):
        return x.clone()


 class ColorizationSVD(SVD):

    def __init__(self, c, h, w):
        self.color_dim = c
        self.num_pixels = h * w
        self.u, self.d, self.v = torch.svd(torch.ones(1, c) / c, some=False)
        self.vt = self.v.t()

    def U(self, x):
        return self.u[0, 0] * x

    def Ut(self, x):
        return self.u[0, 0] * x

    def V(self, x):
        return torch.einsum('ij,bjn->bin', self.v,
                            x.view(x.size(0), self.color_dim,
                                   self.num_pixels)).flatten(1)

    def Vt(self, x):
        return torch.einsum('ij,bjn->bin', self.vt,
                            x.view(x.size(0), self.color_dim,
                                   self.num_pixels)).flatten(1)

    @property
    def D(self):
        return self.d.repeat(self.num_pixels)

    def _pad(self, x):
        o = x.new_zeros(x.size(0), self.color_dim * self.num_pixels)
        o[:, :self.num_pixels] = x
        return o
--- a/modelscope/models/cv/image_to_image_translation/ops/utils.py
+++ b/modelscope/models/cv/image_to_image_translation/ops/utils.py
@@ -0,0 +1,224 @@
 import base64
 import binascii
 import hashlib
 import math
 import os
 import os.path as osp
 import zipfile
 from io import BytesIO
 from multiprocessing.pool import ThreadPool as Pool

 import cv2
 import json
 import numpy as np
 import torch
 import torch.nn.functional as F
 from PIL import Image

 from .random_color import rand_color

 __all__ = [
    'ceil_divide', 'to_device', 'rand_name', 'ema', 'parallel', 'unzip',
    'load_state_dict', 'inverse_indices', 'detect_duplicates', 'md5', 'rope',
    'format_state', 'breakup_grid', 'viz_anno_geometry', 'image_to_base64'
 ]

 TFS_CLIENT = None


 def ceil_divide(a, b):
    return int(math.ceil(a / b))


 def to_device(batch, device, non_blocking=False):
    if isinstance(batch, (list, tuple)):
        return type(batch)([to_device(u, device, non_blocking) for u in batch])
    elif isinstance(batch, dict):
        return type(batch)([(k, to_device(v, device, non_blocking))
                            for k, v in batch.items()])
    elif isinstance(batch, torch.Tensor):
        return batch.to(device, non_blocking=non_blocking)
    return batch


 def rand_name(length=8, suffix=''):
    name = binascii.b2a_hex(os.urandom(length)).decode('utf-8')
    if suffix:
        if not suffix.startswith('.'):
            suffix = '.' + suffix
        name += suffix
    return name


@torch.no_grad()
 def ema(net_ema, net, beta, copy_buffer=False):
    assert 0.0 <= beta <= 1.0
    for p_ema, p in zip(net_ema.parameters(), net.parameters()):
        p_ema.copy_(p.lerp(p_ema, beta))
    if copy_buffer:
        for b_ema, b in zip(net_ema.buffers(), net.buffers()):
            b_ema.copy_(b)


 def parallel(func, args_list, num_workers=32, timeout=None):
    assert isinstance(args_list, list)
    if not isinstance(args_list[0], tuple):
        args_list = [(args, ) for args in args_list]
    if num_workers == 0:
        return [func(*args) for args in args_list]
    with Pool(processes=num_workers) as pool:
        results = [pool.apply_async(func, args) for args in args_list]
        results = [res.get(timeout=timeout) for res in results]
    return results


 def unzip(filename, dst_dir=None):
    if dst_dir is None:
        dst_dir = osp.dirname(filename)
    with zipfile.ZipFile(filename, 'r') as zip_ref:
        zip_ref.extractall(dst_dir)


 def load_state_dict(module, state_dict, drop_prefix=''):
    # find incompatible key-vals
    src, dst = state_dict, module.state_dict()
    if drop_prefix:
        src = type(src)([
            (k[len(drop_prefix):] if k.startswith(drop_prefix) else k, v)
            for k, v in src.items()
        ])
    missing = [k for k in dst if k not in src]
    unexpected = [k for k in src if k not in dst]
    unmatched = [
        k for k in src.keys() & dst.keys() if src[k].shape != dst[k].shape
    ]

    # keep only compatible key-vals
    incompatible = set(unexpected + unmatched)
    src = type(src)([(k, v) for k, v in src.items() if k not in incompatible])
    module.load_state_dict(src, strict=False)

    # report incompatible key-vals
    if len(missing) != 0:
        print('  Missing: ' + ', '.join(missing), flush=True)
    if len(unexpected) != 0:
        print('  Unexpected: ' + ', '.join(unexpected), flush=True)
    if len(unmatched) != 0:
        print('  Shape unmatched: ' + ', '.join(unmatched), flush=True)


 def inverse_indices(indices):
    r"""Inverse map of indices.
        E.g., if A[indices] == B, then B[inv_indices] == A.
    """
    inv_indices = torch.empty_like(indices)
    inv_indices[indices] = torch.arange(len(indices)).to(indices)
    return inv_indices


 def detect_duplicates(feats, thr=0.9):
    assert feats.ndim == 2

    # compute simmat
    feats = F.normalize(feats, p=2, dim=1)
    simmat = torch.mm(feats, feats.T)
    simmat.triu_(1)
    torch.cuda.synchronize()

    # detect duplicates
    mask = ~simmat.gt(thr).any(dim=0)
    return torch.where(mask)[0]


 def md5(filename):
    with open(filename, 'rb') as f:
        return hashlib.md5(f.read()).hexdigest()


 def rope(x):
    r"""Apply rotary position embedding on x of shape [B, *(spatial dimensions), C].
    """
    # reshape
    shape = x.shape
    x = x.view(x.size(0), -1, x.size(-1))
    l, c = x.shape[-2:]
    assert c % 2 == 0
    half = c // 2

    # apply rotary position embedding on x
    sinusoid = torch.outer(
        torch.arange(l).to(x),
        torch.pow(10000, -torch.arange(half).to(x).div(half)))
    sin, cos = torch.sin(sinusoid), torch.cos(sinusoid)
    x1, x2 = x.chunk(2, dim=-1)
    x = torch.cat([x1 * cos - x2 * sin, x2 * cos + x1 * sin], dim=-1)

    # reshape back
    return x.view(shape)


 def format_state(state, filename=None):
    r"""For comparing/aligning state_dict.
    """
    content = '\n'.join([f'{k}\t{tuple(v.shape)}' for k, v in state.items()])
    if filename:
        with open(filename, 'w') as f:
            f.write(content)


 def breakup_grid(img, grid_size):
    r"""The inverse operator of ``torchvision.utils.make_grid``.
    """
    # params
    nrow = img.height // grid_size
    ncol = img.width // grid_size
    wrow = wcol = 2  # NOTE: use default values here

    # collect grids
    grids = []
    for i in range(nrow):
        for j in range(ncol):
            x1 = j * grid_size + (j + 1) * wcol
            y1 = i * grid_size + (i + 1) * wrow
            grids.append(img.crop((x1, y1, x1 + grid_size, y1 + grid_size)))
    return grids


 def viz_anno_geometry(item):
    r"""Visualize an annotation item from SmartLabel.
    """
    if isinstance(item, str):
        item = json.loads(item)
    assert isinstance(item, dict)

    # read image
    orig_img = read_image(item['image_url'], retry=100)
    img = cv2.cvtColor(np.asarray(orig_img), cv2.COLOR_BGR2RGB)

    # loop over geometries
    for geometry in item['sd_result']['items']:
        # params
        poly_img = img.copy()
        color = rand_color()
        points = np.array(geometry['meta']['geometry']).round().astype(int)
        line_color = tuple([int(u * 0.55) for u in color])

        # draw polygons
        poly_img = cv2.fillPoly(poly_img, pts=[points], color=color)
        poly_img = cv2.polylines(
            poly_img,
            pts=[points],
            isClosed=True,
            color=line_color,
            thickness=2)

        # mixing
        img = np.clip(0.25 * img + 0.75 * poly_img, 0, 255).astype(np.uint8)
    return orig_img, Image.fromarray(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))


 def image_to_base64(img, format='JPEG'):
    buffer = BytesIO()
    img.save(buffer, format=format)
    code = base64.b64encode(buffer.getvalue()).decode('utf-8')
    return code
--- a/modelscope/pipelines/cv/init.py
+++ b/modelscope/pipelines/cv/init.py
@@ -15,6 +15,7 @@ if TYPE_CHECKING:
    from .image_color_enhance_pipeline import ImageColorEnhancePipeline
    from .image_colorization_pipeline import ImageColorizationPipeline
    from .image_instance_segmentation_pipeline import ImageInstanceSegmentationPipeline
    from .image_to_image_translation_pipeline import Image2ImageTranslationPipeline
    from .video_category_pipeline import VideoCategoryPipeline
    from .image_matting_pipeline import ImageMattingPipeline
    from .image_super_resolution_pipeline import ImageSuperResolutionPipeline
--- a/modelscope/pipelines/cv/image_to_image_translation_pipeline.py
+++ b/modelscope/pipelines/cv/image_to_image_translation_pipeline.py
@@ -0,0 +1,325 @@
 import io
 import os.path as osp
 import sys
 from typing import Any, Dict

 import cv2
 import numpy as np
 import torch
 import torchvision.transforms as T
 from PIL import Image
 from torchvision.utils import save_image

 import modelscope.models.cv.image_to_image_translation.data as data
 import modelscope.models.cv.image_to_image_translation.models as models
 import modelscope.models.cv.image_to_image_translation.ops as ops
 from modelscope.fileio import File
 from modelscope.metainfo import Pipelines
 from modelscope.models.cv.image_to_image_translation.model_translation import \
    UNet
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines.base import Input, Pipeline
 from modelscope.pipelines.builder import PIPELINES
 from modelscope.preprocessors import load_image
 from modelscope.utils.config import Config
 from modelscope.utils.constant import ModelFile, Tasks
 from modelscope.utils.logger import get_logger

 logger = get_logger()


 def save_grid(imgs, filename, nrow=5):
    save_image(
        imgs.clamp(-1, 1), filename, range=(-1, 1), normalize=True, nrow=nrow)


@PIPELINES.register_module(
    Tasks.image_generation, module_name=Pipelines.image2image_translation)
 class Image2ImageTranslationPipeline(Pipeline):

    def __init__(self, model: str, **kwargs):
        """
        use `model` to create a kws pipeline for prediction
        Args:
            model: model id on modelscope hub.
        """
        super().__init__(model=model)
        config_path = osp.join(self.model, ModelFile.CONFIGURATION)
        logger.info(f'loading config from {config_path}')
        self.cfg = Config.from_file(config_path)
        if torch.cuda.is_available():
            self._device = torch.device('cuda')
        else:
            self._device = torch.device('cpu')
        self.repetition = 4
        # load autoencoder model
        ae_model_path = osp.join(self.model, self.cfg.ModelPath.ae_model_path)
        logger.info(f'loading autoencoder model from {ae_model_path}')
        self.autoencoder = models.VQAutoencoder(
            dim=self.cfg.Params.ae.ae_dim,
            z_dim=self.cfg.Params.ae.ae_z_dim,
            dim_mult=self.cfg.Params.ae.ae_dim_mult,
            attn_scales=self.cfg.Params.ae.ae_attn_scales,
            codebook_size=self.cfg.Params.ae.ae_codebook_size).eval(
            ).requires_grad_(False).to(self._device)  # noqa E123
        self.autoencoder.load_state_dict(
            torch.load(ae_model_path, map_location=self._device))
        logger.info('load autoencoder model done')

        # load palette model
        palette_model_path = osp.join(self.model, ModelFile.TORCH_MODEL_FILE)
        logger.info(f'loading palette model from {palette_model_path}')
        self.palette = UNet(
            resolution=self.cfg.Params.unet.unet_resolution,
            in_dim=self.cfg.Params.unet.unet_in_dim,
            dim=self.cfg.Params.unet.unet_dim,
            context_dim=self.cfg.Params.unet.unet_context_dim,
            out_dim=self.cfg.Params.unet.unet_out_dim,
            dim_mult=self.cfg.Params.unet.unet_dim_mult,
            num_heads=self.cfg.Params.unet.unet_num_heads,
            head_dim=None,
            num_res_blocks=self.cfg.Params.unet.unet_res_blocks,
            attn_scales=self.cfg.Params.unet.unet_attn_scales,
            num_classes=self.cfg.Params.unet.unet_num_classes + 1,
            dropout=self.cfg.Params.unet.unet_dropout).eval().requires_grad_(
                False).to(self._device)
        self.palette.load_state_dict(
            torch.load(palette_model_path, map_location=self._device))
        logger.info('load palette model done')

        # diffusion
        logger.info('Initialization diffusion ...')
        betas = ops.beta_schedule(self.cfg.Params.diffusion.schedule,
                                  self.cfg.Params.diffusion.num_timesteps)
        self.diffusion = ops.GaussianDiffusion(
            betas=betas,
            mean_type=self.cfg.Params.diffusion.mean_type,
            var_type=self.cfg.Params.diffusion.var_type,
            loss_type=self.cfg.Params.diffusion.loss_type,
            rescale_timesteps=False)

        self.transforms = T.Compose([
            data.PadToSquare(),
            T.Resize(
                self.cfg.DATA.scale_size,
                interpolation=T.InterpolationMode.BICUBIC),
            T.ToTensor(),
            T.Normalize(mean=self.cfg.DATA.mean, std=self.cfg.DATA.std)
        ])

    def preprocess(self, input: Input) -> Dict[str, Any]:
        if len(input) == 3:  # colorization
            _, input_type, save_path = input
        elif len(input) == 4:  # uncropping or in-painting
            _, meta, input_type, save_path = input
            if input_type == 0:  # uncropping
                assert meta in ['up', 'down', 'left', 'right']
                direction = meta

        list_ = []
        for i in range(len(input) - 2):
            input_img = input[i]
            if input_img in ['up', 'down', 'left', 'right']:
                continue
            if isinstance(input_img, str):
                if input_type == 2 and i == 0:
                    logger.info('Loading image by origin way ... ')
                    bytes = File.read(input_img)
                    img = Image.open(io.BytesIO(bytes))
                    assert len(img.split()) == 4
                else:
                    img = load_image(input_img)
            elif isinstance(input_img, PIL.Image.Image):
                img = input_img.convert('RGB')
            elif isinstance(input_img, np.ndarray):
                if len(input_img.shape) == 2:
                    input_img = cv2.cvtColor(input_img, cv2.COLOR_GRAY2BGR)
                img = input_img[:, :, ::-1]
                img = Image.fromarray(img.astype('uint8')).convert('RGB')
            else:
                raise TypeError(f'input should be either str, PIL.Image,'
                                f' np.array, but got {type(input)}')
            list_.append(img)
        img_list = []
        if input_type != 2:
            for img in list_:
                img = self.transforms(img)
                imgs = torch.unsqueeze(img, 0)
                imgs = imgs.to(self._device)
                img_list.append(imgs)
        elif input_type == 2:
            mask, masked_img = list_[0], list_[1]
            img = self.transforms(masked_img.convert('RGB'))
            mask = torch.from_numpy(
                np.array(
                    mask.resize((img.shape[2], img.shape[1])),
                    dtype=np.float32)[:, :, -1] / 255.0).unsqueeze(0)
            img = (1 - mask) * img + mask * torch.randn_like(img).clamp_(-1, 1)
            imgs = img.unsqueeze(0).to(self._device)
        b, c, h, w = imgs.shape
        y = torch.LongTensor([self.cfg.Classes.class_id]).to(self._device)

        if input_type == 0:
            assert len(img_list) == 1
            result = {
                'image_data': img_list[0],
                'c': c,
                'h': h,
                'w': w,
                'direction': direction,
                'type': input_type,
                'y': y,
                'save_path': save_path
            }
        elif input_type == 1:
            assert len(img_list) == 1
            result = {
                'image_data': img_list[0],
                'c': c,
                'h': h,
                'w': w,
                'type': input_type,
                'y': y,
                'save_path': save_path
            }
        elif input_type == 2:
            result = {
                'image_data': imgs,
                # 'image_mask': mask,
                'c': c,
                'h': h,
                'w': w,
                'type': input_type,
                'y': y,
                'save_path': save_path
            }
        return result

    @torch.no_grad()
    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
        type_ = input['type']
        if type_ == 0:
            # Uncropping
            img = input['image_data']
            direction = input['direction']
            y = input['y']

            # fix seed
            torch.manual_seed(1 * 8888)
            torch.cuda.manual_seed(1 * 8888)

            logger.info(f'Processing {direction} uncropping')
            img = img.clone()
            i_y = y.repeat(self.repetition, 1)
            if direction == 'up':
                img[:, :, input['h'] // 2:, :] = torch.randn_like(
                    img[:, :, input['h'] // 2:, :])
            elif direction == 'down':
                img[:, :, :input['h'] // 2, :] = torch.randn_like(
                    img[:, :, :input['h'] // 2, :])
            elif direction == 'left':
                img[:, :, :,
                    input['w'] // 2:] = torch.randn_like(img[:, :, :,
                                                             input['w'] // 2:])
            elif direction == 'right':
                img[:, :, :, :input['w'] // 2] = torch.randn_like(
                    img[:, :, :, :input['w'] // 2])
            i_concat = self.autoencoder.encode(img).repeat(
                self.repetition, 1, 1, 1)

            # sample images
            x0 = self.diffusion.ddim_sample_loop(
                noise=torch.randn_like(i_concat),
                model=self.palette,
                model_kwargs=[{
                    'y': i_y,
                    'concat': i_concat
                }, {
                    'y':
                    torch.full_like(i_y,
                                    self.cfg.Params.unet.unet_num_classes),
                    'concat':
                    i_concat
                }],
                guide_scale=1.0,
                clamp=None,
                ddim_timesteps=50,
                eta=1.0)
            i_gen_imgs = self.autoencoder.decode(x0)
            save_grid(i_gen_imgs, input['save_path'], nrow=4)
            return {OutputKeys.OUTPUT_IMG: i_gen_imgs}

        elif type_ == 1:
            # Colorization #
            img = input['image_data']
            y = input['y']
            # fix seed
            torch.manual_seed(1 * 8888)
            torch.cuda.manual_seed(1 * 8888)

            logger.info('Processing Colorization')
            img = img.clone()
            img = img.mean(dim=1, keepdim=True).repeat(1, 3, 1, 1)
            i_concat = self.autoencoder.encode(img).repeat(
                self.repetition, 1, 1, 1)
            i_y = y.repeat(self.repetition, 1)

            # sample images
            x0 = self.diffusion.ddim_sample_loop(
                noise=torch.randn_like(i_concat),
                model=self.palette,
                model_kwargs=[{
                    'y': i_y,
                    'concat': i_concat
                }, {
                    'y':
                    torch.full_like(i_y,
                                    self.cfg.Params.unet.unet_num_classes),
                    'concat':
                    i_concat
                }],
                guide_scale=1.0,
                clamp=None,
                ddim_timesteps=50,
                eta=0.0)
            i_gen_imgs = self.autoencoder.decode(x0)
            save_grid(i_gen_imgs, input['save_path'], nrow=4)
            return {OutputKeys.OUTPUT_IMG: i_gen_imgs}
        elif type_ == 2:
            # Combination #
            logger.info('Processing Combination')

            # prepare inputs
            img = input['image_data']
            concat = self.autoencoder.encode(img).repeat(
                self.repetition, 1, 1, 1)
            y = torch.LongTensor([126]).unsqueeze(0).to(self._device).repeat(
                self.repetition, 1)

            # sample images
            x0 = self.diffusion.ddim_sample_loop(
                noise=torch.randn_like(concat),
                model=self.palette,
                model_kwargs=[{
                    'y': y,
                    'concat': concat
                }, {
                    'y':
                    torch.full_like(y, self.cfg.Params.unet.unet_num_classes),
                    'concat':
                    concat
                }],
                guide_scale=1.0,
                clamp=None,
                ddim_timesteps=50,
                eta=1.0)
            i_gen_imgs = self.autoencoder.decode(x0)
            save_grid(i_gen_imgs, input['save_path'], nrow=4)
            return {OutputKeys.OUTPUT_IMG: i_gen_imgs}
        else:
            raise TypeError(
                f'input type should be 0 (Uncropping), 1 (Colorization), 2 (Combation)'
                f' but got {type_}')

    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
        return inputs
--- a/tests/pipelines/test_image2image_translation.py
+++ b/tests/pipelines/test_image2image_translation.py
@@ -0,0 +1,38 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import os.path as osp
 import shutil
 import unittest

 from modelscope.fileio import File
 from modelscope.msdatasets import MsDataset
 from modelscope.pipelines import pipeline
 from modelscope.utils.constant import ModelFile, Tasks
 from modelscope.utils.test_utils import test_level


 class Image2ImageTranslationTest(unittest.TestCase):

    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
    def test_run_modelhub(self):
        r"""We provide three translation modes, i.e., uncropping, colorization and combination.
            You can pass the following parameters for different mode.
            1. Uncropping Mode:
            result = img2img_gen_pipeline(('data/test/images/img2img_input.jpg', 'left', 0, 'result.jpg'))
            2. Colorization Mode:
            result = img2img_gen_pipeline(('data/test/images/img2img_input.jpg', 1, 'result.jpg'))
            3. Combination Mode:
            just like the following code.
        """
        img2img_gen_pipeline = pipeline(
            Tasks.image_generation,
            model='damo/cv_latent_diffusion_image2image_translation')
        result = img2img_gen_pipeline(
            ('data/test/images/img2img_input_mask.png',
             'data/test/images/img2img_input_masked_img.png', 2,
             'result.jpg'))  # combination mode

        print(f'output: {result}.')


 if __name__ == '__main__':
    unittest.main()