Merge remote-tracking branch 'origin' into feat/fill_mask

Conflicts: modelscope/models/nlp/__init__.py modelscope/pipelines/builder.py modelscope/pipelines/outputs.py modelscope/preprocessors/nlp.py requirements/nlp.txt
3 years ago · cf75f2aeec
--- a/.gitattributes
+++ b/.gitattributes
@@ -0,0 +1,3 @@
 *.png filter=lfs diff=lfs merge=lfs -text
 *.jpg filter=lfs diff=lfs merge=lfs -text
 *.mp4 filter=lfs diff=lfs merge=lfs -text
--- a/.gitignore
+++ b/.gitignore
@@ -24,6 +24,7 @@ wheels/
 .installed.cfg
 *.egg
 /package
 /temp
 MANIFEST

 # PyInstaller
@@ -104,7 +105,6 @@ venv.bak/
 # mypy
 .mypy_cache/

 data
 .vscode
 .idea

@@ -124,3 +124,7 @@ replace.sh

 # Pytorch
 *.pth


 # audio
 *.wav
--- a/data/test/images/image1.jpg
+++ b/data/test/images/image1.jpg
@@ -0,0 +1,3 @@
 version https://git-lfs.github.com/spec/v1
 oid sha256:78094cc48fbcfd9b6d321fe13619ecc72b65e006fc1b4c4458409ade9979486d
 size 129862
--- a/data/test/images/image_captioning.png
+++ b/data/test/images/image_captioning.png
@@ -0,0 +1,3 @@
 version https://git-lfs.github.com/spec/v1
 oid sha256:af83a94899a6d23339c3ecc5c4c58c57c835af57b531a2f4c50461184f820141
 size 603621
--- a/data/test/images/image_matting.png
+++ b/data/test/images/image_matting.png
@@ -0,0 +1,3 @@
 version https://git-lfs.github.com/spec/v1
 oid sha256:af83a94899a6d23339c3ecc5c4c58c57c835af57b531a2f4c50461184f820141
 size 603621
--- a/data/test/images/ocr_detection.jpg
+++ b/data/test/images/ocr_detection.jpg
@@ -0,0 +1,3 @@
 version https://git-lfs.github.com/spec/v1
 oid sha256:5c8435db5583400be5d11a2c17910c96133b462c8a99ccaf0e19f4aac34e0a94
 size 141149
--- a/docs/source/develop.md
+++ b/docs/source/develop.md
@@ -91,6 +91,55 @@ make tests

 4. Daily regression tests will run all cases at 0 am each day using master branch.

 ### 2.3 Test data storage

 As we need a lot of data for testing, including images, videos, models. We use git lfs
 to store those large files.

 1. install git-lfs
 for mac
 ```bash
 brew install git-lfs
 git lfs install
 ```

 for centos, please download rpm from git-lfs github release [website](https://github.com/git-lfs/git-lfs/releases/tag/v3.2.0)
 ```bash
 wget http://101374-public.oss-cn-hangzhou-zmf.aliyuncs.com/git-lfs-3.2.0-1.el7.x86_64.rpm
 sudo rpm -ivh git-lfs-3.2.0-1.el7.x86_64.rpm
 git lfs install
 ```

 for ubuntu
 ```bash
 curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.deb.sh | sudo bash
 sudo apt-get install git-lfs
 git lfs install
 ```

 2. track your data type using git lfs, for example, to track png files
 ```bash
 git lfs track "*.png"
 ```

 3. add your test files to `data/test/` folder, you can make directories if you need.
 ```bash
 git add data/test/test.png
 ```

 4. commit your test data to remote branch
 ```bash
 git commit -m "xxx"
 ```

 To pull data from remote repo, just as the same way you pull git files.
 ```bash
 git pull origin branch_name
 ```




 ## Code Review

 1. Run following command to create an aone CR, replace `TARGET_BRANCH` and `CR_NAME` with the one you want.
--- a/docs/source/faq.md
+++ b/docs/source/faq.md
@@ -29,3 +29,15 @@ reference: [https://huggingface.co/docs/tokenizers/installation#installation-fro
 > ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.

 由于依赖库之间的版本不兼容，可能会存在版本冲突的情况，大部分情况下不影响正常运行。

 ### 3. 安装pytorch出现版本错误

 > ERROR: Ignored the following versions that require a different python version: 1.1.0 Requires-Python >=3.8; 1.1.0rc1 Requires-Python >=3.8; 1.1.1 Requires-Python >=3.8
 > ERROR: Could not find a version that satisfies the requirement torch==1.8.1+cu111 (from versions: 1.0.0, 1.0.1, 1.0.1.post2, 1.1.0, 1.2.0, 1.3.0, 1.3.1, 1.4.0, 1.5.0, 1.5.1, 1.6.0, 1.7.0, 1.7.1, 1.8.0, 1.8.1, 1.9.0, 1.9.1, 1.10.0, 1.10.1, 1.10.2, 1.11.0)
 > ERROR: No matching distribution found for torch==1.8.1+cu111

 安装时使用如下命令：

 ```shell
 pip install -f https://download.pytorch.org/whl/torch_stable.html -i https://pypi.tuna.tsinghua.edu.cn/simple -r requirements.txt
 ```
--- a/docs/source/quick_start.md
+++ b/docs/source/quick_start.md
@@ -25,6 +25,10 @@ ModelScope Library目前支持tensorflow，pytorch两大深度学习框架进行
 * [Pytorch安装指导](https://pytorch.org/get-started/locally/)
 * [Tensorflow安装指导](https://www.tensorflow.org/install/pip)

 部分第三方依赖库需要提前安装numpy
 ```
 pip install numpy
 ```

 ## ModelScope library 安装

--- a/modelscope/models/init.py
+++ b/modelscope/models/init.py
@@ -1,5 +1,8 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.

 from .audio.tts.am import SambertNetHifi16k
 from .audio.tts.vocoder import Hifigan16k
 from .base import Model
 from .builder import MODELS, build_model
 from .multi_model import OfaForImageCaptioning
 from .nlp import BertForSequenceClassification, SbertForSentenceSimilarity
--- a/modelscope/models/audio/init.py
+++ b/modelscope/models/audio/init.py
--- a/modelscope/models/audio/layers/init.py
+++ b/modelscope/models/audio/layers/init.py
--- a/modelscope/models/audio/layers/activations.py
+++ b/modelscope/models/audio/layers/activations.py
@@ -0,0 +1,60 @@
 import torch.nn as nn

 from .layer_base import LayerBase


 class RectifiedLinear(LayerBase):

    def __init__(self, input_dim, output_dim):
        super(RectifiedLinear, self).__init__()
        self.dim = input_dim
        self.relu = nn.ReLU()

    def forward(self, input):
        return self.relu(input)

    def to_kaldi_nnet(self):
        re_str = ''
        re_str += '<RectifiedLinear> %d %d\n' % (self.dim, self.dim)
        return re_str

    def load_kaldi_nnet(self, instr):
        return instr


 class LogSoftmax(LayerBase):

    def __init__(self, input_dim, output_dim):
        super(LogSoftmax, self).__init__()
        self.dim = input_dim
        self.ls = nn.LogSoftmax()

    def forward(self, input):
        return self.ls(input)

    def to_kaldi_nnet(self):
        re_str = ''
        re_str += '<Softmax> %d %d\n' % (self.dim, self.dim)
        return re_str

    def load_kaldi_nnet(self, instr):
        return instr


 class Sigmoid(LayerBase):

    def __init__(self, input_dim, output_dim):
        super(Sigmoid, self).__init__()
        self.dim = input_dim
        self.sig = nn.Sigmoid()

    def forward(self, input):
        return self.sig(input)

    def to_kaldi_nnet(self):
        re_str = ''
        re_str += '<Sigmoid> %d %d\n' % (self.dim, self.dim)
        return re_str

    def load_kaldi_nnet(self, instr):
        return instr
--- a/modelscope/models/audio/layers/affine_transform.py
+++ b/modelscope/models/audio/layers/affine_transform.py
@@ -0,0 +1,78 @@
 import numpy as np
 import torch as th
 import torch.nn as nn

 from .layer_base import (LayerBase, expect_kaldi_matrix, expect_token_number,
                         to_kaldi_matrix)


 class AffineTransform(LayerBase):

    def __init__(self, input_dim, output_dim):
        super(AffineTransform, self).__init__()
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.linear = nn.Linear(input_dim, output_dim)

    def forward(self, input):
        return self.linear(input)

    def to_kaldi_nnet(self):
        re_str = ''
        re_str += '<AffineTransform> %d %d\n' % (self.output_dim,
                                                 self.input_dim)
        re_str += '<LearnRateCoef> 1 <BiasLearnRateCoef> 1 <MaxNorm> 0\n'
        linear_weights = self.state_dict()['linear.weight']
        x = linear_weights.squeeze().numpy()
        re_str += to_kaldi_matrix(x)
        linear_bias = self.state_dict()['linear.bias']
        x = linear_bias.squeeze().numpy()
        re_str += to_kaldi_matrix(x)
        return re_str

    def to_raw_nnet(self, fid):
        linear_weights = self.state_dict()['linear.weight']
        x = linear_weights.squeeze().numpy()
        x.tofile(fid)

        linear_bias = self.state_dict()['linear.bias']
        x = linear_bias.squeeze().numpy()
        x.tofile(fid)

    def load_kaldi_nnet(self, instr):
        output = expect_token_number(
            instr,
            '<LearnRateCoef>',
        )
        if output is None:
            raise Exception('AffineTransform format error for <LearnRateCoef>')
        instr, lr = output

        output = expect_token_number(instr, '<BiasLearnRateCoef>')
        if output is None:
            raise Exception(
                'AffineTransform format error for <BiasLearnRateCoef>')
        instr, lr = output

        output = expect_token_number(instr, '<MaxNorm>')
        if output is None:
            raise Exception('AffineTransform format error for <MaxNorm>')
        instr, lr = output

        output = expect_kaldi_matrix(instr)
        if output is None:
            raise Exception('AffineTransform format error for parsing matrix')
        instr, mat = output

        print(mat.shape)
        self.linear.weight = th.nn.Parameter(
            th.from_numpy(mat).type(th.FloatTensor))

        output = expect_kaldi_matrix(instr)
        if output is None:
            raise Exception('AffineTransform format error for parsing matrix')
        instr, mat = output
        mat = np.squeeze(mat)
        self.linear.bias = th.nn.Parameter(
            th.from_numpy(mat).type(th.FloatTensor))
        return instr
--- a/modelscope/models/audio/layers/deep_fsmn.py
+++ b/modelscope/models/audio/layers/deep_fsmn.py
@@ -0,0 +1,178 @@
 import numpy as np
 import torch as th
 import torch.nn as nn
 import torch.nn.functional as F

 from .layer_base import (LayerBase, expect_kaldi_matrix, expect_token_number,
                         to_kaldi_matrix)


 class DeepFsmn(LayerBase):

    def __init__(self,
                 input_dim,
                 output_dim,
                 lorder=None,
                 rorder=None,
                 hidden_size=None,
                 layer_norm=False,
                 dropout=0):
        super(DeepFsmn, self).__init__()

        self.input_dim = input_dim
        self.output_dim = output_dim

        if lorder is None:
            return

        self.lorder = lorder
        self.rorder = rorder
        self.hidden_size = hidden_size
        self.layer_norm = layer_norm

        self.linear = nn.Linear(input_dim, hidden_size)
        self.norm = nn.LayerNorm(hidden_size)
        self.drop1 = nn.Dropout(p=dropout)
        self.drop2 = nn.Dropout(p=dropout)
        self.project = nn.Linear(hidden_size, output_dim, bias=False)

        self.conv1 = nn.Conv2d(
            output_dim,
            output_dim, [lorder, 1], [1, 1],
            groups=output_dim,
            bias=False)
        self.conv2 = nn.Conv2d(
            output_dim,
            output_dim, [rorder, 1], [1, 1],
            groups=output_dim,
            bias=False)

    def forward(self, input):

        f1 = F.relu(self.linear(input))

        f1 = self.drop1(f1)
        if self.layer_norm:
            f1 = self.norm(f1)

        p1 = self.project(f1)

        x = th.unsqueeze(p1, 1)

        x_per = x.permute(0, 3, 2, 1)

        y = F.pad(x_per, [0, 0, self.lorder - 1, 0])
        yr = F.pad(x_per, [0, 0, 0, self.rorder])
        yr = yr[:, :, 1:, :]

        out = x_per + self.conv1(y) + self.conv2(yr)
        out = self.drop2(out)

        out1 = out.permute(0, 3, 2, 1)

        return input + out1.squeeze()

    def to_kaldi_nnet(self):
        re_str = ''
        re_str += '<UniDeepFsmn> %d %d\n'\
                  % (self.output_dim, self.input_dim)
        re_str += '<LearnRateCoef> %d <HidSize> %d <LOrder> %d <LStride> %d <MaxNorm> 0\n'\
                  % (1, self.hidden_size, self.lorder, 1)
        lfiters = self.state_dict()['conv1.weight']
        x = np.flipud(lfiters.squeeze().numpy().T)
        re_str += to_kaldi_matrix(x)
        proj_weights = self.state_dict()['project.weight']
        x = proj_weights.squeeze().numpy()
        re_str += to_kaldi_matrix(x)
        linear_weights = self.state_dict()['linear.weight']
        x = linear_weights.squeeze().numpy()
        re_str += to_kaldi_matrix(x)
        linear_bias = self.state_dict()['linear.bias']
        x = linear_bias.squeeze().numpy()
        re_str += to_kaldi_matrix(x)
        return re_str

    def load_kaldi_nnet(self, instr):
        output = expect_token_number(
            instr,
            '<LearnRateCoef>',
        )
        if output is None:
            raise Exception('UniDeepFsmn format error for <LearnRateCoef>')
        instr, lr = output

        output = expect_token_number(
            instr,
            '<HidSize>',
        )
        if output is None:
            raise Exception('UniDeepFsmn format error for <HidSize>')
        instr, hiddensize = output
        self.hidden_size = int(hiddensize)

        output = expect_token_number(
            instr,
            '<LOrder>',
        )
        if output is None:
            raise Exception('UniDeepFsmn format error for <LOrder>')
        instr, lorder = output
        self.lorder = int(lorder)

        output = expect_token_number(
            instr,
            '<LStride>',
        )
        if output is None:
            raise Exception('UniDeepFsmn format error for <LStride>')
        instr, lstride = output
        self.lstride = lstride

        output = expect_token_number(
            instr,
            '<MaxNorm>',
        )
        if output is None:
            raise Exception('UniDeepFsmn format error for <MaxNorm>')

        output = expect_kaldi_matrix(instr)
        if output is None:
            raise Exception('UniDeepFsmn format error for parsing matrix')
        instr, mat = output
        mat1 = np.fliplr(mat.T).copy()
        self.conv1 = nn.Conv2d(
            self.output_dim,
            self.output_dim, [self.lorder, 1], [1, 1],
            groups=self.output_dim,
            bias=False)
        mat_th = th.from_numpy(mat1).type(th.FloatTensor)
        mat_th = mat_th.unsqueeze(1)
        mat_th = mat_th.unsqueeze(3)
        self.conv1.weight = th.nn.Parameter(mat_th)

        output = expect_kaldi_matrix(instr)
        if output is None:
            raise Exception('UniDeepFsmn format error for parsing matrix')
        instr, mat = output

        self.project = nn.Linear(self.hidden_size, self.output_dim, bias=False)
        self.linear = nn.Linear(self.input_dim, self.hidden_size)

        self.project.weight = th.nn.Parameter(
            th.from_numpy(mat).type(th.FloatTensor))

        output = expect_kaldi_matrix(instr)
        if output is None:
            raise Exception('UniDeepFsmn format error for parsing matrix')
        instr, mat = output
        self.linear.weight = th.nn.Parameter(
            th.from_numpy(mat).type(th.FloatTensor))

        output = expect_kaldi_matrix(instr)
        if output is None:
            raise Exception('UniDeepFsmn format error for parsing matrix')
        instr, mat = output
        self.linear.bias = th.nn.Parameter(
            th.from_numpy(mat).type(th.FloatTensor))

        return instr
--- a/modelscope/models/audio/layers/layer_base.py
+++ b/modelscope/models/audio/layers/layer_base.py
@@ -0,0 +1,50 @@
 import abc
 import re

 import numpy as np
 import torch.nn as nn


 def expect_token_number(instr, token):
    first_token = re.match(r'^\s*' + token, instr)
    if first_token is None:
        return None
    instr = instr[first_token.end():]
    lr = re.match(r'^\s*(-?\d+\.?\d*e?-?\d*?)', instr)
    if lr is None:
        return None
    return instr[lr.end():], lr.groups()[0]


 def expect_kaldi_matrix(instr):
    pos2 = instr.find('[', 0)
    pos3 = instr.find(']', pos2)
    mat = []
    for stt in instr[pos2 + 1:pos3].split('\n'):
        tmp_mat = np.fromstring(stt, dtype=np.float32, sep=' ')
        if tmp_mat.size > 0:
            mat.append(tmp_mat)
    return instr[pos3 + 1:], np.array(mat)


 def to_kaldi_matrix(np_mat):
    """
    function that transform as str numpy mat to standard kaldi str matrix
    :param np_mat: numpy mat
    :return: str
    """
    np.set_printoptions(threshold=np.inf, linewidth=np.nan, suppress=True)
    out_str = str(np_mat)
    out_str = out_str.replace('[', '')
    out_str = out_str.replace(']', '')
    return '[ %s ]\n' % out_str


 class LayerBase(nn.Module, metaclass=abc.ABCMeta):

    def __init__(self):
        super(LayerBase, self).__init__()

    @abc.abstractmethod
    def to_kaldi_nnet(self):
        pass
--- a/modelscope/models/audio/layers/uni_deep_fsmn.py
+++ b/modelscope/models/audio/layers/uni_deep_fsmn.py
@@ -0,0 +1,482 @@
 import numpy as np
 import torch as th
 import torch.nn as nn
 import torch.nn.functional as F

 from .layer_base import (LayerBase, expect_kaldi_matrix, expect_token_number,
                         to_kaldi_matrix)


 class SepConv(nn.Module):

    def __init__(self,
                 in_channels,
                 filters,
                 out_channels,
                 kernel_size=(5, 2),
                 dilation=(1, 1)):
        """ :param kernel_size (time, frequency)

        """
        super(SepConv, self).__init__()
        # depthwise + pointwise
        self.dconv = nn.Conv2d(
            in_channels,
            in_channels * filters,
            kernel_size,
            dilation=dilation,
            groups=in_channels)
        self.pconv = nn.Conv2d(
            in_channels * filters, out_channels, kernel_size=1)
        self.padding = dilation[0] * (kernel_size[0] - 1)

    def forward(self, input):
        ''' input: [B, C, T, F]
        '''
        x = F.pad(input, [0, 0, self.padding, 0])
        x = self.dconv(x)
        x = self.pconv(x)
        return x


 class Conv2d(nn.Module):

    def __init__(self,
                 input_dim,
                 output_dim,
                 lorder=20,
                 rorder=0,
                 groups=1,
                 bias=False,
                 skip_connect=True):
        super(Conv2d, self).__init__()
        self.lorder = lorder
        self.conv = nn.Conv2d(
            input_dim, output_dim, [lorder, 1], groups=groups, bias=bias)
        self.rorder = rorder
        if self.rorder:
            self.conv2 = nn.Conv2d(
                input_dim, output_dim, [rorder, 1], groups=groups, bias=bias)
        self.skip_connect = skip_connect

    def forward(self, input):
        # [B, 1, T, F]
        x = th.unsqueeze(input, 1)
        # [B, F, T, 1]
        x_per = x.permute(0, 3, 2, 1)
        y = F.pad(x_per, [0, 0, self.lorder - 1, 0])
        out = self.conv(y)
        if self.rorder:
            yr = F.pad(x_per, [0, 0, 0, self.rorder])
            yr = yr[:, :, 1:, :]
            out += self.conv2(yr)
        out = out.permute(0, 3, 2, 1).squeeze(1)
        if self.skip_connect:
            out = out + input
        return out


 class SelfAttLayer(nn.Module):

    def __init__(self, input_dim, output_dim, lorder=None, hidden_size=None):
        super(SelfAttLayer, self).__init__()

        self.input_dim = input_dim
        self.output_dim = output_dim

        if lorder is None:
            return

        self.lorder = lorder
        self.hidden_size = hidden_size

        self.linear = nn.Linear(input_dim, hidden_size)

        self.project = nn.Linear(hidden_size, output_dim, bias=False)

        self.att = nn.Linear(input_dim, lorder, bias=False)

    def forward(self, input):

        f1 = F.relu(self.linear(input))

        p1 = self.project(f1)

        x = th.unsqueeze(p1, 1)

        x_per = x.permute(0, 3, 2, 1)

        y = F.pad(x_per, [0, 0, self.lorder - 1, 0])

        # z [B, F, T, lorder]
        z = x_per
        for i in range(1, self.lorder):
            z = th.cat([z, y[:, :, self.lorder - 1 - i:-i, :]], axis=-1)

        # [B, T, lorder]
        att = F.softmax(self.att(input), dim=-1)
        att = th.unsqueeze(att, 1)
        z = th.sum(z * att, axis=-1)

        out1 = z.permute(0, 2, 1)

        return input + out1


 class TFFsmn(nn.Module):

    def __init__(self,
                 input_dim,
                 output_dim,
                 lorder=None,
                 hidden_size=None,
                 dilation=1,
                 layer_norm=False,
                 dropout=0,
                 skip_connect=True):
        super(TFFsmn, self).__init__()

        self.skip_connect = skip_connect

        self.linear = nn.Linear(input_dim, hidden_size)
        self.norm = nn.Identity()
        if layer_norm:
            self.norm = nn.LayerNorm(input_dim)
        self.act = nn.ReLU()
        self.project = nn.Linear(hidden_size, output_dim, bias=False)

        self.conv1 = nn.Conv2d(
            output_dim,
            output_dim, [lorder, 1],
            dilation=[dilation, 1],
            groups=output_dim,
            bias=False)
        self.padding_left = dilation * (lorder - 1)
        dorder = 5
        self.conv2 = nn.Conv2d(1, 1, [dorder, 1], bias=False)
        self.padding_freq = dorder - 1

    def forward(self, input):
        return self.compute1(input)

    def compute1(self, input):
        ''' linear-dconv-relu(norm)-linear-dconv
        '''
        x = self.linear(input)
        # [B, 1, F, T]
        x = th.unsqueeze(x, 1).permute(0, 1, 3, 2)
        z = F.pad(x, [0, 0, self.padding_freq, 0])
        z = self.conv2(z) + x
        x = z.permute(0, 3, 2, 1).squeeze(-1)
        x = self.act(x)
        x = self.norm(x)
        x = self.project(x)
        x = th.unsqueeze(x, 1).permute(0, 3, 2, 1)
        # [B, F, T+lorder-1, 1]
        y = F.pad(x, [0, 0, self.padding_left, 0])
        out = self.conv1(y)
        if self.skip_connect:
            out = out + x
        out = out.permute(0, 3, 2, 1).squeeze()

        return input + out


 class CNNFsmn(nn.Module):
    ''' use cnn to reduce parameters
    '''

    def __init__(self,
                 input_dim,
                 output_dim,
                 lorder=None,
                 hidden_size=None,
                 dilation=1,
                 layer_norm=False,
                 dropout=0,
                 skip_connect=True):
        super(CNNFsmn, self).__init__()

        self.input_dim = input_dim
        self.output_dim = output_dim
        self.skip_connect = skip_connect

        if lorder is None:
            return

        self.lorder = lorder
        self.hidden_size = hidden_size

        self.linear = nn.Linear(input_dim, hidden_size)
        self.act = nn.ReLU()
        kernel_size = (3, 8)
        stride = (1, 4)
        self.conv = nn.Sequential(
            nn.ConstantPad2d((stride[1], 0, kernel_size[0] - 1, 0), 0),
            nn.Conv2d(1, stride[1], kernel_size=kernel_size, stride=stride))

        self.dconv = nn.Conv2d(
            output_dim,
            output_dim, [lorder, 1],
            dilation=[dilation, 1],
            groups=output_dim,
            bias=False)
        self.padding_left = dilation * (lorder - 1)

    def forward(self, input):
        return self.compute2(input)

    def compute1(self, input):
        ''' linear-relu(norm)-conv2d-relu?-dconv
        '''
        # [B, T, F]
        x = self.linear(input)
        x = self.act(x)
        x = th.unsqueeze(x, 1)
        x = self.conv(x)
        # [B, C, T, F] -> [B, 1, T, F]
        b, c, t, f = x.shape
        x = x.view([b, 1, t, -1])
        x = x.permute(0, 3, 2, 1)
        # [B, F, T+lorder-1, 1]
        y = F.pad(x, [0, 0, self.padding_left, 0])
        out = self.dconv(y)
        if self.skip_connect:
            out = out + x
        out = out.permute(0, 3, 2, 1).squeeze()
        return input + out

    def compute2(self, input):
        ''' conv2d-relu-linear-relu?-dconv
        '''
        x = th.unsqueeze(input, 1)
        x = self.conv(x)
        x = self.act(x)
        # [B, C, T, F] -> [B, T, F]
        b, c, t, f = x.shape
        x = x.view([b, t, -1])
        x = self.linear(x)
        x = th.unsqueeze(x, 1).permute(0, 3, 2, 1)
        y = F.pad(x, [0, 0, self.padding_left, 0])
        out = self.dconv(y)
        if self.skip_connect:
            out = out + x
        out = out.permute(0, 3, 2, 1).squeeze()
        return input + out


 class UniDeepFsmn(LayerBase):

    def __init__(self,
                 input_dim,
                 output_dim,
                 lorder=None,
                 hidden_size=None,
                 dilation=1,
                 layer_norm=False,
                 dropout=0,
                 skip_connect=True):
        super(UniDeepFsmn, self).__init__()

        self.input_dim = input_dim
        self.output_dim = output_dim
        self.skip_connect = skip_connect

        if lorder is None:
            return

        self.lorder = lorder
        self.hidden_size = hidden_size

        self.linear = nn.Linear(input_dim, hidden_size)
        self.norm = nn.Identity()
        if layer_norm:
            self.norm = nn.LayerNorm(input_dim)
        self.act = nn.ReLU()
        self.project = nn.Linear(hidden_size, output_dim, bias=False)

        self.conv1 = nn.Conv2d(
            output_dim,
            output_dim, [lorder, 1],
            dilation=[dilation, 1],
            groups=output_dim,
            bias=False)
        self.padding_left = dilation * (lorder - 1)

    def forward(self, input):
        return self.compute1(input)

    def compute1(self, input):
        ''' linear-relu(norm)-linear-dconv
        '''
        # [B, T, F]
        x = self.linear(input)
        x = self.act(x)
        x = self.norm(x)
        x = self.project(x)
        x = th.unsqueeze(x, 1).permute(0, 3, 2, 1)
        # [B, F, T+lorder-1, 1]
        y = F.pad(x, [0, 0, self.padding_left, 0])
        out = self.conv1(y)
        if self.skip_connect:
            out = out + x
        out = out.permute(0, 3, 2, 1).squeeze()

        return input + out

    def compute2(self, input):
        ''' linear-dconv-linear-relu(norm)
        '''
        x = self.project(input)
        x = th.unsqueeze(x, 1).permute(0, 3, 2, 1)
        y = F.pad(x, [0, 0, self.padding_left, 0])
        out = self.conv1(y)
        if self.skip_connect:
            out = out + x
        out = out.permute(0, 3, 2, 1).squeeze()
        x = self.linear(out)
        x = self.act(x)
        x = self.norm(x)

        return input + x

    def compute3(self, input):
        ''' dconv-linear-relu(norm)-linear
        '''
        x = th.unsqueeze(input, 1).permute(0, 3, 2, 1)
        y = F.pad(x, [0, 0, self.padding_left, 0])
        out = self.conv1(y)
        if self.skip_connect:
            out = out + x
        out = out.permute(0, 3, 2, 1).squeeze()
        x = self.linear(out)
        x = self.act(x)
        x = self.norm(x)
        x = self.project(x)

        return input + x

    def to_kaldi_nnet(self):
        re_str = ''
        re_str += '<UniDeepFsmn> %d %d\n' \
                  % (self.output_dim, self.input_dim)
        re_str += '<LearnRateCoef> %d <HidSize> %d <LOrder> %d <LStride> %d <MaxNorm> 0\n' \
                  % (1, self.hidden_size, self.lorder, 1)
        lfiters = self.state_dict()['conv1.weight']
        x = np.flipud(lfiters.squeeze().numpy().T)
        re_str += to_kaldi_matrix(x)
        proj_weights = self.state_dict()['project.weight']
        x = proj_weights.squeeze().numpy()
        re_str += to_kaldi_matrix(x)
        linear_weights = self.state_dict()['linear.weight']
        x = linear_weights.squeeze().numpy()
        re_str += to_kaldi_matrix(x)
        linear_bias = self.state_dict()['linear.bias']
        x = linear_bias.squeeze().numpy()
        re_str += to_kaldi_matrix(x)
        return re_str

    def to_raw_nnet(self, fid):
        lfiters = self.state_dict()['conv1.weight']
        x = np.flipud(lfiters.squeeze().numpy().T)
        x.tofile(fid)

        proj_weights = self.state_dict()['project.weight']
        x = proj_weights.squeeze().numpy()
        x.tofile(fid)

        linear_weights = self.state_dict()['linear.weight']
        x = linear_weights.squeeze().numpy()
        x.tofile(fid)

        linear_bias = self.state_dict()['linear.bias']
        x = linear_bias.squeeze().numpy()
        x.tofile(fid)

    def load_kaldi_nnet(self, instr):
        output = expect_token_number(
            instr,
            '<LearnRateCoef>',
        )
        if output is None:
            raise Exception('UniDeepFsmn format error for <LearnRateCoef>')
        instr, lr = output

        output = expect_token_number(
            instr,
            '<HidSize>',
        )
        if output is None:
            raise Exception('UniDeepFsmn format error for <HidSize>')
        instr, hiddensize = output
        self.hidden_size = int(hiddensize)

        output = expect_token_number(
            instr,
            '<LOrder>',
        )
        if output is None:
            raise Exception('UniDeepFsmn format error for <LOrder>')
        instr, lorder = output
        self.lorder = int(lorder)

        output = expect_token_number(
            instr,
            '<LStride>',
        )
        if output is None:
            raise Exception('UniDeepFsmn format error for <LStride>')
        instr, lstride = output
        self.lstride = lstride

        output = expect_token_number(
            instr,
            '<MaxNorm>',
        )
        if output is None:
            raise Exception('UniDeepFsmn format error for <MaxNorm>')

        output = expect_kaldi_matrix(instr)
        if output is None:
            raise Exception('UniDeepFsmn format error for parsing matrix')
        instr, mat = output
        mat1 = np.fliplr(mat.T).copy()

        self.conv1 = nn.Conv2d(
            self.output_dim,
            self.output_dim, [self.lorder, 1], [1, 1],
            groups=self.output_dim,
            bias=False)

        mat_th = th.from_numpy(mat1).type(th.FloatTensor)
        mat_th = mat_th.unsqueeze(1)
        mat_th = mat_th.unsqueeze(3)
        self.conv1.weight = th.nn.Parameter(mat_th)

        output = expect_kaldi_matrix(instr)
        if output is None:
            raise Exception('UniDeepFsmn format error for parsing matrix')
        instr, mat = output

        self.project = nn.Linear(self.hidden_size, self.output_dim, bias=False)
        self.linear = nn.Linear(self.input_dim, self.hidden_size)

        self.project.weight = th.nn.Parameter(
            th.from_numpy(mat).type(th.FloatTensor))

        output = expect_kaldi_matrix(instr)
        if output is None:
            raise Exception('UniDeepFsmn format error for parsing matrix')
        instr, mat = output
        self.linear.weight = th.nn.Parameter(
            th.from_numpy(mat).type(th.FloatTensor))

        output = expect_kaldi_matrix(instr)
        if output is None:
            raise Exception('UniDeepFsmn format error for parsing matrix')
        instr, mat = output
        mat = np.squeeze(mat)
        self.linear.bias = th.nn.Parameter(
            th.from_numpy(mat).type(th.FloatTensor))

        return instr
--- a/modelscope/models/audio/network/init.py
+++ b/modelscope/models/audio/network/init.py
--- a/modelscope/models/audio/network/loss.py
+++ b/modelscope/models/audio/network/loss.py
@@ -0,0 +1,394 @@
 import torch
 import torch.nn.functional as F

 from .modulation_loss import (GaborSTRFConv, MelScale,
                              ModulationDomainLossModule)

 EPS = 1e-8


 def compute_mask(mixed_spec, clean_spec, mask_type='psmiam', clip=1):
    '''
        stft: (batch, ..., 2) or complex(batch, ...)
        y = x + n
    '''
    if torch.is_complex(mixed_spec):
        yr, yi = mixed_spec.real, mixed_spec.imag
    else:
        yr, yi = mixed_spec[..., 0], mixed_spec[..., 1]
    if torch.is_complex(clean_spec):
        xr, xi = clean_spec.real, clean_spec.imag
    else:
        xr, xi = clean_spec[..., 0], clean_spec[..., 1]

    if mask_type == 'iam':
        ymag = torch.sqrt(yr**2 + yi**2)
        xmag = torch.sqrt(xr**2 + xi**2)
        iam = xmag / (ymag + EPS)
        return torch.clamp(iam, 0, 1)

    elif mask_type == 'psm':
        ypow = yr**2 + yi**2
        psm = (xr * yr + xi * yi) / (ypow + EPS)
        return torch.clamp(psm, 0, 1)

    elif mask_type == 'psmiam':
        ypow = yr**2 + yi**2
        psm = (xr * yr + xi * yi) / (ypow + EPS)
        ymag = torch.sqrt(yr**2 + yi**2)
        xmag = torch.sqrt(xr**2 + xi**2)
        iam = xmag / (ymag + EPS)
        psmiam = psm * iam
        return torch.clamp(psmiam, 0, 1)

    elif mask_type == 'crm':
        ypow = yr**2 + yi**2
        mr = (xr * yr + xi * yi) / (ypow + EPS)
        mi = (xi * yr - xr * yi) / (ypow + EPS)
        mr = torch.clamp(mr, -clip, clip)
        mi = torch.clamp(mi, -clip, clip)
        return mr, mi


 def energy_vad(spec,
               thdhigh=320 * 600 * 600 * 2,
               thdlow=320 * 300 * 300 * 2,
               int16=True):
    '''
        energy based vad should be accurate enough
        spec: (batch, bins, frames, 2)
        returns (batch, frames)
    '''
    energy = torch.sum(spec[..., 0]**2 + spec[..., 1]**2, dim=1)
    vad = energy > thdhigh
    idx = torch.logical_and(vad == 0, energy > thdlow)
    vad[idx] = 0.5
    return vad


 def modulation_loss_init(n_fft):
    gabor_strf_parameters = torch.load(
        './network/gabor_strf_parameters.pt')['state_dict']
    gabor_modulation_kernels = GaborSTRFConv(supn=30, supk=30, nkern=60)
    gabor_modulation_kernels.load_state_dict(gabor_strf_parameters)

    modulation_loss_module = ModulationDomainLossModule(
        gabor_modulation_kernels.eval())
    for param in modulation_loss_module.parameters():
        param.requires_grad = False

    stft2mel = MelScale(
        n_mels=80, sample_rate=16000, n_stft=n_fft // 2 + 1).cuda()

    return modulation_loss_module, stft2mel


 def mask_loss_function(
        loss_func='psm_loss',
        loss_type='mse',  # ['mse', 'mae', 'comb']
        mask_type='psmiam',
        use_mod_loss=False,
        use_wav2vec_loss=False,
        n_fft=640,
        hop_length=320,
        EPS=1e-8,
        weight=None):
    if weight is not None:
        print(f'Use loss weight: {weight}')
    winlen = n_fft
    window = torch.hamming_window(winlen, periodic=False)

    def stft(x, return_complex=False):
        # returns [batch, bins, frames, 2]
        return torch.stft(
            x,
            n_fft,
            hop_length,
            winlen,
            window=window.to(x.device),
            center=False,
            return_complex=return_complex)

    def istft(x, slen):
        return torch.istft(
            x,
            n_fft,
            hop_length,
            winlen,
            window=window.to(x.device),
            center=False,
            length=slen)

    def mask_loss(targets, masks, nframes):
        ''' [Batch, Time, Frequency]
        '''
        with torch.no_grad():
            mask_for_loss = torch.ones_like(targets)
            for idx, num in enumerate(nframes):
                mask_for_loss[idx, num:, :] = 0
        masks = masks * mask_for_loss
        targets = targets * mask_for_loss

        if weight is None:
            alpha = 1
        else:  # for aec ST
            alpha = weight - targets

        if loss_type == 'mse':
            loss = 0.5 * torch.sum(alpha * torch.pow(targets - masks, 2))
        elif loss_type == 'mae':
            loss = torch.sum(alpha * torch.abs(targets - masks))
        else:  # mse(mask), mae(mask) approx 1:2
            loss = 0.5 * torch.sum(alpha * torch.pow(targets - masks, 2)
                                   + 0.1 * alpha * torch.abs(targets - masks))
        loss /= torch.sum(nframes)
        return loss

    def spectrum_loss(targets, spec, nframes):
        ''' [Batch, Time, Frequency, 2]
        '''
        with torch.no_grad():
            mask_for_loss = torch.ones_like(targets[..., 0])
            for idx, num in enumerate(nframes):
                mask_for_loss[idx, num:, :] = 0
        xr = spec[..., 0] * mask_for_loss
        xi = spec[..., 1] * mask_for_loss
        yr = targets[..., 0] * mask_for_loss
        yi = targets[..., 1] * mask_for_loss
        xmag = torch.sqrt(spec[..., 0]**2 + spec[..., 1]**2) * mask_for_loss
        ymag = torch.sqrt(targets[..., 0]**2
                          + targets[..., 1]**2) * mask_for_loss

        loss1 = torch.sum(torch.pow(xr - yr, 2) + torch.pow(xi - yi, 2))
        loss2 = torch.sum(torch.pow(xmag - ymag, 2))

        loss = (loss1 + loss2) / torch.sum(nframes)
        return loss

    def sa_loss_dlen(mixed, clean, masks, nframes):
        yspec = stft(mixed).permute([0, 2, 1, 3]) / 32768
        xspec = stft(clean).permute([0, 2, 1, 3]) / 32768
        with torch.no_grad():
            mask_for_loss = torch.ones_like(xspec[..., 0])
            for idx, num in enumerate(nframes):
                mask_for_loss[idx, num:, :] = 0
        emag = ((yspec[..., 0]**2 + yspec[..., 1]**2)**0.15) * (masks**0.3)
        xmag = (xspec[..., 0]**2 + xspec[..., 1]**2)**0.15
        emag = emag * mask_for_loss
        xmag = xmag * mask_for_loss

        loss = torch.sum(torch.pow(emag - xmag, 2)) / torch.sum(nframes)
        return loss

    def psm_vad_loss_dlen(mixed, clean, masks, nframes, subtask=None):
        mixed_spec = stft(mixed)
        clean_spec = stft(clean)
        targets = compute_mask(mixed_spec, clean_spec, mask_type)
        # [B, T, F]
        targets = targets.permute(0, 2, 1)

        loss = mask_loss(targets, masks, nframes)

        if subtask is not None:
            vadtargets = energy_vad(clean_spec)
            with torch.no_grad():
                mask_for_loss = torch.ones_like(targets[:, :, 0])
                for idx, num in enumerate(nframes):
                    mask_for_loss[idx, num:] = 0
            subtask = subtask[:, :, 0] * mask_for_loss
            vadtargets = vadtargets * mask_for_loss

            loss_vad = F.binary_cross_entropy(subtask, vadtargets)
            return loss + loss_vad
        return loss

    def modulation_loss(mixed, clean, masks, nframes, subtask=None):
        mixed_spec = stft(mixed, True)
        clean_spec = stft(clean, True)
        enhanced_mag = torch.abs(mixed_spec)
        clean_mag = torch.abs(clean_spec)
        with torch.no_grad():
            mask_for_loss = torch.ones_like(clean_mag)
            for idx, num in enumerate(nframes):
                mask_for_loss[idx, :, num:] = 0
        clean_mag = clean_mag * mask_for_loss
        enhanced_mag = enhanced_mag * mask_for_loss * masks.permute([0, 2, 1])

        # Covert to log-mel representation
        # (B,T,#mel_channels)
        clean_log_mel = torch.log(
            torch.transpose(stft2mel(clean_mag**2), 2, 1) + 1e-8)
        enhanced_log_mel = torch.log(
            torch.transpose(stft2mel(enhanced_mag**2), 2, 1) + 1e-8)

        alpha = compute_mask(mixed_spec, clean_spec, mask_type)
        alpha = alpha.permute(0, 2, 1)
        loss = 0.05 * modulation_loss_module(enhanced_log_mel, clean_log_mel,
                                             alpha)
        loss2 = psm_vad_loss_dlen(mixed, clean, masks, nframes, subtask)
        # print(loss.item(), loss2.item()) #approx 1:4
        loss = loss + loss2
        return loss

    def wav2vec_loss(mixed, clean, masks, nframes, subtask=None):
        mixed /= 32768
        clean /= 32768
        mixed_spec = stft(mixed)
        with torch.no_grad():
            mask_for_loss = torch.ones_like(masks)
            for idx, num in enumerate(nframes):
                mask_for_loss[idx, num:, :] = 0
        masks_est = masks * mask_for_loss

        estimate = mixed_spec * masks_est.permute([0, 2, 1]).unsqueeze(3)
        est_clean = istft(estimate, clean.shape[1])
        loss = wav2vec_loss_module(est_clean, clean)
        return loss

    def sisdr_loss_dlen(mixed,
                        clean,
                        masks,
                        nframes,
                        subtask=None,
                        zero_mean=True):
        mixed_spec = stft(mixed)
        with torch.no_grad():
            mask_for_loss = torch.ones_like(masks)
            for idx, num in enumerate(nframes):
                mask_for_loss[idx, num:, :] = 0
        masks_est = masks * mask_for_loss

        estimate = mixed_spec * masks_est.permute([0, 2, 1]).unsqueeze(3)
        est_clean = istft(estimate, clean.shape[1])
        flen = min(clean.shape[1], est_clean.shape[1])
        clean = clean[:, :flen]
        est_clean = est_clean[:, :flen]

        # follow asteroid/losses/sdr.py
        if zero_mean:
            clean = clean - torch.mean(clean, dim=1, keepdim=True)
            est_clean = est_clean - torch.mean(est_clean, dim=1, keepdim=True)

        dot = torch.sum(est_clean * clean, dim=1, keepdim=True)
        s_clean_energy = torch.sum(clean**2, dim=1, keepdim=True) + EPS
        scaled_clean = dot * clean / s_clean_energy
        e_noise = est_clean - scaled_clean

        # [batch]
        sisdr = torch.sum(
            scaled_clean**2, dim=1) / (
                torch.sum(e_noise**2, dim=1) + EPS)
        sisdr = -10 * torch.log10(sisdr + EPS)
        loss = sisdr.mean()
        return loss

    def sisdr_freq_loss_dlen(mixed, clean, masks, nframes, subtask=None):
        mixed_spec = stft(mixed)
        clean_spec = stft(clean)
        with torch.no_grad():
            mask_for_loss = torch.ones_like(masks)
            for idx, num in enumerate(nframes):
                mask_for_loss[idx, num:, :] = 0
        masks_est = masks * mask_for_loss

        estimate = mixed_spec * masks_est.permute([0, 2, 1]).unsqueeze(3)

        dot_real = estimate[..., 0] * clean_spec[..., 0] + \
            estimate[..., 1] * clean_spec[..., 1]
        dot_imag = estimate[..., 0] * clean_spec[..., 1] - \
            estimate[..., 1] * clean_spec[..., 0]
        dot = torch.cat([dot_real.unsqueeze(3), dot_imag.unsqueeze(3)], dim=-1)
        s_clean_energy = clean_spec[..., 0] ** 2 + \
            clean_spec[..., 1] ** 2 + EPS
        scaled_clean = dot * clean_spec / s_clean_energy.unsqueeze(3)
        e_noise = estimate - scaled_clean

        # [batch]
        scaled_clean_energy = torch.sum(
            scaled_clean[..., 0]**2 + scaled_clean[..., 1]**2, dim=1)
        e_noise_energy = torch.sum(
            e_noise[..., 0]**2 + e_noise[..., 1]**2, dim=1)
        sisdr = torch.sum(
            scaled_clean_energy, dim=1) / (
                torch.sum(e_noise_energy, dim=1) + EPS)
        sisdr = -10 * torch.log10(sisdr + EPS)
        loss = sisdr.mean()
        return loss

    def crm_loss_dlen(mixed, clean, masks, nframes, subtask=None):
        mixed_spec = stft(mixed).permute([0, 2, 1, 3])
        clean_spec = stft(clean).permute([0, 2, 1, 3])
        mixed_spec = mixed_spec / 32768
        clean_spec = clean_spec / 32768
        tgt_mr, tgt_mi = compute_mask(mixed_spec, clean_spec, mask_type='crm')

        D = int(masks.shape[2] / 2)
        with torch.no_grad():
            mask_for_loss = torch.ones_like(clean_spec[..., 0])
            for idx, num in enumerate(nframes):
                mask_for_loss[idx, num:, :] = 0
        mr = masks[..., :D] * mask_for_loss
        mi = masks[..., D:] * mask_for_loss
        tgt_mr = tgt_mr * mask_for_loss
        tgt_mi = tgt_mi * mask_for_loss

        if weight is None:
            alpha = 1
        else:
            alpha = weight - tgt_mr
        # signal approximation
        yr = mixed_spec[..., 0]
        yi = mixed_spec[..., 1]
        loss1 = torch.sum(alpha * torch.pow((mr * yr - mi * yi) - clean_spec[..., 0], 2)) \
            + torch.sum(alpha * torch.pow((mr * yi + mi * yr) - clean_spec[..., 1], 2))
        # mask approximation
        loss2 = torch.sum(alpha * torch.pow(mr - tgt_mr, 2)) \
            + torch.sum(alpha * torch.pow(mi - tgt_mi, 2))
        loss = 0.5 * (loss1 + loss2) / torch.sum(nframes)
        return loss

    def crm_miso_loss_dlen(mixed, clean, masks, nframes):
        return crm_loss_dlen(mixed[..., 0], clean[..., 0], masks, nframes)

    def mimo_loss_dlen(mixed, clean, masks, nframes):
        chs = mixed.shape[-1]
        D = masks.shape[2] // chs
        loss = psm_vad_loss_dlen(mixed[..., 0], clean[..., 0], masks[..., :D],
                                 nframes)
        for ch in range(1, chs):
            loss1 = psm_vad_loss_dlen(mixed[..., ch], clean[..., ch],
                                      masks[..., ch * D:ch * D + D], nframes)
            loss = loss + loss1
        return loss / chs

    def spec_loss_dlen(mixed, clean, spec, nframes):
        clean_spec = stft(clean).permute([0, 2, 1, 3])
        clean_spec = clean_spec / 32768

        D = spec.shape[2] // 2
        spec_est = torch.cat([spec[..., :D, None], spec[..., D:, None]],
                             dim=-1)
        loss = spectrum_loss(clean_spec, spec_est, nframes)
        return loss

    if loss_func == 'psm_vad_loss_dlen':
        return psm_vad_loss_dlen
    elif loss_func == 'sisdr_loss_dlen':
        return sisdr_loss_dlen
    elif loss_func == 'sisdr_freq_loss_dlen':
        return sisdr_freq_loss_dlen
    elif loss_func == 'crm_loss_dlen':
        return crm_loss_dlen
    elif loss_func == 'modulation_loss':
        return modulation_loss
    elif loss_func == 'wav2vec_loss':
        return wav2vec_loss
    elif loss_func == 'mimo_loss_dlen':
        return mimo_loss_dlen
    elif loss_func == 'spec_loss_dlen':
        return spec_loss_dlen
    elif loss_func == 'sa_loss_dlen':
        return sa_loss_dlen
    else:
        print('error loss func')
        return None
--- a/modelscope/models/audio/network/modulation_loss.py
+++ b/modelscope/models/audio/network/modulation_loss.py
@@ -0,0 +1,248 @@
 import math

 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from torchaudio.transforms import MelScale


 class ModulationDomainLossModule(torch.nn.Module):
    """Modulation-domain loss function developed in [1] for supervised speech enhancement

        In our paper, we used the gabor-based STRF kernels as the modulation kernels and used the log-mel spectrogram
        as the input spectrogram representation.
        Specific parameter details are in the paper and in the example below

        Parameters
        ----------
        modulation_kernels: nn.Module
            Differentiable module that transforms a spectrogram representation to the modulation domain

            modulation_domain = modulation_kernels(input_tf_representation)
            Input Spectrogram representation (B, T, F) ---> |(M) modulation_kernels|--->Modulation Domain(B, M, T', F')

        norm: boolean
            Normalizes the modulation domain representation to be 0 mean across time

        [1] T. Vuong, Y. Xia, and R. M. Stern, “A modulation-domain lossfor neural-network-based real-time
         speech enhancement”
            Accepted ICASSP 2021, https://arxiv.org/abs/2102.07330


    """

    def __init__(self, modulation_kernels, norm=True):
        super(ModulationDomainLossModule, self).__init__()

        self.modulation_kernels = modulation_kernels
        self.mse = nn.MSELoss(reduce=False)
        self.norm = norm

    def forward(self, enhanced_spect, clean_spect, weight=None):
        """Calculate modulation-domain loss
        Args:
            enhanced_spect (Tensor): spectrogram representation of enhanced signal (B, #frames, #freq_channels).
            clean_spect (Tensor): spectrogram representation of clean ground-truth signal (B, #frames, #freq_channels).
        Returns:
            Tensor: Modulation-domain loss value.
        """

        clean_mod = self.modulation_kernels(clean_spect)
        enhanced_mod = self.modulation_kernels(enhanced_spect)

        if self.norm:
            mean_clean_mod = torch.mean(clean_mod, dim=2)
            mean_enhanced_mod = torch.mean(enhanced_mod, dim=2)

            clean_mod = clean_mod - mean_clean_mod.unsqueeze(2)
            enhanced_mod = enhanced_mod - mean_enhanced_mod.unsqueeze(2)

        if weight is None:
            alpha = 1
        else:  # TF-mask weight
            alpha = 1 + torch.sum(weight, dim=-1, keepdim=True).unsqueeze(1)
        mod_mse_loss = self.mse(enhanced_mod, clean_mod) * alpha
        mod_mse_loss = torch.mean(
            torch.sum(mod_mse_loss, dim=(1, 2, 3))
            / torch.sum(clean_mod**2, dim=(1, 2, 3)))

        return mod_mse_loss


 class ModulationDomainNCCLossModule(torch.nn.Module):
    """Modulation-domain loss function developed in [1] for supervised speech enhancement

        # Speech Intelligibility Prediction Using Spectro-Temporal Modulation Analysis - based off of this

        In our paper, we used the gabor-based STRF kernels as the modulation kernels and used the log-mel spectrogram
        as the input spectrogram representation.
        Specific parameter details are in the paper and in the example below

        Parameters
        ----------
        modulation_kernels: nn.Module
            Differentiable module that transforms a spectrogram representation to the modulation domain

            modulation_domain = modulation_kernels(input_tf_representation)
            Input Spectrogram representation(B, T, F) --- (M) modulation_kernels---> Modulation Domain(B, M, T', F')

        [1]

    """

    def __init__(self, modulation_kernels):
        super(ModulationDomainNCCLossModule, self).__init__()

        self.modulation_kernels = modulation_kernels
        self.mse = nn.MSELoss(reduce=False)

    def forward(self, enhanced_spect, clean_spect):
        """Calculate modulation-domain loss
        Args:
            enhanced_spect (Tensor): spectrogram representation of enhanced signal (B, #frames, #freq_channels).
            clean_spect (Tensor): spectrogram representation of clean ground-truth signal (B, #frames, #freq_channels).
        Returns:
            Tensor: Modulation-domain loss value.
        """

        clean_mod = self.modulation_kernels(clean_spect)
        enhanced_mod = self.modulation_kernels(enhanced_spect)
        mean_clean_mod = torch.mean(clean_mod, dim=2)
        mean_enhanced_mod = torch.mean(enhanced_mod, dim=2)

        normalized_clean = clean_mod - mean_clean_mod.unsqueeze(2)
        normalized_enhanced = enhanced_mod - mean_enhanced_mod.unsqueeze(2)

        inner_product = torch.sum(
            normalized_clean * normalized_enhanced, dim=2)
        normalized_denom = (torch.sum(
            normalized_clean * normalized_clean, dim=2))**.5 * (torch.sum(
                normalized_enhanced * normalized_enhanced, dim=2))**.5

        ncc = inner_product / normalized_denom
        mod_mse_loss = torch.mean((ncc - 1.0)**2)

        return mod_mse_loss


 class GaborSTRFConv(nn.Module):
    """Gabor-STRF-based cross-correlation kernel."""

    def __init__(self,
                 supn,
                 supk,
                 nkern,
                 rates=None,
                 scales=None,
                 norm_strf=True,
                 real_only=False):
        """Instantiate a Gabor-based STRF convolution layer.
        Parameters
        ----------
        supn: int
            Time support in number of frames. Also the window length.
        supk: int
            Frequency support in number of channels. Also the window length.
        nkern: int
            Number of kernels, each with a learnable rate and scale.
        rates: list of float, None
            Initial values for temporal modulation.
        scales: list of float, None
            Initial values for spectral modulation.
        norm_strf: Boolean
            Normalize STRF kernels to be unit length
        real_only: Boolean
            If True, nkern REAL gabor-STRF kernels
            If False, nkern//2 REAL and nkern//2 IMAGINARY gabor-STRF kernels
        """
        super(GaborSTRFConv, self).__init__()
        self.numN = supn
        self.numK = supk
        self.numKern = nkern
        self.real_only = real_only
        self.norm_strf = norm_strf

        if not real_only:
            nkern = nkern // 2

        if supk % 2 == 0:  # force odd number
            supk += 1
        self.supk = torch.arange(supk, dtype=torch.float32)
        if supn % 2 == 0:  # force odd number
            supn += 1
        self.supn = torch.arange(supn, dtype=self.supk.dtype)
        self.padding = (supn // 2, supk // 2)
        # Set up learnable parameters
        # for param in (rates, scales):
        #    assert (not param) or len(param) == nkern
        if not rates:

            rates = torch.rand(nkern) * math.pi / 2.0

        if not scales:

            scales = (torch.rand(nkern) * 2.0 - 1.0) * math.pi / 2.0

        self.rates_ = nn.Parameter(torch.Tensor(rates))
        self.scales_ = nn.Parameter(torch.Tensor(scales))

    def strfs(self):
        """Make STRFs using the current parameters."""

        if self.supn.device != self.rates_.device:  # for first run
            self.supn = self.supn.to(self.rates_.device)
            self.supk = self.supk.to(self.rates_.device)
        n0, k0 = self.padding

        nwind = .5 - .5 * \
            torch.cos(2 * math.pi * (self.supn + 1) / (len(self.supn) + 1))
        kwind = .5 - .5 * \
            torch.cos(2 * math.pi * (self.supk + 1) / (len(self.supk) + 1))

        new_wind = torch.matmul((nwind).unsqueeze(-1), (kwind).unsqueeze(0))

        n_n_0 = self.supn - n0
        k_k_0 = self.supk - k0
        n_mult = torch.matmul(
            n_n_0.unsqueeze(1),
            torch.ones((1, len(self.supk))).type(torch.FloatTensor).to(
                self.rates_.device))
        k_mult = torch.matmul(
            torch.ones((len(self.supn),
                        1)).type(torch.FloatTensor).to(self.rates_.device),
            k_k_0.unsqueeze(0))

        inside = self.rates_.unsqueeze(1).unsqueeze(
            1) * n_mult + self.scales_.unsqueeze(1).unsqueeze(1) * k_mult
        real_strf = torch.cos(inside) * new_wind.unsqueeze(0)

        if self.real_only:
            final_strf = real_strf

        else:
            imag_strf = torch.sin(inside) * new_wind.unsqueeze(0)
            final_strf = torch.cat([real_strf, imag_strf], dim=0)

        if self.norm_strf:
            final_strf = final_strf / (torch.sum(
                final_strf**2, dim=(1, 2)).unsqueeze(1).unsqueeze(2))**.5

        return final_strf

    def forward(self, sigspec):
        """Forward pass a batch of (real) spectra [Batch x Time x Frequency]."""
        if len(sigspec.shape) == 2:  # expand batch dimension if single eg
            sigspec = sigspec.unsqueeze(0)
        strfs = self.strfs().unsqueeze(1).type_as(sigspec)
        out = F.conv2d(sigspec.unsqueeze(1), strfs, padding=self.padding)
        return out

    def __repr__(self):
        """Gabor filter"""
        report = """
            +++++ Gabor Filter Kernels [{}], supn[{}], supk[{}] real only [{}] norm strf [{}] +++++

        """.format(self.numKern, self.numN, self.numK, self.real_only,
                   self.norm_strf)

        return report
--- a/modelscope/models/audio/network/se_net.py
+++ b/modelscope/models/audio/network/se_net.py
@@ -0,0 +1,483 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F

 from ..layers.activations import RectifiedLinear, Sigmoid
 from ..layers.affine_transform import AffineTransform
 from ..layers.deep_fsmn import DeepFsmn
 from ..layers.uni_deep_fsmn import Conv2d, UniDeepFsmn


 class MaskNet(nn.Module):

    def __init__(self,
                 indim,
                 outdim,
                 layers=9,
                 hidden_dim=128,
                 hidden_dim2=None,
                 lorder=20,
                 rorder=0,
                 dilation=1,
                 layer_norm=False,
                 dropout=0,
                 crm=False,
                 vad=False,
                 linearout=False):
        super(MaskNet, self).__init__()

        self.linear1 = AffineTransform(indim, hidden_dim)
        self.relu = RectifiedLinear(hidden_dim, hidden_dim)
        if hidden_dim2 is None:
            hidden_dim2 = hidden_dim

        if rorder == 0:
            repeats = [
                UniDeepFsmn(
                    hidden_dim,
                    hidden_dim,
                    lorder,
                    hidden_dim2,
                    dilation=dilation,
                    layer_norm=layer_norm,
                    dropout=dropout) for i in range(layers)
            ]
        else:
            repeats = [
                DeepFsmn(
                    hidden_dim,
                    hidden_dim,
                    lorder,
                    rorder,
                    hidden_dim2,
                    layer_norm=layer_norm,
                    dropout=dropout) for i in range(layers)
            ]
        self.deepfsmn = nn.Sequential(*repeats)

        self.linear2 = AffineTransform(hidden_dim, outdim)

        self.crm = crm
        if self.crm:
            self.sig = nn.Tanh()
        else:
            self.sig = Sigmoid(outdim, outdim)

        self.vad = vad
        if self.vad:
            self.linear3 = AffineTransform(hidden_dim, 1)

        self.layers = layers
        self.linearout = linearout
        if self.linearout and self.vad:
            print('Warning: not supported nnet')

    def forward(self, feat, ctl=None):
        x1 = self.linear1(feat)
        x2 = self.relu(x1)
        if ctl is not None:
            ctl = min(ctl, self.layers - 1)
            for i in range(ctl):
                x2 = self.deepfsmn[i](x2)
            mask = self.sig(self.linear2(x2))
            if self.vad:
                vad = torch.sigmoid(self.linear3(x2))
                return mask, vad
            else:
                return mask
        x3 = self.deepfsmn(x2)
        if self.linearout:
            return self.linear2(x3)
        mask = self.sig(self.linear2(x3))
        if self.vad:
            vad = torch.sigmoid(self.linear3(x3))
            return mask, vad
        else:
            return mask

    def to_kaldi_nnet(self):
        re_str = ''
        re_str += '<Nnet>\n'
        re_str += self.linear1.to_kaldi_nnet()
        re_str += self.relu.to_kaldi_nnet()
        for dfsmn in self.deepfsmn:
            re_str += dfsmn.to_kaldi_nnet()
        re_str += self.linear2.to_kaldi_nnet()
        re_str += self.sig.to_kaldi_nnet()
        re_str += '</Nnet>\n'

        return re_str

    def to_raw_nnet(self, fid):
        self.linear1.to_raw_nnet(fid)
        for dfsmn in self.deepfsmn:
            dfsmn.to_raw_nnet(fid)
        self.linear2.to_raw_nnet(fid)


 class StageNet(nn.Module):

    def __init__(self,
                 indim,
                 outdim,
                 layers=9,
                 layers2=6,
                 hidden_dim=128,
                 lorder=20,
                 rorder=0,
                 layer_norm=False,
                 dropout=0,
                 crm=False,
                 vad=False,
                 linearout=False):
        super(StageNet, self).__init__()

        self.stage1 = nn.ModuleList()
        self.stage2 = nn.ModuleList()
        layer = nn.Sequential(nn.Linear(indim, hidden_dim), nn.ReLU())
        self.stage1.append(layer)
        for i in range(layers):
            layer = UniDeepFsmn(
                hidden_dim,
                hidden_dim,
                lorder,
                hidden_dim,
                layer_norm=layer_norm,
                dropout=dropout)
            self.stage1.append(layer)
        layer = nn.Sequential(nn.Linear(hidden_dim, 321), nn.Sigmoid())
        self.stage1.append(layer)
        # stage2
        layer = nn.Sequential(nn.Linear(321 + indim, hidden_dim), nn.ReLU())
        self.stage2.append(layer)
        for i in range(layers2):
            layer = UniDeepFsmn(
                hidden_dim,
                hidden_dim,
                lorder,
                hidden_dim,
                layer_norm=layer_norm,
                dropout=dropout)
            self.stage2.append(layer)
        layer = nn.Sequential(
            nn.Linear(hidden_dim, outdim),
            nn.Sigmoid() if not crm else nn.Tanh())
        self.stage2.append(layer)
        self.crm = crm
        self.vad = vad
        self.linearout = linearout
        self.window = torch.hamming_window(640, periodic=False).cuda()
        self.freezed = False

    def freeze(self):
        if not self.freezed:
            for param in self.stage1.parameters():
                param.requires_grad = False
            self.freezed = True
            print('freezed stage1')

    def forward(self, feat, mixture, ctl=None):
        if ctl == 'off':
            x = feat
            for i in range(len(self.stage1)):
                x = self.stage1[i](x)
            return x
        else:
            self.freeze()
            x = feat
            for i in range(len(self.stage1)):
                x = self.stage1[i](x)

            spec = torch.stft(
                mixture / 32768,
                640,
                320,
                640,
                self.window,
                center=False,
                return_complex=True)
            spec = torch.view_as_real(spec).permute([0, 2, 1, 3])
            specmag = torch.sqrt(spec[..., 0]**2 + spec[..., 1]**2)
            est = x * specmag
            y = torch.cat([est, feat], dim=-1)
            for i in range(len(self.stage2)):
                y = self.stage2[i](y)
            return y


 class Unet(nn.Module):

    def __init__(self,
                 indim,
                 outdim,
                 layers=9,
                 dims=[256] * 4,
                 lorder=20,
                 rorder=0,
                 dilation=1,
                 layer_norm=False,
                 dropout=0,
                 crm=False,
                 vad=False,
                 linearout=False):
        super(Unet, self).__init__()

        self.linear1 = AffineTransform(indim, dims[0])
        self.relu = RectifiedLinear(dims[0], dims[0])

        self.encoder = nn.ModuleList()
        self.decoder = nn.ModuleList()
        for i in range(len(dims) - 1):
            layer = nn.Sequential(
                nn.Linear(dims[i], dims[i + 1]), nn.ReLU(),
                nn.Linear(dims[i + 1], dims[i + 1], bias=False),
                Conv2d(
                    dims[i + 1],
                    dims[i + 1],
                    lorder,
                    groups=dims[i + 1],
                    skip_connect=True))
            self.encoder.append(layer)
        for i in range(len(dims) - 1, 0, -1):
            layer = nn.Sequential(
                nn.Linear(dims[i] * 2, dims[i - 1]), nn.ReLU(),
                nn.Linear(dims[i - 1], dims[i - 1], bias=False),
                Conv2d(
                    dims[i - 1],
                    dims[i - 1],
                    lorder,
                    groups=dims[i - 1],
                    skip_connect=True))
            self.decoder.append(layer)
        self.tf = nn.ModuleList()
        for i in range(layers - 2 * (len(dims) - 1)):
            layer = nn.Sequential(
                nn.Linear(dims[-1], dims[-1]), nn.ReLU(),
                nn.Linear(dims[-1], dims[-1], bias=False),
                Conv2d(
                    dims[-1],
                    dims[-1],
                    lorder,
                    groups=dims[-1],
                    skip_connect=True))
            self.tf.append(layer)

        self.linear2 = AffineTransform(dims[0], outdim)
        self.crm = crm
        self.act = nn.Tanh() if self.crm else nn.Sigmoid()
        self.vad = False
        self.layers = layers
        self.linearout = linearout

    def forward(self, x, ctl=None):
        x = self.linear1(x)
        x = self.relu(x)

        encoder_out = []
        for i in range(len(self.encoder)):
            x = self.encoder[i](x)
            encoder_out.append(x)
        for i in range(len(self.tf)):
            x = self.tf[i](x)
        for i in range(len(self.decoder)):
            x = torch.cat([x, encoder_out[-1 - i]], dim=-1)
            x = self.decoder[i](x)

        x = self.linear2(x)
        if self.linearout:
            return x
        return self.act(x)


 class BranchNet(nn.Module):

    def __init__(self,
                 indim,
                 outdim,
                 layers=9,
                 hidden_dim=256,
                 lorder=20,
                 rorder=0,
                 dilation=1,
                 layer_norm=False,
                 dropout=0,
                 crm=False,
                 vad=False,
                 linearout=False):
        super(BranchNet, self).__init__()

        self.linear1 = AffineTransform(indim, hidden_dim)
        self.relu = RectifiedLinear(hidden_dim, hidden_dim)

        self.convs = nn.ModuleList()
        self.deepfsmn = nn.ModuleList()
        self.FREQ = nn.ModuleList()
        self.TIME = nn.ModuleList()
        self.br1 = nn.ModuleList()
        self.br2 = nn.ModuleList()
        for i in range(layers):
            '''
            layer = nn.Sequential(
                nn.Linear(hidden_dim, hidden_dim),
                nn.ReLU(),
                nn.Linear(hidden_dim, hidden_dim, bias=False),
                Conv2d(hidden_dim, hidden_dim, lorder,
                       groups=hidden_dim, skip_connect=True)
            )
            self.deepfsmn.append(layer)
            '''
            layer = nn.Sequential(nn.Linear(hidden_dim, hidden_dim), nn.ReLU())
            self.FREQ.append(layer)
            '''
            layer = nn.GRU(hidden_dim, hidden_dim,
                           batch_first=True,
                           bidirectional=False)
            self.TIME.append(layer)

            layer = nn.Sequential(
                nn.Linear(hidden_dim, hidden_dim//2, bias=False),
                Conv2d(hidden_dim//2, hidden_dim//2, lorder,
                       groups=hidden_dim//2, skip_connect=True)
            )
            self.br1.append(layer)
            layer = nn.GRU(hidden_dim, hidden_dim//2,
                           batch_first=True,
                           bidirectional=False)
            self.br2.append(layer)
            '''

        self.linear2 = AffineTransform(hidden_dim, outdim)
        self.crm = crm
        self.act = nn.Tanh() if self.crm else nn.Sigmoid()
        self.vad = False
        self.layers = layers
        self.linearout = linearout

    def forward(self, x, ctl=None):
        return self.forward_branch(x)

    def forward_sepconv(self, x):
        x = torch.unsqueeze(x, 1)
        for i in range(len(self.convs)):
            x = self.convs[i](x)
            x = F.relu(x)
        B, C, H, W = x.shape
        x = x.permute(0, 2, 1, 3)
        x = torch.reshape(x, [B, H, C * W])
        x = self.linear1(x)
        x = self.relu(x)
        for i in range(self.layers):
            x = self.deepfsmn[i](x) + x
        x = self.linear2(x)
        return self.act(x)

    def forward_branch(self, x):
        x = self.linear1(x)
        x = self.relu(x)
        for i in range(self.layers):
            z = self.FREQ[i](x)
            x = z + x
        x = self.linear2(x)
        if self.linearout:
            return x
        return self.act(x)


 class TACNet(nn.Module):
    ''' transform average concatenate for ad hoc dr
    '''

    def __init__(self,
                 indim,
                 outdim,
                 layers=9,
                 hidden_dim=128,
                 lorder=20,
                 rorder=0,
                 crm=False,
                 vad=False,
                 linearout=False):
        super(TACNet, self).__init__()

        self.linear1 = AffineTransform(indim, hidden_dim)
        self.relu = RectifiedLinear(hidden_dim, hidden_dim)

        if rorder == 0:
            repeats = [
                UniDeepFsmn(hidden_dim, hidden_dim, lorder, hidden_dim)
                for i in range(layers)
            ]
        else:
            repeats = [
                DeepFsmn(hidden_dim, hidden_dim, lorder, rorder, hidden_dim)
                for i in range(layers)
            ]
        self.deepfsmn = nn.Sequential(*repeats)

        self.ch_transform = nn.ModuleList([])
        self.ch_average = nn.ModuleList([])
        self.ch_concat = nn.ModuleList([])
        for i in range(layers):
            self.ch_transform.append(
                nn.Sequential(nn.Linear(hidden_dim, hidden_dim), nn.PReLU()))
            self.ch_average.append(
                nn.Sequential(nn.Linear(hidden_dim, hidden_dim), nn.PReLU()))
            self.ch_concat.append(
                nn.Sequential(
                    nn.Linear(hidden_dim * 2, hidden_dim), nn.PReLU()))

        self.linear2 = AffineTransform(hidden_dim, outdim)

        self.crm = crm
        if self.crm:
            self.sig = nn.Tanh()
        else:
            self.sig = Sigmoid(outdim, outdim)

        self.vad = vad
        if self.vad:
            self.linear3 = AffineTransform(hidden_dim, 1)

        self.layers = layers
        self.linearout = linearout
        if self.linearout and self.vad:
            print('Warning: not supported nnet')

    def forward(self, feat, ctl=None):
        B, T, F = feat.shape
        # assume 4ch
        ch = 4
        zlist = []
        for c in range(ch):
            z = self.linear1(feat[..., c * (F // 4):(c + 1) * (F // 4)])
            z = self.relu(z)
            zlist.append(z)
        for i in range(self.layers):
            # forward
            for c in range(ch):
                zlist[c] = self.deepfsmn[i](zlist[c])

            # transform
            olist = []
            for c in range(ch):
                z = self.ch_transform[i](zlist[c])
                olist.append(z)
            # average
            avg = 0
            for c in range(ch):
                avg = avg + olist[c]
            avg = avg / ch
            avg = self.ch_average[i](avg)
            # concate
            for c in range(ch):
                tac = torch.cat([olist[c], avg], dim=-1)
                tac = self.ch_concat[i](tac)
                zlist[c] = zlist[c] + tac

        for c in range(ch):
            zlist[c] = self.sig(self.linear2(zlist[c]))
        mask = torch.cat(zlist, dim=-1)
        return mask

    def to_kaldi_nnet(self):
        pass
--- a/modelscope/models/audio/tts/init.py
+++ b/modelscope/models/audio/tts/init.py
--- a/modelscope/models/audio/tts/am/init.py
+++ b/modelscope/models/audio/tts/am/init.py
@@ -0,0 +1 @@
 from .sambert_hifi_16k import *  # noqa F403
--- a/modelscope/models/audio/tts/am/models/init.py
+++ b/modelscope/models/audio/tts/am/models/init.py
@@ -0,0 +1,8 @@
 from .robutrans import RobuTrans


 def create_model(name, hparams):
    if name == 'robutrans':
        return RobuTrans(hparams)
    else:
        raise Exception('Unknown model: ' + name)
--- a/modelscope/models/audio/tts/am/models/compat.py
+++ b/modelscope/models/audio/tts/am/models/compat.py
@@ -0,0 +1,82 @@
 """Functions for compatibility with different TensorFlow versions."""

 import tensorflow as tf


 def is_tf2():
    """Returns ``True`` if running TensorFlow 2.0."""
    return tf.__version__.startswith('2')


 def tf_supports(symbol):
    """Returns ``True`` if TensorFlow defines :obj:`symbol`."""
    return _string_to_tf_symbol(symbol) is not None


 def tf_any(*symbols):
    """Returns the first supported symbol."""
    for symbol in symbols:
        module = _string_to_tf_symbol(symbol)
        if module is not None:
            return module
    return None


 def tf_compat(v2=None, v1=None):  # pylint: disable=invalid-name
    """Returns the compatible symbol based on the current TensorFlow version.

    Args:
      v2: The candidate v2 symbol name.
      v1: The candidate v1 symbol name.

    Returns:
      A TensorFlow symbol.

    Raises:
      ValueError: if no symbol can be found.
    """
    candidates = []
    if v2 is not None:
        candidates.append(v2)
    if v1 is not None:
        candidates.append(v1)
        candidates.append('compat.v1.%s' % v1)
    symbol = tf_any(*candidates)
    if symbol is None:
        raise ValueError('Failure to resolve the TensorFlow symbol')
    return symbol


 def name_from_variable_scope(name=''):
    """Creates a name prefixed by the current variable scope."""
    var_scope = tf_compat(v1='get_variable_scope')().name
    compat_name = ''
    if name:
        compat_name = '%s/' % name
    if var_scope:
        compat_name = '%s/%s' % (var_scope, compat_name)
    return compat_name


 def reuse():
    """Returns ``True`` if the current variable scope is marked for reuse."""
    return tf_compat(v1='get_variable_scope')().reuse


 def _string_to_tf_symbol(symbol):
    modules = symbol.split('.')
    namespace = tf
    for module in modules:
        namespace = getattr(namespace, module, None)
        if namespace is None:
            return None
    return namespace


 # pylint: disable=invalid-name
 gfile_copy = tf_compat(v2='io.gfile.copy', v1='gfile.Copy')
 gfile_exists = tf_compat(v2='io.gfile.exists', v1='gfile.Exists')
 gfile_open = tf_compat(v2='io.gfile.GFile', v1='gfile.GFile')
 is_tensor = tf_compat(v2='is_tensor', v1='contrib.framework.is_tensor')
 logging = tf_compat(v1='logging')
 nest = tf_compat(v2='nest', v1='contrib.framework.nest')
--- a/modelscope/models/audio/tts/am/models/fsmn.py
+++ b/modelscope/models/audio/tts/am/models/fsmn.py
@@ -0,0 +1,273 @@
 import tensorflow as tf


 def build_sequence_mask(sequence_length,
                        maximum_length=None,
                        dtype=tf.float32):
    """Builds the dot product mask.

    Args:
      sequence_length: The sequence length.
      maximum_length: Optional size of the returned time dimension. Otherwise
        it is the maximum of :obj:`sequence_length`.
      dtype: The type of the mask tensor.

    Returns:
      A broadcastable ``tf.Tensor`` of type :obj:`dtype` and shape
      ``[batch_size, max_length]``.
    """
    mask = tf.sequence_mask(
        sequence_length, maxlen=maximum_length, dtype=dtype)

    return mask


 def norm(inputs):
    """Layer normalizes :obj:`inputs`."""
    return tf.contrib.layers.layer_norm(inputs, begin_norm_axis=-1)


 def pad_in_time(x, padding_shape):
    """Helper function to pad a tensor in the time dimension and retain the static depth dimension.

       Agrs:
        x: [Batch, Time, Frequency]
        padding_length: padding size of constant value (0) before the time dimension

      return:
        padded x
    """

    depth = x.get_shape().as_list()[-1]
    x = tf.pad(x, [[0, 0], padding_shape, [0, 0]])
    x.set_shape((None, None, depth))

    return x


 def pad_in_time_right(x, padding_length):
    """Helper function to pad a tensor in the time dimension and retain the static depth dimension.

       Agrs:
        x: [Batch, Time, Frequency]
        padding_length: padding size of constant value (0) before the time dimension

      return:
        padded x
    """
    depth = x.get_shape().as_list()[-1]
    x = tf.pad(x, [[0, 0], [0, padding_length], [0, 0]])
    x.set_shape((None, None, depth))

    return x


 def feed_forward(x, ffn_dim, memory_units, mode, dropout=0.0):
    """Implements the Transformer's "Feed Forward" layer.

    .. math::

        ffn(x) = max(0, x*W_1 + b_1)*W_2

    Args:
      x: The input.
      ffn_dim: The number of units of the nonlinear transformation.
      memory_units: the number of units of linear transformation
      mode: A ``tf.estimator.ModeKeys`` mode.
      dropout: The probability to drop units from the inner transformation.

    Returns:
      The transformed input.
    """
    inner = tf.layers.conv1d(x, ffn_dim, 1, activation=tf.nn.relu)
    inner = tf.layers.dropout(
        inner, rate=dropout, training=mode == tf.estimator.ModeKeys.TRAIN)
    outer = tf.layers.conv1d(inner, memory_units, 1, use_bias=False)

    return outer


 def drop_and_add(inputs, outputs, mode, dropout=0.0):
    """Drops units in the outputs and adds the previous values.

    Args:
      inputs: The input of the previous layer.
      outputs: The output of the previous layer.
      mode: A ``tf.estimator.ModeKeys`` mode.
      dropout: The probability to drop units in :obj:`outputs`.

    Returns:
      The residual and normalized output.
    """
    outputs = tf.layers.dropout(outputs, rate=dropout, training=mode)

    input_dim = inputs.get_shape().as_list()[-1]
    output_dim = outputs.get_shape().as_list()[-1]

    if input_dim == output_dim:
        outputs += inputs

    return outputs


 def MemoryBlock(
    inputs,
    filter_size,
    mode,
    mask=None,
    dropout=0.0,
 ):
    """
    Define the bidirectional memory block in FSMN

    Agrs:
      inputs: The output of the previous layer. [Batch, Time, Frequency]
      filter_size: memory block filter size
      mode: Training or Evaluation
      mask: A ``tf.Tensor`` applied to the memory block output

    return:
      output: 3-D tensor ([Batch, Time, Frequency])
    """
    static_shape = inputs.get_shape().as_list()
    depth = static_shape[-1]
    inputs = tf.expand_dims(inputs, axis=1)  # [Batch, 1, Time, Frequency]
    depthwise_filter = tf.get_variable(
        'depth_conv_w',
        shape=[1, filter_size, depth, 1],
        initializer=tf.glorot_uniform_initializer(),
        dtype=tf.float32)
    memory = tf.nn.depthwise_conv2d(
        input=inputs,
        filter=depthwise_filter,
        strides=[1, 1, 1, 1],
        padding='SAME',
        rate=[1, 1],
        data_format='NHWC')
    memory = memory + inputs
    output = tf.layers.dropout(memory, rate=dropout, training=mode)
    output = tf.reshape(
        output,
        [tf.shape(output)[0], tf.shape(output)[2], depth])
    if mask is not None:
        output = output * tf.expand_dims(mask, -1)

    return output


 def MemoryBlockV2(
    inputs,
    filter_size,
    mode,
    shift=0,
    mask=None,
    dropout=0.0,
 ):
    """
    Define the bidirectional memory block in FSMN

    Agrs:
      inputs: The output of the previous layer. [Batch, Time, Frequency]
      filter_size: memory block filter size
      mode: Training or Evaluation
      shift: left padding, to control delay
      mask: A ``tf.Tensor`` applied to the memory block output

    return:
      output: 3-D tensor ([Batch, Time, Frequency])
    """
    if mask is not None:
        inputs = inputs * tf.expand_dims(mask, -1)

    static_shape = inputs.get_shape().as_list()
    depth = static_shape[-1]
    # padding
    left_padding = int(round((filter_size - 1) / 2))
    right_padding = int((filter_size - 1) / 2)
    if shift > 0:
        left_padding = left_padding + shift
        right_padding = right_padding - shift
    pad_inputs = pad_in_time(inputs, [left_padding, right_padding])
    pad_inputs = tf.expand_dims(
        pad_inputs, axis=1)  # [Batch, 1, Time, Frequency]
    depthwise_filter = tf.get_variable(
        'depth_conv_w',
        shape=[1, filter_size, depth, 1],
        initializer=tf.glorot_uniform_initializer(),
        dtype=tf.float32)
    memory = tf.nn.depthwise_conv2d(
        input=pad_inputs,
        filter=depthwise_filter,
        strides=[1, 1, 1, 1],
        padding='VALID',
        rate=[1, 1],
        data_format='NHWC')
    memory = tf.reshape(
        memory,
        [tf.shape(memory)[0], tf.shape(memory)[2], depth])
    memory = memory + inputs
    output = tf.layers.dropout(memory, rate=dropout, training=mode)
    if mask is not None:
        output = output * tf.expand_dims(mask, -1)

    return output


 def UniMemoryBlock(
    inputs,
    filter_size,
    mode,
    cache=None,
    mask=None,
    dropout=0.0,
 ):
    """
    Define the unidirectional memory block in FSMN

    Agrs:
      inputs: The output of the previous layer. [Batch, Time, Frequency]
      filter_size: memory block filter size
      cache: for streaming inference
      mode: Training or Evaluation
      mask: A ``tf.Tensor`` applied to the memory block output
      dropout: dorpout factor
    return:
      output: 3-D tensor ([Batch, Time, Frequency])
    """
    if cache is not None:
        static_shape = cache['queries'].get_shape().as_list()
        depth = static_shape[-1]
        queries = tf.slice(cache['queries'], [0, 1, 0], [
            tf.shape(cache['queries'])[0],
            tf.shape(cache['queries'])[1] - 1, depth
        ])
        queries = tf.concat([queries, inputs], axis=1)
        cache['queries'] = queries
    else:
        padding_length = filter_size - 1
        queries = pad_in_time(inputs, [padding_length, 0])

    queries = tf.expand_dims(queries, axis=1)  # [Batch, 1, Time, Frequency]
    static_shape = queries.get_shape().as_list()
    depth = static_shape[-1]
    depthwise_filter = tf.get_variable(
        'depth_conv_w',
        shape=[1, filter_size, depth, 1],
        initializer=tf.glorot_uniform_initializer(),
        dtype=tf.float32)
    memory = tf.nn.depthwise_conv2d(
        input=queries,
        filter=depthwise_filter,
        strides=[1, 1, 1, 1],
        padding='VALID',
        rate=[1, 1],
        data_format='NHWC')
    memory = tf.reshape(
        memory,
        [tf.shape(memory)[0], tf.shape(memory)[2], depth])
    memory = memory + inputs
    output = tf.layers.dropout(memory, rate=dropout, training=mode)
    if mask is not None:
        output = output * tf.expand_dims(mask, -1)

    return output
--- a/modelscope/models/audio/tts/am/models/fsmn_encoder.py
+++ b/modelscope/models/audio/tts/am/models/fsmn_encoder.py
@@ -0,0 +1,178 @@
 import tensorflow as tf

 from . import fsmn


 class FsmnEncoder():
    """Encoder using Fsmn
    """

    def __init__(self,
                 filter_size,
                 fsmn_num_layers,
                 dnn_num_layers,
                 num_memory_units=512,
                 ffn_inner_dim=2048,
                 dropout=0.0,
                 position_encoder=None):
        """Initializes the parameters of the encoder.

        Args:
          filter_size: the total order of memory block
          fsmn_num_layers: The number of fsmn layers.
          dnn_num_layers: The number of dnn layers
          num_units: The number of memory units.
          ffn_inner_dim: The number of units of the inner linear transformation
            in the feed forward layer.
          dropout: The probability to drop units from the outputs.
          position_encoder: The :class:`opennmt.layers.position.PositionEncoder` to
            apply on inputs or ``None``.
        """
        super(FsmnEncoder, self).__init__()
        self.filter_size = filter_size
        self.fsmn_num_layers = fsmn_num_layers
        self.dnn_num_layers = dnn_num_layers
        self.num_memory_units = num_memory_units
        self.ffn_inner_dim = ffn_inner_dim
        self.dropout = dropout
        self.position_encoder = position_encoder

    def encode(self, inputs, sequence_length=None, mode=True):
        if self.position_encoder is not None:
            inputs = self.position_encoder(inputs)

        inputs = tf.layers.dropout(inputs, rate=self.dropout, training=mode)

        mask = fsmn.build_sequence_mask(
            sequence_length, maximum_length=tf.shape(inputs)[1])

        state = ()

        for layer in range(self.fsmn_num_layers):
            with tf.variable_scope('fsmn_layer_{}'.format(layer)):
                with tf.variable_scope('ffn'):
                    context = fsmn.feed_forward(
                        inputs,
                        self.ffn_inner_dim,
                        self.num_memory_units,
                        mode,
                        dropout=self.dropout)

                with tf.variable_scope('memory'):
                    memory = fsmn.MemoryBlock(
                        context,
                        self.filter_size,
                        mode,
                        mask=mask,
                        dropout=self.dropout)

                    memory = fsmn.drop_and_add(
                        inputs, memory, mode, dropout=self.dropout)

                inputs = memory
                state += (tf.reduce_mean(inputs, axis=1), )

        for layer in range(self.dnn_num_layers):
            with tf.variable_scope('dnn_layer_{}'.format(layer)):
                transformed = fsmn.feed_forward(
                    inputs,
                    self.ffn_inner_dim,
                    self.num_memory_units,
                    mode,
                    dropout=self.dropout)

                inputs = transformed
                state += (tf.reduce_mean(inputs, axis=1), )

        outputs = inputs
        return (outputs, state, sequence_length)


 class FsmnEncoderV2():
    """Encoder using Fsmn
    """

    def __init__(self,
                 filter_size,
                 fsmn_num_layers,
                 dnn_num_layers,
                 num_memory_units=512,
                 ffn_inner_dim=2048,
                 dropout=0.0,
                 shift=0,
                 position_encoder=None):
        """Initializes the parameters of the encoder.

        Args:
          filter_size: the total order of memory block
          fsmn_num_layers: The number of fsmn layers.
          dnn_num_layers: The number of dnn layers
          num_units: The number of memory units.
          ffn_inner_dim: The number of units of the inner linear transformation
            in the feed forward layer.
          dropout: The probability to drop units from the outputs.
          shift: left padding, to control delay
          position_encoder: The :class:`opennmt.layers.position.PositionEncoder` to
            apply on inputs or ``None``.
        """
        super(FsmnEncoderV2, self).__init__()
        self.filter_size = filter_size
        self.fsmn_num_layers = fsmn_num_layers
        self.dnn_num_layers = dnn_num_layers
        self.num_memory_units = num_memory_units
        self.ffn_inner_dim = ffn_inner_dim
        self.dropout = dropout
        self.shift = shift
        if not isinstance(shift, list):
            self.shift = [shift for _ in range(self.fsmn_num_layers)]
        self.position_encoder = position_encoder

    def encode(self, inputs, sequence_length=None, mode=True):
        if self.position_encoder is not None:
            inputs = self.position_encoder(inputs)

        inputs = tf.layers.dropout(inputs, rate=self.dropout, training=mode)

        mask = fsmn.build_sequence_mask(
            sequence_length, maximum_length=tf.shape(inputs)[1])

        state = ()
        for layer in range(self.fsmn_num_layers):
            with tf.variable_scope('fsmn_layer_{}'.format(layer)):
                with tf.variable_scope('ffn'):
                    context = fsmn.feed_forward(
                        inputs,
                        self.ffn_inner_dim,
                        self.num_memory_units,
                        mode,
                        dropout=self.dropout)

                with tf.variable_scope('memory'):
                    memory = fsmn.MemoryBlockV2(
                        context,
                        self.filter_size,
                        mode,
                        shift=self.shift[layer],
                        mask=mask,
                        dropout=self.dropout)

                    memory = fsmn.drop_and_add(
                        inputs, memory, mode, dropout=self.dropout)

                inputs = memory
                state += (tf.reduce_mean(inputs, axis=1), )

        for layer in range(self.dnn_num_layers):
            with tf.variable_scope('dnn_layer_{}'.format(layer)):
                transformed = fsmn.feed_forward(
                    inputs,
                    self.ffn_inner_dim,
                    self.num_memory_units,
                    mode,
                    dropout=self.dropout)

                inputs = transformed
                state += (tf.reduce_mean(inputs, axis=1), )

        outputs = inputs
        return (outputs, state, sequence_length)
--- a/modelscope/models/audio/tts/am/models/helpers.py
+++ b/modelscope/models/audio/tts/am/models/helpers.py
@@ -0,0 +1,160 @@
 import numpy as np
 import tensorflow as tf
 from tensorflow.contrib.seq2seq import Helper


 class VarTestHelper(Helper):

    def __init__(self, batch_size, inputs, dim):
        with tf.name_scope('VarTestHelper'):
            self._batch_size = batch_size
            self._inputs = inputs
            self._dim = dim

            num_steps = tf.shape(self._inputs)[1]
            self._lengths = tf.tile([num_steps], [self._batch_size])

            self._inputs = tf.roll(inputs, shift=-1, axis=1)
            self._init_inputs = inputs[:, 0, :]

    @property
    def batch_size(self):
        return self._batch_size

    @property
    def sample_ids_shape(self):
        return tf.TensorShape([])

    @property
    def sample_ids_dtype(self):
        return np.int32

    def initialize(self, name=None):
        return (tf.tile([False], [self._batch_size]),
                _go_frames(self._batch_size, self._dim, self._init_inputs))

    def sample(self, time, outputs, state, name=None):
        return tf.tile([0], [self._batch_size])  # Return all 0; we ignore them

    def next_inputs(self, time, outputs, state, sample_ids, name=None):
        with tf.name_scope('VarTestHelper'):
            finished = (time + 1 >= self._lengths)
            next_inputs = tf.concat([outputs, self._inputs[:, time, :]],
                                    axis=-1)
            return (finished, next_inputs, state)


 class VarTrainingHelper(Helper):

    def __init__(self, targets, inputs, dim):
        with tf.name_scope('VarTrainingHelper'):
            self._targets = targets  # [N, T_in, 1]
            self._batch_size = tf.shape(inputs)[0]  # N
            self._inputs = inputs
            self._dim = dim

            num_steps = tf.shape(self._targets)[1]
            self._lengths = tf.tile([num_steps], [self._batch_size])

            self._inputs = tf.roll(inputs, shift=-1, axis=1)
            self._init_inputs = inputs[:, 0, :]

    @property
    def batch_size(self):
        return self._batch_size

    @property
    def sample_ids_shape(self):
        return tf.TensorShape([])

    @property
    def sample_ids_dtype(self):
        return np.int32

    def initialize(self, name=None):
        return (tf.tile([False], [self._batch_size]),
                _go_frames(self._batch_size, self._dim, self._init_inputs))

    def sample(self, time, outputs, state, name=None):
        return tf.tile([0], [self._batch_size])  # Return all 0; we ignore them

    def next_inputs(self, time, outputs, state, sample_ids, name=None):
        with tf.name_scope(name or 'VarTrainingHelper'):
            finished = (time + 1 >= self._lengths)
            next_inputs = tf.concat(
                [self._targets[:, time, :], self._inputs[:, time, :]], axis=-1)
            return (finished, next_inputs, state)


 class VarTrainingSSHelper(Helper):

    def __init__(self, targets, inputs, dim, global_step, schedule_begin,
                 alpha, decay_steps):
        with tf.name_scope('VarTrainingSSHelper'):
            self._targets = targets  # [N, T_in, 1]
            self._batch_size = tf.shape(inputs)[0]  # N
            self._inputs = inputs
            self._dim = dim

            num_steps = tf.shape(self._targets)[1]
            self._lengths = tf.tile([num_steps], [self._batch_size])

            self._inputs = tf.roll(inputs, shift=-1, axis=1)
            self._init_inputs = inputs[:, 0, :]

            # for schedule sampling
            self._global_step = global_step
            self._schedule_begin = schedule_begin
            self._alpha = alpha
            self._decay_steps = decay_steps

    @property
    def batch_size(self):
        return self._batch_size

    @property
    def sample_ids_shape(self):
        return tf.TensorShape([])

    @property
    def sample_ids_dtype(self):
        return np.int32

    def initialize(self, name=None):
        self._ratio = _tf_decay(self._global_step, self._schedule_begin,
                                self._alpha, self._decay_steps)
        return (tf.tile([False], [self._batch_size]),
                _go_frames(self._batch_size, self._dim, self._init_inputs))

    def sample(self, time, outputs, state, name=None):
        return tf.tile([0], [self._batch_size])  # Return all 0; we ignore them

    def next_inputs(self, time, outputs, state, sample_ids, name=None):
        with tf.name_scope(name or 'VarTrainingHelper'):
            finished = (time + 1 >= self._lengths)
            next_inputs_tmp = tf.cond(
                tf.less(
                    tf.random_uniform([], minval=0, maxval=1,
                                      dtype=tf.float32), self._ratio),
                lambda: self._targets[:, time, :], lambda: outputs)
            next_inputs = tf.concat(
                [next_inputs_tmp, self._inputs[:, time, :]], axis=-1)
            return (finished, next_inputs, state)


 def _go_frames(batch_size, dim, init_inputs):
    '''Returns all-zero <GO> frames for a given batch size and output dimension'''
    return tf.concat([tf.tile([[0.0]], [batch_size, dim]), init_inputs],
                     axis=-1)


 def _tf_decay(global_step, schedule_begin, alpha, decay_steps):
    tfr = tf.train.exponential_decay(
        1.0,
        global_step=global_step - schedule_begin,
        decay_steps=decay_steps,
        decay_rate=alpha,
        name='tfr_decay')
    final_tfr = tf.cond(
        tf.less(global_step, schedule_begin), lambda: 1.0, lambda: tfr)
    return final_tfr
--- a/modelscope/models/audio/tts/am/models/modules.py
+++ b/modelscope/models/audio/tts/am/models/modules.py
@@ -0,0 +1,461 @@
 import tensorflow as tf
 from tensorflow.contrib.cudnn_rnn import CudnnLSTM
 from tensorflow.contrib.cudnn_rnn.python.ops import cudnn_rnn_ops
 from tensorflow.contrib.rnn import LSTMBlockCell


 def encoder_prenet(inputs,
                   n_conv_layers,
                   filters,
                   kernel_size,
                   dense_units,
                   is_training,
                   mask=None,
                   scope='encoder_prenet'):
    x = inputs
    with tf.variable_scope(scope):
        for i in range(n_conv_layers):
            x = conv1d(
                x,
                filters,
                kernel_size,
                is_training,
                activation=tf.nn.relu,
                dropout=True,
                mask=mask,
                scope='conv1d_{}'.format(i))
        x = tf.layers.dense(
            x, units=dense_units, activation=None, name='dense')
    return x


 def decoder_prenet(inputs,
                   prenet_units,
                   dense_units,
                   is_training,
                   scope='decoder_prenet'):
    x = inputs
    with tf.variable_scope(scope):
        for i, units in enumerate(prenet_units):
            x = tf.layers.dense(
                x,
                units=units,
                activation=tf.nn.relu,
                name='dense_{}'.format(i))
            x = tf.layers.dropout(
                x, rate=0.5, training=is_training, name='dropout_{}'.format(i))
        x = tf.layers.dense(
            x, units=dense_units, activation=None, name='dense')
    return x


 def encoder(inputs,
            input_lengths,
            n_conv_layers,
            filters,
            kernel_size,
            lstm_units,
            is_training,
            embedded_inputs_speaker,
            mask=None,
            scope='encoder'):
    with tf.variable_scope(scope):
        x = conv_and_lstm(
            inputs,
            input_lengths,
            n_conv_layers,
            filters,
            kernel_size,
            lstm_units,
            is_training,
            embedded_inputs_speaker,
            mask=mask)
    return x


 def prenet(inputs, prenet_units, is_training, scope='prenet'):
    x = inputs
    with tf.variable_scope(scope):
        for i, units in enumerate(prenet_units):
            x = tf.layers.dense(
                x,
                units=units,
                activation=tf.nn.relu,
                name='dense_{}'.format(i))
            x = tf.layers.dropout(
                x, rate=0.5, training=is_training, name='dropout_{}'.format(i))
    return x


 def postnet_residual_ulstm(inputs,
                           n_conv_layers,
                           filters,
                           kernel_size,
                           lstm_units,
                           output_units,
                           is_training,
                           scope='postnet_residual_ulstm'):
    with tf.variable_scope(scope):
        x = conv_and_ulstm(inputs, None, n_conv_layers, filters, kernel_size,
                           lstm_units, is_training)
        x = conv1d(
            x,
            output_units,
            kernel_size,
            is_training,
            activation=None,
            dropout=False,
            scope='conv1d_{}'.format(n_conv_layers - 1))
    return x


 def postnet_residual_lstm(inputs,
                          n_conv_layers,
                          filters,
                          kernel_size,
                          lstm_units,
                          output_units,
                          is_training,
                          scope='postnet_residual_lstm'):
    with tf.variable_scope(scope):
        x = conv_and_lstm(inputs, None, n_conv_layers, filters, kernel_size,
                          lstm_units, is_training)
        x = conv1d(
            x,
            output_units,
            kernel_size,
            is_training,
            activation=None,
            dropout=False,
            scope='conv1d_{}'.format(n_conv_layers - 1))
    return x


 def postnet_linear_ulstm(inputs,
                         n_conv_layers,
                         filters,
                         kernel_size,
                         lstm_units,
                         output_units,
                         is_training,
                         scope='postnet_linear'):
    with tf.variable_scope(scope):
        x = conv_and_ulstm(inputs, None, n_conv_layers, filters, kernel_size,
                           lstm_units, is_training)
        x = tf.layers.dense(x, units=output_units)
    return x


 def postnet_linear_lstm(inputs,
                        n_conv_layers,
                        filters,
                        kernel_size,
                        lstm_units,
                        output_units,
                        output_lengths,
                        is_training,
                        embedded_inputs_speaker2,
                        mask=None,
                        scope='postnet_linear'):
    with tf.variable_scope(scope):
        x = conv_and_lstm_dec(
            inputs,
            output_lengths,
            n_conv_layers,
            filters,
            kernel_size,
            lstm_units,
            is_training,
            embedded_inputs_speaker2,
            mask=mask)
        x = tf.layers.dense(x, units=output_units)
    return x


 def postnet_linear(inputs,
                   n_conv_layers,
                   filters,
                   kernel_size,
                   lstm_units,
                   output_units,
                   output_lengths,
                   is_training,
                   embedded_inputs_speaker2,
                   mask=None,
                   scope='postnet_linear'):
    with tf.variable_scope(scope):
        x = conv_dec(
            inputs,
            output_lengths,
            n_conv_layers,
            filters,
            kernel_size,
            lstm_units,
            is_training,
            embedded_inputs_speaker2,
            mask=mask)
    return x


 def conv_and_lstm(inputs,
                  sequence_lengths,
                  n_conv_layers,
                  filters,
                  kernel_size,
                  lstm_units,
                  is_training,
                  embedded_inputs_speaker,
                  mask=None,
                  scope='conv_and_lstm'):
    x = inputs
    with tf.variable_scope(scope):
        for i in range(n_conv_layers):
            x = conv1d(
                x,
                filters,
                kernel_size,
                is_training,
                activation=tf.nn.relu,
                dropout=True,
                mask=mask,
                scope='conv1d_{}'.format(i))

        x = tf.concat([x, embedded_inputs_speaker], axis=2)

        outputs, states = tf.nn.bidirectional_dynamic_rnn(
            LSTMBlockCell(lstm_units),
            LSTMBlockCell(lstm_units),
            x,
            sequence_length=sequence_lengths,
            dtype=tf.float32)
        x = tf.concat(outputs, axis=-1)

    return x


 def conv_and_lstm_dec(inputs,
                      sequence_lengths,
                      n_conv_layers,
                      filters,
                      kernel_size,
                      lstm_units,
                      is_training,
                      embedded_inputs_speaker2,
                      mask=None,
                      scope='conv_and_lstm'):
    x = inputs
    with tf.variable_scope(scope):
        for i in range(n_conv_layers):
            x = conv1d(
                x,
                filters,
                kernel_size,
                is_training,
                activation=tf.nn.relu,
                dropout=True,
                mask=mask,
                scope='conv1d_{}'.format(i))

        x = tf.concat([x, embedded_inputs_speaker2], axis=2)

        outputs, states = tf.nn.bidirectional_dynamic_rnn(
            LSTMBlockCell(lstm_units),
            LSTMBlockCell(lstm_units),
            x,
            sequence_length=sequence_lengths,
            dtype=tf.float32)
        x = tf.concat(outputs, axis=-1)
    return x


 def conv_dec(inputs,
             sequence_lengths,
             n_conv_layers,
             filters,
             kernel_size,
             lstm_units,
             is_training,
             embedded_inputs_speaker2,
             mask=None,
             scope='conv_and_lstm'):
    x = inputs
    with tf.variable_scope(scope):
        for i in range(n_conv_layers):
            x = conv1d(
                x,
                filters,
                kernel_size,
                is_training,
                activation=tf.nn.relu,
                dropout=True,
                mask=mask,
                scope='conv1d_{}'.format(i))
        x = tf.concat([x, embedded_inputs_speaker2], axis=2)
    return x


 def conv_and_ulstm(inputs,
                   sequence_lengths,
                   n_conv_layers,
                   filters,
                   kernel_size,
                   lstm_units,
                   is_training,
                   scope='conv_and_ulstm'):
    x = inputs
    with tf.variable_scope(scope):
        for i in range(n_conv_layers):
            x = conv1d(
                x,
                filters,
                kernel_size,
                is_training,
                activation=tf.nn.relu,
                dropout=True,
                scope='conv1d_{}'.format(i))

        outputs, states = tf.nn.dynamic_rnn(
            LSTMBlockCell(lstm_units),
            x,
            sequence_length=sequence_lengths,
            dtype=tf.float32)

    return outputs


 def conv1d(inputs,
           filters,
           kernel_size,
           is_training,
           activation=None,
           dropout=False,
           mask=None,
           scope='conv1d'):
    with tf.variable_scope(scope):
        if mask is not None:
            inputs = inputs * tf.expand_dims(mask, -1)
        x = tf.layers.conv1d(
            inputs, filters=filters, kernel_size=kernel_size, padding='same')
        if mask is not None:
            x = x * tf.expand_dims(mask, -1)

        x = tf.layers.batch_normalization(x, training=is_training)
        if activation is not None:
            x = activation(x)
        if dropout:
            x = tf.layers.dropout(x, rate=0.5, training=is_training)
    return x


 def conv1d_dp(inputs,
              filters,
              kernel_size,
              is_training,
              activation=None,
              dropout=False,
              dropoutrate=0.5,
              mask=None,
              scope='conv1d'):
    with tf.variable_scope(scope):
        if mask is not None:
            inputs = inputs * tf.expand_dims(mask, -1)
        x = tf.layers.conv1d(
            inputs, filters=filters, kernel_size=kernel_size, padding='same')
        if mask is not None:
            x = x * tf.expand_dims(mask, -1)

        x = tf.contrib.layers.layer_norm(x)
        if activation is not None:
            x = activation(x)
        if dropout:
            x = tf.layers.dropout(x, rate=dropoutrate, training=is_training)
    return x


 def duration_predictor(inputs,
                       n_conv_layers,
                       filters,
                       kernel_size,
                       lstm_units,
                       input_lengths,
                       is_training,
                       embedded_inputs_speaker,
                       mask=None,
                       scope='duration_predictor'):
    with tf.variable_scope(scope):
        x = inputs
        for i in range(n_conv_layers):
            x = conv1d_dp(
                x,
                filters,
                kernel_size,
                is_training,
                activation=tf.nn.relu,
                dropout=True,
                dropoutrate=0.1,
                mask=mask,
                scope='conv1d_{}'.format(i))

        x = tf.concat([x, embedded_inputs_speaker], axis=2)

        outputs, states = tf.nn.bidirectional_dynamic_rnn(
            LSTMBlockCell(lstm_units),
            LSTMBlockCell(lstm_units),
            x,
            sequence_length=input_lengths,
            dtype=tf.float32)
        x = tf.concat(outputs, axis=-1)

        x = tf.layers.dense(x, units=1)
        x = tf.nn.relu(x)
    return x


 def duration_predictor2(inputs,
                        n_conv_layers,
                        filters,
                        kernel_size,
                        input_lengths,
                        is_training,
                        mask=None,
                        scope='duration_predictor'):
    with tf.variable_scope(scope):
        x = inputs
        for i in range(n_conv_layers):
            x = conv1d_dp(
                x,
                filters,
                kernel_size,
                is_training,
                activation=tf.nn.relu,
                dropout=True,
                dropoutrate=0.1,
                mask=mask,
                scope='conv1d_{}'.format(i))

        x = tf.layers.dense(x, units=1)
        x = tf.nn.relu(x)
    return x


 def conv_prenet(inputs,
                n_conv_layers,
                filters,
                kernel_size,
                is_training,
                mask=None,
                scope='conv_prenet'):
    x = inputs
    with tf.variable_scope(scope):
        for i in range(n_conv_layers):
            x = conv1d(
                x,
                filters,
                kernel_size,
                is_training,
                activation=tf.nn.relu,
                dropout=True,
                mask=mask,
                scope='conv1d_{}'.format(i))

    return x
--- a/modelscope/models/audio/tts/am/models/position.py
+++ b/modelscope/models/audio/tts/am/models/position.py
@@ -0,0 +1,174 @@
 """Define position encoder classes."""

 import abc
 import math

 import tensorflow as tf

 from .reducer import SumReducer


 class PositionEncoder(tf.keras.layers.Layer):
    """Base class for position encoders."""

    def __init__(self, reducer=None, **kwargs):
        """Initializes the position encoder.
        Args:
          reducer: A :class:`opennmt.layers.Reducer` to merge inputs and position
            encodings. Defaults to :class:`opennmt.layers.SumReducer`.
          **kwargs: Additional layer keyword arguments.
        """
        super(PositionEncoder, self).__init__(**kwargs)
        if reducer is None:
            reducer = SumReducer(dtype=kwargs.get('dtype'))
        self.reducer = reducer

    def call(self, inputs, position=None):  # pylint: disable=arguments-differ
        """Add position encodings to :obj:`inputs`.
        Args:
          inputs: The inputs to encode.
          position: The single position to encode, to use when this layer is called
            step by step.
        Returns:
          A ``tf.Tensor`` whose shape depends on the configured ``reducer``.
        """
        batch_size = tf.shape(inputs)[0]
        timesteps = tf.shape(inputs)[1]
        input_dim = inputs.shape[-1].value
        positions = tf.range(timesteps) + 1 if position is None else [position]
        position_encoding = self._encode([positions], input_dim)
        position_encoding = tf.tile(position_encoding, [batch_size, 1, 1])
        return self.reducer([inputs, position_encoding])

    @abc.abstractmethod
    def _encode(self, positions, depth):
        """Creates position encodings.
        Args:
          positions: The positions to encode of shape :math:`[B, ...]`.
          depth: The encoding depth :math:`D`.
        Returns:
          A ``tf.Tensor`` of shape :math:`[B, ..., D]`.
        """
        raise NotImplementedError()


 class PositionEmbedder(PositionEncoder):
    """Encodes position with a lookup table."""

    def __init__(self, maximum_position=128, reducer=None, **kwargs):
        """Initializes the position encoder.
        Args:
          maximum_position: The maximum position to embed. Positions greater
            than this value will be set to :obj:`maximum_position`.
          reducer: A :class:`opennmt.layers.Reducer` to merge inputs and position
            encodings. Defaults to :class:`opennmt.layers.SumReducer`.
          **kwargs: Additional layer keyword arguments.
        """
        super(PositionEmbedder, self).__init__(reducer=reducer, **kwargs)
        self.maximum_position = maximum_position
        self.embedding = None

    def build(self, input_shape):
        shape = [self.maximum_position + 1, input_shape[-1]]
        self.embedding = self.add_weight('position_embedding', shape)
        super(PositionEmbedder, self).build(input_shape)

    def _encode(self, positions, depth):
        positions = tf.minimum(positions, self.maximum_position)
        return tf.nn.embedding_lookup(self.embedding, positions)


 class SinusoidalPositionEncoder(PositionEncoder):
    """Encodes positions with sine waves as described in
    https://arxiv.org/abs/1706.03762.
    """

    def _encode(self, positions, depth):
        if depth % 2 != 0:
            raise ValueError(
                'SinusoidalPositionEncoder expects the depth to be divisble '
                'by 2 but got %d' % depth)

        batch_size = tf.shape(positions)[0]
        positions = tf.cast(positions, tf.float32)

        log_timescale_increment = math.log(10000) / (depth / 2 - 1)
        inv_timescales = tf.exp(
            tf.range(depth / 2, dtype=tf.float32) * -log_timescale_increment)
        inv_timescales = tf.reshape(
            tf.tile(inv_timescales, [batch_size]), [batch_size, depth // 2])
        scaled_time = tf.expand_dims(positions, -1) * tf.expand_dims(
            inv_timescales, 1)
        encoding = tf.concat(
            [tf.sin(scaled_time), tf.cos(scaled_time)], axis=2)
        return tf.cast(encoding, self.dtype)


 class SinusodalPositionalEncoding(tf.keras.layers.Layer):

    def __init__(self, name='SinusodalPositionalEncoding'):
        super(SinusodalPositionalEncoding, self).__init__(name=name)

    @staticmethod
    def positional_encoding(len, dim, step=1.):
        """
        :param len: int scalar
        :param dim: int scalar
        :param step:
        :return: position embedding
        """
        pos_mat = tf.tile(
            tf.expand_dims(
                tf.range(0, tf.cast(len, dtype=tf.float32), dtype=tf.float32)
                * step,
                axis=-1), [1, dim])
        dim_mat = tf.tile(
            tf.expand_dims(
                tf.range(0, tf.cast(dim, dtype=tf.float32), dtype=tf.float32),
                axis=0), [len, 1])
        dim_mat_int = tf.cast(dim_mat, dtype=tf.int32)
        pos_encoding = tf.where(  # [time, dims]
            tf.math.equal(tf.math.mod(dim_mat_int, 2), 0),
            x=tf.math.sin(
                pos_mat / tf.pow(10000., dim_mat / tf.cast(dim, tf.float32))),
            y=tf.math.cos(pos_mat
                          / tf.pow(10000.,
                                   (dim_mat - 1) / tf.cast(dim, tf.float32))))
        return pos_encoding


 class BatchSinusodalPositionalEncoding(tf.keras.layers.Layer):

    def __init__(self, name='BatchSinusodalPositionalEncoding'):
        super(BatchSinusodalPositionalEncoding, self).__init__(name=name)

    @staticmethod
    def positional_encoding(batch_size, len, dim, pos_mat, step=1.):
        """
        :param len: int scalar
        :param dim: int scalar
        :param step:
        :param pos_mat: [B, len] = [len, 1] * dim
        :return: position embedding
        """
        pos_mat = tf.tile(
            tf.expand_dims(tf.cast(pos_mat, dtype=tf.float32) * step, axis=-1),
            [1, 1, dim])  # [B, len, dim]

        dim_mat = tf.tile(
            tf.expand_dims(
                tf.expand_dims(
                    tf.range(
                        0, tf.cast(dim, dtype=tf.float32), dtype=tf.float32),
                    axis=0),
                axis=0), [batch_size, len, 1])  # [B, len, dim]

        dim_mat_int = tf.cast(dim_mat, dtype=tf.int32)
        pos_encoding = tf.where(  # [B, time, dims]
            tf.math.equal(tf.mod(dim_mat_int, 2), 0),
            x=tf.math.sin(
                pos_mat / tf.pow(10000., dim_mat / tf.cast(dim, tf.float32))),
            y=tf.math.cos(pos_mat
                          / tf.pow(10000.,
                                   (dim_mat - 1) / tf.cast(dim, tf.float32))))
        return pos_encoding
--- a/modelscope/models/audio/tts/am/models/reducer.py
+++ b/modelscope/models/audio/tts/am/models/reducer.py
@@ -0,0 +1,155 @@
 """Define reducers: objects that merge inputs."""

 import abc
 import functools

 import tensorflow as tf


 def pad_in_time(x, padding_length):
    """Helper function to pad a tensor in the time dimension and retain the static depth dimension."""
    return tf.pad(x, [[0, 0], [0, padding_length], [0, 0]])


 def align_in_time(x, length):
    """Aligns the time dimension of :obj:`x` with :obj:`length`."""
    time_dim = tf.shape(x)[1]
    return tf.cond(
        tf.less(time_dim, length),
        true_fn=lambda: pad_in_time(x, length - time_dim),
        false_fn=lambda: x[:, :length])


 def pad_with_identity(x,
                      sequence_length,
                      max_sequence_length,
                      identity_values=0,
                      maxlen=None):
    """Pads a tensor with identity values up to :obj:`max_sequence_length`.
    Args:
      x: A ``tf.Tensor`` of shape ``[batch_size, time, depth]``.
      sequence_length: The true sequence length of :obj:`x`.
      max_sequence_length: The sequence length up to which the tensor must contain
        :obj:`identity values`.
      identity_values: The identity value.
      maxlen: Size of the output time dimension. Default is the maximum value in
        obj:`max_sequence_length`.
    Returns:
      A ``tf.Tensor`` of shape ``[batch_size, maxlen, depth]``.
    """
    if maxlen is None:
        maxlen = tf.reduce_max(max_sequence_length)

    mask = tf.sequence_mask(sequence_length, maxlen=maxlen, dtype=x.dtype)
    mask = tf.expand_dims(mask, axis=-1)
    mask_combined = tf.sequence_mask(
        max_sequence_length, maxlen=maxlen, dtype=x.dtype)
    mask_combined = tf.expand_dims(mask_combined, axis=-1)

    identity_mask = mask_combined * (1.0 - mask)

    x = pad_in_time(x, maxlen - tf.shape(x)[1])
    x = x * mask + (identity_mask * identity_values)

    return x


 def pad_n_with_identity(inputs, sequence_lengths, identity_values=0):
    """Pads each input tensors with identity values up to
    ``max(sequence_lengths)`` for each batch.
    Args:
      inputs: A list of ``tf.Tensor``.
      sequence_lengths: A list of sequence length.
      identity_values: The identity value.
    Returns:
      A tuple ``(padded, max_sequence_length)`` which are respectively a list of
      ``tf.Tensor`` where each tensor are padded with identity and the combined
      sequence length.
    """
    max_sequence_length = tf.reduce_max(sequence_lengths, axis=0)
    maxlen = tf.reduce_max([tf.shape(x)[1] for x in inputs])
    padded = [
        pad_with_identity(
            x,
            length,
            max_sequence_length,
            identity_values=identity_values,
            maxlen=maxlen) for x, length in zip(inputs, sequence_lengths)
    ]
    return padded, max_sequence_length


 class Reducer(tf.keras.layers.Layer):
    """Base class for reducers."""

    def zip_and_reduce(self, x, y):
        """Zips the :obj:`x` with :obj:`y` structures together and reduces all
        elements. If the structures are nested, they will be flattened first.
        Args:
          x: The first structure.
          y: The second structure.
        Returns:
          The same structure as :obj:`x` and :obj:`y` where each element from
          :obj:`x` is reduced with the correspond element from :obj:`y`.
        Raises:
          ValueError: if the two structures are not the same.
        """
        tf.nest.assert_same_structure(x, y)
        x_flat = tf.nest.flatten(x)
        y_flat = tf.nest.flatten(y)
        reduced = list(map(self, zip(x_flat, y_flat)))
        return tf.nest.pack_sequence_as(x, reduced)

    def call(self, inputs, sequence_length=None):  # pylint: disable=arguments-differ
        """Reduces all input elements.
        Args:
          inputs: A list of ``tf.Tensor``.
          sequence_length: The length of each input, if reducing sequences.
        Returns:
          If :obj:`sequence_length` is set, a tuple
          ``(reduced_input, reduced_length)``, otherwise a reduced ``tf.Tensor``
          only.
        """
        if sequence_length is None:
            return self.reduce(inputs)
        else:
            return self.reduce_sequence(
                inputs, sequence_lengths=sequence_length)

    @abc.abstractmethod
    def reduce(self, inputs):
        """See :meth:`opennmt.layers.Reducer.__call__`."""
        raise NotImplementedError()

    @abc.abstractmethod
    def reduce_sequence(self, inputs, sequence_lengths):
        """See :meth:`opennmt.layers.Reducer.__call__`."""
        raise NotImplementedError()


 class SumReducer(Reducer):
    """A reducer that sums the inputs."""

    def reduce(self, inputs):
        if len(inputs) == 1:
            return inputs[0]
        if len(inputs) == 2:
            return inputs[0] + inputs[1]
        return tf.add_n(inputs)

    def reduce_sequence(self, inputs, sequence_lengths):
        padded, combined_length = pad_n_with_identity(
            inputs, sequence_lengths, identity_values=0)
        return self.reduce(padded), combined_length


 class MultiplyReducer(Reducer):
    """A reducer that multiplies the inputs."""

    def reduce(self, inputs):
        return functools.reduce(lambda a, x: a * x, inputs)

    def reduce_sequence(self, inputs, sequence_lengths):
        padded, combined_length = pad_n_with_identity(
            inputs, sequence_lengths, identity_values=1)
        return self.reduce(padded), combined_length
--- a/modelscope/models/audio/tts/am/models/rnn_wrappers.py
+++ b/modelscope/models/audio/tts/am/models/rnn_wrappers.py
@@ -0,0 +1,240 @@
 import numpy as np
 import tensorflow as tf
 from tensorflow.contrib.rnn import RNNCell
 from tensorflow.contrib.seq2seq import AttentionWrapperState
 from tensorflow.python.ops import rnn_cell_impl

 from .modules import prenet


 class VarPredictorCell(RNNCell):
    '''Wrapper wrapper knock knock.'''

    def __init__(self, var_predictor_cell, is_training, dim, prenet_units):
        super(VarPredictorCell, self).__init__()
        self._var_predictor_cell = var_predictor_cell
        self._is_training = is_training
        self._dim = dim
        self._prenet_units = prenet_units

    @property
    def state_size(self):
        return tuple([self.output_size, self._var_predictor_cell.state_size])

    @property
    def output_size(self):
        return self._dim

    def zero_state(self, batch_size, dtype):
        return tuple([
            rnn_cell_impl._zero_state_tensors(self.output_size, batch_size,
                                              dtype),
            self._var_predictor_cell.zero_state(batch_size, dtype)
        ])

    def call(self, inputs, state):
        '''Run the Tacotron2 super decoder cell.'''
        super_cell_out, decoder_state = state

        # split
        prenet_input = inputs[:, 0:self._dim]
        encoder_output = inputs[:, self._dim:]

        # prenet and concat
        prenet_output = prenet(
            prenet_input,
            self._prenet_units,
            self._is_training,
            scope='var_prenet')
        decoder_input = tf.concat([prenet_output, encoder_output], axis=-1)

        # decoder LSTM/GRU
        new_super_cell_out, new_decoder_state = self._var_predictor_cell(
            decoder_input, decoder_state)

        # projection
        new_super_cell_out = tf.layers.dense(
            new_super_cell_out, units=self._dim)

        new_states = tuple([new_super_cell_out, new_decoder_state])

        return new_super_cell_out, new_states


 class DurPredictorCell(RNNCell):
    '''Wrapper wrapper knock knock.'''

    def __init__(self, var_predictor_cell, is_training, dim, prenet_units):
        super(DurPredictorCell, self).__init__()
        self._var_predictor_cell = var_predictor_cell
        self._is_training = is_training
        self._dim = dim
        self._prenet_units = prenet_units

    @property
    def state_size(self):
        return tuple([self.output_size, self._var_predictor_cell.state_size])

    @property
    def output_size(self):
        return self._dim

    def zero_state(self, batch_size, dtype):
        return tuple([
            rnn_cell_impl._zero_state_tensors(self.output_size, batch_size,
                                              dtype),
            self._var_predictor_cell.zero_state(batch_size, dtype)
        ])

    def call(self, inputs, state):
        '''Run the Tacotron2 super decoder cell.'''
        super_cell_out, decoder_state = state

        # split
        prenet_input = inputs[:, 0:self._dim]
        encoder_output = inputs[:, self._dim:]

        # prenet and concat
        prenet_output = prenet(
            prenet_input,
            self._prenet_units,
            self._is_training,
            scope='dur_prenet')
        decoder_input = tf.concat([prenet_output, encoder_output], axis=-1)

        # decoder LSTM/GRU
        new_super_cell_out, new_decoder_state = self._var_predictor_cell(
            decoder_input, decoder_state)

        # projection
        new_super_cell_out = tf.layers.dense(
            new_super_cell_out, units=self._dim)
        new_super_cell_out = tf.nn.relu(new_super_cell_out)
        #    new_super_cell_out = tf.log(tf.cast(tf.round(tf.exp(new_super_cell_out) - 1), tf.float32) + 1)

        new_states = tuple([new_super_cell_out, new_decoder_state])

        return new_super_cell_out, new_states


 class DurPredictorCECell(RNNCell):
    '''Wrapper wrapper knock knock.'''

    def __init__(self, var_predictor_cell, is_training, dim, prenet_units,
                 max_dur, dur_embedding_dim):
        super(DurPredictorCECell, self).__init__()
        self._var_predictor_cell = var_predictor_cell
        self._is_training = is_training
        self._dim = dim
        self._prenet_units = prenet_units
        self._max_dur = max_dur
        self._dur_embedding_dim = dur_embedding_dim

    @property
    def state_size(self):
        return tuple([self.output_size, self._var_predictor_cell.state_size])

    @property
    def output_size(self):
        return self._max_dur

    def zero_state(self, batch_size, dtype):
        return tuple([
            rnn_cell_impl._zero_state_tensors(self.output_size, batch_size,
                                              dtype),
            self._var_predictor_cell.zero_state(batch_size, dtype)
        ])

    def call(self, inputs, state):
        '''Run the Tacotron2 super decoder cell.'''
        super_cell_out, decoder_state = state

        # split
        prenet_input = tf.squeeze(
            tf.cast(inputs[:, 0:self._dim], tf.int32), axis=-1)  # [N]
        prenet_input = tf.one_hot(
            prenet_input, self._max_dur, on_value=1.0, off_value=0.0,
            axis=-1)  # [N, 120]
        prenet_input = tf.layers.dense(
            prenet_input, units=self._dur_embedding_dim)
        encoder_output = inputs[:, self._dim:]

        # prenet and concat
        prenet_output = prenet(
            prenet_input,
            self._prenet_units,
            self._is_training,
            scope='dur_prenet')
        decoder_input = tf.concat([prenet_output, encoder_output], axis=-1)

        # decoder LSTM/GRU
        new_super_cell_out, new_decoder_state = self._var_predictor_cell(
            decoder_input, decoder_state)

        # projection
        new_super_cell_out = tf.layers.dense(
            new_super_cell_out, units=self._max_dur)  # [N, 120]
        new_super_cell_out = tf.nn.softmax(new_super_cell_out)  # [N, 120]

        new_states = tuple([new_super_cell_out, new_decoder_state])

        return new_super_cell_out, new_states


 class VarPredictorCell2(RNNCell):
    '''Wrapper wrapper knock knock.'''

    def __init__(self, var_predictor_cell, is_training, dim, prenet_units):
        super(VarPredictorCell2, self).__init__()
        self._var_predictor_cell = var_predictor_cell
        self._is_training = is_training
        self._dim = dim
        self._prenet_units = prenet_units

    @property
    def state_size(self):
        return tuple([self.output_size, self._var_predictor_cell.state_size])

    @property
    def output_size(self):
        return self._dim

    def zero_state(self, batch_size, dtype):
        return tuple([
            rnn_cell_impl._zero_state_tensors(self.output_size, batch_size,
                                              dtype),
            self._var_predictor_cell.zero_state(batch_size, dtype)
        ])

    def call(self, inputs, state):
        '''Run the Tacotron2 super decoder cell.'''
        super_cell_out, decoder_state = state

        # split
        prenet_input = inputs[:, 0:self._dim]
        encoder_output = inputs[:, self._dim:]

        # prenet and concat
        prenet_output = prenet(
            prenet_input,
            self._prenet_units,
            self._is_training,
            scope='var_prenet')
        decoder_input = tf.concat([prenet_output, encoder_output], axis=-1)

        # decoder LSTM/GRU
        new_super_cell_out, new_decoder_state = self._var_predictor_cell(
            decoder_input, decoder_state)

        # projection
        new_super_cell_out = tf.layers.dense(
            new_super_cell_out, units=self._dim)

        # split and relu
        new_super_cell_out = tf.concat([
            tf.nn.relu(new_super_cell_out[:, 0:1]), new_super_cell_out[:, 1:]
        ], axis=-1)  # yapf:disable

        new_states = tuple([new_super_cell_out, new_decoder_state])

        return new_super_cell_out, new_states
--- a/modelscope/models/audio/tts/am/models/robutrans.py
+++ b/modelscope/models/audio/tts/am/models/robutrans.py
@@ -0,0 +1,760 @@
 import tensorflow as tf
 from tensorflow.contrib.rnn import LSTMBlockCell, MultiRNNCell
 from tensorflow.contrib.seq2seq import BasicDecoder
 from tensorflow.python.ops.ragged.ragged_util import repeat

 from .fsmn_encoder import FsmnEncoderV2
 from .helpers import VarTestHelper, VarTrainingHelper
 from .modules import conv_prenet, decoder_prenet, encoder_prenet
 from .position import (BatchSinusodalPositionalEncoding,
                       SinusodalPositionalEncoding)
 from .rnn_wrappers import DurPredictorCell, VarPredictorCell
 from .self_attention_decoder import SelfAttentionDecoder
 from .self_attention_encoder import SelfAttentionEncoder


 class RobuTrans():

    def __init__(self, hparams):
        self._hparams = hparams

    def initialize(self,
                   inputs,
                   inputs_emotion,
                   inputs_speaker,
                   input_lengths,
                   output_lengths=None,
                   mel_targets=None,
                   durations=None,
                   pitch_contours=None,
                   uv_masks=None,
                   pitch_scales=None,
                   duration_scales=None,
                   energy_contours=None,
                   energy_scales=None):
        '''Initializes the model for inference.

        Sets "mel_outputs", "linear_outputs", "stop_token_outputs", and "alignments" fields.

        Args:
          inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of
            steps in the input time series, and values are character IDs
          input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths
            of each sequence in inputs.
          output_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths
            of each sequence in outputs.
          mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number
            of steps in the output time series, M is num_mels, and values are entries in the mel
            spectrogram. Only needed for training.
        '''
        with tf.variable_scope('inference') as _:
            is_training = mel_targets is not None
            batch_size = tf.shape(inputs)[0]
            hp = self._hparams

            input_mask = None
            if input_lengths is not None and is_training:
                input_mask = tf.sequence_mask(
                    input_lengths, tf.shape(inputs)[1], dtype=tf.float32)

            if input_mask is not None:
                inputs = inputs * tf.expand_dims(input_mask, -1)

            # speaker embedding
            embedded_inputs_speaker = tf.layers.dense(
                inputs_speaker,
                32,
                activation=None,
                use_bias=False,
                kernel_initializer=tf.truncated_normal_initializer(stddev=0.5))

            # emotion embedding
            embedded_inputs_emotion = tf.layers.dense(
                inputs_emotion,
                32,
                activation=None,
                use_bias=False,
                kernel_initializer=tf.truncated_normal_initializer(stddev=0.5))

            # symbol embedding
            with tf.variable_scope('Embedding'):
                embedded_inputs = tf.layers.dense(
                    inputs,
                    hp.embedding_dim,
                    activation=None,
                    use_bias=False,
                    kernel_initializer=tf.truncated_normal_initializer(
                        stddev=0.5))

            # Encoder
            with tf.variable_scope('Encoder'):
                Encoder = SelfAttentionEncoder(
                    num_layers=hp.encoder_num_layers,
                    num_units=hp.encoder_num_units,
                    num_heads=hp.encoder_num_heads,
                    ffn_inner_dim=hp.encoder_ffn_inner_dim,
                    dropout=hp.encoder_dropout,
                    attention_dropout=hp.encoder_attention_dropout,
                    relu_dropout=hp.encoder_relu_dropout)
                encoder_outputs, state_mo, sequence_length_mo, attns = Encoder.encode(
                    embedded_inputs,
                    sequence_length=input_lengths,
                    mode=is_training)
                encoder_outputs = tf.layers.dense(
                    encoder_outputs,
                    hp.encoder_projection_units,
                    activation=None,
                    use_bias=False,
                    kernel_initializer=tf.truncated_normal_initializer(
                        stddev=0.5))

            # pitch and energy
            var_inputs = tf.concat([
                encoder_outputs, embedded_inputs_speaker,
                embedded_inputs_emotion
            ], 2)
            if input_mask is not None:
                var_inputs = var_inputs * tf.expand_dims(input_mask, -1)

            with tf.variable_scope('Pitch_Predictor'):
                Pitch_Predictor_FSMN = FsmnEncoderV2(
                    filter_size=hp.predictor_filter_size,
                    fsmn_num_layers=hp.predictor_fsmn_num_layers,
                    dnn_num_layers=hp.predictor_dnn_num_layers,
                    num_memory_units=hp.predictor_num_memory_units,
                    ffn_inner_dim=hp.predictor_ffn_inner_dim,
                    dropout=hp.predictor_dropout,
                    shift=hp.predictor_shift,
                    position_encoder=None)
                pitch_contour_outputs, _, _ = Pitch_Predictor_FSMN.encode(
                    tf.concat([
                        encoder_outputs, embedded_inputs_speaker,
                        embedded_inputs_emotion
                    ], 2),
                    sequence_length=input_lengths,
                    mode=is_training)
                pitch_contour_outputs, _ = tf.nn.bidirectional_dynamic_rnn(
                    LSTMBlockCell(hp.predictor_lstm_units),
                    LSTMBlockCell(hp.predictor_lstm_units),
                    pitch_contour_outputs,
                    sequence_length=input_lengths,
                    dtype=tf.float32)
                pitch_contour_outputs = tf.concat(
                    pitch_contour_outputs, axis=-1)
                pitch_contour_outputs = tf.layers.dense(
                    pitch_contour_outputs, units=1)  # [N, T_in, 1]
                pitch_contour_outputs = tf.squeeze(
                    pitch_contour_outputs, axis=2)  # [N, T_in]

            with tf.variable_scope('Energy_Predictor'):
                Energy_Predictor_FSMN = FsmnEncoderV2(
                    filter_size=hp.predictor_filter_size,
                    fsmn_num_layers=hp.predictor_fsmn_num_layers,
                    dnn_num_layers=hp.predictor_dnn_num_layers,
                    num_memory_units=hp.predictor_num_memory_units,
                    ffn_inner_dim=hp.predictor_ffn_inner_dim,
                    dropout=hp.predictor_dropout,
                    shift=hp.predictor_shift,
                    position_encoder=None)
                energy_contour_outputs, _, _ = Energy_Predictor_FSMN.encode(
                    tf.concat([
                        encoder_outputs, embedded_inputs_speaker,
                        embedded_inputs_emotion
                    ], 2),
                    sequence_length=input_lengths,
                    mode=is_training)
                energy_contour_outputs, _ = tf.nn.bidirectional_dynamic_rnn(
                    LSTMBlockCell(hp.predictor_lstm_units),
                    LSTMBlockCell(hp.predictor_lstm_units),
                    energy_contour_outputs,
                    sequence_length=input_lengths,
                    dtype=tf.float32)
                energy_contour_outputs = tf.concat(
                    energy_contour_outputs, axis=-1)
                energy_contour_outputs = tf.layers.dense(
                    energy_contour_outputs, units=1)  # [N, T_in, 1]
                energy_contour_outputs = tf.squeeze(
                    energy_contour_outputs, axis=2)  # [N, T_in]

            if is_training:
                pitch_embeddings = tf.expand_dims(
                    pitch_contours, axis=2)  # [N, T_in, 1]
                pitch_embeddings = tf.layers.conv1d(
                    pitch_embeddings,
                    filters=hp.encoder_projection_units,
                    kernel_size=9,
                    padding='same',
                    name='pitch_embeddings')  # [N, T_in, 32]

                energy_embeddings = tf.expand_dims(
                    energy_contours, axis=2)  # [N, T_in, 1]
                energy_embeddings = tf.layers.conv1d(
                    energy_embeddings,
                    filters=hp.encoder_projection_units,
                    kernel_size=9,
                    padding='same',
                    name='energy_embeddings')  # [N, T_in, 32]
            else:
                pitch_contour_outputs *= pitch_scales
                pitch_embeddings = tf.expand_dims(
                    pitch_contour_outputs, axis=2)  # [N, T_in, 1]
                pitch_embeddings = tf.layers.conv1d(
                    pitch_embeddings,
                    filters=hp.encoder_projection_units,
                    kernel_size=9,
                    padding='same',
                    name='pitch_embeddings')  # [N, T_in, 32]

                energy_contour_outputs *= energy_scales
                energy_embeddings = tf.expand_dims(
                    energy_contour_outputs, axis=2)  # [N, T_in, 1]
                energy_embeddings = tf.layers.conv1d(
                    energy_embeddings,
                    filters=hp.encoder_projection_units,
                    kernel_size=9,
                    padding='same',
                    name='energy_embeddings')  # [N, T_in, 32]

            encoder_outputs_ = encoder_outputs + pitch_embeddings + energy_embeddings

            # duration
            dur_inputs = tf.concat([
                encoder_outputs_, embedded_inputs_speaker,
                embedded_inputs_emotion
            ], 2)
            if input_mask is not None:
                dur_inputs = dur_inputs * tf.expand_dims(input_mask, -1)
            with tf.variable_scope('Duration_Predictor'):
                duration_predictor_cell = MultiRNNCell([
                    LSTMBlockCell(hp.predictor_lstm_units),
                    LSTMBlockCell(hp.predictor_lstm_units)
                ], state_is_tuple=True)  # yapf:disable
                duration_output_cell = DurPredictorCell(
                    duration_predictor_cell, is_training, 1,
                    hp.predictor_prenet_units)
                duration_predictor_init_state = duration_output_cell.zero_state(
                    batch_size=batch_size, dtype=tf.float32)
                if is_training:
                    duration_helper = VarTrainingHelper(
                        tf.expand_dims(
                            tf.log(tf.cast(durations, tf.float32) + 1),
                            axis=2), dur_inputs, 1)
                else:
                    duration_helper = VarTestHelper(batch_size, dur_inputs, 1)
                (
                    duration_outputs, _
                ), final_duration_predictor_state, _ = tf.contrib.seq2seq.dynamic_decode(
                    BasicDecoder(duration_output_cell, duration_helper,
                                 duration_predictor_init_state),
                    maximum_iterations=1000)
                duration_outputs = tf.squeeze(
                    duration_outputs, axis=2)  # [N, T_in]
                if input_mask is not None:
                    duration_outputs = duration_outputs * input_mask
                duration_outputs_ = tf.exp(duration_outputs) - 1

            # Length Regulator
            with tf.variable_scope('Length_Regulator'):
                if is_training:
                    i = tf.constant(1)
                    # position embedding
                    j = tf.constant(1)
                    dur_len = tf.shape(durations)[-1]
                    embedded_position_i = tf.range(1, durations[0, 0] + 1)

                    def condition_pos(j, e):
                        return tf.less(j, dur_len)

                    def loop_body_pos(j, embedded_position_i):
                        embedded_position_i = tf.concat([
                            embedded_position_i,
                            tf.range(1, durations[0, j] + 1)
                        ], axis=0)  # yapf:disable
                        return [j + 1, embedded_position_i]

                    j, embedded_position_i = tf.while_loop(
                        condition_pos,
                        loop_body_pos, [j, embedded_position_i],
                        shape_invariants=[
                            j.get_shape(),
                            tf.TensorShape([None])
                        ])
                    embedded_position = tf.reshape(embedded_position_i,
                                                   (1, -1))

                    # others
                    LR_outputs = repeat(
                        encoder_outputs_[0:1, :, :], durations[0, :], axis=1)
                    embedded_outputs_speaker = repeat(
                        embedded_inputs_speaker[0:1, :, :],
                        durations[0, :],
                        axis=1)
                    embedded_outputs_emotion = repeat(
                        embedded_inputs_emotion[0:1, :, :],
                        durations[0, :],
                        axis=1)

                    def condition(i, pos, layer, s, e):
                        return tf.less(i, tf.shape(mel_targets)[0])

                    def loop_body(i, embedded_position, LR_outputs,
                                  embedded_outputs_speaker,
                                  embedded_outputs_emotion):
                        # position embedding
                        jj = tf.constant(1)
                        embedded_position_i = tf.range(1, durations[i, 0] + 1)

                        def condition_pos_i(j, e):
                            return tf.less(j, dur_len)

                        def loop_body_pos_i(j, embedded_position_i):
                            embedded_position_i = tf.concat([
                                embedded_position_i,
                                tf.range(1, durations[i, j] + 1)
                            ], axis=0)  # yapf:disable
                            return [j + 1, embedded_position_i]

                        jj, embedded_position_i = tf.while_loop(
                            condition_pos_i,
                            loop_body_pos_i, [jj, embedded_position_i],
                            shape_invariants=[
                                jj.get_shape(),
                                tf.TensorShape([None])
                            ])
                        embedded_position = tf.concat([
                            embedded_position,
                            tf.reshape(embedded_position_i, (1, -1))
                        ], 0)

                        # others
                        LR_outputs = tf.concat([
                            LR_outputs,
                            repeat(
                                encoder_outputs_[i:i + 1, :, :],
                                durations[i, :],
                                axis=1)
                        ], 0)
                        embedded_outputs_speaker = tf.concat([
                            embedded_outputs_speaker,
                            repeat(
                                embedded_inputs_speaker[i:i + 1, :, :],
                                durations[i, :],
                                axis=1)
                        ], 0)
                        embedded_outputs_emotion = tf.concat([
                            embedded_outputs_emotion,
                            repeat(
                                embedded_inputs_emotion[i:i + 1, :, :],
                                durations[i, :],
                                axis=1)
                        ], 0)
                        return [
                            i + 1, embedded_position, LR_outputs,
                            embedded_outputs_speaker, embedded_outputs_emotion
                        ]

                    i, embedded_position, LR_outputs,
                    embedded_outputs_speaker,
                    embedded_outputs_emotion = tf.while_loop(
                        condition,
                        loop_body, [
                            i, embedded_position, LR_outputs,
                            embedded_outputs_speaker, embedded_outputs_emotion
                        ],
                        shape_invariants=[
                            i.get_shape(),
                            tf.TensorShape([None, None]),
                            tf.TensorShape([None, None, None]),
                            tf.TensorShape([None, None, None]),
                            tf.TensorShape([None, None, None])
                        ],
                        parallel_iterations=hp.batch_size)

                    ori_framenum = tf.shape(mel_targets)[1]
                else:
                    # position
                    j = tf.constant(1)
                    dur_len = tf.shape(duration_outputs_)[-1]
                    embedded_position_i = tf.range(
                        1,
                        tf.cast(tf.round(duration_outputs_)[0, 0], tf.int32)
                        + 1)

                    def condition_pos(j, e):
                        return tf.less(j, dur_len)

                    def loop_body_pos(j, embedded_position_i):
                        embedded_position_i = tf.concat([
                            embedded_position_i,
                            tf.range(
                                1,
                                tf.cast(
                                    tf.round(duration_outputs_)[0, j],
                                    tf.int32) + 1)
                        ], axis=0)  # yapf:disable
                        return [j + 1, embedded_position_i]

                    j, embedded_position_i = tf.while_loop(
                        condition_pos,
                        loop_body_pos, [j, embedded_position_i],
                        shape_invariants=[
                            j.get_shape(),
                            tf.TensorShape([None])
                        ])
                    embedded_position = tf.reshape(embedded_position_i,
                                                   (1, -1))
                    # others
                    duration_outputs_ *= duration_scales
                    LR_outputs = repeat(
                        encoder_outputs_[0:1, :, :],
                        tf.cast(tf.round(duration_outputs_)[0, :], tf.int32),
                        axis=1)
                    embedded_outputs_speaker = repeat(
                        embedded_inputs_speaker[0:1, :, :],
                        tf.cast(tf.round(duration_outputs_)[0, :], tf.int32),
                        axis=1)
                    embedded_outputs_emotion = repeat(
                        embedded_inputs_emotion[0:1, :, :],
                        tf.cast(tf.round(duration_outputs_)[0, :], tf.int32),
                        axis=1)
                    ori_framenum = tf.shape(LR_outputs)[1]

                    left = hp.outputs_per_step - tf.mod(
                        ori_framenum, hp.outputs_per_step)
                    LR_outputs = tf.cond(
                        tf.equal(left,
                                 hp.outputs_per_step), lambda: LR_outputs,
                        lambda: tf.pad(LR_outputs, [[0, 0], [0, left], [0, 0]],
                                       'CONSTANT'))
                    embedded_outputs_speaker = tf.cond(
                        tf.equal(left, hp.outputs_per_step),
                        lambda: embedded_outputs_speaker, lambda: tf.pad(
                            embedded_outputs_speaker, [[0, 0], [0, left],
                                                       [0, 0]], 'CONSTANT'))
                    embedded_outputs_emotion = tf.cond(
                        tf.equal(left, hp.outputs_per_step),
                        lambda: embedded_outputs_emotion, lambda: tf.pad(
                            embedded_outputs_emotion, [[0, 0], [0, left],
                                                       [0, 0]], 'CONSTANT'))
                    embedded_position = tf.cond(
                        tf.equal(left, hp.outputs_per_step),
                        lambda: embedded_position,
                        lambda: tf.pad(embedded_position, [[0, 0], [0, left]],
                                       'CONSTANT'))

            # Pos_Embedding
            with tf.variable_scope('Position_Embedding'):
                Pos_Embedding = BatchSinusodalPositionalEncoding()
                position_embeddings = Pos_Embedding.positional_encoding(
                    batch_size,
                    tf.shape(LR_outputs)[1], hp.encoder_projection_units,
                    embedded_position)
            LR_outputs += position_embeddings

            # multi-frame
            LR_outputs = tf.reshape(LR_outputs, [
                batch_size, -1,
                hp.outputs_per_step * hp.encoder_projection_units
            ])
            embedded_outputs_speaker = tf.reshape(
                embedded_outputs_speaker,
                [batch_size, -1, hp.outputs_per_step * 32])[:, :, :32]
            embedded_outputs_emotion = tf.reshape(
                embedded_outputs_emotion,
                [batch_size, -1, hp.outputs_per_step * 32])[:, :, :32]
            # [N, T_out, D_LR_outputs] (D_LR_outputs = hp.outputs_per_step * hp.encoder_projection_units + 64)
            LR_outputs = tf.concat([
                LR_outputs, embedded_outputs_speaker, embedded_outputs_emotion
            ], -1)

            # auto bandwidth
            if is_training:
                durations_mask = tf.cast(durations,
                                         tf.float32) * input_mask  # [N, T_in]
            else:
                durations_mask = duration_outputs_
            X_band_width = tf.cast(
                tf.round(tf.reduce_max(durations_mask) / hp.outputs_per_step),
                tf.int32)
            H_band_width = X_band_width

            with tf.variable_scope('Decoder'):
                Decoder = SelfAttentionDecoder(
                    num_layers=hp.decoder_num_layers,
                    num_units=hp.decoder_num_units,
                    num_heads=hp.decoder_num_heads,
                    ffn_inner_dim=hp.decoder_ffn_inner_dim,
                    dropout=hp.decoder_dropout,
                    attention_dropout=hp.decoder_attention_dropout,
                    relu_dropout=hp.decoder_relu_dropout,
                    prenet_units=hp.prenet_units,
                    dense_units=hp.prenet_proj_units,
                    num_mels=hp.num_mels,
                    outputs_per_step=hp.outputs_per_step,
                    X_band_width=X_band_width,
                    H_band_width=H_band_width,
                    position_encoder=None)
                if is_training:
                    if hp.free_run:
                        r = hp.outputs_per_step
                        init_decoder_input = tf.expand_dims(
                            tf.tile([[0.0]], [batch_size, hp.num_mels]),
                            axis=1)  # [N, 1, hp.num_mels]
                        decoder_input_lengths = tf.cast(
                            output_lengths / r, tf.int32)
                        decoder_outputs, attention_x, attention_h = Decoder.dynamic_decode_and_search(
                            init_decoder_input,
                            maximum_iterations=tf.shape(LR_outputs)[1],
                            mode=is_training,
                            memory=LR_outputs,
                            memory_sequence_length=decoder_input_lengths)
                    else:
                        r = hp.outputs_per_step
                        decoder_input = mel_targets[:, r - 1::
                                                    r, :]  # [N, T_out / r, hp.num_mels]
                        init_decoder_input = tf.expand_dims(
                            tf.tile([[0.0]], [batch_size, hp.num_mels]),
                            axis=1)  # [N, 1, hp.num_mels]
                        decoder_input = tf.concat(
                            [init_decoder_input, decoder_input],
                            axis=1)  # [N, T_out / r + 1, hp.num_mels]
                        decoder_input = decoder_input[:, :
                                                      -1, :]  # [N, T_out / r, hp.num_mels]
                        decoder_input_lengths = tf.cast(
                            output_lengths / r, tf.int32)
                        decoder_outputs, attention_x, attention_h = Decoder.decode_from_inputs(
                            decoder_input,
                            decoder_input_lengths,
                            mode=is_training,
                            memory=LR_outputs,
                            memory_sequence_length=decoder_input_lengths)
                else:
                    init_decoder_input = tf.expand_dims(
                        tf.tile([[0.0]], [batch_size, hp.num_mels]),
                        axis=1)  # [N, 1, hp.num_mels]
                    decoder_outputs, attention_x, attention_h = Decoder.dynamic_decode_and_search(
                        init_decoder_input,
                        maximum_iterations=tf.shape(LR_outputs)[1],
                        mode=is_training,
                        memory=LR_outputs,
                        memory_sequence_length=tf.expand_dims(
                            tf.shape(LR_outputs)[1], axis=0))

                if is_training:
                    mel_outputs_ = tf.reshape(decoder_outputs,
                                              [batch_size, -1, hp.num_mels])
                else:
                    mel_outputs_ = tf.reshape(
                        decoder_outputs,
                        [batch_size, -1, hp.num_mels])[:, :ori_framenum, :]
                mel_outputs = mel_outputs_

            with tf.variable_scope('Postnet'):
                Postnet_FSMN = FsmnEncoderV2(
                    filter_size=hp.postnet_filter_size,
                    fsmn_num_layers=hp.postnet_fsmn_num_layers,
                    dnn_num_layers=hp.postnet_dnn_num_layers,
                    num_memory_units=hp.postnet_num_memory_units,
                    ffn_inner_dim=hp.postnet_ffn_inner_dim,
                    dropout=hp.postnet_dropout,
                    shift=hp.postnet_shift,
                    position_encoder=None)
                if is_training:
                    postnet_fsmn_outputs, _, _ = Postnet_FSMN.encode(
                        mel_outputs,
                        sequence_length=output_lengths,
                        mode=is_training)
                    hidden_lstm_outputs, _ = tf.nn.dynamic_rnn(
                        LSTMBlockCell(hp.postnet_lstm_units),
                        postnet_fsmn_outputs,
                        sequence_length=output_lengths,
                        dtype=tf.float32)
                else:
                    postnet_fsmn_outputs, _, _ = Postnet_FSMN.encode(
                        mel_outputs,
                        sequence_length=[tf.shape(mel_outputs_)[1]],
                        mode=is_training)
                    hidden_lstm_outputs, _ = tf.nn.dynamic_rnn(
                        LSTMBlockCell(hp.postnet_lstm_units),
                        postnet_fsmn_outputs,
                        sequence_length=[tf.shape(mel_outputs_)[1]],
                        dtype=tf.float32)

            mel_residual_outputs = tf.layers.dense(
                hidden_lstm_outputs, units=hp.num_mels)
            mel_outputs += mel_residual_outputs

            self.inputs = inputs
            self.inputs_speaker = inputs_speaker
            self.inputs_emotion = inputs_emotion
            self.input_lengths = input_lengths
            self.durations = durations
            self.output_lengths = output_lengths
            self.mel_outputs_ = mel_outputs_
            self.mel_outputs = mel_outputs
            self.mel_targets = mel_targets
            self.duration_outputs = duration_outputs
            self.duration_outputs_ = duration_outputs_
            self.duration_scales = duration_scales
            self.pitch_contour_outputs = pitch_contour_outputs
            self.pitch_contours = pitch_contours
            self.pitch_scales = pitch_scales
            self.energy_contour_outputs = energy_contour_outputs
            self.energy_contours = energy_contours
            self.energy_scales = energy_scales
            self.uv_masks_ = uv_masks

            self.embedded_inputs_emotion = embedded_inputs_emotion
            self.embedding_fsmn_outputs = embedded_inputs
            self.encoder_outputs = encoder_outputs
            self.encoder_outputs_ = encoder_outputs_
            self.LR_outputs = LR_outputs
            self.postnet_fsmn_outputs = postnet_fsmn_outputs

            self.pitch_embeddings = pitch_embeddings
            self.energy_embeddings = energy_embeddings

            self.attns = attns
            self.attention_x = attention_x
            self.attention_h = attention_h
            self.X_band_width = X_band_width
            self.H_band_width = H_band_width

    def add_loss(self):
        '''Adds loss to the model. Sets "loss" field. initialize must have been called.'''
        with tf.variable_scope('loss') as _:
            hp = self._hparams
            mask = tf.sequence_mask(
                self.output_lengths,
                tf.shape(self.mel_targets)[1],
                dtype=tf.float32)
            valid_outputs = tf.reduce_sum(mask)

            mask_input = tf.sequence_mask(
                self.input_lengths,
                tf.shape(self.durations)[1],
                dtype=tf.float32)
            valid_inputs = tf.reduce_sum(mask_input)

            # mel loss
            if self.uv_masks_ is not None:
                valid_outputs_mask = tf.reduce_sum(
                    tf.expand_dims(mask, -1) * self.uv_masks_)
                self.mel_loss_ = tf.reduce_sum(
                    tf.abs(self.mel_targets - self.mel_outputs_)
                    * tf.expand_dims(mask, -1) * self.uv_masks_) / (
                        valid_outputs_mask * hp.num_mels)
                self.mel_loss = tf.reduce_sum(
                    tf.abs(self.mel_targets - self.mel_outputs)
                    * tf.expand_dims(mask, -1) * self.uv_masks_) / (
                        valid_outputs_mask * hp.num_mels)
            else:
                self.mel_loss_ = tf.reduce_sum(
                    tf.abs(self.mel_targets - self.mel_outputs_)
                    * tf.expand_dims(mask, -1)) / (
                        valid_outputs * hp.num_mels)
                self.mel_loss = tf.reduce_sum(
                    tf.abs(self.mel_targets - self.mel_outputs)
                    * tf.expand_dims(mask, -1)) / (
                        valid_outputs * hp.num_mels)

            # duration loss
            self.duration_loss = tf.reduce_sum(
                tf.abs(
                    tf.log(tf.cast(self.durations, tf.float32) + 1)
                    - self.duration_outputs) * mask_input) / valid_inputs

            # pitch contour loss
            self.pitch_contour_loss = tf.reduce_sum(
                tf.abs(self.pitch_contours - self.pitch_contour_outputs)
                * mask_input) / valid_inputs

            # energy contour loss
            self.energy_contour_loss = tf.reduce_sum(
                tf.abs(self.energy_contours - self.energy_contour_outputs)
                * mask_input) / valid_inputs

            # final loss
            self.loss = self.mel_loss_ + self.mel_loss + self.duration_loss \
                + self.pitch_contour_loss + self.energy_contour_loss

            # guided attention loss
            self.guided_attention_loss = tf.constant(0.0)
            if hp.guided_attention:
                i0 = tf.constant(0)
                loss0 = tf.constant(0.0)

                def c(i, _):
                    return tf.less(i, tf.shape(mel_targets)[0])

                def loop_body(i, loss):
                    decoder_input_lengths = tf.cast(
                        self.output_lengths / hp.outputs_per_step, tf.int32)
                    input_len = decoder_input_lengths[i]
                    output_len = decoder_input_lengths[i]
                    input_w = tf.expand_dims(
                        tf.range(tf.cast(input_len, dtype=tf.float32)),
                        axis=1) / tf.cast(
                            input_len, dtype=tf.float32)  # [T_in, 1]
                    output_w = tf.expand_dims(
                        tf.range(tf.cast(output_len, dtype=tf.float32)),
                        axis=0) / tf.cast(
                            output_len, dtype=tf.float32)  # [1, T_out]
                    guided_attention_w = 1.0 - tf.exp(
                        -(1 / hp.guided_attention_2g_squared)
                        * tf.square(input_w - output_w))  # [T_in, T_out]
                    guided_attention_w = tf.expand_dims(
                        guided_attention_w, axis=0)  # [1, T_in, T_out]
                    # [hp.decoder_num_heads, T_in, T_out]
                    guided_attention_w = tf.tile(guided_attention_w,
                                                 [hp.decoder_num_heads, 1, 1])
                    loss_i = tf.constant(0.0)
                    for j in range(hp.decoder_num_layers):
                        loss_i += tf.reduce_mean(
                            self.attention_h[j][i, :, :input_len, :output_len]
                            * guided_attention_w)

                    return [tf.add(i, 1), tf.add(loss, loss_i)]

                _, loss = tf.while_loop(
                    c,
                    loop_body,
                    loop_vars=[i0, loss0],
                    parallel_iterations=hp.batch_size)
                self.guided_attention_loss = loss / hp.batch_size
                self.loss += hp.guided_attention_loss_weight * self.guided_attention_loss

    def add_optimizer(self, global_step):
        '''Adds optimizer. Sets "gradients" and "optimize" fields. add_loss must have been called.

        Args:
          global_step: int32 scalar Tensor representing current global step in training
        '''
        with tf.variable_scope('optimizer') as _:
            hp = self._hparams
            if hp.decay_learning_rate:
                self.learning_rate = _learning_rate_decay(
                    hp.initial_learning_rate, global_step)
            else:
                self.learning_rate = tf.convert_to_tensor(
                    hp.initial_learning_rate)
            optimizer = tf.train.AdamOptimizer(self.learning_rate,
                                               hp.adam_beta1, hp.adam_beta2)
            gradients, variables = zip(*optimizer.compute_gradients(self.loss))
            self.gradients = gradients
            clipped_gradients, _ = tf.clip_by_global_norm(gradients, 1.0)

            # Add dependency on UPDATE_OPS; otherwise batchnorm won't work correctly. See:
            # https://github.com/tensorflow/tensorflow/issues/1122
            with tf.control_dependencies(
                    tf.get_collection(tf.GraphKeys.UPDATE_OPS)):
                self.optimize = optimizer.apply_gradients(
                    zip(clipped_gradients, variables), global_step=global_step)


 def _learning_rate_decay(init_lr, global_step):
    # Noam scheme from tensor2tensor:
    warmup_steps = 4000.0
    step = tf.cast(global_step + 1, dtype=tf.float32)
    return init_lr * warmup_steps**0.5 * tf.minimum(step * warmup_steps**-1.5,
                                                    step**-0.5)
--- a/modelscope/models/audio/tts/am/models/self_attention_decoder.py
+++ b/modelscope/models/audio/tts/am/models/self_attention_decoder.py
@@ -0,0 +1,817 @@
 """Define self-attention decoder."""

 import sys

 import tensorflow as tf

 from . import compat, transformer
 from .modules import decoder_prenet
 from .position import SinusoidalPositionEncoder


 class SelfAttentionDecoder():
    """Decoder using self-attention as described in
    https://arxiv.org/abs/1706.03762.
    """

    def __init__(self,
                 num_layers,
                 num_units=512,
                 num_heads=8,
                 ffn_inner_dim=2048,
                 dropout=0.1,
                 attention_dropout=0.1,
                 relu_dropout=0.1,
                 prenet_units=256,
                 dense_units=128,
                 num_mels=80,
                 outputs_per_step=3,
                 X_band_width=None,
                 H_band_width=None,
                 position_encoder=SinusoidalPositionEncoder(),
                 self_attention_type='scaled_dot'):
        """Initializes the parameters of the decoder.

        Args:
          num_layers: The number of layers.
          num_units: The number of hidden units.
          num_heads: The number of heads in the multi-head attention.
          ffn_inner_dim: The number of units of the inner linear transformation
            in the feed forward layer.
          dropout: The probability to drop units from the outputs.
          attention_dropout: The probability to drop units from the attention.
          relu_dropout: The probability to drop units from the ReLU activation in
            the feed forward layer.
          position_encoder: A :class:`opennmt.layers.position.PositionEncoder` to
            apply on inputs or ``None``.
          self_attention_type: Type of self attention, "scaled_dot" or "average" (case
            insensitive).

        Raises:
          ValueError: if :obj:`self_attention_type` is invalid.
        """
        super(SelfAttentionDecoder, self).__init__()
        self.num_layers = num_layers
        self.num_units = num_units
        self.num_heads = num_heads
        self.ffn_inner_dim = ffn_inner_dim
        self.dropout = dropout
        self.attention_dropout = attention_dropout
        self.relu_dropout = relu_dropout
        self.position_encoder = position_encoder
        self.self_attention_type = self_attention_type.lower()
        if self.self_attention_type not in ('scaled_dot', 'average'):
            raise ValueError('invalid attention type %s'
                             % self.self_attention_type)
        if self.self_attention_type == 'average':
            tf.logging.warning(
                'Support for average attention network is experimental '
                'and may change in future versions.')
        self.prenet_units = prenet_units
        self.dense_units = dense_units
        self.num_mels = num_mels
        self.outputs_per_step = outputs_per_step
        self.X_band_width = X_band_width
        self.H_band_width = H_band_width

    @property
    def output_size(self):
        """Returns the decoder output size."""
        return self.num_units

    @property
    def support_alignment_history(self):
        return True

    @property
    def support_multi_source(self):
        return True

    def _init_cache(self, batch_size, dtype=tf.float32, num_sources=1):
        cache = {}

        for layer in range(self.num_layers):
            proj_cache_shape = [
                batch_size, self.num_heads, 0, self.num_units // self.num_heads
            ]
            layer_cache = {}
            layer_cache['memory'] = [{
                'memory_keys':
                tf.zeros(proj_cache_shape, dtype=dtype),
                'memory_values':
                tf.zeros(proj_cache_shape, dtype=dtype)
            } for _ in range(num_sources)]
            if self.self_attention_type == 'scaled_dot':
                layer_cache['self_keys'] = tf.zeros(
                    proj_cache_shape, dtype=dtype)
                layer_cache['self_values'] = tf.zeros(
                    proj_cache_shape, dtype=dtype)
            elif self.self_attention_type == 'average':
                layer_cache['prev_g'] = tf.zeros(
                    [batch_size, 1, self.num_units], dtype=dtype)
            cache['layer_{}'.format(layer)] = layer_cache

        return cache

    def _init_attn(self, dtype=tf.float32):
        attn = []
        for layer in range(self.num_layers):
            attn.append(tf.TensorArray(tf.float32, size=0, dynamic_size=True))
        return attn

    def _self_attention_stack(self,
                              inputs,
                              sequence_length=None,
                              mode=True,
                              cache=None,
                              memory=None,
                              memory_sequence_length=None,
                              step=None):

        # [N, T_out, self.dense_units] or [N, 1, self.dense_units]
        prenet_outputs = decoder_prenet(inputs, self.prenet_units,
                                        self.dense_units, mode)
        if step is None:
            decoder_inputs = tf.concat(
                [memory, prenet_outputs],
                axis=-1)  # [N, T_out, memory_size + self.dense_units]
        else:
            decoder_inputs = tf.concat(
                [memory[:, step:step + 1, :], prenet_outputs],
                axis=-1)  # [N, 1, memory_size + self.dense_units]
        decoder_inputs = tf.layers.dense(
            decoder_inputs, units=self.dense_units)

        inputs = decoder_inputs
        inputs *= self.num_units**0.5
        if self.position_encoder is not None:
            inputs = self.position_encoder(
                inputs, position=step + 1 if step is not None else None)

        inputs = tf.layers.dropout(inputs, rate=self.dropout, training=mode)

        decoder_mask = None
        memory_mask = None
        # last_attention = None

        X_band_width_tmp = -1
        H_band_width_tmp = -1
        if self.X_band_width is not None:
            X_band_width_tmp = tf.cast(
                tf.cond(
                    tf.less(tf.shape(memory)[1], self.X_band_width),
                    lambda: -1, lambda: self.X_band_width),
                dtype=tf.int64)
        if self.H_band_width is not None:
            H_band_width_tmp = tf.cast(
                tf.cond(
                    tf.less(tf.shape(memory)[1], self.H_band_width),
                    lambda: -1, lambda: self.H_band_width),
                dtype=tf.int64)

        if self.self_attention_type == 'scaled_dot':
            if sequence_length is not None:
                decoder_mask = transformer.build_future_mask(
                    sequence_length,
                    num_heads=self.num_heads,
                    maximum_length=tf.shape(inputs)[1],
                    band=X_band_width_tmp)  # [N, 1, T_out, T_out]
        elif self.self_attention_type == 'average':
            if cache is None:
                if sequence_length is None:
                    sequence_length = tf.fill([tf.shape(inputs)[0]],
                                              tf.shape(inputs)[1])
                decoder_mask = transformer.cumulative_average_mask(
                    sequence_length,
                    maximum_length=tf.shape(inputs)[1],
                    dtype=inputs.dtype)

        if memory is not None and not tf.contrib.framework.nest.is_sequence(
                memory):
            memory = (memory, )
        if memory_sequence_length is not None:
            if not tf.contrib.framework.nest.is_sequence(
                    memory_sequence_length):
                memory_sequence_length = (memory_sequence_length, )
            if step is None:
                memory_mask = [
                    transformer.build_history_mask(
                        length,
                        num_heads=self.num_heads,
                        maximum_length=tf.shape(m)[1],
                        band=H_band_width_tmp)
                    for m, length in zip(memory, memory_sequence_length)
                ]
            else:
                memory_mask = [
                    transformer.build_history_mask(
                        length,
                        num_heads=self.num_heads,
                        maximum_length=tf.shape(m)[1],
                        band=H_band_width_tmp)[:, :, step:step + 1, :]
                    for m, length in zip(memory, memory_sequence_length)
                ]

        # last_attention = None
        attns_x = []
        attns_h = []
        for layer in range(self.num_layers):
            layer_name = 'layer_{}'.format(layer)
            layer_cache = cache[layer_name] if cache is not None else None
            with tf.variable_scope(layer_name):
                if memory is not None:
                    for i, (mem, mask) in enumerate(zip(memory, memory_mask)):
                        memory_cache = None
                        if layer_cache is not None:
                            memory_cache = layer_cache['memory'][i]
                        scope_name = 'multi_head_{}'.format(i)
                        if i == 0:
                            scope_name = 'multi_head'
                        with tf.variable_scope(scope_name):
                            encoded, attn_x, attn_h = transformer.multi_head_attention_PNCA(
                                self.num_heads,
                                transformer.norm(inputs),
                                mem,
                                mode,
                                num_units=self.num_units,
                                mask=decoder_mask,
                                mask_h=mask,
                                cache=layer_cache,
                                cache_h=memory_cache,
                                dropout=self.attention_dropout,
                                return_attention=True,
                                layer_name=layer_name,
                                X_band_width=self.X_band_width)
                            attns_x.append(attn_x)
                            attns_h.append(attn_h)
                            context = transformer.drop_and_add(
                                inputs, encoded, mode, dropout=self.dropout)

                with tf.variable_scope('ffn'):
                    transformed = transformer.feed_forward_ori(
                        transformer.norm(context),
                        self.ffn_inner_dim,
                        mode,
                        dropout=self.relu_dropout)
                    transformed = transformer.drop_and_add(
                        context, transformed, mode, dropout=self.dropout)

                inputs = transformed

        outputs = transformer.norm(inputs)
        outputs = tf.layers.dense(
            outputs, units=self.num_mels * self.outputs_per_step)
        return outputs, attns_x, attns_h

    def decode_from_inputs(self,
                           inputs,
                           sequence_length,
                           initial_state=None,
                           mode=True,
                           memory=None,
                           memory_sequence_length=None):
        outputs, attention_x, attention_h = self._self_attention_stack(
            inputs,
            sequence_length=sequence_length,
            mode=mode,
            memory=memory,
            memory_sequence_length=memory_sequence_length)
        return outputs, attention_x, attention_h

    def step_fn(self,
                mode,
                batch_size,
                initial_state=None,
                memory=None,
                memory_sequence_length=None,
                dtype=tf.float32):
        if memory is None:
            num_sources = 0
        elif tf.contrib.framework.nest.is_sequence(memory):
            num_sources = len(memory)
        else:
            num_sources = 1
        cache = self._init_cache(
            batch_size, dtype=dtype, num_sources=num_sources)
        attention_x = self._init_attn(dtype=dtype)
        attention_h = self._init_attn(dtype=dtype)

        def _fn(step, inputs, cache):
            outputs, attention_x, attention_h = self._self_attention_stack(
                inputs,
                mode=mode,
                cache=cache,
                memory=memory,
                memory_sequence_length=memory_sequence_length,
                step=step)
            attention_x_tmp = []
            for layer in range(len(attention_h)):
                attention_x_tmp_l = tf.zeros_like(attention_h[layer])
                if self.X_band_width is not None:
                    pred = tf.less(step, self.X_band_width + 1)
                    attention_x_tmp_l_1 = tf.cond(pred,  # yapf:disable
                                                  lambda: attention_x_tmp_l[:, :, :, :step + 1] + attention_x[layer],
                                                  lambda: tf.concat([
                                                                    attention_x_tmp_l[:, :, :,
                                                                                      :step - self.X_band_width],
                                                                    attention_x_tmp_l[:, :, :,
                                                                                      step - self.X_band_width:step + 1]
                                                                    + attention_x[layer]],
                                                                    axis=-1))  # yapf:disable
                    attention_x_tmp_l_2 = attention_x_tmp_l[:, :, :, step + 1:]
                    attention_x_tmp.append(
                        tf.concat([attention_x_tmp_l_1, attention_x_tmp_l_2],
                                  axis=-1))
                else:
                    attention_x_tmp_l_1 = attention_x_tmp_l[:, :, :, :step + 1]
                    attention_x_tmp_l_2 = attention_x_tmp_l[:, :, :, step + 1:]
                    attention_x_tmp.append(
                        tf.concat([
                            attention_x_tmp_l_1 + attention_x[layer],
                            attention_x_tmp_l_2
                        ], axis=-1))  # yapf:disable
            attention_x = attention_x_tmp
            return outputs, cache, attention_x, attention_h

        return _fn, cache, attention_x, attention_h

    def dynamic_decode_and_search(self, init_decoder_input, maximum_iterations,
                                  mode, memory, memory_sequence_length):
        batch_size = tf.shape(init_decoder_input)[0]
        step_fn, init_cache, init_attn_x, init_attn_h = self.step_fn(
            mode,
            batch_size,
            memory=memory,
            memory_sequence_length=memory_sequence_length)

        outputs, attention_x, attention_h, cache = self.dynamic_decode(
            step_fn,
            init_decoder_input,
            init_cache=init_cache,
            init_attn_x=init_attn_x,
            init_attn_h=init_attn_h,
            maximum_iterations=maximum_iterations,
            batch_size=batch_size)
        return outputs, attention_x, attention_h

    def dynamic_decode_and_search_teacher_forcing(self, decoder_input,
                                                  maximum_iterations, mode,
                                                  memory,
                                                  memory_sequence_length):
        batch_size = tf.shape(decoder_input)[0]
        step_fn, init_cache, init_attn_x, init_attn_h = self.step_fn(
            mode,
            batch_size,
            memory=memory,
            memory_sequence_length=memory_sequence_length)

        outputs, attention_x, attention_h, cache = self.dynamic_decode_teacher_forcing(
            step_fn,
            decoder_input,
            init_cache=init_cache,
            init_attn_x=init_attn_x,
            init_attn_h=init_attn_h,
            maximum_iterations=maximum_iterations,
            batch_size=batch_size)
        return outputs, attention_x, attention_h

    def dynamic_decode(self,
                       step_fn,
                       init_decoder_input,
                       init_cache=None,
                       init_attn_x=None,
                       init_attn_h=None,
                       maximum_iterations=None,
                       batch_size=None):

        def _cond(step, cache, inputs, outputs, attention_x, attention_h):  # pylint: disable=unused-argument
            return tf.less(step, maximum_iterations)

        def _body(step, cache, inputs, outputs, attention_x, attention_h):
            # output: [1, 1, num_mels * r]
            # attn: [1, 1, T_out]
            output, cache, attn_x, attn_h = step_fn(
                step, inputs, cache)  # outputs, cache, attention, attns
            for layer in range(len(attention_x)):
                attention_x[layer] = attention_x[layer].write(
                    step, tf.cast(attn_x[layer], tf.float32))

            for layer in range(len(attention_h)):
                attention_h[layer] = attention_h[layer].write(
                    step, tf.cast(attn_h[layer], tf.float32))

            outputs = outputs.write(step, tf.cast(output, tf.float32))
            return step + 1, cache, output[:, :, -self.
                                           num_mels:], outputs, attention_x, attention_h

        step = tf.constant(0, dtype=tf.int32)
        outputs = tf.TensorArray(tf.float32, size=0, dynamic_size=True)

        _, cache, _, outputs, attention_x, attention_h = tf.while_loop(
            _cond,
            _body,
            loop_vars=(step, init_cache, init_decoder_input, outputs,
                       init_attn_x, init_attn_h),
            shape_invariants=(step.shape,
                              compat.nest.map_structure(
                                  self._get_shape_invariants, init_cache),
                              compat.nest.map_structure(
                                  self._get_shape_invariants,
                                  init_decoder_input), tf.TensorShape(None),
                              compat.nest.map_structure(
                                  self._get_shape_invariants, init_attn_x),
                              compat.nest.map_structure(
                                  self._get_shape_invariants, init_attn_h)),
            parallel_iterations=1,
            back_prop=False,
            maximum_iterations=maximum_iterations)
        # element of outputs: [N, 1, num_mels * r]
        outputs_stack = outputs.stack()  # [T_out, N, 1, num_mels * r]
        outputs_stack = tf.transpose(
            outputs_stack, perm=[2, 1, 0, 3])  # [1, N, T_out, num_mels * r]
        outputs_stack = tf.squeeze(
            outputs_stack, axis=0)  # [N, T_out, num_mels * r]

        attention_x_stack = []
        for layer in range(len(attention_x)):
            attention_x_stack_tmp = attention_x[layer].stack(
            )  # [T_out, N, H, 1, T_out]
            attention_x_stack_tmp = tf.transpose(
                attention_x_stack_tmp, perm=[3, 1, 2, 0,
                                             4])  # [1, N, H, T_out, T_out]
            attention_x_stack_tmp = tf.squeeze(
                attention_x_stack_tmp, axis=0)  # [N, H, T_out, T_out]
            attention_x_stack.append(attention_x_stack_tmp)

        attention_h_stack = []
        for layer in range(len(attention_h)):
            attention_h_stack_tmp = attention_h[layer].stack(
            )  # [T_out, N, H, 1, T_out]
            attention_h_stack_tmp = tf.transpose(
                attention_h_stack_tmp, perm=[3, 1, 2, 0,
                                             4])  # [1, N, H, T_out, T_out]
            attention_h_stack_tmp = tf.squeeze(
                attention_h_stack_tmp, axis=0)  # [N, H, T_out, T_out]
            attention_h_stack.append(attention_h_stack_tmp)

        return outputs_stack, attention_x_stack, attention_h_stack, cache

    def dynamic_decode_teacher_forcing(self,
                                       step_fn,
                                       decoder_input,
                                       init_cache=None,
                                       init_attn_x=None,
                                       init_attn_h=None,
                                       maximum_iterations=None,
                                       batch_size=None):

        def _cond(step, cache, inputs, outputs, attention_x, attention_h):  # pylint: disable=unused-argument
            return tf.less(step, maximum_iterations)

        def _body(step, cache, inputs, outputs, attention_x, attention_h):
            # output: [1, 1, num_mels * r]
            # attn: [1, 1, T_out]
            output, cache, attn_x, attn_h = step_fn(
                step, inputs[:, step:step + 1, :],
                cache)  # outputs, cache, attention, attns
            for layer in range(len(attention_x)):
                attention_x[layer] = attention_x[layer].write(
                    step, tf.cast(attn_x[layer], tf.float32))

            for layer in range(len(attention_h)):
                attention_h[layer] = attention_h[layer].write(
                    step, tf.cast(attn_h[layer], tf.float32))
            outputs = outputs.write(step, tf.cast(output, tf.float32))
            return step + 1, cache, inputs, outputs, attention_x, attention_h

        step = tf.constant(0, dtype=tf.int32)
        outputs = tf.TensorArray(tf.float32, size=0, dynamic_size=True)

        _, cache, _, outputs, attention_x, attention_h = tf.while_loop(
            _cond,
            _body,
            loop_vars=(step, init_cache, decoder_input, outputs, init_attn_x,
                       init_attn_h),
            shape_invariants=(step.shape,
                              compat.nest.map_structure(
                                  self._get_shape_invariants,
                                  init_cache), decoder_input.shape,
                              tf.TensorShape(None),
                              compat.nest.map_structure(
                                  self._get_shape_invariants, init_attn_x),
                              compat.nest.map_structure(
                                  self._get_shape_invariants, init_attn_h)),
            parallel_iterations=1,
            back_prop=False,
            maximum_iterations=maximum_iterations)
        # element of outputs: [N, 1, num_mels * r]
        outputs_stack = outputs.stack()  # [T_out, N, 1, num_mels * r]
        outputs_stack = tf.transpose(
            outputs_stack, perm=[2, 1, 0, 3])  # [1, N, T_out, num_mels * r]
        outputs_stack = tf.squeeze(
            outputs_stack, axis=0)  # [N, T_out, num_mels * r]

        attention_x_stack = []
        for layer in range(len(attention_x)):
            attention_x_stack_tmp = attention_x[layer].stack(
            )  # [T_out, N, H, 1, T_out]
            attention_x_stack_tmp = tf.transpose(
                attention_x_stack_tmp, perm=[3, 1, 2, 0,
                                             4])  # [1, N, H, T_out, T_out]
            attention_x_stack_tmp = tf.squeeze(
                attention_x_stack_tmp, axis=0)  # [N, H, T_out, T_out]
            attention_x_stack.append(attention_x_stack_tmp)

        attention_h_stack = []
        for layer in range(len(attention_h)):
            attention_h_stack_tmp = attention_h[layer].stack(
            )  # [T_out, N, H, 1, T_out]
            attention_h_stack_tmp = tf.transpose(
                attention_h_stack_tmp, perm=[3, 1, 2, 0,
                                             4])  # [1, N, H, T_out, T_out]
            attention_h_stack_tmp = tf.squeeze(
                attention_h_stack_tmp, axis=0)  # [N, H, T_out, T_out]
            attention_h_stack.append(attention_h_stack_tmp)

        return outputs_stack, attention_x_stack, attention_h_stack, cache

    def _get_shape_invariants(self, tensor):
        """Returns the shape of the tensor but sets middle dims to None."""
        if isinstance(tensor, tf.TensorArray):
            shape = None
        else:
            shape = tensor.shape.as_list()
            for i in range(1, len(shape) - 1):
                shape[i] = None
        return tf.TensorShape(shape)


 class SelfAttentionDecoderOri():
    """Decoder using self-attention as described in
    https://arxiv.org/abs/1706.03762.
    """

    def __init__(self,
                 num_layers,
                 num_units=512,
                 num_heads=8,
                 ffn_inner_dim=2048,
                 dropout=0.1,
                 attention_dropout=0.1,
                 relu_dropout=0.1,
                 position_encoder=SinusoidalPositionEncoder(),
                 self_attention_type='scaled_dot'):
        """Initializes the parameters of the decoder.

        Args:
          num_layers: The number of layers.
          num_units: The number of hidden units.
          num_heads: The number of heads in the multi-head attention.
          ffn_inner_dim: The number of units of the inner linear transformation
            in the feed forward layer.
          dropout: The probability to drop units from the outputs.
          attention_dropout: The probability to drop units from the attention.
          relu_dropout: The probability to drop units from the ReLU activation in
            the feed forward layer.
          position_encoder: A :class:`opennmt.layers.position.PositionEncoder` to
            apply on inputs or ``None``.
          self_attention_type: Type of self attention, "scaled_dot" or "average" (case
            insensitive).

        Raises:
          ValueError: if :obj:`self_attention_type` is invalid.
        """
        super(SelfAttentionDecoderOri, self).__init__()
        self.num_layers = num_layers
        self.num_units = num_units
        self.num_heads = num_heads
        self.ffn_inner_dim = ffn_inner_dim
        self.dropout = dropout
        self.attention_dropout = attention_dropout
        self.relu_dropout = relu_dropout
        self.position_encoder = position_encoder
        self.self_attention_type = self_attention_type.lower()
        if self.self_attention_type not in ('scaled_dot', 'average'):
            raise ValueError('invalid attention type %s'
                             % self.self_attention_type)
        if self.self_attention_type == 'average':
            tf.logging.warning(
                'Support for average attention network is experimental '
                'and may change in future versions.')

    @property
    def output_size(self):
        """Returns the decoder output size."""
        return self.num_units

    @property
    def support_alignment_history(self):
        return True

    @property
    def support_multi_source(self):
        return True

    def _init_cache(self, batch_size, dtype=tf.float32, num_sources=1):
        cache = {}

        for layer in range(self.num_layers):
            proj_cache_shape = [
                batch_size, self.num_heads, 0, self.num_units // self.num_heads
            ]
            layer_cache = {}
            layer_cache['memory'] = [{
                'memory_keys':
                tf.zeros(proj_cache_shape, dtype=dtype),
                'memory_values':
                tf.zeros(proj_cache_shape, dtype=dtype)
            } for _ in range(num_sources)]
            if self.self_attention_type == 'scaled_dot':
                layer_cache['self_keys'] = tf.zeros(
                    proj_cache_shape, dtype=dtype)
                layer_cache['self_values'] = tf.zeros(
                    proj_cache_shape, dtype=dtype)
            elif self.self_attention_type == 'average':
                layer_cache['prev_g'] = tf.zeros(
                    [batch_size, 1, self.num_units], dtype=dtype)
            cache['layer_{}'.format(layer)] = layer_cache

        return cache

    def _self_attention_stack(self,
                              inputs,
                              sequence_length=None,
                              mode=True,
                              cache=None,
                              memory=None,
                              memory_sequence_length=None,
                              step=None):
        inputs *= self.num_units**0.5
        if self.position_encoder is not None:
            inputs = self.position_encoder(
                inputs, position=step + 1 if step is not None else None)

        inputs = tf.layers.dropout(inputs, rate=self.dropout, training=mode)

        decoder_mask = None
        memory_mask = None
        last_attention = None

        if self.self_attention_type == 'scaled_dot':
            if sequence_length is not None:
                decoder_mask = transformer.build_future_mask(
                    sequence_length,
                    num_heads=self.num_heads,
                    maximum_length=tf.shape(inputs)[1])
        elif self.self_attention_type == 'average':
            if cache is None:
                if sequence_length is None:
                    sequence_length = tf.fill([tf.shape(inputs)[0]],
                                              tf.shape(inputs)[1])
                decoder_mask = transformer.cumulative_average_mask(
                    sequence_length,
                    maximum_length=tf.shape(inputs)[1],
                    dtype=inputs.dtype)

        if memory is not None and not tf.contrib.framework.nest.is_sequence(
                memory):
            memory = (memory, )
        if memory_sequence_length is not None:
            if not tf.contrib.framework.nest.is_sequence(
                    memory_sequence_length):
                memory_sequence_length = (memory_sequence_length, )
            memory_mask = [
                transformer.build_sequence_mask(
                    length,
                    num_heads=self.num_heads,
                    maximum_length=tf.shape(m)[1])
                for m, length in zip(memory, memory_sequence_length)
            ]

        for layer in range(self.num_layers):
            layer_name = 'layer_{}'.format(layer)
            layer_cache = cache[layer_name] if cache is not None else None
            with tf.variable_scope(layer_name):
                if self.self_attention_type == 'scaled_dot':
                    with tf.variable_scope('masked_multi_head'):
                        encoded = transformer.multi_head_attention(
                            self.num_heads,
                            transformer.norm(inputs),
                            None,
                            mode,
                            num_units=self.num_units,
                            mask=decoder_mask,
                            cache=layer_cache,
                            dropout=self.attention_dropout)
                        last_context = transformer.drop_and_add(
                            inputs, encoded, mode, dropout=self.dropout)
                elif self.self_attention_type == 'average':
                    with tf.variable_scope('average_attention'):
                        # Cumulative average.
                        x = transformer.norm(inputs)
                        y = transformer.cumulative_average(
                            x,
                            decoder_mask if cache is None else step,
                            cache=layer_cache)
                        # FFN.
                        y = transformer.feed_forward(
                            y,
                            self.ffn_inner_dim,
                            mode,
                            dropout=self.relu_dropout)
                        # Gating layer.
                        z = tf.layers.dense(
                            tf.concat([x, y], -1), self.num_units * 2)
                        i, f = tf.split(z, 2, axis=-1)
                        y = tf.sigmoid(i) * x + tf.sigmoid(f) * y
                        last_context = transformer.drop_and_add(
                            inputs, y, mode, dropout=self.dropout)

                if memory is not None:
                    for i, (mem, mask) in enumerate(zip(memory, memory_mask)):
                        memory_cache = layer_cache['memory'][i] if layer_cache is not None else None  # yapf:disable
                        with tf.variable_scope('multi_head' if i
                                               == 0 else 'multi_head_%d' % i):  # yapf:disable
                            context, last_attention = transformer.multi_head_attention(
                                self.num_heads,
                                transformer.norm(last_context),
                                mem,
                                mode,
                                mask=mask,
                                cache=memory_cache,
                                dropout=self.attention_dropout,
                                return_attention=True)
                            last_context = transformer.drop_and_add(
                                last_context,
                                context,
                                mode,
                                dropout=self.dropout)
                            if i > 0:  # Do not return attention in case of multi source.
                                last_attention = None

                with tf.variable_scope('ffn'):
                    transformed = transformer.feed_forward_ori(
                        transformer.norm(last_context),
                        self.ffn_inner_dim,
                        mode,
                        dropout=self.relu_dropout)
                    transformed = transformer.drop_and_add(
                        last_context, transformed, mode, dropout=self.dropout)

                inputs = transformed

        if last_attention is not None:
            # The first head of the last layer is returned.
            first_head_attention = last_attention[:, 0]
        else:
            first_head_attention = None

        outputs = transformer.norm(inputs)
        return outputs, first_head_attention

    def decode_from_inputs(self,
                           inputs,
                           sequence_length,
                           initial_state=None,
                           mode=True,
                           memory=None,
                           memory_sequence_length=None):
        outputs, attention = self._self_attention_stack(
            inputs,
            sequence_length=sequence_length,
            mode=mode,
            memory=memory,
            memory_sequence_length=memory_sequence_length)
        return outputs, None, attention

    def step_fn(self,
                mode,
                batch_size,
                initial_state=None,
                memory=None,
                memory_sequence_length=None,
                dtype=tf.float32):
        if memory is None:
            num_sources = 0
        elif tf.contrib.framework.nest.is_sequence(memory):
            num_sources = len(memory)
        else:
            num_sources = 1
        cache = self._init_cache(
            batch_size, dtype=dtype, num_sources=num_sources)

        def _fn(step, inputs, cache, mode):
            inputs = tf.expand_dims(inputs, 1)
            outputs, attention = self._self_attention_stack(
                inputs,
                mode=mode,
                cache=cache,
                memory=memory,
                memory_sequence_length=memory_sequence_length,
                step=step)
            outputs = tf.squeeze(outputs, axis=1)
            if attention is not None:
                attention = tf.squeeze(attention, axis=1)
            return outputs, cache, attention

        return _fn, cache
--- a/modelscope/models/audio/tts/am/models/self_attention_encoder.py
+++ b/modelscope/models/audio/tts/am/models/self_attention_encoder.py
@@ -0,0 +1,182 @@
 """Define the self-attention encoder."""

 import tensorflow as tf

 from . import transformer
 from .position import SinusoidalPositionEncoder


 class SelfAttentionEncoder():
    """Encoder using self-attention as described in
    https://arxiv.org/abs/1706.03762.
    """

    def __init__(self,
                 num_layers,
                 num_units=512,
                 num_heads=8,
                 ffn_inner_dim=2048,
                 dropout=0.1,
                 attention_dropout=0.1,
                 relu_dropout=0.1,
                 position_encoder=SinusoidalPositionEncoder()):
        """Initializes the parameters of the encoder.

        Args:
          num_layers: The number of layers.
          num_units: The number of hidden units.
          num_heads: The number of heads in the multi-head attention.
          ffn_inner_dim: The number of units of the inner linear transformation
            in the feed forward layer.
          dropout: The probability to drop units from the outputs.
          attention_dropout: The probability to drop units from the attention.
          relu_dropout: The probability to drop units from the ReLU activation in
            the feed forward layer.
          position_encoder: The :class:`opennmt.layers.position.PositionEncoder` to
            apply on inputs or ``None``.
        """
        super(SelfAttentionEncoder, self).__init__()
        self.num_layers = num_layers
        self.num_units = num_units
        self.num_heads = num_heads
        self.ffn_inner_dim = ffn_inner_dim
        self.dropout = dropout
        self.attention_dropout = attention_dropout
        self.relu_dropout = relu_dropout
        self.position_encoder = position_encoder

    def encode(self, inputs, sequence_length=None, mode=True):
        inputs *= self.num_units**0.5
        if self.position_encoder is not None:
            inputs = self.position_encoder(inputs)

        inputs = tf.layers.dropout(inputs, rate=self.dropout, training=mode)
        mask = transformer.build_sequence_mask(
            sequence_length,
            num_heads=self.num_heads,
            maximum_length=tf.shape(inputs)[1])

        mask_FF = tf.squeeze(
            transformer.build_sequence_mask(
                sequence_length, maximum_length=tf.shape(inputs)[1]),
            axis=1)

        state = ()

        attns = []
        for layer in range(self.num_layers):
            with tf.variable_scope('layer_{}'.format(layer)):
                with tf.variable_scope('multi_head'):
                    context, attn = transformer.multi_head_attention(
                        self.num_heads,
                        transformer.norm(inputs),
                        None,
                        mode,
                        num_units=self.num_units,
                        mask=mask,
                        dropout=self.attention_dropout,
                        return_attention=True)
                    attns.append(attn)
                    context = transformer.drop_and_add(
                        inputs, context, mode, dropout=self.dropout)

                with tf.variable_scope('ffn'):
                    transformed = transformer.feed_forward(
                        transformer.norm(context),
                        self.ffn_inner_dim,
                        mode,
                        dropout=self.relu_dropout,
                        mask=mask_FF)
                    transformed = transformer.drop_and_add(
                        context, transformed, mode, dropout=self.dropout)

                inputs = transformed
                state += (tf.reduce_mean(inputs, axis=1), )

        outputs = transformer.norm(inputs)
        return (outputs, state, sequence_length, attns)


 class SelfAttentionEncoderOri():
    """Encoder using self-attention as described in
    https://arxiv.org/abs/1706.03762.
    """

    def __init__(self,
                 num_layers,
                 num_units=512,
                 num_heads=8,
                 ffn_inner_dim=2048,
                 dropout=0.1,
                 attention_dropout=0.1,
                 relu_dropout=0.1,
                 position_encoder=SinusoidalPositionEncoder()):
        """Initializes the parameters of the encoder.

        Args:
          num_layers: The number of layers.
          num_units: The number of hidden units.
          num_heads: The number of heads in the multi-head attention.
          ffn_inner_dim: The number of units of the inner linear transformation
            in the feed forward layer.
          dropout: The probability to drop units from the outputs.
          attention_dropout: The probability to drop units from the attention.
          relu_dropout: The probability to drop units from the ReLU activation in
            the feed forward layer.
          position_encoder: The :class:`opennmt.layers.position.PositionEncoder` to
            apply on inputs or ``None``.
        """
        super(SelfAttentionEncoderOri, self).__init__()
        self.num_layers = num_layers
        self.num_units = num_units
        self.num_heads = num_heads
        self.ffn_inner_dim = ffn_inner_dim
        self.dropout = dropout
        self.attention_dropout = attention_dropout
        self.relu_dropout = relu_dropout
        self.position_encoder = position_encoder

    def encode(self, inputs, sequence_length=None, mode=True):
        inputs *= self.num_units**0.5
        if self.position_encoder is not None:
            inputs = self.position_encoder(inputs)

        inputs = tf.layers.dropout(inputs, rate=self.dropout, training=mode)
        mask = transformer.build_sequence_mask(
            sequence_length,
            num_heads=self.num_heads,
            maximum_length=tf.shape(inputs)[1])  # [N, 1, 1, T_out]

        state = ()

        attns = []
        for layer in range(self.num_layers):
            with tf.variable_scope('layer_{}'.format(layer)):
                with tf.variable_scope('multi_head'):
                    context, attn = transformer.multi_head_attention(
                        self.num_heads,
                        transformer.norm(inputs),
                        None,
                        mode,
                        num_units=self.num_units,
                        mask=mask,
                        dropout=self.attention_dropout,
                        return_attention=True)
                    attns.append(attn)
                    context = transformer.drop_and_add(
                        inputs, context, mode, dropout=self.dropout)

                with tf.variable_scope('ffn'):
                    transformed = transformer.feed_forward_ori(
                        transformer.norm(context),
                        self.ffn_inner_dim,
                        mode,
                        dropout=self.relu_dropout)
                    transformed = transformer.drop_and_add(
                        context, transformed, mode, dropout=self.dropout)

                inputs = transformed
                state += (tf.reduce_mean(inputs, axis=1), )

        outputs = transformer.norm(inputs)
        return (outputs, state, sequence_length, attns)
--- a/modelscope/models/audio/tts/am/models/transformer.py
+++ b/modelscope/models/audio/tts/am/models/transformer.py
--- a/modelscope/models/audio/tts/am/sambert_hifi_16k.py
+++ b/modelscope/models/audio/tts/am/sambert_hifi_16k.py
@@ -0,0 +1,255 @@
 import io
 import os
 from typing import Any, Dict, Optional, Union

 import numpy as np
 import tensorflow as tf
 from sklearn.preprocessing import MultiLabelBinarizer

 from modelscope.models.base import Model
 from modelscope.models.builder import MODELS
 from modelscope.utils.constant import ModelFile, Tasks
 from .models import create_model
 from .text.symbols import load_symbols
 from .text.symbols_dict import SymbolsDict

 __all__ = ['SambertNetHifi16k']


 def multi_label_symbol_to_sequence(my_classes, my_symbol):
    one_hot = MultiLabelBinarizer(my_classes)
    tokens = my_symbol.strip().split(' ')
    sequences = []
    for token in tokens:
        sequences.append(tuple(token.split('&')))
    # sequences.append(tuple(['~'])) # sequence length minus 1 to ignore EOS ~
    return one_hot.fit_transform(sequences)


@MODELS.register_module(Tasks.text_to_speech, module_name=r'sambert_hifi_16k')
 class SambertNetHifi16k(Model):

    def __init__(self,
                 model_dir,
                 pitch_control_str='',
                 duration_control_str='',
                 energy_control_str='',
                 *args,
                 **kwargs):
        tf.reset_default_graph()
        local_ckpt_path = os.path.join(ModelFile.TF_CHECKPOINT_FOLDER, 'ckpt')
        self._ckpt_path = os.path.join(model_dir, local_ckpt_path)
        self._dict_path = os.path.join(model_dir, 'dicts')
        self._hparams = tf.contrib.training.HParams(**kwargs)
        values = self._hparams.values()
        hp = [' {}:{}'.format(name, values[name]) for name in sorted(values)]
        print('Hyperparameters:\n' + '\n'.join(hp))
        super().__init__(self._ckpt_path, *args, **kwargs)
        model_name = 'robutrans'
        self._lfeat_type_list = self._hparams.lfeat_type_list.strip().split(
            ',')
        sy, tone, syllable_flag, word_segment, emo_category, speaker = load_symbols(
            self._dict_path)
        self._sy = sy
        self._tone = tone
        self._syllable_flag = syllable_flag
        self._word_segment = word_segment
        self._emo_category = emo_category
        self._speaker = speaker
        self._inputs_dim = dict()
        for lfeat_type in self._lfeat_type_list:
            if lfeat_type == 'sy':
                self._inputs_dim[lfeat_type] = len(sy)
            elif lfeat_type == 'tone':
                self._inputs_dim[lfeat_type] = len(tone)
            elif lfeat_type == 'syllable_flag':
                self._inputs_dim[lfeat_type] = len(syllable_flag)
            elif lfeat_type == 'word_segment':
                self._inputs_dim[lfeat_type] = len(word_segment)
            elif lfeat_type == 'emo_category':
                self._inputs_dim[lfeat_type] = len(emo_category)
            elif lfeat_type == 'speaker':
                self._inputs_dim[lfeat_type] = len(speaker)

        self._symbols_dict = SymbolsDict(sy, tone, syllable_flag, word_segment,
                                         emo_category, speaker,
                                         self._inputs_dim,
                                         self._lfeat_type_list)
        dim_inputs = sum(self._inputs_dim.values(
        )) - self._inputs_dim['speaker'] - self._inputs_dim['emo_category']
        inputs = tf.placeholder(tf.float32, [1, None, dim_inputs], 'inputs')
        inputs_emotion = tf.placeholder(
            tf.float32, [1, None, self._inputs_dim['emo_category']],
            'inputs_emotion')
        inputs_speaker = tf.placeholder(tf.float32,
                                        [1, None, self._inputs_dim['speaker']],
                                        'inputs_speaker')

        input_lengths = tf.placeholder(tf.int32, [1], 'input_lengths')
        pitch_contours_scale = tf.placeholder(tf.float32, [1, None],
                                              'pitch_contours_scale')
        energy_contours_scale = tf.placeholder(tf.float32, [1, None],
                                               'energy_contours_scale')
        duration_scale = tf.placeholder(tf.float32, [1, None],
                                        'duration_scale')

        with tf.variable_scope('model') as _:
            self._model = create_model(model_name, self._hparams)
            self._model.initialize(
                inputs,
                inputs_emotion,
                inputs_speaker,
                input_lengths,
                duration_scales=duration_scale,
                pitch_scales=pitch_contours_scale,
                energy_scales=energy_contours_scale)
            self._mel_spec = self._model.mel_outputs[0]
            self._duration_outputs = self._model.duration_outputs[0]
            self._duration_outputs_ = self._model.duration_outputs_[0]
            self._pitch_contour_outputs = self._model.pitch_contour_outputs[0]
            self._energy_contour_outputs = self._model.energy_contour_outputs[
                0]
            self._embedded_inputs_emotion = self._model.embedded_inputs_emotion[
                0]
            self._embedding_fsmn_outputs = self._model.embedding_fsmn_outputs[
                0]
            self._encoder_outputs = self._model.encoder_outputs[0]
            self._pitch_embeddings = self._model.pitch_embeddings[0]
            self._energy_embeddings = self._model.energy_embeddings[0]
            self._LR_outputs = self._model.LR_outputs[0]
            self._postnet_fsmn_outputs = self._model.postnet_fsmn_outputs[0]
            self._attention_h = self._model.attention_h
            self._attention_x = self._model.attention_x

            print('Loading checkpoint: %s' % self._ckpt_path)
            config = tf.ConfigProto()
            config.gpu_options.allow_growth = True
            self._session = tf.Session(config=config)
            self._session.run(tf.global_variables_initializer())

            saver = tf.train.Saver()
            saver.restore(self._session, self._ckpt_path)

            duration_cfg_lst = []
            if len(duration_control_str) != 0:
                for item in duration_control_str.strip().split('|'):
                    percent, scale = item.lstrip('(').rstrip(')').split(',')
                    duration_cfg_lst.append((float(percent), float(scale)))

            self._duration_cfg_lst = duration_cfg_lst

            pitch_contours_cfg_lst = []
            if len(pitch_control_str) != 0:
                for item in pitch_control_str.strip().split('|'):
                    percent, scale = item.lstrip('(').rstrip(')').split(',')
                    pitch_contours_cfg_lst.append(
                        (float(percent), float(scale)))

            self._pitch_contours_cfg_lst = pitch_contours_cfg_lst

            energy_contours_cfg_lst = []
            if len(energy_control_str) != 0:
                for item in energy_control_str.strip().split('|'):
                    percent, scale = item.lstrip('(').rstrip(')').split(',')
                    energy_contours_cfg_lst.append(
                        (float(percent), float(scale)))

            self._energy_contours_cfg_lst = energy_contours_cfg_lst

    def forward(self, text):
        cleaner_names = [x.strip() for x in self._hparams.cleaners.split(',')]

        lfeat_symbol = text.strip().split(' ')
        lfeat_symbol_separate = [''] * int(len(self._lfeat_type_list))
        for this_lfeat_symbol in lfeat_symbol:
            this_lfeat_symbol = this_lfeat_symbol.strip('{').strip('}').split(
                '$')
            if len(this_lfeat_symbol) != len(self._lfeat_type_list):
                raise Exception(
                    'Length of this_lfeat_symbol in training data'
                    + ' is not equal to the length of lfeat_type_list, '
                    + str(len(this_lfeat_symbol)) + ' VS. '
                    + str(len(self._lfeat_type_list)))
            index = 0
            while index < len(lfeat_symbol_separate):
                lfeat_symbol_separate[index] = lfeat_symbol_separate[
                    index] + this_lfeat_symbol[index] + ' '
                index = index + 1

        index = 0
        lfeat_type = self._lfeat_type_list[index]
        sequence = self._symbols_dict.symbol_to_sequence(
            lfeat_symbol_separate[index].strip(), lfeat_type, cleaner_names)
        sequence_array = np.asarray(
            sequence[:-1],
            dtype=np.int32)  # sequence length minus 1 to ignore EOS ~
        inputs = np.eye(
            self._inputs_dim[lfeat_type], dtype=np.float32)[sequence_array]
        index = index + 1
        while index < len(self._lfeat_type_list) - 2:
            lfeat_type = self._lfeat_type_list[index]
            sequence = self._symbols_dict.symbol_to_sequence(
                lfeat_symbol_separate[index].strip(), lfeat_type,
                cleaner_names)
            sequence_array = np.asarray(
                sequence[:-1],
                dtype=np.int32)  # sequence length minus 1 to ignore EOS ~
            inputs_temp = np.eye(
                self._inputs_dim[lfeat_type], dtype=np.float32)[sequence_array]
            inputs = np.concatenate((inputs, inputs_temp), axis=1)
            index = index + 1
        seq = inputs

        lfeat_type = 'emo_category'
        inputs_emotion = multi_label_symbol_to_sequence(
            self._emo_category, lfeat_symbol_separate[index].strip())
        # inputs_emotion = inputs_emotion * 1.5
        index = index + 1

        lfeat_type = 'speaker'
        inputs_speaker = multi_label_symbol_to_sequence(
            self._speaker, lfeat_symbol_separate[index].strip())

        duration_scale = np.ones((len(seq), ), dtype=np.float32)
        start_idx = 0
        for (percent, scale) in self._duration_cfg_lst:
            duration_scale[start_idx:start_idx
                           + int(percent * len(seq))] = scale
            start_idx += int(percent * len(seq))

        pitch_contours_scale = np.ones((len(seq), ), dtype=np.float32)
        start_idx = 0
        for (percent, scale) in self._pitch_contours_cfg_lst:
            pitch_contours_scale[start_idx:start_idx
                                 + int(percent * len(seq))] = scale
            start_idx += int(percent * len(seq))

        energy_contours_scale = np.ones((len(seq), ), dtype=np.float32)
        start_idx = 0
        for (percent, scale) in self._energy_contours_cfg_lst:
            energy_contours_scale[start_idx:start_idx
                                  + int(percent * len(seq))] = scale
            start_idx += int(percent * len(seq))

        feed_dict = {
            self._model.inputs: [np.asarray(seq, dtype=np.float32)],
            self._model.inputs_emotion:
            [np.asarray(inputs_emotion, dtype=np.float32)],
            self._model.inputs_speaker:
            [np.asarray(inputs_speaker, dtype=np.float32)],
            self._model.input_lengths:
            np.asarray([len(seq)], dtype=np.int32),
            self._model.duration_scales: [duration_scale],
            self._model.pitch_scales: [pitch_contours_scale],
            self._model.energy_scales: [energy_contours_scale]
        }

        result = self._session.run([
            self._mel_spec, self._duration_outputs, self._duration_outputs_,
            self._pitch_contour_outputs, self._embedded_inputs_emotion,
            self._embedding_fsmn_outputs, self._encoder_outputs,
            self._pitch_embeddings, self._LR_outputs,
            self._postnet_fsmn_outputs, self._energy_contour_outputs,
            self._energy_embeddings, self._attention_x, self._attention_h
        ], feed_dict=feed_dict)  # yapf:disable
        return result[0]
--- a/modelscope/models/audio/tts/am/text/init.py
+++ b/modelscope/models/audio/tts/am/text/init.py
--- a/modelscope/models/audio/tts/am/text/cleaners.py
+++ b/modelscope/models/audio/tts/am/text/cleaners.py
@@ -0,0 +1,89 @@
 '''
 Cleaners are transformations that run over the input text at both training and eval time.

 Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners"
 hyperparameter. Some cleaners are English-specific. You'll typically want to use:
  1. "english_cleaners" for English text
  2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using
     the Unidecode library (https://pypi.python.org/pypi/Unidecode)
  3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update
     the symbols in symbols.py to match your data).
 '''

 import re

 from unidecode import unidecode

 from .numbers import normalize_numbers

 # Regular expression matching whitespace:
 _whitespace_re = re.compile(r'\s+')

 # List of (regular expression, replacement) pairs for abbreviations:
 _abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1])
                  for x in [
                      ('mrs', 'misess'),
                      ('mr', 'mister'),
                      ('dr', 'doctor'),
                      ('st', 'saint'),
                      ('co', 'company'),
                      ('jr', 'junior'),
                      ('maj', 'major'),
                      ('gen', 'general'),
                      ('drs', 'doctors'),
                      ('rev', 'reverend'),
                      ('lt', 'lieutenant'),
                      ('hon', 'honorable'),
                      ('sgt', 'sergeant'),
                      ('capt', 'captain'),
                      ('esq', 'esquire'),
                      ('ltd', 'limited'),
                      ('col', 'colonel'),
                      ('ft', 'fort'), ]]  # yapf:disable


 def expand_abbreviations(text):
    for regex, replacement in _abbreviations:
        text = re.sub(regex, replacement, text)
    return text


 def expand_numbers(text):
    return normalize_numbers(text)


 def lowercase(text):
    return text.lower()


 def collapse_whitespace(text):
    return re.sub(_whitespace_re, ' ', text)


 def convert_to_ascii(text):
    return unidecode(text)


 def basic_cleaners(text):
    '''Basic pipeline that lowercases and collapses whitespace without transliteration.'''
    text = lowercase(text)
    text = collapse_whitespace(text)
    return text


 def transliteration_cleaners(text):
    '''Pipeline for non-English text that transliterates to ASCII.'''
    text = convert_to_ascii(text)
    text = lowercase(text)
    text = collapse_whitespace(text)
    return text


 def english_cleaners(text):
    '''Pipeline for English text, including number and abbreviation expansion.'''
    text = convert_to_ascii(text)
    text = lowercase(text)
    text = expand_numbers(text)
    text = expand_abbreviations(text)
    text = collapse_whitespace(text)
    return text
--- a/modelscope/models/audio/tts/am/text/cmudict.py
+++ b/modelscope/models/audio/tts/am/text/cmudict.py
@@ -0,0 +1,64 @@
 import re

 valid_symbols = [
    'AA', 'AA0', 'AA1', 'AA2', 'AE', 'AE0', 'AE1', 'AE2', 'AH', 'AH0', 'AH1',
    'AH2', 'AO', 'AO0', 'AO1', 'AO2', 'AW', 'AW0', 'AW1', 'AW2', 'AY', 'AY0',
    'AY1', 'AY2', 'B', 'CH', 'D', 'DH', 'EH', 'EH0', 'EH1', 'EH2', 'ER', 'ER0',
    'ER1', 'ER2', 'EY', 'EY0', 'EY1', 'EY2', 'F', 'G', 'HH', 'IH', 'IH0',
    'IH1', 'IH2', 'IY', 'IY0', 'IY1', 'IY2', 'JH', 'K', 'L', 'M', 'N', 'NG',
    'OW', 'OW0', 'OW1', 'OW2', 'OY', 'OY0', 'OY1', 'OY2', 'P', 'R', 'S', 'SH',
    'T', 'TH', 'UH', 'UH0', 'UH1', 'UH2', 'UW', 'UW0', 'UW1', 'UW2', 'V', 'W',
    'Y', 'Z', 'ZH'
 ]

 _valid_symbol_set = set(valid_symbols)


 class CMUDict:
    '''Thin wrapper around CMUDict data. http://www.speech.cs.cmu.edu/cgi-bin/cmudict'''

    def __init__(self, file_or_path, keep_ambiguous=True):
        if isinstance(file_or_path, str):
            with open(file_or_path, encoding='latin-1') as f:
                entries = _parse_cmudict(f)
        else:
            entries = _parse_cmudict(file_or_path)
        if not keep_ambiguous:
            entries = {
                word: pron
                for word, pron in entries.items() if len(pron) == 1
            }
        self._entries = entries

    def __len__(self):
        return len(self._entries)

    def lookup(self, word):
        '''Returns list of ARPAbet pronunciations of the given word.'''
        return self._entries.get(word.upper())


 _alt_re = re.compile(r'\([0-9]+\)')


 def _parse_cmudict(file):
    cmudict = {}
    for line in file:
        if len(line) and (line[0] >= 'A' and line[0] <= 'Z' or line[0] == "'"):
            parts = line.split('  ')
            word = re.sub(_alt_re, '', parts[0])
            pronunciation = _get_pronunciation(parts[1])
            if pronunciation:
                if word in cmudict:
                    cmudict[word].append(pronunciation)
                else:
                    cmudict[word] = [pronunciation]
    return cmudict


 def _get_pronunciation(s):
    parts = s.strip().split(' ')
    for part in parts:
        if part not in _valid_symbol_set:
            return None
    return ' '.join(parts)
--- a/modelscope/models/audio/tts/am/text/numbers.py
+++ b/modelscope/models/audio/tts/am/text/numbers.py
@@ -0,0 +1,70 @@
 import re

 import inflect

 _inflect = inflect.engine()
 _comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])')
 _decimal_number_re = re.compile(r'([0-9]+\.[0-9]+)')
 _pounds_re = re.compile(r'£([0-9\,]*[0-9]+)')
 _dollars_re = re.compile(r'\$([0-9\.\,]*[0-9]+)')
 _ordinal_re = re.compile(r'[0-9]+(st|nd|rd|th)')
 _number_re = re.compile(r'[0-9]+')


 def _remove_commas(m):
    return m.group(1).replace(',', '')


 def _expand_decimal_point(m):
    return m.group(1).replace('.', ' point ')


 def _expand_dollars(m):
    match = m.group(1)
    parts = match.split('.')
    if len(parts) > 2:
        return match + ' dollars'  # Unexpected format
    dollars = int(parts[0]) if parts[0] else 0
    cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0
    if dollars and cents:
        dollar_unit = 'dollar' if dollars == 1 else 'dollars'
        cent_unit = 'cent' if cents == 1 else 'cents'
        return '%s %s, %s %s' % (dollars, dollar_unit, cents, cent_unit)
    elif dollars:
        dollar_unit = 'dollar' if dollars == 1 else 'dollars'
        return '%s %s' % (dollars, dollar_unit)
    elif cents:
        cent_unit = 'cent' if cents == 1 else 'cents'
        return '%s %s' % (cents, cent_unit)
    else:
        return 'zero dollars'


 def _expand_ordinal(m):
    return _inflect.number_to_words(m.group(0))


 def _expand_number(m):
    num = int(m.group(0))
    if num > 1000 and num < 3000:
        if num == 2000:
            return 'two thousand'
        elif num > 2000 and num < 2010:
            return 'two thousand ' + _inflect.number_to_words(num % 100)
        elif num % 100 == 0:
            return _inflect.number_to_words(num // 100) + ' hundred'
        else:
            return _inflect.number_to_words(
                num, andword='', zero='oh', group=2).replace(', ', ' ')
    else:
        return _inflect.number_to_words(num, andword='')


 def normalize_numbers(text):
    text = re.sub(_comma_number_re, _remove_commas, text)
    text = re.sub(_pounds_re, r'\1 pounds', text)
    text = re.sub(_dollars_re, _expand_dollars, text)
    text = re.sub(_decimal_number_re, _expand_decimal_point, text)
    text = re.sub(_ordinal_re, _expand_ordinal, text)
    text = re.sub(_number_re, _expand_number, text)
    return text
--- a/modelscope/models/audio/tts/am/text/symbols.py
+++ b/modelscope/models/audio/tts/am/text/symbols.py
@@ -0,0 +1,95 @@
 '''
 Defines the set of symbols used in text input to the model.

 The default is a set of ASCII characters that works well for English or text that has been run
 through Unidecode. For other data, you can modify _characters. See TRAINING_DATA.md for details.
 '''
 import codecs
 import os

 _pad = '_'
 _eos = '~'
 _mask = '@[MASK]'


 def load_symbols(dict_path):
    _characters = ''
    _ch_symbols = []
    sy_dict_name = 'sy_dict.txt'
    sy_dict_path = os.path.join(dict_path, sy_dict_name)
    f = codecs.open(sy_dict_path, 'r')
    for line in f:
        line = line.strip('\r\n')
        _ch_symbols.append(line)

    _arpabet = ['@' + s for s in _ch_symbols]

    # Export all symbols:
    sy = list(_characters) + _arpabet + [_pad, _eos, _mask]

    _characters = ''

    _ch_tones = []
    tone_dict_name = 'tone_dict.txt'
    tone_dict_path = os.path.join(dict_path, tone_dict_name)
    f = codecs.open(tone_dict_path, 'r')
    for line in f:
        line = line.strip('\r\n')
        _ch_tones.append(line)

    # Export all tones:
    tone = list(_characters) + _ch_tones + [_pad, _eos, _mask]

    _characters = ''

    _ch_syllable_flags = []
    syllable_flag_name = 'syllable_flag_dict.txt'
    syllable_flag_path = os.path.join(dict_path, syllable_flag_name)
    f = codecs.open(syllable_flag_path, 'r')
    for line in f:
        line = line.strip('\r\n')
        _ch_syllable_flags.append(line)

    # Export all syllable_flags:
    syllable_flag = list(_characters) + _ch_syllable_flags + [
        _pad, _eos, _mask
    ]

    _characters = ''

    _ch_word_segments = []
    word_segment_name = 'word_segment_dict.txt'
    word_segment_path = os.path.join(dict_path, word_segment_name)
    f = codecs.open(word_segment_path, 'r')
    for line in f:
        line = line.strip('\r\n')
        _ch_word_segments.append(line)

    # Export all syllable_flags:
    word_segment = list(_characters) + _ch_word_segments + [_pad, _eos, _mask]

    _characters = ''

    _ch_emo_types = []
    emo_category_name = 'emo_category_dict.txt'
    emo_category_path = os.path.join(dict_path, emo_category_name)
    f = codecs.open(emo_category_path, 'r')
    for line in f:
        line = line.strip('\r\n')
        _ch_emo_types.append(line)

    emo_category = list(_characters) + _ch_emo_types + [_pad, _eos, _mask]

    _characters = ''

    _ch_speakers = []
    speaker_name = 'speaker_dict.txt'
    speaker_path = os.path.join(dict_path, speaker_name)
    f = codecs.open(speaker_path, 'r')
    for line in f:
        line = line.strip('\r\n')
        _ch_speakers.append(line)

    # Export all syllable_flags:
    speaker = list(_characters) + _ch_speakers + [_pad, _eos, _mask]
    return sy, tone, syllable_flag, word_segment, emo_category, speaker
--- a/modelscope/models/audio/tts/am/text/symbols_dict.py
+++ b/modelscope/models/audio/tts/am/text/symbols_dict.py
@@ -0,0 +1,200 @@
 import re
 import sys

 from .cleaners import (basic_cleaners, english_cleaners,
                       transliteration_cleaners)


 class SymbolsDict:

    def __init__(self, sy, tone, syllable_flag, word_segment, emo_category,
                 speaker, inputs_dim, lfeat_type_list):
        self._inputs_dim = inputs_dim
        self._lfeat_type_list = lfeat_type_list
        self._sy_to_id = {s: i for i, s in enumerate(sy)}
        self._id_to_sy = {i: s for i, s in enumerate(sy)}
        self._tone_to_id = {s: i for i, s in enumerate(tone)}
        self._id_to_tone = {i: s for i, s in enumerate(tone)}
        self._syllable_flag_to_id = {s: i for i, s in enumerate(syllable_flag)}
        self._id_to_syllable_flag = {i: s for i, s in enumerate(syllable_flag)}
        self._word_segment_to_id = {s: i for i, s in enumerate(word_segment)}
        self._id_to_word_segment = {i: s for i, s in enumerate(word_segment)}
        self._emo_category_to_id = {s: i for i, s in enumerate(emo_category)}
        self._id_to_emo_category = {i: s for i, s in enumerate(emo_category)}
        self._speaker_to_id = {s: i for i, s in enumerate(speaker)}
        self._id_to_speaker = {i: s for i, s in enumerate(speaker)}
        print('_sy_to_id: ')
        print(self._sy_to_id)
        print('_tone_to_id: ')
        print(self._tone_to_id)
        print('_syllable_flag_to_id: ')
        print(self._syllable_flag_to_id)
        print('_word_segment_to_id: ')
        print(self._word_segment_to_id)
        print('_emo_category_to_id: ')
        print(self._emo_category_to_id)
        print('_speaker_to_id: ')
        print(self._speaker_to_id)
        self._curly_re = re.compile(r'(.*?)\{(.+?)\}(.*)')
        self._cleaners = {
            basic_cleaners.__name__: basic_cleaners,
            transliteration_cleaners.__name__: transliteration_cleaners,
            english_cleaners.__name__: english_cleaners
        }

    def _clean_text(self, text, cleaner_names):
        for name in cleaner_names:
            cleaner = self._cleaners.get(name)
            if not cleaner:
                raise Exception('Unknown cleaner: %s' % name)
            text = cleaner(text)
        return text

    def _sy_to_sequence(self, sy):
        return [self._sy_to_id[s] for s in sy if self._should_keep_sy(s)]

    def _arpabet_to_sequence(self, text):
        return self._sy_to_sequence(['@' + s for s in text.split()])

    def _should_keep_sy(self, s):
        return s in self._sy_to_id and s != '_' and s != '~'

    def symbol_to_sequence(self, this_lfeat_symbol, lfeat_type, cleaner_names):
        sequence = []
        if lfeat_type == 'sy':
            this_lfeat_symbol = this_lfeat_symbol.strip().split(' ')
            this_lfeat_symbol_format = ''
            index = 0
            while index < len(this_lfeat_symbol):
                this_lfeat_symbol_format = this_lfeat_symbol_format + '{' + this_lfeat_symbol[
                    index] + '}' + ' '
                index = index + 1
            sequence = self.text_to_sequence(this_lfeat_symbol_format,
                                             cleaner_names)
        elif lfeat_type == 'tone':
            sequence = self.tone_to_sequence(this_lfeat_symbol)
        elif lfeat_type == 'syllable_flag':
            sequence = self.syllable_flag_to_sequence(this_lfeat_symbol)
        elif lfeat_type == 'word_segment':
            sequence = self.word_segment_to_sequence(this_lfeat_symbol)
        elif lfeat_type == 'emo_category':
            sequence = self.emo_category_to_sequence(this_lfeat_symbol)
        elif lfeat_type == 'speaker':
            sequence = self.speaker_to_sequence(this_lfeat_symbol)
        else:
            raise Exception('Unknown lfeat type: %s' % lfeat_type)

        return sequence

    def text_to_sequence(self, text, cleaner_names):
        '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.

          The text can optionally have ARPAbet sequences enclosed in curly braces embedded
          in it. For example, "Turn left on {HH AW1 S S T AH0 N} Street."

          Args:
            text: string to convert to a sequence
            cleaner_names: names of the cleaner functions to run the text through

          Returns:
            List of integers corresponding to the symbols in the text
        '''
        sequence = []

        # Check for curly braces and treat their contents as ARPAbet:
        while len(text):
            m = self._curly_re.match(text)
            if not m:
                sequence += self._sy_to_sequence(
                    self._clean_text(text, cleaner_names))
                break
            sequence += self._sy_to_sequence(
                self._clean_text(m.group(1), cleaner_names))
            sequence += self._arpabet_to_sequence(m.group(2))
            text = m.group(3)

        # Append EOS token
        sequence.append(self._sy_to_id['~'])
        return sequence

    def tone_to_sequence(self, tone):
        tones = tone.strip().split(' ')
        sequence = []
        for this_tone in tones:
            sequence.append(self._tone_to_id[this_tone])
        sequence.append(self._tone_to_id['~'])
        return sequence

    def syllable_flag_to_sequence(self, syllable_flag):
        syllable_flags = syllable_flag.strip().split(' ')
        sequence = []
        for this_syllable_flag in syllable_flags:
            sequence.append(self._syllable_flag_to_id[this_syllable_flag])
        sequence.append(self._syllable_flag_to_id['~'])
        return sequence

    def word_segment_to_sequence(self, word_segment):
        word_segments = word_segment.strip().split(' ')
        sequence = []
        for this_word_segment in word_segments:
            sequence.append(self._word_segment_to_id[this_word_segment])
        sequence.append(self._word_segment_to_id['~'])
        return sequence

    def emo_category_to_sequence(self, emo_type):
        emo_categories = emo_type.strip().split(' ')
        sequence = []
        for this_category in emo_categories:
            sequence.append(self._emo_category_to_id[this_category])
        sequence.append(self._emo_category_to_id['~'])
        return sequence

    def speaker_to_sequence(self, speaker):
        speakers = speaker.strip().split(' ')
        sequence = []
        for this_speaker in speakers:
            sequence.append(self._speaker_to_id[this_speaker])
        sequence.append(self._speaker_to_id['~'])
        return sequence

    def sequence_to_symbol(self, sequence):
        result = ''
        pre_lfeat_dim = 0
        for lfeat_type in self._lfeat_type_list:
            current_one_hot_sequence = sequence[:, pre_lfeat_dim:pre_lfeat_dim
                                                + self._inputs_dim[lfeat_type]]
            current_sequence = current_one_hot_sequence.argmax(1)
            length = current_sequence.shape[0]

            index = 0
            while index < length:
                this_sequence = current_sequence[index]
                s = ''
                if lfeat_type == 'sy':
                    s = self._id_to_sy[this_sequence]
                    if len(s) > 1 and s[0] == '@':
                        s = s[1:]
                elif lfeat_type == 'tone':
                    s = self._id_to_tone[this_sequence]
                elif lfeat_type == 'syllable_flag':
                    s = self._id_to_syllable_flag[this_sequence]
                elif lfeat_type == 'word_segment':
                    s = self._id_to_word_segment[this_sequence]
                elif lfeat_type == 'emo_category':
                    s = self._id_to_emo_category[this_sequence]
                elif lfeat_type == 'speaker':
                    s = self._id_to_speaker[this_sequence]
                else:
                    raise Exception('Unknown lfeat type: %s' % lfeat_type)

                if index == 0:
                    result = result + lfeat_type + ': '

                result = result + '{' + s + '}'

                if index == length - 1:
                    result = result + '; '

                index = index + 1
            pre_lfeat_dim = pre_lfeat_dim + self._inputs_dim[lfeat_type]
        return result
--- a/modelscope/models/audio/tts/frontend/init.py
+++ b/modelscope/models/audio/tts/frontend/init.py
@@ -0,0 +1 @@
 from .generic_text_to_speech_frontend import *  # noqa F403
--- a/modelscope/models/audio/tts/frontend/generic_text_to_speech_frontend.py
+++ b/modelscope/models/audio/tts/frontend/generic_text_to_speech_frontend.py
@@ -0,0 +1,39 @@
 import os
 import zipfile
 from typing import Any, Dict, List

 from modelscope.models.base import Model
 from modelscope.models.builder import MODELS
 from modelscope.utils.audio.tts_exceptions import (
    TtsFrontendInitializeFailedException,
    TtsFrontendLanguageTypeInvalidException)
 from modelscope.utils.constant import Tasks

 __all__ = ['GenericTtsFrontend']


@MODELS.register_module(
    Tasks.text_to_speech, module_name=r'generic_tts_frontend')
 class GenericTtsFrontend(Model):

    def __init__(self, model_dir='.', lang_type='pinyin', *args, **kwargs):
        super().__init__(model_dir, *args, **kwargs)
        import ttsfrd

        frontend = ttsfrd.TtsFrontendEngine()
        zip_file = os.path.join(model_dir, 'resource.zip')
        self._res_path = os.path.join(model_dir, 'resource')
        with zipfile.ZipFile(zip_file, 'r') as zip_ref:
            zip_ref.extractall(model_dir)
        if not frontend.initialize(self._res_path):
            raise TtsFrontendInitializeFailedException(
                'resource invalid: {}'.format(self._res_path))
        if not frontend.set_lang_type(lang_type):
            raise TtsFrontendLanguageTypeInvalidException(
                'language type invalid: {}, valid is pinyin and chenmix'.
                format(lang_type))
        self._frontend = frontend

    def forward(self, data: str) -> Dict[str, List]:
        result = self._frontend.gen_tacotron_symbols(data)
        return {'texts': [s for s in result.splitlines() if s != '']}
--- a/modelscope/models/audio/tts/vocoder/init.py
+++ b/modelscope/models/audio/tts/vocoder/init.py
@@ -0,0 +1 @@
 from .hifigan16k import *  # noqa F403
--- a/modelscope/models/audio/tts/vocoder/hifigan16k.py
+++ b/modelscope/models/audio/tts/vocoder/hifigan16k.py
@@ -0,0 +1,73 @@
 from __future__ import (absolute_import, division, print_function,
                        unicode_literals)
 import argparse
 import glob
 import os
 import time

 import json
 import numpy as np
 import torch
 from scipy.io.wavfile import write

 from modelscope.models.base import Model
 from modelscope.models.builder import MODELS
 from modelscope.utils.audio.tts_exceptions import \
    TtsVocoderMelspecShapeMismatchException
 from modelscope.utils.constant import ModelFile, Tasks
 from .models import Generator

 __all__ = ['Hifigan16k', 'AttrDict']
 MAX_WAV_VALUE = 32768.0


 def load_checkpoint(filepath, device):
    assert os.path.isfile(filepath)
    print("Loading '{}'".format(filepath))
    checkpoint_dict = torch.load(filepath, map_location=device)
    print('Complete.')
    return checkpoint_dict


 class AttrDict(dict):

    def __init__(self, *args, **kwargs):
        super(AttrDict, self).__init__(*args, **kwargs)
        self.__dict__ = self


@MODELS.register_module(Tasks.text_to_speech, module_name=r'hifigan16k')
 class Hifigan16k(Model):

    def __init__(self, model_dir, *args, **kwargs):
        self._ckpt_path = os.path.join(model_dir,
                                       ModelFile.TORCH_MODEL_BIN_FILE)
        self._config = AttrDict(**kwargs)

        super().__init__(self._ckpt_path, *args, **kwargs)
        if torch.cuda.is_available():
            torch.manual_seed(self._config.seed)
            self._device = torch.device('cuda')
        else:
            self._device = torch.device('cpu')
        self._generator = Generator(self._config).to(self._device)
        state_dict_g = load_checkpoint(self._ckpt_path, self._device)
        self._generator.load_state_dict(state_dict_g['generator'])
        self._generator.eval()
        self._generator.remove_weight_norm()

    def forward(self, melspec):
        dim0 = list(melspec.shape)[-1]
        if dim0 != 80:
            raise TtsVocoderMelspecShapeMismatchException(
                'input melspec mismatch 0 dim require 80 but {}'.format(dim0))
        with torch.no_grad():
            x = melspec.T
            x = torch.FloatTensor(x).to(self._device)
            if len(x.shape) == 2:
                x = x.unsqueeze(0)
            y_g_hat = self._generator(x)
            audio = y_g_hat.squeeze()
            audio = audio * MAX_WAV_VALUE
            audio = audio.cpu().numpy().astype('int16')
            return audio
--- a/modelscope/models/audio/tts/vocoder/models/init.py
+++ b/modelscope/models/audio/tts/vocoder/models/init.py
@@ -0,0 +1 @@
 from .models import Generator
--- a/modelscope/models/audio/tts/vocoder/models/models.py
+++ b/modelscope/models/audio/tts/vocoder/models/models.py
@@ -0,0 +1,516 @@
 from distutils.version import LooseVersion

 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from torch.nn import AvgPool1d, Conv1d, Conv2d, ConvTranspose1d
 from torch.nn.utils import remove_weight_norm, spectral_norm, weight_norm

 from .utils import get_padding, init_weights

 is_pytorch_17plus = LooseVersion(torch.__version__) >= LooseVersion('1.7')


 def stft(x, fft_size, hop_size, win_length, window):
    """Perform STFT and convert to magnitude spectrogram.

    Args:
        x (Tensor): Input signal tensor (B, T).
        fft_size (int): FFT size.
        hop_size (int): Hop size.
        win_length (int): Window length.
        window (str): Window function type.

    Returns:
        Tensor: Magnitude spectrogram (B, #frames, fft_size // 2 + 1).

    """
    if is_pytorch_17plus:
        x_stft = torch.stft(
            x, fft_size, hop_size, win_length, window, return_complex=False)
    else:
        x_stft = torch.stft(x, fft_size, hop_size, win_length, window)
    real = x_stft[..., 0]
    imag = x_stft[..., 1]

    # NOTE(kan-bayashi): clamp is needed to avoid nan or inf
    return torch.sqrt(torch.clamp(real**2 + imag**2, min=1e-7)).transpose(2, 1)


 LRELU_SLOPE = 0.1


 def get_padding_casual(kernel_size, dilation=1):
    return int(kernel_size * dilation - dilation)


 class Conv1dCasual(torch.nn.Module):

    def __init__(self,
                 in_channels,
                 out_channels,
                 kernel_size,
                 stride=1,
                 padding=0,
                 dilation=1,
                 groups=1,
                 bias=True,
                 padding_mode='zeros'):
        super(Conv1dCasual, self).__init__()
        self.pad = padding
        self.conv1d = weight_norm(
            Conv1d(
                in_channels,
                out_channels,
                kernel_size,
                stride,
                padding=0,
                dilation=dilation,
                groups=groups,
                bias=bias,
                padding_mode=padding_mode))
        self.conv1d.apply(init_weights)

    def forward(self, x):  # bdt
        # described starting from the last dimension and moving forward.
        x = F.pad(x, (self.pad, 0, 0, 0, 0, 0), 'constant')
        x = self.conv1d(x)
        return x

    def remove_weight_norm(self):
        remove_weight_norm(self.conv1d)


 class ConvTranspose1dCausal(torch.nn.Module):
    """CausalConvTranspose1d module with customized initialization."""

    def __init__(self,
                 in_channels,
                 out_channels,
                 kernel_size,
                 stride,
                 padding=0):
        """Initialize CausalConvTranspose1d module."""
        super(ConvTranspose1dCausal, self).__init__()
        self.deconv = weight_norm(
            ConvTranspose1d(in_channels, out_channels, kernel_size, stride))
        self.stride = stride
        self.deconv.apply(init_weights)
        self.pad = kernel_size - stride

    def forward(self, x):
        """Calculate forward propagation.
        Args:
            x (Tensor): Input tensor (B, in_channels, T_in).
        Returns:
            Tensor: Output tensor (B, out_channels, T_out).
        """
        # x = F.pad(x, (self.pad, 0, 0, 0, 0, 0), "constant")
        return self.deconv(x)[:, :, :-self.pad]

    def remove_weight_norm(self):
        remove_weight_norm(self.deconv)


 class ResBlock1(torch.nn.Module):

    def __init__(self, h, channels, kernel_size=3, dilation=(1, 3, 5)):
        super(ResBlock1, self).__init__()
        self.h = h
        self.convs1 = nn.ModuleList([
            Conv1dCasual(
                channels,
                channels,
                kernel_size,
                1,
                dilation=dilation[i],
                padding=get_padding_casual(kernel_size, dilation[i]))
            for i in range(len(dilation))
        ])

        self.convs2 = nn.ModuleList([
            Conv1dCasual(
                channels,
                channels,
                kernel_size,
                1,
                dilation=1,
                padding=get_padding_casual(kernel_size, 1))
            for i in range(len(dilation))
        ])

    def forward(self, x):
        for c1, c2 in zip(self.convs1, self.convs2):
            xt = F.leaky_relu(x, LRELU_SLOPE)
            xt = c1(xt)
            xt = F.leaky_relu(xt, LRELU_SLOPE)
            xt = c2(xt)
            x = xt + x
        return x

    def remove_weight_norm(self):
        for layer in self.convs1:
            layer.remove_weight_norm()
        for layer in self.convs2:
            layer.remove_weight_norm()


 class Generator(torch.nn.Module):

    def __init__(self, h):
        super(Generator, self).__init__()
        self.h = h
        self.num_kernels = len(h.resblock_kernel_sizes)
        self.num_upsamples = len(h.upsample_rates)
        print('num_kernels={}, num_upsamples={}'.format(
            self.num_kernels, self.num_upsamples))
        self.conv_pre = Conv1dCasual(
            80, h.upsample_initial_channel, 7, 1, padding=7 - 1)
        resblock = ResBlock1 if h.resblock == '1' else ResBlock2

        self.ups = nn.ModuleList()
        self.repeat_ups = nn.ModuleList()
        for i, (u, k) in enumerate(
                zip(h.upsample_rates, h.upsample_kernel_sizes)):
            upsample = nn.Sequential(
                nn.Upsample(mode='nearest', scale_factor=u),
                nn.LeakyReLU(LRELU_SLOPE),
                Conv1dCasual(
                    h.upsample_initial_channel // (2**i),
                    h.upsample_initial_channel // (2**(i + 1)),
                    kernel_size=7,
                    stride=1,
                    padding=7 - 1))
            self.repeat_ups.append(upsample)
            self.ups.append(
                ConvTranspose1dCausal(
                    h.upsample_initial_channel // (2**i),
                    h.upsample_initial_channel // (2**(i + 1)),
                    k,
                    u,
                    padding=(k - u) // 2))

        self.resblocks = nn.ModuleList()
        for i in range(len(self.ups)):
            ch = h.upsample_initial_channel // (2**(i + 1))
            for j, (k, d) in enumerate(
                    zip(h.resblock_kernel_sizes, h.resblock_dilation_sizes)):
                self.resblocks.append(resblock(h, ch, k, d))

        self.conv_post = Conv1dCasual(ch, 1, 7, 1, padding=7 - 1)

    def forward(self, x):
        x = self.conv_pre(x)
        for i in range(self.num_upsamples):
            x = torch.sin(x) + x
            # transconv
            x1 = F.leaky_relu(x, LRELU_SLOPE)
            x1 = self.ups[i](x1)
            # repeat
            x2 = self.repeat_ups[i](x)
            x = x1 + x2
            xs = None
            for j in range(self.num_kernels):
                if xs is None:
                    xs = self.resblocks[i * self.num_kernels + j](x)
                else:
                    xs += self.resblocks[i * self.num_kernels + j](x)
            x = xs / self.num_kernels
        x = F.leaky_relu(x)
        x = self.conv_post(x)
        x = torch.tanh(x)
        return x

    def remove_weight_norm(self):
        print('Removing weight norm...')
        for layer in self.ups:
            layer.remove_weight_norm()
        for layer in self.repeat_ups:
            layer[-1].remove_weight_norm()
        for layer in self.resblocks:
            layer.remove_weight_norm()
        self.conv_pre.remove_weight_norm()
        self.conv_post.remove_weight_norm()


 class DiscriminatorP(torch.nn.Module):

    def __init__(self,
                 period,
                 kernel_size=5,
                 stride=3,
                 use_spectral_norm=False):
        super(DiscriminatorP, self).__init__()
        self.period = period
        norm_f = weight_norm if use_spectral_norm is False else spectral_norm
        self.convs = nn.ModuleList([
            norm_f(
                Conv2d(
                    1,
                    32, (kernel_size, 1), (stride, 1),
                    padding=(get_padding(5, 1), 0))),
            norm_f(
                Conv2d(
                    32,
                    128, (kernel_size, 1), (stride, 1),
                    padding=(get_padding(5, 1), 0))),
            norm_f(
                Conv2d(
                    128,
                    512, (kernel_size, 1), (stride, 1),
                    padding=(get_padding(5, 1), 0))),
            norm_f(
                Conv2d(
                    512,
                    1024, (kernel_size, 1), (stride, 1),
                    padding=(get_padding(5, 1), 0))),
            norm_f(Conv2d(1024, 1024, (kernel_size, 1), 1, padding=(2, 0))),
        ])
        self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))

    def forward(self, x):
        fmap = []

        # 1d to 2d
        b, c, t = x.shape
        if t % self.period != 0:  # pad first
            n_pad = self.period - (t % self.period)
            x = F.pad(x, (0, n_pad), 'reflect')
            t = t + n_pad
        x = x.view(b, c, t // self.period, self.period)

        for layer in self.convs:
            x = layer(x)
            x = F.leaky_relu(x, LRELU_SLOPE)
            fmap.append(x)
        x = self.conv_post(x)
        fmap.append(x)
        x = torch.flatten(x, 1, -1)

        return x, fmap


 class MultiPeriodDiscriminator(torch.nn.Module):

    def __init__(self):
        super(MultiPeriodDiscriminator, self).__init__()
        self.discriminators = nn.ModuleList([
            DiscriminatorP(2),
            DiscriminatorP(3),
            DiscriminatorP(5),
            DiscriminatorP(7),
            DiscriminatorP(11),
        ])

    def forward(self, y, y_hat):
        y_d_rs = []
        y_d_gs = []
        fmap_rs = []
        fmap_gs = []
        for i, d in enumerate(self.discriminators):
            y_d_r, fmap_r = d(y)
            y_d_g, fmap_g = d(y_hat)
            y_d_rs.append(y_d_r)
            fmap_rs.append(fmap_r)
            y_d_gs.append(y_d_g)
            fmap_gs.append(fmap_g)

        return y_d_rs, y_d_gs, fmap_rs, fmap_gs


 class DiscriminatorS(torch.nn.Module):

    def __init__(self, use_spectral_norm=False):
        super(DiscriminatorS, self).__init__()
        norm_f = weight_norm if use_spectral_norm is False else spectral_norm
        self.convs = nn.ModuleList([
            norm_f(Conv1d(1, 128, 15, 1, padding=7)),
            norm_f(Conv1d(128, 128, 41, 2, groups=4, padding=20)),
            norm_f(Conv1d(128, 256, 41, 2, groups=16, padding=20)),
            norm_f(Conv1d(256, 512, 41, 4, groups=16, padding=20)),
            norm_f(Conv1d(512, 1024, 41, 4, groups=16, padding=20)),
            norm_f(Conv1d(1024, 1024, 41, 1, groups=16, padding=20)),
            norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
        ])
        self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))

    def forward(self, x):
        fmap = []
        for layer in self.convs:
            x = layer(x)
            x = F.leaky_relu(x, LRELU_SLOPE)
            fmap.append(x)
        x = self.conv_post(x)
        fmap.append(x)
        x = torch.flatten(x, 1, -1)

        return x, fmap


 class MultiScaleDiscriminator(torch.nn.Module):

    def __init__(self):
        super(MultiScaleDiscriminator, self).__init__()
        self.discriminators = nn.ModuleList([
            DiscriminatorS(use_spectral_norm=True),
            DiscriminatorS(),
            DiscriminatorS(),
        ])
        from pytorch_wavelets import DWT1DForward
        self.meanpools = nn.ModuleList(
            [DWT1DForward(wave='db3', J=1),
             DWT1DForward(wave='db3', J=1)])
        self.convs = nn.ModuleList([
            weight_norm(Conv1d(2, 1, 15, 1, padding=7)),
            weight_norm(Conv1d(2, 1, 15, 1, padding=7))
        ])

    def forward(self, y, y_hat):
        y_d_rs = []
        y_d_gs = []
        fmap_rs = []
        fmap_gs = []
        for i, d in enumerate(self.discriminators):
            if i != 0:
                yl, yh = self.meanpools[i - 1](y)
                y = torch.cat([yl, yh[0]], dim=1)
                y = self.convs[i - 1](y)
                y = F.leaky_relu(y, LRELU_SLOPE)

                yl_hat, yh_hat = self.meanpools[i - 1](y_hat)
                y_hat = torch.cat([yl_hat, yh_hat[0]], dim=1)
                y_hat = self.convs[i - 1](y_hat)
                y_hat = F.leaky_relu(y_hat, LRELU_SLOPE)

            y_d_r, fmap_r = d(y)
            y_d_g, fmap_g = d(y_hat)
            y_d_rs.append(y_d_r)
            fmap_rs.append(fmap_r)
            y_d_gs.append(y_d_g)
            fmap_gs.append(fmap_g)

        return y_d_rs, y_d_gs, fmap_rs, fmap_gs


 class DiscriminatorSTFT(torch.nn.Module):

    def __init__(self,
                 kernel_size=11,
                 stride=2,
                 use_spectral_norm=False,
                 fft_size=1024,
                 shift_size=120,
                 win_length=600,
                 window='hann_window'):
        super(DiscriminatorSTFT, self).__init__()
        self.fft_size = fft_size
        self.shift_size = shift_size
        self.win_length = win_length
        norm_f = weight_norm if use_spectral_norm is False else spectral_norm
        self.convs = nn.ModuleList([
            norm_f(
                Conv2d(
                    fft_size // 2 + 1,
                    32, (15, 1), (1, 1),
                    padding=(get_padding(15, 1), 0))),
            norm_f(
                Conv2d(
                    32,
                    32, (kernel_size, 1), (stride, 1),
                    padding=(get_padding(9, 1), 0))),
            norm_f(
                Conv2d(
                    32,
                    32, (kernel_size, 1), (stride, 1),
                    padding=(get_padding(9, 1), 0))),
            norm_f(
                Conv2d(
                    32,
                    32, (kernel_size, 1), (stride, 1),
                    padding=(get_padding(9, 1), 0))),
            norm_f(Conv2d(32, 32, (5, 1), (1, 1), padding=(2, 0))),
        ])
        self.conv_post = norm_f(Conv2d(32, 1, (3, 1), (1, 1), padding=(1, 0)))
        self.register_buffer('window', getattr(torch, window)(win_length))

    def forward(self, wav):
        wav = torch.squeeze(wav, 1)
        x_mag = stft(wav, self.fft_size, self.shift_size, self.win_length,
                     self.window)
        x = torch.transpose(x_mag, 2, 1).unsqueeze(-1)
        fmap = []
        for layer in self.convs:
            x = layer(x)
            x = F.leaky_relu(x, LRELU_SLOPE)
            fmap.append(x)
        x = self.conv_post(x)
        fmap.append(x)
        x = x.squeeze(-1)

        return x, fmap


 class MultiSTFTDiscriminator(torch.nn.Module):

    def __init__(
        self,
        fft_sizes=[1024, 2048, 512],
        hop_sizes=[120, 240, 50],
        win_lengths=[600, 1200, 240],
        window='hann_window',
    ):
        super(MultiSTFTDiscriminator, self).__init__()
        self.discriminators = nn.ModuleList()
        for fs, ss, wl in zip(fft_sizes, hop_sizes, win_lengths):
            self.discriminators += [
                DiscriminatorSTFT(fft_size=fs, shift_size=ss, win_length=wl)
            ]

    def forward(self, y, y_hat):
        y_d_rs = []
        y_d_gs = []
        fmap_rs = []
        fmap_gs = []
        for i, d in enumerate(self.discriminators):
            y_d_r, fmap_r = d(y)
            y_d_g, fmap_g = d(y_hat)
            y_d_rs.append(y_d_r)
            fmap_rs.append(fmap_r)
            y_d_gs.append(y_d_g)
            fmap_gs.append(fmap_g)

        return y_d_rs, y_d_gs, fmap_rs, fmap_gs


 def feature_loss(fmap_r, fmap_g):
    loss = 0
    for dr, dg in zip(fmap_r, fmap_g):
        for rl, gl in zip(dr, dg):
            loss += torch.mean(torch.abs(rl - gl))

    return loss * 2


 def discriminator_loss(disc_real_outputs, disc_generated_outputs):
    loss = 0
    r_losses = []
    g_losses = []
    for dr, dg in zip(disc_real_outputs, disc_generated_outputs):
        r_loss = torch.mean((1 - dr)**2)
        g_loss = torch.mean(dg**2)
        loss += (r_loss + g_loss)
        r_losses.append(r_loss.item())
        g_losses.append(g_loss.item())

    return loss, r_losses, g_losses


 def generator_loss(disc_outputs):
    loss = 0
    gen_losses = []
    for dg in disc_outputs:
        temp_loss = torch.mean((1 - dg)**2)
        gen_losses.append(temp_loss)
        loss += temp_loss

    return loss, gen_losses
--- a/modelscope/models/audio/tts/vocoder/models/utils.py
+++ b/modelscope/models/audio/tts/vocoder/models/utils.py
@@ -0,0 +1,59 @@
 import glob
 import os

 import matplotlib
 import matplotlib.pylab as plt
 import torch
 from torch.nn.utils import weight_norm

 matplotlib.use('Agg')


 def plot_spectrogram(spectrogram):
    fig, ax = plt.subplots(figsize=(10, 2))
    im = ax.imshow(
        spectrogram, aspect='auto', origin='lower', interpolation='none')
    plt.colorbar(im, ax=ax)

    fig.canvas.draw()
    plt.close()

    return fig


 def init_weights(m, mean=0.0, std=0.01):
    classname = m.__class__.__name__
    if classname.find('Conv') != -1:
        m.weight.data.normal_(mean, std)


 def apply_weight_norm(m):
    classname = m.__class__.__name__
    if classname.find('Conv') != -1:
        weight_norm(m)


 def get_padding(kernel_size, dilation=1):
    return int((kernel_size * dilation - dilation) / 2)


 def load_checkpoint(filepath, device):
    assert os.path.isfile(filepath)
    print("Loading '{}'".format(filepath))
    checkpoint_dict = torch.load(filepath, map_location=device)
    print('Complete.')
    return checkpoint_dict


 def save_checkpoint(filepath, obj):
    print('Saving checkpoint to {}'.format(filepath))
    torch.save(obj, filepath)
    print('Complete.')


 def scan_checkpoint(cp_dir, prefix):
    pattern = os.path.join(cp_dir, prefix + '????????')
    cp_list = glob.glob(pattern)
    if len(cp_list) == 0:
        return None
    return sorted(cp_list)[-1]
--- a/modelscope/models/base.py
+++ b/modelscope/models/base.py
@@ -62,4 +62,6 @@ class Model(ABC):
        if hasattr(model_cfg, 'model_type') and not hasattr(model_cfg, 'type'):
            model_cfg.type = model_cfg.model_type
        model_cfg.model_dir = local_model_dir
        for k, v in kwargs.items():
            model_cfg.k = v
        return build_model(model_cfg, task_name)
--- a/modelscope/models/multi_model/init.py
+++ b/modelscope/models/multi_model/init.py
@@ -0,0 +1 @@
 from .image_captioning_model import OfaForImageCaptioning
--- a/modelscope/models/multi_model/image_captioning_model.py
+++ b/modelscope/models/multi_model/image_captioning_model.py
@@ -0,0 +1,80 @@
 import os.path as osp
 from typing import Any, Dict

 from PIL import Image

 from modelscope.utils.constant import ModelFile, Tasks
 from ..base import Model
 from ..builder import MODELS

 __all__ = ['OfaForImageCaptioning']


@MODELS.register_module(
    Tasks.image_captioning, module_name=r'ofa-image-captioning')
 class OfaForImageCaptioning(Model):

    def __init__(self, model_dir, *args, **kwargs):
        super().__init__(model_dir=model_dir, *args, **kwargs)
        ckpt_name = ModelFile.TORCH_MODEL_FILE
        local_model = osp.join(model_dir, ckpt_name)
        bpe_dir = model_dir
        # turn on cuda if GPU is available
        from fairseq import checkpoint_utils, tasks, utils
        from ofa.tasks.mm_tasks import CaptionTask
        from ofa.utils.eval_utils import eval_caption
        self.eval_caption = eval_caption

        tasks.register_task('caption', CaptionTask)
        use_cuda = kwargs['use_cuda'] if 'use_cuda' in kwargs else False
        use_fp16 = kwargs[
            'use_fp16'] if 'use_fp16' in kwargs and use_cuda else False
        overrides = {
            'bpe_dir': bpe_dir,
            'eval_cider': False,
            'beam': 5,
            'max_len_b': 16,
            'no_repeat_ngram_size': 3,
            'seed': 7
        }
        models, cfg, task = checkpoint_utils.load_model_ensemble_and_task(
            utils.split_paths(local_model), arg_overrides=overrides)

        # Move models to GPU
        for model in models:
            model.eval()
            if use_cuda:
                model.cuda()
            if use_fp16:
                model.half()
            model.prepare_for_inference_(cfg)
        self.models = models
        # Initialize generator
        self.generator = task.build_generator(models, cfg.generation)

        # Initialize transform
        from torchvision import transforms
        mean = [0.5, 0.5, 0.5]
        std = [0.5, 0.5, 0.5]

        self.patch_resize_transform = transforms.Compose([
            lambda image: image.convert('RGB'),
            transforms.Resize(
                (cfg.task.patch_image_size, cfg.task.patch_image_size),
                interpolation=Image.BICUBIC),
            transforms.ToTensor(),
            transforms.Normalize(mean=mean, std=std),
        ])
        self.task = task

    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
        results, _ = self.eval_caption(self.task, self.generator, self.models,
                                       input)
        return {
            'image_id': results[0]['image_id'],
            'caption': results[0]['caption']
        }

    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
        # What should we do here ?
        return inputs
--- a/modelscope/models/nlp/init.py
+++ b/modelscope/models/nlp/init.py
@@ -1,4 +1,5 @@
 from .masked_language_model import *  # noqa F403
 from .sentence_similarity_model import *  # noqa F403
 from .sequence_classification_model import *  # noqa F403
 from .text_generation_model import *  # noqa F403
 from .bert_for_sequence_classification import *  # noqa F403
 from .palm_for_text_generation import *  # noqa F403
 from .sbert_for_sentence_similarity import *  # noqa F403
 from .sbert_for_token_classification import *  # noqa F403
--- a/modelscope/models/nlp/bert_for_sequence_classification.py
+++ b/modelscope/models/nlp/bert_for_sequence_classification.py
--- a/modelscope/models/nlp/palm_for_text_generation.py
+++ b/modelscope/models/nlp/palm_for_text_generation.py
@@ -0,0 +1,43 @@
 from typing import Dict

 from modelscope.utils.constant import Tasks
 from ..base import Model, Tensor
 from ..builder import MODELS

 __all__ = ['PalmForTextGeneration']


@MODELS.register_module(Tasks.text_generation, module_name=r'palm2.0')
 class PalmForTextGeneration(Model):

    def __init__(self, model_dir: str, *args, **kwargs):
        """initialize the text generation model from the `model_dir` path.

        Args:
            model_dir (str): the model path.
            model_cls (Optional[Any], optional): model loader, if None, use the
                default loader to load model weights, by default None.
        """
        super().__init__(model_dir, *args, **kwargs)
        self.model_dir = model_dir

        from sofa.models.palm_v2 import PalmForConditionalGeneration, Translator
        model = PalmForConditionalGeneration.from_pretrained(model_dir)
        self.tokenizer = model.tokenizer
        self.generator = Translator(model)

    def forward(self, input: Dict[str, Tensor]) -> Dict[str, Tensor]:
        """return the result by the model

        Args:
            input (Dict[str, Tensor]): the preprocessed data

        Returns:
            Dict[str, Tensor]: results
                Example:
                    {
                        'predictions': Tensor([[1377, 4959, 2785, 6392...])]), # tokens need to be decode by tokenizer
                    }
        """

        return self.generator(**input)
--- a/modelscope/models/nlp/sbert_for_sentence_similarity.py
+++ b/modelscope/models/nlp/sbert_for_sentence_similarity.py
--- a/modelscope/models/nlp/sbert_for_token_classification.py
+++ b/modelscope/models/nlp/sbert_for_token_classification.py
@@ -0,0 +1,56 @@
 from typing import Any, Dict, Union

 import numpy as np
 import torch
 from sofa import SbertConfig, SbertForTokenClassification

 from modelscope.utils.constant import Tasks
 from ..base import Model, Tensor
 from ..builder import MODELS

 __all__ = ['StructBertForTokenClassification']


@MODELS.register_module(
    Tasks.word_segmentation,
    module_name=r'structbert-chinese-word-segmentation')
 class StructBertForTokenClassification(Model):

    def __init__(self, model_dir: str, *args, **kwargs):
        """initialize the word segmentation model from the `model_dir` path.

        Args:
            model_dir (str): the model path.
            model_cls (Optional[Any], optional): model loader, if None, use the
                default loader to load model weights, by default None.
        """
        super().__init__(model_dir, *args, **kwargs)
        self.model_dir = model_dir
        self.model = SbertForTokenClassification.from_pretrained(
            self.model_dir)
        self.config = SbertConfig.from_pretrained(self.model_dir)

    def forward(self, input: Dict[str,
                                  Any]) -> Dict[str, Union[str, np.ndarray]]:
        """return the result by the model

        Args:
            input (Dict[str, Any]): the preprocessed data

        Returns:
            Dict[str, Union[str,np.ndarray]]: results
                Example:
                    {
                        'predictions': array([1,4]), # lable 0-negative 1-positive
                        'logits': array([[-0.53860897,  1.5029076 ]], dtype=float32) # true value
                        'text': str(今天),
                    }
        """
        input_ids = torch.tensor(input['input_ids']).unsqueeze(0)
        output = self.model(input_ids)
        logits = output.logits
        pred = torch.argmax(logits[0], dim=-1)
        pred = pred.numpy()

        rst = {'predictions': pred, 'logits': logits, 'text': input['text']}
        return rst
--- a/modelscope/models/nlp/text_generation_model.py
+++ b/modelscope/models/nlp/text_generation_model.py
@@ -1,52 +0,0 @@
 from typing import Any, Dict

 from modelscope.utils.constant import Tasks
 from ..base import Model, Tensor
 from ..builder import MODELS

 __all__ = ['PalmForTextGenerationModel']


@MODELS.register_module(Tasks.text_generation, module_name=r'palm')
 class PalmForTextGenerationModel(Model):

    def __init__(self, model_dir: str, *args, **kwargs):
        """initialize the text generation model from the `model_dir` path.

        Args:
            model_dir (str): the model path.
            model_cls (Optional[Any], optional): model loader, if None, use the
                default loader to load model weights, by default None.
        """
        from sofa import PalmTokenizer

        super().__init__(model_dir, *args, **kwargs)
        self.model_dir = model_dir

        from sofa.models.palm import PalmForConditionalGeneration, TextGenerator
        tokenizer = kwargs.pop('tokenizer',
                               PalmTokenizer.from_pretrained(model_dir))
        model = PalmForConditionalGeneration.from_pretrained(model_dir)
        self.generator = TextGenerator(model, tokenizer)

    def forward(self, input: Dict[str, Tensor]) -> Dict[str, Tensor]:
        """return the result by the model

        Args:
            input (Dict[str, Any]): the preprocessed data

        Returns:
            Dict[str, np.ndarray]: results
                Example:
                    {
                        'predictions': array([1]), # lable 0-negative 1-positive
                        'probabilities': array([[0.11491239, 0.8850876 ]], dtype=float32),
                        'logits': array([[-0.53860897,  1.5029076 ]], dtype=float32) # true value
                    }
        """

        encoder_inputs = [
            input['input_ids'], input['token_type_ids'],
            input['attention_mask']
        ]
        return self.generator(encoder_inputs)
--- a/modelscope/pipelines/init.py
+++ b/modelscope/pipelines/init.py
@@ -1,4 +1,4 @@
 from .audio import *  # noqa F403
 from .audio import LinearAECPipeline
 from .base import Pipeline
 from .builder import pipeline
 from .cv import *  # noqa F403
--- a/modelscope/pipelines/audio/init.py
+++ b/modelscope/pipelines/audio/init.py
@@ -0,0 +1,2 @@
 from .linear_aec_pipeline import LinearAECPipeline
 from .text_to_speech_pipeline import *  # noqa F403
--- a/modelscope/pipelines/audio/linear_aec_pipeline.py
+++ b/modelscope/pipelines/audio/linear_aec_pipeline.py
@@ -0,0 +1,160 @@
 import importlib
 import os
 from typing import Any, Dict

 import numpy as np
 import scipy.io.wavfile as wav
 import torch
 import yaml

 from modelscope.preprocessors.audio import LinearAECAndFbank
 from modelscope.utils.constant import ModelFile, Tasks
 from ..base import Pipeline
 from ..builder import PIPELINES

 FEATURE_MVN = 'feature.DEY.mvn.txt'

 CONFIG_YAML = 'dey_mini.yaml'


 def initialize_config(module_cfg):
    r"""According to config items, load specific module dynamically with params.
        1. Load the module corresponding to the "module" param.
        2. Call function (or instantiate class) corresponding to the "main" param.
        3. Send the param (in "args") into the function (or class) when calling ( or instantiating).

    Args:
        module_cfg (dict): config items, eg:
            {
                "module": "models.model",
                "main": "Model",
                "args": {...}
            }

    Returns:
        the module loaded.
    """
    module = importlib.import_module(module_cfg['module'])
    return getattr(module, module_cfg['main'])(**module_cfg['args'])


@PIPELINES.register_module(
    Tasks.speech_signal_process, module_name=r'speech_dfsmn_aec_psm_16k')
 class LinearAECPipeline(Pipeline):
    r"""AEC Inference Pipeline only support 16000 sample rate.

    When invoke the class with pipeline.__call__(), you should provide two params:
        Dict[str, Any]
            the path of wav files，eg:{
            "nearend_mic": "/your/data/near_end_mic_audio.wav",
            "farend_speech": "/your/data/far_end_speech_audio.wav"}
        output_path (str, optional): "/your/output/audio_after_aec.wav"
            the file path to write generate audio.
    """

    def __init__(self, model):
        r"""
        Args:
            model: model id on modelscope hub.
        """
        super().__init__(model=model)
        self.use_cuda = torch.cuda.is_available()
        with open(
                os.path.join(self.model, CONFIG_YAML), encoding='utf-8') as f:
            self.config = yaml.full_load(f.read())
            self.config['io']['mvn'] = os.path.join(self.model, FEATURE_MVN)
        self._init_model()
        self.preprocessor = LinearAECAndFbank(self.config['io'])

        n_fft = self.config['loss']['args']['n_fft']
        hop_length = self.config['loss']['args']['hop_length']
        winlen = n_fft
        window = torch.hamming_window(winlen, periodic=False)

        def stft(x):
            return torch.stft(
                x,
                n_fft,
                hop_length,
                winlen,
                center=False,
                window=window.to(x.device),
                return_complex=False)

        def istft(x, slen):
            return torch.istft(
                x,
                n_fft,
                hop_length,
                winlen,
                window=window.to(x.device),
                center=False,
                length=slen)

        self.stft = stft
        self.istft = istft

    def _init_model(self):
        checkpoint = torch.load(
            os.path.join(self.model, ModelFile.TORCH_MODEL_BIN_FILE),
            map_location='cpu')
        self.model = initialize_config(self.config['nnet'])
        if self.use_cuda:
            self.model = self.model.cuda()
        self.model.load_state_dict(checkpoint)

    def forward(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
        r"""The AEC process.

        Args:
            inputs: dict={'feature': Tensor, 'base': Tensor}
                'feature' feature of input audio.
                'base' the base audio to mask.

        Returns:
            dict:
                {
                    'output_pcm': generated audio array
                }
        """
        output_data = self._process(inputs['feature'], inputs['base'])
        return {'output_pcm': output_data}

    def postprocess(self, inputs: Dict[str, Any], **kwargs) -> Dict[str, Any]:
        r"""The post process. Will save audio to file, if the output_path is given.

        Args:
            inputs: dict:
                {
                    'output_pcm': generated audio array
                }
            kwargs: accept 'output_path' which is the path to write generated audio

        Returns:
            dict:
                {
                    'output_pcm': generated audio array
                }
        """
        if 'output_path' in kwargs.keys():
            wav.write(kwargs['output_path'], self.preprocessor.SAMPLE_RATE,
                      inputs['output_pcm'].astype(np.int16))
        inputs['output_pcm'] = inputs['output_pcm'] / 32768.0
        return inputs

    def _process(self, fbanks, mixture):
        if self.use_cuda:
            fbanks = fbanks.cuda()
            mixture = mixture.cuda()
        if self.model.vad:
            with torch.no_grad():
                masks, vad = self.model(fbanks.unsqueeze(0))
                masks = masks.permute([2, 1, 0])
        else:
            with torch.no_grad():
                masks = self.model(fbanks.unsqueeze(0))
                masks = masks.permute([2, 1, 0])
        spectrum = self.stft(mixture)
        masked_spec = spectrum * masks
        masked_sig = self.istft(masked_spec, len(mixture)).cpu().numpy()
        return masked_sig
--- a/modelscope/pipelines/audio/text_to_speech_pipeline.py
+++ b/modelscope/pipelines/audio/text_to_speech_pipeline.py
@@ -0,0 +1,46 @@
 import time
 from typing import Any, Dict, List

 import numpy as np

 from modelscope.models import Model
 from modelscope.models.audio.tts.am import SambertNetHifi16k
 from modelscope.models.audio.tts.vocoder import Hifigan16k
 from modelscope.pipelines.base import Pipeline
 from modelscope.pipelines.builder import PIPELINES
 from modelscope.preprocessors import TextToTacotronSymbols, build_preprocessor
 from modelscope.utils.constant import Fields, Tasks

 __all__ = ['TextToSpeechSambertHifigan16kPipeline']


@PIPELINES.register_module(
    Tasks.text_to_speech, module_name=r'tts-sambert-hifigan-16k')
 class TextToSpeechSambertHifigan16kPipeline(Pipeline):

    def __init__(self,
                 config_file: str = None,
                 model: List[Model] = None,
                 preprocessor: TextToTacotronSymbols = None,
                 **kwargs):
        super().__init__(
            config_file=config_file,
            model=model,
            preprocessor=preprocessor,
            **kwargs)
        assert len(model) == 2, 'model number should be 2'
        self._am = model[0]
        self._vocoder = model[1]
        self._preprocessor = preprocessor

    def forward(self, inputs: Dict[str, Any]) -> Dict[str, np.ndarray]:
        texts = inputs['texts']
        audio_total = np.empty((0), dtype='int16')
        for line in texts:
            line = line.strip().split('\t')
            audio = self._vocoder.forward(self._am.forward(line[1]))
            audio_total = np.append(audio_total, audio, axis=0)
        return {'output': audio_total}

    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
        return inputs
--- a/modelscope/pipelines/builder.py
+++ b/modelscope/pipelines/builder.py
@@ -13,18 +13,23 @@ PIPELINES = Registry('pipelines')

 DEFAULT_MODEL_FOR_PIPELINE = {
    # TaskName: (pipeline_module_name, model_repo)
    Tasks.word_segmentation:
    ('structbert-chinese-word-segmentation',
     'damo/nlp_structbert_word-segmentation_chinese-base'),
    Tasks.sentence_similarity:
    ('sbert-base-chinese-sentence-similarity',
     'damo/nlp_structbert_sentence-similarity_chinese-base'),
    Tasks.image_matting: ('image-matting', 'damo/cv_unet_image-matting_damo'),
    Tasks.image_matting: ('image-matting', 'damo/cv_unet_image-matting'),
    Tasks.text_classification:
    ('bert-sentiment-analysis', 'damo/bert-base-sst2'),
    Tasks.text_generation: ('palm', 'damo/nlp_palm_text-generation_chinese'),
    Tasks.image_captioning: ('ofa', None),
    Tasks.text_generation: ('palm2.0',
                            'damo/nlp_palm2.0_text-generation_chinese-base'),
    Tasks.image_captioning: ('ofa', 'damo/ofa_image-caption_coco_large_en'),
    Tasks.image_generation:
    ('person-image-cartoon',
     'damo/cv_unet_person-image-cartoon_compound-models'),
    Tasks.fill_mask: ('sbert', 'damo/nlp_structbert_fill-mask_chinese-large'),
    Tasks.ocr_detection: ('ocr-detection',
                          'damo/cv_resnet18_ocr-detection-line-level_damo'),
    Tasks.fill_mask: ('veco', 'damo/nlp_veco_fill-mask_large')
 }

--- a/modelscope/pipelines/cv/init.py
+++ b/modelscope/pipelines/cv/init.py
@@ -1,2 +1,3 @@
 from .image_cartoon_pipeline import ImageCartoonPipeline
 from .image_matting_pipeline import ImageMattingPipeline
 from .ocr_detection_pipeline import OCRDetectionPipeline
--- a/modelscope/pipelines/cv/ocr_detection_pipeline.py
+++ b/modelscope/pipelines/cv/ocr_detection_pipeline.py
@@ -0,0 +1,167 @@
 import math
 import os
 import os.path as osp
 import sys
 from typing import Any, Dict, List, Tuple, Union

 import cv2
 import numpy as np
 import PIL
 import tensorflow as tf
 import tf_slim as slim

 from modelscope.pipelines.base import Input
 from modelscope.preprocessors import load_image
 from modelscope.utils.constant import ModelFile, Tasks
 from modelscope.utils.logger import get_logger
 from ..base import Pipeline
 from ..builder import PIPELINES
 from .ocr_utils import model_resnet_mutex_v4_linewithchar, ops, utils

 if tf.__version__ >= '2.0':
    tf = tf.compat.v1
 tf.compat.v1.disable_eager_execution()

 logger = get_logger()

 # constant
 RBOX_DIM = 5
 OFFSET_DIM = 6
 WORD_POLYGON_DIM = 8
 OFFSET_VARIANCE = [0.1, 0.1, 0.1, 0.1, 0.1, 0.1]

 FLAGS = tf.app.flags.FLAGS
 tf.app.flags.DEFINE_float('node_threshold', 0.4,
                          'Confidence threshold for nodes')
 tf.app.flags.DEFINE_float('link_threshold', 0.6,
                          'Confidence threshold for links')


@PIPELINES.register_module(
    Tasks.ocr_detection, module_name=Tasks.ocr_detection)
 class OCRDetectionPipeline(Pipeline):

    def __init__(self, model: str):
        super().__init__(model=model)
        model_path = osp.join(
            osp.join(self.model, ModelFile.TF_CHECKPOINT_FOLDER),
            'checkpoint-80000')

        config = tf.ConfigProto(allow_soft_placement=True)
        config.gpu_options.allow_growth = True
        self._session = tf.Session(config=config)
        global_step = tf.get_variable(
            'global_step', [],
            initializer=tf.constant_initializer(0),
            dtype=tf.int64,
            trainable=False)
        variable_averages = tf.train.ExponentialMovingAverage(
            0.997, global_step)
        self.input_images = tf.placeholder(
            tf.float32, shape=[1, 1024, 1024, 3], name='input_images')
        self.output = {}

        # detector
        detector = model_resnet_mutex_v4_linewithchar.SegLinkDetector()
        all_maps = detector.build_model(self.input_images, is_training=False)

        # decode local predictions
        all_nodes, all_links, all_reg = [], [], []
        for i, maps in enumerate(all_maps):
            cls_maps, lnk_maps, reg_maps = maps[0], maps[1], maps[2]
            reg_maps = tf.multiply(reg_maps, OFFSET_VARIANCE)

            cls_prob = tf.nn.softmax(tf.reshape(cls_maps, [-1, 2]))

            lnk_prob_pos = tf.nn.softmax(tf.reshape(lnk_maps, [-1, 4])[:, :2])
            lnk_prob_mut = tf.nn.softmax(tf.reshape(lnk_maps, [-1, 4])[:, 2:])
            lnk_prob = tf.concat([lnk_prob_pos, lnk_prob_mut], axis=1)

            all_nodes.append(cls_prob)
            all_links.append(lnk_prob)
            all_reg.append(reg_maps)

        # decode segments and links
        image_size = tf.shape(self.input_images)[1:3]
        segments, group_indices, segment_counts, _ = ops.decode_segments_links_python(
            image_size,
            all_nodes,
            all_links,
            all_reg,
            anchor_sizes=list(detector.anchor_sizes))

        # combine segments
        combined_rboxes, combined_counts = ops.combine_segments_python(
            segments, group_indices, segment_counts)
        self.output['combined_rboxes'] = combined_rboxes
        self.output['combined_counts'] = combined_counts

        with self._session.as_default() as sess:
            logger.info(f'loading model from {model_path}')
            # load model
            model_loader = tf.train.Saver(
                variable_averages.variables_to_restore())
            model_loader.restore(sess, model_path)

    def preprocess(self, input: Input) -> Dict[str, Any]:
        if isinstance(input, str):
            img = np.array(load_image(input))
        elif isinstance(input, PIL.Image.Image):
            img = np.array(input.convert('RGB'))
        elif isinstance(input, np.ndarray):
            if len(input.shape) == 2:
                img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
            img = input[:, :, ::-1]  # in rgb order
        else:
            raise TypeError(f'input should be either str, PIL.Image,'
                            f' np.array, but got {type(input)}')
        h, w, c = img.shape
        img_pad = np.zeros((max(h, w), max(h, w), 3), dtype=np.float32)
        img_pad[:h, :w, :] = img

        resize_size = 1024
        img_pad_resize = cv2.resize(img_pad, (resize_size, resize_size))
        img_pad_resize = cv2.cvtColor(img_pad_resize, cv2.COLOR_RGB2BGR)
        img_pad_resize = img_pad_resize - np.array([123.68, 116.78, 103.94],
                                                   dtype=np.float32)

        resize_size = tf.stack([resize_size, resize_size])
        orig_size = tf.stack([max(h, w), max(h, w)])
        self.output['orig_size'] = orig_size
        self.output['resize_size'] = resize_size

        result = {'img': np.expand_dims(img_pad_resize, axis=0)}
        return result

    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
        with self._session.as_default():
            feed_dict = {self.input_images: input['img']}
            sess_outputs = self._session.run(self.output, feed_dict=feed_dict)
            return sess_outputs

    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
        rboxes = inputs['combined_rboxes'][0]
        count = inputs['combined_counts'][0]
        rboxes = rboxes[:count, :]

        # convert rboxes to polygons and find its coordinates on the original image
        orig_h, orig_w = inputs['orig_size']
        resize_h, resize_w = inputs['resize_size']
        polygons = utils.rboxes_to_polygons(rboxes)
        scale_y = float(orig_h) / float(resize_h)
        scale_x = float(orig_w) / float(resize_w)

        # confine polygons inside image
        polygons[:, ::2] = np.maximum(
            0, np.minimum(polygons[:, ::2] * scale_x, orig_w - 1))
        polygons[:, 1::2] = np.maximum(
            0, np.minimum(polygons[:, 1::2] * scale_y, orig_h - 1))
        polygons = np.round(polygons).astype(np.int32)

        # nms
        dt_n9 = [o + [utils.cal_width(o)] for o in polygons.tolist()]
        dt_nms = utils.nms_python(dt_n9)
        dt_polygons = np.array([o[:8] for o in dt_nms])

        result = {'det_polygons': dt_polygons}
        return result
--- a/modelscope/pipelines/cv/ocr_utils/init.py
+++ b/modelscope/pipelines/cv/ocr_utils/init.py
--- a/modelscope/pipelines/cv/ocr_utils/model_resnet_mutex_v4_linewithchar.py
+++ b/modelscope/pipelines/cv/ocr_utils/model_resnet_mutex_v4_linewithchar.py
@@ -0,0 +1,158 @@
 import tensorflow as tf
 import tf_slim as slim

 from . import ops, resnet18_v1, resnet_utils

 if tf.__version__ >= '2.0':
    tf = tf.compat.v1

 # constants
 OFFSET_DIM = 6

 N_LOCAL_LINKS = 8
 N_CROSS_LINKS = 4
 N_SEG_CLASSES = 2
 N_LNK_CLASSES = 4

 POS_LABEL = 1
 NEG_LABEL = 0


 class SegLinkDetector():

    def __init__(self):
        self.anchor_sizes = [6., 11.84210526, 23.68421053, 45., 90., 150.]

    def _detection_classifier(self,
                              maps,
                              ksize,
                              weight_decay,
                              cross_links=False,
                              scope=None):

        with tf.variable_scope(scope):
            seg_depth = N_SEG_CLASSES
            if cross_links:
                lnk_depth = N_LNK_CLASSES * (N_LOCAL_LINKS + N_CROSS_LINKS)
            else:
                lnk_depth = N_LNK_CLASSES * N_LOCAL_LINKS
            reg_depth = OFFSET_DIM
            map_depth = maps.get_shape()[3]
            inter_maps, inter_relu = ops.conv2d(
                maps, map_depth, 256, 1, 1, 'SAME', scope='conv_inter')

            dir_maps, dir_relu = ops.conv2d(
                inter_relu, 256, 2, ksize, 1, 'SAME', scope='conv_dir')
            cen_maps, cen_relu = ops.conv2d(
                inter_relu, 256, 2, ksize, 1, 'SAME', scope='conv_cen')
            pol_maps, pol_relu = ops.conv2d(
                inter_relu, 256, 8, ksize, 1, 'SAME', scope='conv_pol')
            concat_relu = tf.concat([dir_relu, cen_relu, pol_relu], axis=-1)
            _, lnk_embedding = ops.conv_relu(
                concat_relu, 12, 256, 1, 1, scope='lnk_embedding')
            lnk_maps, lnk_relu = ops.conv2d(
                inter_relu + lnk_embedding,
                256,
                lnk_depth,
                ksize,
                1,
                'SAME',
                scope='conv_lnk')

            char_seg_maps, char_seg_relu = ops.conv2d(
                inter_relu,
                256,
                seg_depth,
                ksize,
                1,
                'SAME',
                scope='conv_char_cls')
            char_reg_maps, char_reg_relu = ops.conv2d(
                inter_relu,
                256,
                reg_depth,
                ksize,
                1,
                'SAME',
                scope='conv_char_reg')
            concat_char_relu = tf.concat([char_seg_relu, char_reg_relu],
                                         axis=-1)
            _, char_embedding = ops.conv_relu(
                concat_char_relu, 8, 256, 1, 1, scope='conv_char_embedding')
            seg_maps, seg_relu = ops.conv2d(
                inter_relu + char_embedding,
                256,
                seg_depth,
                ksize,
                1,
                'SAME',
                scope='conv_cls')
            reg_maps, reg_relu = ops.conv2d(
                inter_relu + char_embedding,
                256,
                reg_depth,
                ksize,
                1,
                'SAME',
                scope='conv_reg')

        return seg_relu, lnk_relu, reg_relu

    def _build_cnn(self, images, weight_decay, is_training):
        with slim.arg_scope(
                resnet18_v1.resnet_arg_scope(weight_decay=weight_decay)):
            logits, end_points = resnet18_v1.resnet_v1_18(
                images, is_training=is_training, scope='resnet_v1_18')

        outputs = {
            'conv3_3': end_points['pool1'],
            'conv4_3': end_points['pool2'],
            'fc7': end_points['pool3'],
            'conv8_2': end_points['pool4'],
            'conv9_2': end_points['pool5'],
            'conv10_2': end_points['pool6'],
        }
        return outputs

    def build_model(self, images, is_training=True, scope=None):

        weight_decay = 5e-4  # FLAGS.weight_decay
        cnn_outputs = self._build_cnn(images, weight_decay, is_training)
        det_0 = self._detection_classifier(
            cnn_outputs['conv3_3'],
            3,
            weight_decay,
            cross_links=False,
            scope='dete_0')
        det_1 = self._detection_classifier(
            cnn_outputs['conv4_3'],
            3,
            weight_decay,
            cross_links=True,
            scope='dete_1')
        det_2 = self._detection_classifier(
            cnn_outputs['fc7'],
            3,
            weight_decay,
            cross_links=True,
            scope='dete_2')
        det_3 = self._detection_classifier(
            cnn_outputs['conv8_2'],
            3,
            weight_decay,
            cross_links=True,
            scope='dete_3')
        det_4 = self._detection_classifier(
            cnn_outputs['conv9_2'],
            3,
            weight_decay,
            cross_links=True,
            scope='dete_4')
        det_5 = self._detection_classifier(
            cnn_outputs['conv10_2'],
            3,
            weight_decay,
            cross_links=True,
            scope='dete_5')
        outputs = [det_0, det_1, det_2, det_3, det_4, det_5]
        return outputs
--- a/modelscope/pipelines/cv/ocr_utils/ops.py
+++ b/modelscope/pipelines/cv/ocr_utils/ops.py
--- a/modelscope/pipelines/cv/ocr_utils/resnet18_v1.py
+++ b/modelscope/pipelines/cv/ocr_utils/resnet18_v1.py
@@ -0,0 +1,432 @@
 """Contains definitions for the original form of Residual Networks.
 The 'v1' residual networks (ResNets) implemented in this module were proposed
 by:
 [1] Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun
    Deep Residual Learning for Image Recognition. arXiv:1512.03385
 Other variants were introduced in:
 [2] Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun
    Identity Mappings in Deep Residual Networks. arXiv: 1603.05027
 The networks defined in this module utilize the bottleneck building block of
 [1] with projection shortcuts only for increasing depths. They employ batch
 normalization *after* every weight layer. This is the architecture used by
 MSRA in the Imagenet and MSCOCO 2016 competition models ResNet-101 and
 ResNet-152. See [2; Fig. 1a] for a comparison between the current 'v1'
 architecture and the alternative 'v2' architecture of [2] which uses batch
 normalization *before* every weight layer in the so-called full pre-activation
 units.
 Typical use:
   from tensorflow.contrib.slim.nets import resnet_v1
 ResNet-101 for image classification into 1000 classes:
   # inputs has shape [batch, 224, 224, 3]
   with slim.arg_scope(resnet_v1.resnet_arg_scope()):
      net, end_points = resnet_v1.resnet_v1_101(inputs, 1000, is_training=False)
 ResNet-101 for semantic segmentation into 21 classes:
   # inputs has shape [batch, 513, 513, 3]
   with slim.arg_scope(resnet_v1.resnet_arg_scope()):
      net, end_points = resnet_v1.resnet_v1_101(inputs,
                                                21,
                                                is_training=False,
                                                global_pool=False,
                                                output_stride=16)
 """
 import tensorflow as tf
 import tf_slim as slim

 from . import resnet_utils

 if tf.__version__ >= '2.0':
    tf = tf.compat.v1

 resnet_arg_scope = resnet_utils.resnet_arg_scope


@slim.add_arg_scope
 def basicblock(inputs,
               depth,
               depth_bottleneck,
               stride,
               rate=1,
               outputs_collections=None,
               scope=None):
    """Bottleneck residual unit variant with BN after convolutions.
    This is the original residual unit proposed in [1]. See Fig. 1(a) of [2] for
    its definition. Note that we use here the bottleneck variant which has an
    extra bottleneck layer.
    When putting together two consecutive ResNet blocks that use this unit, one
    should use stride = 2 in the last unit of the first block.
    Args:
      inputs: A tensor of size [batch, height, width, channels].
      depth: The depth of the ResNet unit output.
      depth_bottleneck: The depth of the bottleneck layers.
      stride: The ResNet unit's stride. Determines the amount of downsampling of
        the units output compared to its input.
      rate: An integer, rate for atrous convolution.
      outputs_collections: Collection to add the ResNet unit output.
      scope: Optional variable_scope.
    Returns:
      The ResNet unit's output.
    """
    with tf.variable_scope(scope, 'bottleneck_v1', [inputs]) as sc:
        depth_in = slim.utils.last_dimension(inputs.get_shape(), min_rank=4)
        if depth == depth_in:
            shortcut = resnet_utils.subsample(inputs, stride, 'shortcut')
        else:
            shortcut = slim.conv2d(
                inputs,
                depth, [1, 1],
                stride=stride,
                activation_fn=None,
                scope='shortcut')

        residual = resnet_utils.conv2d_same(
            inputs, depth, 3, stride, rate=rate, scope='conv1')
        residual = resnet_utils.conv2d_same(
            residual, depth, 3, 1, rate=rate, scope='conv2')

        output = tf.nn.relu(residual + shortcut)

        return slim.utils.collect_named_outputs(outputs_collections,
                                                sc.original_name_scope, output)


@slim.add_arg_scope
 def bottleneck(inputs,
               depth,
               depth_bottleneck,
               stride,
               rate=1,
               outputs_collections=None,
               scope=None):
    """Bottleneck residual unit variant with BN after convolutions.
    This is the original residual unit proposed in [1]. See Fig. 1(a) of [2] for
    its definition. Note that we use here the bottleneck variant which has an
    extra bottleneck layer.
    When putting together two consecutive ResNet blocks that use this unit, one
    should use stride = 2 in the last unit of the first block.
    Args:
      inputs: A tensor of size [batch, height, width, channels].
      depth: The depth of the ResNet unit output.
      depth_bottleneck: The depth of the bottleneck layers.
      stride: The ResNet unit's stride. Determines the amount of downsampling of
        the units output compared to its input.
      rate: An integer, rate for atrous convolution.
      outputs_collections: Collection to add the ResNet unit output.
      scope: Optional variable_scope.
    Returns:
      The ResNet unit's output.
    """
    with tf.variable_scope(scope, 'bottleneck_v1', [inputs]) as sc:
        depth_in = slim.utils.last_dimension(inputs.get_shape(), min_rank=4)
        if depth == depth_in:
            shortcut = resnet_utils.subsample(inputs, stride, 'shortcut')
        else:
            shortcut = slim.conv2d(
                inputs,
                depth, [1, 1],
                stride=stride,
                activation_fn=None,
                scope='shortcut')

        residual = slim.conv2d(
            inputs, depth_bottleneck, [1, 1], stride=1, scope='conv1')
        residual = resnet_utils.conv2d_same(
            residual, depth_bottleneck, 3, stride, rate=rate, scope='conv2')
        residual = slim.conv2d(
            residual,
            depth, [1, 1],
            stride=1,
            activation_fn=None,
            scope='conv3')

        output = tf.nn.relu(shortcut + residual)

        return slim.utils.collect_named_outputs(outputs_collections,
                                                sc.original_name_scope, output)


 def resnet_v1(inputs,
              blocks,
              num_classes=None,
              is_training=True,
              global_pool=True,
              output_stride=None,
              include_root_block=True,
              spatial_squeeze=True,
              reuse=None,
              scope=None):
    """Generator for v1 ResNet models.
    This function generates a family of ResNet v1 models. See the resnet_v1_*()
    methods for specific model instantiations, obtained by selecting different
    block instantiations that produce ResNets of various depths.
    Training for image classification on Imagenet is usually done with [224, 224]
    inputs, resulting in [7, 7] feature maps at the output of the last ResNet
    block for the ResNets defined in [1] that have nominal stride equal to 32.
    However, for dense prediction tasks we advise that one uses inputs with
    spatial dimensions that are multiples of 32 plus 1, e.g., [321, 321]. In
    this case the feature maps at the ResNet output will have spatial shape
    [(height - 1) / output_stride + 1, (width - 1) / output_stride + 1]
    and corners exactly aligned with the input image corners, which greatly
    facilitates alignment of the features to the image. Using as input [225, 225]
    images results in [8, 8] feature maps at the output of the last ResNet block.
    For dense prediction tasks, the ResNet needs to run in fully-convolutional
    (FCN) mode and global_pool needs to be set to False. The ResNets in [1, 2] all
    have nominal stride equal to 32 and a good choice in FCN mode is to use
    output_stride=16 in order to increase the density of the computed features at
    small computational and memory overhead, cf. http://arxiv.org/abs/1606.00915.
    Args:
      inputs: A tensor of size [batch, height_in, width_in, channels].
      blocks: A list of length equal to the number of ResNet blocks. Each element
        is a resnet_utils.Block object describing the units in the block.
      num_classes: Number of predicted classes for classification tasks. If None
        we return the features before the logit layer.
      is_training: whether is training or not.
      global_pool: If True, we perform global average pooling before computing the
        logits. Set to True for image classification, False for dense prediction.
      output_stride: If None, then the output will be computed at the nominal
        network stride. If output_stride is not None, it specifies the requested
        ratio of input to output spatial resolution.
      include_root_block: If True, include the initial convolution followed by
        max-pooling, if False excludes it.
      spatial_squeeze: if True, logits is of shape [B, C], if false logits is
          of shape [B, 1, 1, C], where B is batch_size and C is number of classes.
      reuse: whether or not the network and its variables should be reused. To be
        able to reuse 'scope' must be given.
      scope: Optional variable_scope.
    Returns:
      net: A rank-4 tensor of size [batch, height_out, width_out, channels_out].
        If global_pool is False, then height_out and width_out are reduced by a
        factor of output_stride compared to the respective height_in and width_in,
        else both height_out and width_out equal one. If num_classes is None, then
        net is the output of the last ResNet block, potentially after global
        average pooling. If num_classes is not None, net contains the pre-softmax
        activations.
      end_points: A dictionary from components of the network to the corresponding
        activation.
    Raises:
      ValueError: If the target output_stride is not valid.
    """
    with tf.variable_scope(scope, 'resnet_v1', [inputs], reuse=reuse) as sc:
        end_points_collection = sc.name + '_end_points'
        with slim.arg_scope(
            [slim.conv2d, bottleneck, resnet_utils.stack_blocks_dense],
                outputs_collections=end_points_collection):
            with slim.arg_scope([slim.batch_norm], is_training=is_training):
                net = inputs
                if include_root_block:
                    if output_stride is not None:
                        if output_stride % 4 != 0:
                            raise ValueError(
                                'The output_stride needs to be a multiple of 4.'
                            )
                        output_stride /= 4
                    net = resnet_utils.conv2d_same(
                        net, 64, 7, stride=2, scope='conv1')
                    net = tf.pad(net, [[0, 0], [1, 1], [1, 1], [0, 0]])
                    net = slim.max_pool2d(net, [3, 3], stride=2, scope='pool1')

                    net = slim.utils.collect_named_outputs(
                        end_points_collection, 'pool2', net)

                net = resnet_utils.stack_blocks_dense(net, blocks,
                                                      output_stride)

                end_points = slim.utils.convert_collection_to_dict(
                    end_points_collection)

                end_points['pool1'] = end_points['resnet_v1_18/block2/unit_2']
                end_points['pool2'] = end_points['resnet_v1_18/block3/unit_2']
                end_points['pool3'] = end_points['resnet_v1_18/block4/unit_2']
                end_points['pool4'] = end_points['resnet_v1_18/block5/unit_2']
                end_points['pool5'] = end_points['resnet_v1_18/block6/unit_2']
                end_points['pool6'] = net

                return net, end_points


 resnet_v1.default_image_size = 224


 def resnet_v1_18(inputs,
                 num_classes=None,
                 is_training=True,
                 global_pool=True,
                 output_stride=None,
                 spatial_squeeze=True,
                 reuse=None,
                 scope='resnet_v1_18'):
    """ResNet-18 model of [1]. See resnet_v1() for arg and return description."""
    blocks = [
        resnet_utils.Block('block1', basicblock,
                           [(64, 64, 1)] + [(64, 64, 1)]),
        resnet_utils.Block('block2', basicblock,
                           [(128, 128, 1)] + [(128, 128, 1)]),
        resnet_utils.Block('block3', basicblock,
                           [(256, 256, 2)] + [(256, 256, 1)]),
        resnet_utils.Block('block4', basicblock,
                           [(512, 512, 2)] + [(512, 512, 1)]),
        resnet_utils.Block('block5', basicblock,
                           [(256, 256, 2)] + [(256, 256, 1)]),
        resnet_utils.Block('block6', basicblock,
                           [(256, 256, 2)] + [(256, 256, 1)]),
        resnet_utils.Block('block7', basicblock,
                           [(256, 256, 2)] + [(256, 256, 1)]),
    ]
    return resnet_v1(
        inputs,
        blocks,
        num_classes,
        is_training,
        global_pool=global_pool,
        output_stride=output_stride,
        include_root_block=True,
        spatial_squeeze=spatial_squeeze,
        reuse=reuse,
        scope=scope)


 resnet_v1_18.default_image_size = resnet_v1.default_image_size


 def resnet_v1_50(inputs,
                 num_classes=None,
                 is_training=True,
                 global_pool=True,
                 output_stride=None,
                 spatial_squeeze=True,
                 reuse=None,
                 scope='resnet_v1_50'):
    """ResNet-50 model of [1]. See resnet_v1() for arg and return description."""
    blocks = [
        resnet_utils.Block('block1', bottleneck,
                           [(256, 64, 1)] * 2 + [(256, 64, 2)]),
        resnet_utils.Block('block2', bottleneck,
                           [(512, 128, 1)] * 3 + [(512, 128, 2)]),
        resnet_utils.Block('block3', bottleneck,
                           [(1024, 256, 1)] * 5 + [(1024, 256, 2)]),
        resnet_utils.Block('block4', bottleneck,
                           [(2048, 512, 1)] * 3 + [(2048, 512, 2)]),
        resnet_utils.Block('block5', bottleneck,
                           [(1024, 256, 1)] * 2 + [(1024, 256, 2)]),
        resnet_utils.Block('block6', bottleneck, [(1024, 256, 1)] * 2),
    ]
    return resnet_v1(
        inputs,
        blocks,
        num_classes,
        is_training,
        global_pool=global_pool,
        output_stride=output_stride,
        include_root_block=True,
        spatial_squeeze=spatial_squeeze,
        reuse=reuse,
        scope=scope)


 resnet_v1_50.default_image_size = resnet_v1.default_image_size


 def resnet_v1_101(inputs,
                  num_classes=None,
                  is_training=True,
                  global_pool=True,
                  output_stride=None,
                  spatial_squeeze=True,
                  reuse=None,
                  scope='resnet_v1_101'):
    """ResNet-101 model of [1]. See resnet_v1() for arg and return description."""
    blocks = [
        resnet_utils.Block('block1', bottleneck,
                           [(256, 64, 1)] * 2 + [(256, 64, 2)]),
        resnet_utils.Block('block2', bottleneck,
                           [(512, 128, 1)] * 3 + [(512, 128, 2)]),
        resnet_utils.Block('block3', bottleneck,
                           [(1024, 256, 1)] * 22 + [(1024, 256, 2)]),
        resnet_utils.Block('block4', bottleneck, [(2048, 512, 1)] * 3)
    ]
    return resnet_v1(
        inputs,
        blocks,
        num_classes,
        is_training,
        global_pool=global_pool,
        output_stride=output_stride,
        include_root_block=True,
        spatial_squeeze=spatial_squeeze,
        reuse=reuse,
        scope=scope)


 resnet_v1_101.default_image_size = resnet_v1.default_image_size


 def resnet_v1_152(inputs,
                  num_classes=None,
                  is_training=True,
                  global_pool=True,
                  output_stride=None,
                  spatial_squeeze=True,
                  reuse=None,
                  scope='resnet_v1_152'):
    """ResNet-152 model of [1]. See resnet_v1() for arg and return description."""
    blocks = [
        resnet_utils.Block('block1', bottleneck,
                           [(256, 64, 1)] * 2 + [(256, 64, 2)]),
        resnet_utils.Block('block2', bottleneck,
                           [(512, 128, 1)] * 7 + [(512, 128, 2)]),
        resnet_utils.Block('block3', bottleneck,
                           [(1024, 256, 1)] * 35 + [(1024, 256, 2)]),
        resnet_utils.Block('block4', bottleneck, [(2048, 512, 1)] * 3)
    ]
    return resnet_v1(
        inputs,
        blocks,
        num_classes,
        is_training,
        global_pool=global_pool,
        output_stride=output_stride,
        include_root_block=True,
        spatial_squeeze=spatial_squeeze,
        reuse=reuse,
        scope=scope)


 resnet_v1_152.default_image_size = resnet_v1.default_image_size


 def resnet_v1_200(inputs,
                  num_classes=None,
                  is_training=True,
                  global_pool=True,
                  output_stride=None,
                  spatial_squeeze=True,
                  reuse=None,
                  scope='resnet_v1_200'):
    """ResNet-200 model of [2]. See resnet_v1() for arg and return description."""
    blocks = [
        resnet_utils.Block('block1', bottleneck,
                           [(256, 64, 1)] * 2 + [(256, 64, 2)]),
        resnet_utils.Block('block2', bottleneck,
                           [(512, 128, 1)] * 23 + [(512, 128, 2)]),
        resnet_utils.Block('block3', bottleneck,
                           [(1024, 256, 1)] * 35 + [(1024, 256, 2)]),
        resnet_utils.Block('block4', bottleneck, [(2048, 512, 1)] * 3)
    ]
    return resnet_v1(
        inputs,
        blocks,
        num_classes,
        is_training,
        global_pool=global_pool,
        output_stride=output_stride,
        include_root_block=True,
        spatial_squeeze=spatial_squeeze,
        reuse=reuse,
        scope=scope)


 resnet_v1_200.default_image_size = resnet_v1.default_image_size

 if __name__ == '__main__':
    input = tf.placeholder(tf.float32, shape=(None, 224, 224, 3), name='input')
    with slim.arg_scope(resnet_arg_scope()) as sc:
        logits = resnet_v1_50(input)
--- a/modelscope/pipelines/cv/ocr_utils/resnet_utils.py
+++ b/modelscope/pipelines/cv/ocr_utils/resnet_utils.py
@@ -0,0 +1,231 @@
 """Contains building blocks for various versions of Residual Networks.
 Residual networks (ResNets) were proposed in:
  Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun
  Deep Residual Learning for Image Recognition. arXiv:1512.03385, 2015
 More variants were introduced in:
  Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun
  Identity Mappings in Deep Residual Networks. arXiv: 1603.05027, 2016
 We can obtain different ResNet variants by changing the network depth, width,
 and form of residual unit. This module implements the infrastructure for
 building them. Concrete ResNet units and full ResNet networks are implemented in
 the accompanying resnet_v1.py and resnet_v2.py modules.
 Compared to https://github.com/KaimingHe/deep-residual-networks, in the current
 implementation we subsample the output activations in the last residual unit of
 each block, instead of subsampling the input activations in the first residual
 unit of each block. The two implementations give identical results but our
 implementation is more memory efficient.
 """

 import collections

 import tensorflow as tf
 import tf_slim as slim

 if tf.__version__ >= '2.0':
    tf = tf.compat.v1


 class Block(collections.namedtuple('Block', ['scope', 'unit_fn', 'args'])):
    """A named tuple describing a ResNet block.
    Its parts are:
      scope: The scope of the `Block`.
      unit_fn: The ResNet unit function which takes as input a `Tensor` and
        returns another `Tensor` with the output of the ResNet unit.
      args: A list of length equal to the number of units in the `Block`. The list
        contains one (depth, depth_bottleneck, stride) tuple for each unit in the
        block to serve as argument to unit_fn.
    """


 def subsample(inputs, factor, scope=None):
    """Subsamples the input along the spatial dimensions.
    Args:
      inputs: A `Tensor` of size [batch, height_in, width_in, channels].
      factor: The subsampling factor.
      scope: Optional variable_scope.
    Returns:
      output: A `Tensor` of size [batch, height_out, width_out, channels] with the
        input, either intact (if factor == 1) or subsampled (if factor > 1).
    """
    if factor == 1:
        return inputs
    else:
        return slim.max_pool2d(inputs, [1, 1], stride=factor, scope=scope)


 def conv2d_same(inputs, num_outputs, kernel_size, stride, rate=1, scope=None):
    """Strided 2-D convolution with 'SAME' padding.
    When stride > 1, then we do explicit zero-padding, followed by conv2d with
    'VALID' padding.
    Note that
       net = conv2d_same(inputs, num_outputs, 3, stride=stride)
    is equivalent to
       net = slim.conv2d(inputs, num_outputs, 3, stride=1, padding='SAME')
       net = subsample(net, factor=stride)
    whereas
       net = slim.conv2d(inputs, num_outputs, 3, stride=stride, padding='SAME')
    is different when the input's height or width is even, which is why we add the
    current function. For more details, see ResnetUtilsTest.testConv2DSameEven().
    Args:
      inputs: A 4-D tensor of size [batch, height_in, width_in, channels].
      num_outputs: An integer, the number of output filters.
      kernel_size: An int with the kernel_size of the filters.
      stride: An integer, the output stride.
      rate: An integer, rate for atrous convolution.
      scope: Scope.
    Returns:
      output: A 4-D tensor of size [batch, height_out, width_out, channels] with
        the convolution output.
    """
    if stride == 1:
        return slim.conv2d(
            inputs,
            num_outputs,
            kernel_size,
            stride=1,
            rate=rate,
            padding='SAME',
            scope=scope)
    else:
        kernel_size_effective = kernel_size + (kernel_size - 1) * (rate - 1)
        pad_total = kernel_size_effective - 1
        pad_beg = pad_total // 2
        pad_end = pad_total - pad_beg
        inputs = tf.pad(
            inputs, [[0, 0], [pad_beg, pad_end], [pad_beg, pad_end], [0, 0]])
        return slim.conv2d(
            inputs,
            num_outputs,
            kernel_size,
            stride=stride,
            rate=rate,
            padding='VALID',
            scope=scope)


@slim.add_arg_scope
 def stack_blocks_dense(net,
                       blocks,
                       output_stride=None,
                       outputs_collections=None):
    """Stacks ResNet `Blocks` and controls output feature density.
    First, this function creates scopes for the ResNet in the form of
    'block_name/unit_1', 'block_name/unit_2', etc.
    Second, this function allows the user to explicitly control the ResNet
    output_stride, which is the ratio of the input to output spatial resolution.
    This is useful for dense prediction tasks such as semantic segmentation or
    object detection.
    Most ResNets consist of 4 ResNet blocks and subsample the activations by a
    factor of 2 when transitioning between consecutive ResNet blocks. This results
    to a nominal ResNet output_stride equal to 8. If we set the output_stride to
    half the nominal network stride (e.g., output_stride=4), then we compute
    responses twice.
    Control of the output feature density is implemented by atrous convolution.
    Args:
      net: A `Tensor` of size [batch, height, width, channels].
      blocks: A list of length equal to the number of ResNet `Blocks`. Each
        element is a ResNet `Block` object describing the units in the `Block`.
      output_stride: If `None`, then the output will be computed at the nominal
        network stride. If output_stride is not `None`, it specifies the requested
        ratio of input to output spatial resolution, which needs to be equal to
        the product of unit strides from the start up to some level of the ResNet.
        For example, if the ResNet employs units with strides 1, 2, 1, 3, 4, 1,
        then valid values for the output_stride are 1, 2, 6, 24 or None (which
        is equivalent to output_stride=24).
      outputs_collections: Collection to add the ResNet block outputs.
    Returns:
      net: Output tensor with stride equal to the specified output_stride.
    Raises:
      ValueError: If the target output_stride is not valid.
    """
    # The current_stride variable keeps track of the effective stride of the
    # activations. This allows us to invoke atrous convolution whenever applying
    # the next residual unit would result in the activations having stride larger
    # than the target output_stride.
    current_stride = 1

    # The atrous convolution rate parameter.
    rate = 1

    for block in blocks:
        with tf.variable_scope(block.scope, 'block', [net]):
            for i, unit in enumerate(block.args):
                if output_stride is not None and current_stride > output_stride:
                    raise ValueError(
                        'The target output_stride cannot be reached.')

                with tf.variable_scope(
                        'unit_%d' % (i + 1), values=[net]) as sc:
                    unit_depth, unit_depth_bottleneck, unit_stride = unit
                    # If we have reached the target output_stride, then we need to employ
                    # atrous convolution with stride=1 and multiply the atrous rate by the
                    # current unit's stride for use in subsequent layers.
                    if output_stride is not None and current_stride == output_stride:
                        net = block.unit_fn(
                            net,
                            depth=unit_depth,
                            depth_bottleneck=unit_depth_bottleneck,
                            stride=1,
                            rate=rate)
                        rate *= unit_stride

                    else:
                        net = block.unit_fn(
                            net,
                            depth=unit_depth,
                            depth_bottleneck=unit_depth_bottleneck,
                            stride=unit_stride,
                            rate=1)
                        current_stride *= unit_stride
                    net = slim.utils.collect_named_outputs(
                        outputs_collections, sc.name, net)

    if output_stride is not None and current_stride != output_stride:
        raise ValueError('The target output_stride cannot be reached.')

    return net


 def resnet_arg_scope(weight_decay=0.0001,
                     batch_norm_decay=0.997,
                     batch_norm_epsilon=1e-5,
                     batch_norm_scale=True):
    """Defines the default ResNet arg scope.
    TODO(gpapan): The batch-normalization related default values above are
      appropriate for use in conjunction with the reference ResNet models
      released at https://github.com/KaimingHe/deep-residual-networks. When
      training ResNets from scratch, they might need to be tuned.
    Args:
      weight_decay: The weight decay to use for regularizing the model.
      batch_norm_decay: The moving average decay when estimating layer activation
        statistics in batch normalization.
      batch_norm_epsilon: Small constant to prevent division by zero when
        normalizing activations by their variance in batch normalization.
      batch_norm_scale: If True, uses an explicit `gamma` multiplier to scale the
        activations in the batch normalization layer.
    Returns:
      An `arg_scope` to use for the resnet models.
    """
    batch_norm_params = {
        'decay': batch_norm_decay,
        'epsilon': batch_norm_epsilon,
        'scale': batch_norm_scale,
        'updates_collections': tf.GraphKeys.UPDATE_OPS,
    }

    with slim.arg_scope(
        [slim.conv2d],
            weights_regularizer=slim.l2_regularizer(weight_decay),
            weights_initializer=slim.variance_scaling_initializer(),
            activation_fn=tf.nn.relu,
            normalizer_fn=slim.batch_norm,
            normalizer_params=batch_norm_params):
        with slim.arg_scope([slim.batch_norm], **batch_norm_params):
            # The following implies padding='SAME' for pool1, which makes feature
            # alignment easier for dense prediction tasks. This is also used in
            # https://github.com/facebook/fb.resnet.torch. However the accompanying
            # code of 'Deep Residual Learning for Image Recognition' uses
            # padding='VALID' for pool1. You can switch to that choice by setting
            # slim.arg_scope([slim.max_pool2d], padding='VALID').
            with slim.arg_scope([slim.max_pool2d], padding='VALID') as arg_sc:
                return arg_sc
--- a/modelscope/pipelines/cv/ocr_utils/utils.py
+++ b/modelscope/pipelines/cv/ocr_utils/utils.py
@@ -0,0 +1,108 @@
 import cv2
 import numpy as np


 def rboxes_to_polygons(rboxes):
    """
    Convert rboxes to polygons
    ARGS
        `rboxes`: [n, 5]
    RETURN
        `polygons`: [n, 8]
    """

    theta = rboxes[:, 4:5]
    cxcy = rboxes[:, :2]
    half_w = rboxes[:, 2:3] / 2.
    half_h = rboxes[:, 3:4] / 2.
    v1 = np.hstack([np.cos(theta) * half_w, np.sin(theta) * half_w])
    v2 = np.hstack([-np.sin(theta) * half_h, np.cos(theta) * half_h])
    p1 = cxcy - v1 - v2
    p2 = cxcy + v1 - v2
    p3 = cxcy + v1 + v2
    p4 = cxcy - v1 + v2
    polygons = np.hstack([p1, p2, p3, p4])
    return polygons


 def cal_width(box):
    pd1 = point_dist(box[0], box[1], box[2], box[3])
    pd2 = point_dist(box[4], box[5], box[6], box[7])
    return (pd1 + pd2) / 2


 def point_dist(x1, y1, x2, y2):
    return np.sqrt((x2 - x1) * (x2 - x1) + (y2 - y1) * (y2 - y1))


 def draw_polygons(img, polygons):
    for p in polygons.tolist():
        p = [int(o) for o in p]
        cv2.line(img, (p[0], p[1]), (p[2], p[3]), (0, 255, 0), 1)
        cv2.line(img, (p[2], p[3]), (p[4], p[5]), (0, 255, 0), 1)
        cv2.line(img, (p[4], p[5]), (p[6], p[7]), (0, 255, 0), 1)
        cv2.line(img, (p[6], p[7]), (p[0], p[1]), (0, 255, 0), 1)
    return img


 def nms_python(boxes):
    boxes = sorted(boxes, key=lambda x: -x[8])
    nms_flag = [True] * len(boxes)
    for i, a in enumerate(boxes):
        if not nms_flag[i]:
            continue
        else:
            for j, b in enumerate(boxes):
                if not j > i:
                    continue
                if not nms_flag[j]:
                    continue
                score_a = a[8]
                score_b = b[8]
                rbox_a = polygon2rbox(a[:8])
                rbox_b = polygon2rbox(b[:8])
                if point_in_rbox(rbox_a[:2], rbox_b) or point_in_rbox(
                        rbox_b[:2], rbox_a):
                    if score_a > score_b:
                        nms_flag[j] = False
    boxes_nms = []
    for i, box in enumerate(boxes):
        if nms_flag[i]:
            boxes_nms.append(box)
    return boxes_nms


 def point_in_rbox(c, rbox):
    cx0, cy0 = c[0], c[1]
    cx1, cy1 = rbox[0], rbox[1]
    w, h = rbox[2], rbox[3]
    theta = rbox[4]
    dist_x = np.abs((cx1 - cx0) * np.cos(theta) + (cy1 - cy0) * np.sin(theta))
    dist_y = np.abs(-(cx1 - cx0) * np.sin(theta) + (cy1 - cy0) * np.cos(theta))
    return ((dist_x < w / 2.0) and (dist_y < h / 2.0))


 def polygon2rbox(polygon):
    x1, x2, x3, x4 = polygon[0], polygon[2], polygon[4], polygon[6]
    y1, y2, y3, y4 = polygon[1], polygon[3], polygon[5], polygon[7]
    c_x = (x1 + x2 + x3 + x4) / 4
    c_y = (y1 + y2 + y3 + y4) / 4
    w1 = point_dist(x1, y1, x2, y2)
    w2 = point_dist(x3, y3, x4, y4)
    h1 = point_line_dist(c_x, c_y, x1, y1, x2, y2)
    h2 = point_line_dist(c_x, c_y, x3, y3, x4, y4)
    h = h1 + h2
    w = (w1 + w2) / 2
    theta1 = np.arctan2(y2 - y1, x2 - x1)
    theta2 = np.arctan2(y3 - y4, x3 - x4)
    theta = (theta1 + theta2) / 2.0
    return [c_x, c_y, w, h, theta]


 def point_line_dist(px, py, x1, y1, x2, y2):
    eps = 1e-6
    dx = x2 - x1
    dy = y2 - y1
    div = np.sqrt(dx * dx + dy * dy) + eps
    dist = np.abs(px * dy - py * dx + x2 * y1 - y2 * x1) / div
    return dist
--- a/modelscope/pipelines/multi_modal/init.py
+++ b/modelscope/pipelines/multi_modal/init.py
@@ -1 +1 @@
 from .image_captioning import ImageCaptionPipeline
 from .image_captioning_pipeline import ImageCaptionPipeline
--- a/modelscope/pipelines/multi_modal/image_captioning_pipeline.py
+++ b/modelscope/pipelines/multi_modal/image_captioning_pipeline.py
@@ -0,0 +1,33 @@
 from typing import Any, Dict, Union

 from modelscope.preprocessors import OfaImageCaptionPreprocessor, Preprocessor
 from modelscope.utils.constant import Tasks
 from modelscope.utils.logger import get_logger
 from ..base import Model, Pipeline
 from ..builder import PIPELINES

 logger = get_logger()


@PIPELINES.register_module(Tasks.image_captioning, module_name='ofa')
 class ImageCaptionPipeline(Pipeline):

    def __init__(self,
                 model: Union[Model, str],
                 preprocessor: [Preprocessor] = None,
                 **kwargs):
        super().__init__()
        assert isinstance(model, str) or isinstance(model, Model), \
            'model must be a single str or OfaForImageCaptioning'
        if isinstance(model, str):
            pipe_model = Model.from_pretrained(model)
        elif isinstance(model, Model):
            pipe_model = model
        else:
            raise NotImplementedError
        if preprocessor is None and pipe_model:
            preprocessor = OfaImageCaptionPreprocessor(model_dir=model)
        super().__init__(model=pipe_model, preprocessor=preprocessor, **kwargs)

    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
        return inputs
--- a/modelscope/pipelines/nlp/init.py
+++ b/modelscope/pipelines/nlp/init.py
@@ -2,3 +2,4 @@ from .fill_mask_pipeline import *  # noqa F403
 from .sentence_similarity_pipeline import *  # noqa F403
 from .sequence_classification_pipeline import *  # noqa F403
 from .text_generation_pipeline import *  # noqa F403
 from .word_segmentation_pipeline import *  # noqa F403
--- a/modelscope/pipelines/nlp/sentence_similarity_pipeline.py
+++ b/modelscope/pipelines/nlp/sentence_similarity_pipeline.py
@@ -1,8 +1,5 @@
 import os
 import uuid
 from typing import Any, Dict, Union

 import json
 import numpy as np

 from modelscope.models.nlp import SbertForSentenceSimilarity
--- a/modelscope/pipelines/nlp/sequence_classification_pipeline.py
+++ b/modelscope/pipelines/nlp/sequence_classification_pipeline.py
@@ -1,8 +1,5 @@
 import os
 import uuid
 from typing import Any, Dict, Union

 import json
 import numpy as np

 from modelscope.models.nlp import BertForSequenceClassification
--- a/modelscope/pipelines/nlp/text_generation_pipeline.py
+++ b/modelscope/pipelines/nlp/text_generation_pipeline.py
@@ -1,7 +1,7 @@
 from typing import Dict, Optional, Union

 from modelscope.models import Model
 from modelscope.models.nlp import PalmForTextGenerationModel
 from modelscope.models.nlp import PalmForTextGeneration
 from modelscope.preprocessors import TextGenerationPreprocessor
 from modelscope.utils.constant import Tasks
 from ..base import Pipeline, Tensor
@@ -10,11 +10,11 @@ from ..builder import PIPELINES
 __all__ = ['TextGenerationPipeline']


@PIPELINES.register_module(Tasks.text_generation, module_name=r'palm')
@PIPELINES.register_module(Tasks.text_generation, module_name=r'palm2.0')
 class TextGenerationPipeline(Pipeline):

    def __init__(self,
                 model: Union[PalmForTextGenerationModel, str],
                 model: Union[PalmForTextGeneration, str],
                 preprocessor: Optional[TextGenerationPreprocessor] = None,
                 **kwargs):
        """use `model` and `preprocessor` to create a nlp text classification pipeline for prediction
@@ -23,16 +23,16 @@ class TextGenerationPipeline(Pipeline):
            model (SequenceClassificationModel): a model instance
            preprocessor (SequenceClassificationPreprocessor): a preprocessor instance
        """
        sc_model = model if isinstance(
            model,
            PalmForTextGenerationModel) else Model.from_pretrained(model)
        model = model if isinstance(
            model, PalmForTextGeneration) else Model.from_pretrained(model)
        if preprocessor is None:
            preprocessor = TextGenerationPreprocessor(
                sc_model.model_dir,
                model.model_dir,
                model.tokenizer,
                first_sequence='sentence',
                second_sequence=None)
        super().__init__(model=sc_model, preprocessor=preprocessor, **kwargs)
        self.tokenizer = preprocessor.tokenizer
        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
        self.tokenizer = model.tokenizer

    def postprocess(self, inputs: Dict[str, Tensor]) -> Dict[str, str]:
        """process the prediction results
@@ -43,17 +43,20 @@ class TextGenerationPipeline(Pipeline):
        Returns:
            Dict[str, str]: the prediction results
        """
        replace_tokens_bert = (('[unused0]', ''), ('[PAD]', ''),
                               ('[unused1]', ''), (r' +', ' '), ('[SEP]', ''),
                               ('[unused2]', ''), ('[CLS]', ''), ('[UNK]', ''))
        replace_tokens_roberta = ((r' +', ' '), ('<mask>', '<q>'), ('<pad>',
                                                                    ''),
                                  ('<s>', ''), ('</s>', ''), ('<unk>', ' '))

        vocab_size = len(self.tokenizer.vocab)
        pred_list = inputs['predictions']
        pred_ids = pred_list[0][0].cpu().numpy().tolist()
        for j in range(len(pred_ids)):
            if pred_ids[j] >= vocab_size:
                pred_ids[j] = 100
        pred = self.tokenizer.convert_ids_to_tokens(pred_ids)
        pred_string = ''.join(pred).replace(
            '##',
            '').split('[SEP]')[0].replace('[CLS]',
                                          '').replace('[SEP]',
                                                      '').replace('[UNK]', '')
        pred_string = self.tokenizer.decode(pred_ids)
        for _old, _new in replace_tokens_bert:
            pred_string = pred_string.replace(_old, _new)
        pred_string.strip()
        for _old, _new in replace_tokens_roberta:
            pred_string = pred_string.replace(_old, _new)
        pred_string.strip()
        return {'text': pred_string}
--- a/modelscope/pipelines/nlp/word_segmentation_pipeline.py
+++ b/modelscope/pipelines/nlp/word_segmentation_pipeline.py
@@ -0,0 +1,69 @@
 from typing import Any, Dict, Optional, Union

 from modelscope.models import Model
 from modelscope.models.nlp import StructBertForTokenClassification
 from modelscope.preprocessors import TokenClassifcationPreprocessor
 from modelscope.utils.constant import Tasks
 from ..base import Pipeline, Tensor
 from ..builder import PIPELINES

 __all__ = ['WordSegmentationPipeline']


@PIPELINES.register_module(
    Tasks.word_segmentation,
    module_name=r'structbert-chinese-word-segmentation')
 class WordSegmentationPipeline(Pipeline):

    def __init__(self,
                 model: Union[StructBertForTokenClassification, str],
                 preprocessor: Optional[TokenClassifcationPreprocessor] = None,
                 **kwargs):
        """use `model` and `preprocessor` to create a nlp word segmentation pipeline for prediction

        Args:
            model (StructBertForTokenClassification): a model instance
            preprocessor (TokenClassifcationPreprocessor): a preprocessor instance
        """
        model = model if isinstance(
            model,
            StructBertForTokenClassification) else Model.from_pretrained(model)
        if preprocessor is None:
            preprocessor = TokenClassifcationPreprocessor(model.model_dir)
        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
        self.tokenizer = preprocessor.tokenizer
        self.config = model.config
        self.id2label = self.config.id2label

    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, str]:
        """process the prediction results

        Args:
            inputs (Dict[str, Any]): _description_

        Returns:
            Dict[str, str]: the prediction results
        """

        pred_list = inputs['predictions']
        labels = []
        for pre in pred_list:
            labels.append(self.id2label[pre])
        labels = labels[1:-1]
        chunks = []
        chunk = ''
        assert len(inputs['text']) == len(labels)
        for token, label in zip(inputs['text'], labels):
            if label[0] == 'B' or label[0] == 'I':
                chunk += token
            else:
                chunk += token
                chunks.append(chunk)
                chunk = ''
        if chunk:
            chunks.append(chunk)
        seg_result = ' '.join(chunks)
        rst = {
            'output': seg_result,
        }
        return rst
--- a/modelscope/pipelines/outputs.py
+++ b/modelscope/pipelines/outputs.py
@@ -54,6 +54,13 @@ TASK_OUTPUTS = {
    # }
    Tasks.pose_estimation: ['poses', 'boxes'],

    # ocr detection result for single sample
    # {
    #   "det_polygons": np.array with shape [num_text, 8], each box is
    #       [x1, y1, x2, y2, x3, y3, x4, y4]
    # }
    Tasks.ocr_detection: ['det_polygons'],

    # ============ nlp tasks ===================

    # text classification result for single sample
@@ -75,8 +82,27 @@ TASK_OUTPUTS = {
    # }
    Tasks.fill_mask: ['text'],

    # word segmentation result for single sample
    # {
    #   "output": "今天 天气 不错 ， 适合 出去 游玩"
    # }
    Tasks.word_segmentation: ['output'],

    # sentence similarity result for single sample
    #   {
    #       "labels": "1",
    #       "scores": 0.9
    #   }
    Tasks.sentence_similarity: ['scores', 'labels'],

    # ============ audio tasks ===================

    # audio processed for single file in PCM format
    # {
    #   "output_pcm": np.array with shape(samples,) and dtype float32
    # }
    Tasks.speech_signal_process: ['output_pcm'],

    # ============ multi-modal tasks ===================

    # image caption result for single sample
--- a/modelscope/preprocessors/init.py
+++ b/modelscope/preprocessors/init.py
@@ -1,7 +1,10 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.

 from .audio import LinearAECAndFbank
 from .base import Preprocessor
 from .builder import PREPROCESSORS, build_preprocessor
 from .common import Compose
 from .image import LoadImage, load_image
 from .multi_model import OfaImageCaptionPreprocessor
 from .nlp import *  # noqa F403
 from .text_to_speech import *  # noqa F403
--- a/modelscope/preprocessors/audio.py
+++ b/modelscope/preprocessors/audio.py
@@ -0,0 +1,231 @@
 import ctypes
 import os
 from typing import Any, Dict

 import numpy as np
 import scipy.io.wavfile as wav
 import torch
 from numpy.ctypeslib import ndpointer

 from modelscope.utils.constant import Fields
 from .builder import PREPROCESSORS


 def load_wav(path):
    samp_rate, data = wav.read(path)
    return np.float32(data), samp_rate


 def load_library(libaec):
    libaec_in_cwd = os.path.join('.', libaec)
    if os.path.exists(libaec_in_cwd):
        libaec = libaec_in_cwd
    mitaec = ctypes.cdll.LoadLibrary(libaec)
    fe_process = mitaec.fe_process_inst
    fe_process.argtypes = [
        ndpointer(ctypes.c_float, flags='C_CONTIGUOUS'),
        ndpointer(ctypes.c_float, flags='C_CONTIGUOUS'), ctypes.c_int,
        ndpointer(ctypes.c_float, flags='C_CONTIGUOUS'),
        ndpointer(ctypes.c_float, flags='C_CONTIGUOUS'),
        ndpointer(ctypes.c_float, flags='C_CONTIGUOUS')
    ]
    return fe_process


 def do_linear_aec(fe_process, mic, ref, int16range=True):
    mic = np.float32(mic)
    ref = np.float32(ref)
    if len(mic) > len(ref):
        mic = mic[:len(ref)]
    out_mic = np.zeros_like(mic)
    out_linear = np.zeros_like(mic)
    out_echo = np.zeros_like(mic)
    out_ref = np.zeros_like(mic)
    if int16range:
        mic /= 32768
        ref /= 32768
    fe_process(mic, ref, len(mic), out_mic, out_linear, out_echo)
    # out_ref not in use here
    if int16range:
        out_mic *= 32768
        out_linear *= 32768
        out_echo *= 32768
    return out_mic, out_ref, out_linear, out_echo


 def load_kaldi_feature_transform(filename):
    fp = open(filename, 'r')
    all_str = fp.read()
    pos1 = all_str.find('AddShift')
    pos2 = all_str.find('[', pos1)
    pos3 = all_str.find(']', pos2)
    mean = np.fromstring(all_str[pos2 + 1:pos3], dtype=np.float32, sep=' ')
    pos1 = all_str.find('Rescale')
    pos2 = all_str.find('[', pos1)
    pos3 = all_str.find(']', pos2)
    scale = np.fromstring(all_str[pos2 + 1:pos3], dtype=np.float32, sep=' ')
    fp.close()
    return mean, scale


 class Feature:
    r"""Extract feat from one utterance.
    """

    def __init__(self,
                 fbank_config,
                 feat_type='spec',
                 mvn_file=None,
                 cuda=False):
        r"""

        Args:
            fbank_config (dict):
            feat_type (str):
                raw: do nothing
                fbank: use kaldi.fbank
                spec: Real/Imag
                logpow: log(1+|x|^2)
            mvn_file (str): the path of data file for mean variance normalization
            cuda:
        """
        self.fbank_config = fbank_config
        self.feat_type = feat_type
        self.n_fft = fbank_config['frame_length'] * fbank_config[
            'sample_frequency'] // 1000
        self.hop_length = fbank_config['frame_shift'] * fbank_config[
            'sample_frequency'] // 1000
        self.window = torch.hamming_window(self.n_fft, periodic=False)

        self.mvn = False
        if mvn_file is not None and os.path.exists(mvn_file):
            print(f'loading mvn file: {mvn_file}')
            shift, scale = load_kaldi_feature_transform(mvn_file)
            self.shift = torch.from_numpy(shift)
            self.scale = torch.from_numpy(scale)
            self.mvn = True
        if cuda:
            self.window = self.window.cuda()
            if self.mvn:
                self.shift = self.shift.cuda()
                self.scale = self.scale.cuda()

    def compute(self, utt):
        r"""

        Args:
            utt: in [-32768, 32767] range

        Returns:
             [..., T, F]
        """
        if self.feat_type == 'raw':
            return utt
        elif self.feat_type == 'fbank':
            # have to use local import before modelscope framework supoort lazy loading
            import torchaudio.compliance.kaldi as kaldi
            if len(utt.shape) == 1:
                utt = utt.unsqueeze(0)
            feat = kaldi.fbank(utt, **self.fbank_config)
        elif self.feat_type == 'spec':
            spec = torch.stft(
                utt / 32768,
                self.n_fft,
                self.hop_length,
                self.n_fft,
                self.window,
                center=False,
                return_complex=True)
            feat = torch.cat([spec.real, spec.imag], dim=-2).permute(-1, -2)
        elif self.feat_type == 'logpow':
            spec = torch.stft(
                utt,
                self.n_fft,
                self.hop_length,
                self.n_fft,
                self.window,
                center=False,
                return_complex=True)
            abspow = torch.abs(spec)**2
            feat = torch.log(1 + abspow).permute(-1, -2)
        return feat

    def normalize(self, feat):
        if self.mvn:
            feat = feat + self.shift
            feat = feat * self.scale
        return feat


@PREPROCESSORS.register_module(Fields.audio)
 class LinearAECAndFbank:
    SAMPLE_RATE = 16000

    def __init__(self, io_config):
        self.trunc_length = 7200 * self.SAMPLE_RATE
        self.linear_aec_delay = io_config['linear_aec_delay']
        self.feature = Feature(io_config['fbank_config'],
                               io_config['feat_type'], io_config['mvn'])
        self.mitaec = load_library(io_config['mitaec_library'])
        self.mask_on_mic = io_config['mask_on'] == 'nearend_mic'

    def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
        """ linear filtering the near end mic and far end audio, then extract the feature
        :param data: dict with two keys and correspond audios: "nearend_mic" and "farend_speech"
        :return: dict with two keys and Tensor values: "base" linear filtered audio，and "feature"
        """
        # read files
        nearend_mic, fs = load_wav(data['nearend_mic'])
        assert fs == self.SAMPLE_RATE, f'The sample rate should be {self.SAMPLE_RATE}'
        farend_speech, fs = load_wav(data['farend_speech'])
        assert fs == self.SAMPLE_RATE, f'The sample rate should be {self.SAMPLE_RATE}'
        if 'nearend_speech' in data:
            nearend_speech, fs = load_wav(data['nearend_speech'])
            assert fs == self.SAMPLE_RATE, f'The sample rate should be {self.SAMPLE_RATE}'
        else:
            nearend_speech = np.zeros_like(nearend_mic)

        out_mic, out_ref, out_linear, out_echo = do_linear_aec(
            self.mitaec, nearend_mic, farend_speech)
        # fix 20ms linear aec delay by delaying the target speech
        extra_zeros = np.zeros([int(self.linear_aec_delay * fs)])
        nearend_speech = np.concatenate([extra_zeros, nearend_speech])
        # truncate files to the same length
        flen = min(
            len(out_mic), len(out_ref), len(out_linear), len(out_echo),
            len(nearend_speech))
        fstart = 0
        flen = min(flen, self.trunc_length)
        nearend_mic, out_ref, out_linear, out_echo, nearend_speech = (
            out_mic[fstart:flen], out_ref[fstart:flen],
            out_linear[fstart:flen], out_echo[fstart:flen],
            nearend_speech[fstart:flen])

        # extract features (frames, [mic, linear, ref, aes?])
        feat = torch.FloatTensor()

        nearend_mic = torch.from_numpy(np.float32(nearend_mic))
        fbank_nearend_mic = self.feature.compute(nearend_mic)
        feat = torch.cat([feat, fbank_nearend_mic], dim=1)

        out_linear = torch.from_numpy(np.float32(out_linear))
        fbank_out_linear = self.feature.compute(out_linear)
        feat = torch.cat([feat, fbank_out_linear], dim=1)

        out_echo = torch.from_numpy(np.float32(out_echo))
        fbank_out_echo = self.feature.compute(out_echo)
        feat = torch.cat([feat, fbank_out_echo], dim=1)

        # feature transform
        feat = self.feature.normalize(feat)

        # prepare target
        if nearend_speech is not None:
            nearend_speech = torch.from_numpy(np.float32(nearend_speech))

        if self.mask_on_mic:
            base = nearend_mic
        else:
            base = out_linear
        out_data = {'base': base, 'target': nearend_speech, 'feature': feat}
        return out_data
--- a/modelscope/pipelines/multi_modal/image_captioning.py
+++ b/modelscope/pipelines/multi_modal/image_captioning.py
@@ -1,32 +1,50 @@
 from typing import Any, Dict
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import os.path as osp
 from typing import Any, Dict, Union

 import numpy as np
 import torch
 from maas_hub.snapshot_download import snapshot_download
 from PIL import Image

 from modelscope.pipelines.base import Input
 from modelscope.preprocessors import load_image
 from modelscope.utils.constant import Tasks
 from modelscope.utils.logger import get_logger
 from ..base import Pipeline
 from ..builder import PIPELINES
 from modelscope.utils.constant import Fields, ModelFile
 from modelscope.utils.hub import get_model_cache_dir
 from modelscope.utils.type_assert import type_assert
 from .base import Preprocessor
 from .builder import PREPROCESSORS
 from .image import load_image

 logger = get_logger()
 __all__ = [
    'OfaImageCaptionPreprocessor',
 ]


@PIPELINES.register_module(Tasks.image_captioning, module_name='ofa')
 class ImageCaptionPipeline(Pipeline):
    # TODO: refine using modelhub
    def __init__(self, model: str, bpe_dir: str):
        super().__init__()
        # turn on cuda if GPU is available
@PREPROCESSORS.register_module(
    Fields.multi_modal, module_name=r'ofa-image-caption')
 class OfaImageCaptionPreprocessor(Preprocessor):

    def __init__(self, model_dir: str, *args, **kwargs):
        """preprocess the data via the vocab.txt from the `model_dir` path

        Args:
            model_dir (str): model path
        """
        super().__init__(*args, **kwargs)

        if osp.exists(model_dir):
            local_model_dir = model_dir
        else:
            cache_path = get_model_cache_dir(model_dir)
            local_model_dir = cache_path if osp.exists(
                cache_path) else snapshot_download(model_dir)
        local_model = osp.join(local_model_dir, ModelFile.TORCH_MODEL_FILE)
        bpe_dir = local_model_dir

        from fairseq import checkpoint_utils, tasks, utils
        from ofa.tasks.mm_tasks import CaptionTask

        tasks.register_task('caption', CaptionTask)
        use_cuda = False
        # use fp16 only when GPU is available
        use_fp16 = False

        overrides = {
            'bpe_dir': bpe_dir,
            'eval_cider': False,
@@ -35,21 +53,9 @@ class ImageCaptionPipeline(Pipeline):
            'no_repeat_ngram_size': 3,
            'seed': 7
        }
        models, cfg, task = checkpoint_utils.load_model_ensemble_and_task(
            utils.split_paths(model), arg_overrides=overrides)

        # Move models to GPU
        for model in models:
            model.eval()
            if use_cuda:
                model.cuda()
            if use_fp16:
                model.half()
            model.prepare_for_inference_(cfg)
        self.models = models
        # Initialize generator
        self.generator = task.build_generator(models, cfg.generation)

        model, cfg, task = checkpoint_utils.load_model_ensemble_and_task(
            utils.split_paths(local_model), arg_overrides=overrides)
        del model
        # Initialize transform
        from torchvision import transforms
        mean = [0.5, 0.5, 0.5]
@@ -69,7 +75,8 @@ class ImageCaptionPipeline(Pipeline):
        self.eos_item = torch.LongTensor([task.src_dict.eos()])
        self.pad_idx = task.src_dict.pad()

    def preprocess(self, input: Input) -> Dict[str, Any]:
    @type_assert(object, (str, tuple))
    def __call__(self, data: Union[str, tuple]) -> Dict[str, Any]:

        def encode_text(text, length=None, append_bos=False, append_eos=False):
            s = self.task.tgt_dict.encode_line(
@@ -88,7 +95,7 @@ class ImageCaptionPipeline(Pipeline):
            patch_image = self.patch_resize_transform(input).unsqueeze(0)
        else:
            patch_image = self.patch_resize_transform(
                load_image(input)).unsqueeze(0)
                load_image(data)).unsqueeze(0)
        patch_mask = torch.tensor([True])
        text = 'what does the image describe?'
        src_text = encode_text(
@@ -105,17 +112,3 @@ class ImageCaptionPipeline(Pipeline):
            }
        }
        return sample

    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
        from ofa.utils.eval_utils import eval_caption

        results, _ = eval_caption(self.task, self.generator, self.models,
                                  input)
        return {
            'image_id': results[0]['image_id'],
            'caption': results[0]['caption']
        }

    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
        # What should we do here ?
        return inputs
--- a/modelscope/preprocessors/nlp.py
+++ b/modelscope/preprocessors/nlp.py
@@ -12,7 +12,8 @@ from .builder import PREPROCESSORS

 __all__ = [
    'Tokenize', 'SequenceClassificationPreprocessor',
    'TextGenerationPreprocessor', 'FillMaskPreprocessor'
    'TextGenerationPreprocessor', 'TokenClassifcationPreprocessor',
    'FillMaskPreprocessor'
 ]


@@ -53,12 +54,12 @@ class SequenceClassificationPreprocessor(Preprocessor):
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_dir)
        print(f'this is the tokenzier {self.tokenizer}')

    @type_assert(object, (str, tuple))
    def __call__(self, data: Union[str, tuple]) -> Dict[str, Any]:
    @type_assert(object, (str, tuple, Dict))
    def __call__(self, data: Union[str, tuple, Dict]) -> Dict[str, Any]:
        """process the raw input data

        Args:
            data (str or tuple):
            data (str or tuple, Dict):
            sentence1 (str): a sentence
                    Example:
                        'you are so handsome.'
@@ -70,22 +71,31 @@ class SequenceClassificationPreprocessor(Preprocessor):
                sentence2 (str): a sentence
                    Example:
                        'you are so beautiful.'
            or
            {field1: field_value1, field2: field_value2}
            field1 (str): field name, default 'first_sequence'
            field_value1 (str): a sentence
                    Example:
                        'you are so handsome.'

            field2 (str): field name, default 'second_sequence'
            field_value2 (str): a sentence
                Example:
                    'you are so beautiful.'

        Returns:
            Dict[str, Any]: the preprocessed data
        """

        if not isinstance(data, tuple):
            data = (
                data,
                None,
            )

        sentence1, sentence2 = data
        new_data = {
            self.first_sequence: sentence1,
            self.second_sequence: sentence2
        }
        if isinstance(data, str):
            new_data = {self.first_sequence: data}
        elif isinstance(data, tuple):
            sentence1, sentence2 = data
            new_data = {
                self.first_sequence: sentence1,
                self.second_sequence: sentence2
            }
        else:
            new_data = data

        # preprocess the data for the model input

@@ -115,17 +125,15 @@ class SequenceClassificationPreprocessor(Preprocessor):
        return rst


@PREPROCESSORS.register_module(Fields.nlp, module_name=r'palm')
@PREPROCESSORS.register_module(Fields.nlp, module_name=r'palm2.0')
 class TextGenerationPreprocessor(Preprocessor):

    def __init__(self, model_dir: str, *args, **kwargs):
    def __init__(self, model_dir: str, tokenizer, *args, **kwargs):
        """preprocess the data using the vocab.txt from the `model_dir` path

        Args:
            model_dir (str): model path
        """
        from sofa import PalmTokenizer

        super().__init__(*args, **kwargs)

        self.model_dir: str = model_dir
@@ -134,7 +142,7 @@ class TextGenerationPreprocessor(Preprocessor):
        self.second_sequence: str = kwargs.pop('second_sequence',
                                               'second_sequence')
        self.sequence_length: int = kwargs.pop('sequence_length', 128)
        self.tokenizer = PalmTokenizer.from_pretrained(model_dir)
        self.tokenizer = tokenizer

    @type_assert(object, str)
    def __call__(self, data: str) -> Dict[str, Any]:
@@ -153,7 +161,7 @@ class TextGenerationPreprocessor(Preprocessor):
        new_data = {self.first_sequence: data}
        # preprocess the data for the model input

        rst = {'input_ids': [], 'attention_mask': [], 'token_type_ids': []}
        rst = {'input_ids': [], 'attention_mask': []}

        max_seq_length = self.sequence_length

@@ -225,3 +233,51 @@ class FillMaskPreprocessor(Preprocessor):
        rst['token_type_ids'].append(feature['token_type_ids'])

        return {k: torch.tensor(v) for k, v in rst.items()}


@PREPROCESSORS.register_module(
    Fields.nlp, module_name=r'bert-token-classification')
 class TokenClassifcationPreprocessor(Preprocessor):

    def __init__(self, model_dir: str, *args, **kwargs):
        """preprocess the data via the vocab.txt from the `model_dir` path

        Args:
            model_dir (str): model path
        """

        super().__init__(*args, **kwargs)

        from sofa import SbertTokenizer
        self.model_dir: str = model_dir
        self.tokenizer = SbertTokenizer.from_pretrained(self.model_dir)

    @type_assert(object, str)
    def __call__(self, data: str) -> Dict[str, Any]:
        """process the raw input data

        Args:
            data (str): a sentence
                Example:
                    'you are so handsome.'

        Returns:
            Dict[str, Any]: the preprocessed data
        """
        # preprocess the data for the model input

        text = data.replace(' ', '').strip()
        tokens = []
        for token in text:
            token = self.tokenizer.tokenize(token)
            tokens.extend(token)
        input_ids = self.tokenizer.convert_tokens_to_ids(tokens)
        input_ids = self.tokenizer.build_inputs_with_special_tokens(input_ids)
        attention_mask = [1] * len(input_ids)
        token_type_ids = [0] * len(input_ids)
        return {
            'text': text,
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'token_type_ids': token_type_ids
        }
--- a/modelscope/preprocessors/text_to_speech.py
+++ b/modelscope/preprocessors/text_to_speech.py
@@ -0,0 +1,51 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import io
 from typing import Any, Dict, Union

 from modelscope.fileio import File
 from modelscope.models.audio.tts.frontend import GenericTtsFrontend
 from modelscope.models.base import Model
 from modelscope.utils.audio.tts_exceptions import *  # noqa F403
 from modelscope.utils.constant import Fields
 from .base import Preprocessor
 from .builder import PREPROCESSORS

 __all__ = ['TextToTacotronSymbols', 'text_to_tacotron_symbols']


@PREPROCESSORS.register_module(
    Fields.audio, module_name=r'text_to_tacotron_symbols')
 class TextToTacotronSymbols(Preprocessor):
    """extract tacotron symbols from text.

    Args:
        res_path (str): TTS frontend resource url
        lang_type (str): language type, valid values are "pinyin" and "chenmix"
    """

    def __init__(self, model_name, lang_type='pinyin'):
        self._frontend_model = Model.from_pretrained(
            model_name, lang_type=lang_type)
        assert self._frontend_model is not None, 'load model from pretained failed'

    def __call__(self, data: str) -> Dict[str, Any]:
        """Call functions to load text and get tacotron symbols.

        Args:
            input (str): text with utf-8
        Returns:
            symbos (list[str]): texts in tacotron symbols format.
        """
        return self._frontend_model.forward(data)


 def text_to_tacotron_symbols(text='', path='./', lang='pinyin'):
    """ simple interface to transform text to tacotron symbols

    Args:
        text (str): input text
        path (str): resource path
        lang (str): language type from one of "pinyin" and "chenmix"
    """
    transform = TextToTacotronSymbols(path, lang)
    return transform(text)
--- a/modelscope/pydatasets/config.py
+++ b/modelscope/pydatasets/config.py
@@ -0,0 +1,22 @@
 import os
 from pathlib import Path

 # Cache location
 DEFAULT_CACHE_HOME = '~/.cache'
 CACHE_HOME = os.getenv('CACHE_HOME', DEFAULT_CACHE_HOME)
 DEFAULT_MS_CACHE_HOME = os.path.join(CACHE_HOME, 'modelscope/hub')
 MS_CACHE_HOME = os.path.expanduser(
    os.getenv('MS_CACHE_HOME', DEFAULT_MS_CACHE_HOME))

 DEFAULT_MS_DATASETS_CACHE = os.path.join(MS_CACHE_HOME, 'datasets')
 MS_DATASETS_CACHE = Path(
    os.getenv('MS_DATASETS_CACHE', DEFAULT_MS_DATASETS_CACHE))

 DOWNLOADED_DATASETS_DIR = 'downloads'
 DEFAULT_DOWNLOADED_DATASETS_PATH = os.path.join(MS_DATASETS_CACHE,
                                                DOWNLOADED_DATASETS_DIR)
 DOWNLOADED_DATASETS_PATH = Path(
    os.getenv('DOWNLOADED_DATASETS_PATH', DEFAULT_DOWNLOADED_DATASETS_PATH))

 MS_HUB_ENDPOINT = os.environ.get('MS_HUB_ENDPOINT',
                                 'http://101.201.119.157:31752')
--- a/modelscope/pydatasets/py_dataset.py
+++ b/modelscope/pydatasets/py_dataset.py
@@ -1,64 +1,81 @@
 from typing import (Any, Callable, Dict, List, Mapping, Optional, Sequence,
                    Union)
 import os
 from typing import (Any, Callable, Dict, Iterable, List, Mapping, Optional,
                    Sequence, Union)

 from datasets import Dataset, load_dataset
 import numpy as np
 from datasets import Dataset
 from datasets import load_dataset as hf_load_dataset
 from datasets.config import TF_AVAILABLE, TORCH_AVAILABLE
 from datasets.packaged_modules import _PACKAGED_DATASETS_MODULES
 from datasets.utils.file_utils import (is_relative_path,
                                       relative_to_absolute_path)

 from modelscope.pydatasets.config import MS_DATASETS_CACHE
 from modelscope.pydatasets.utils.ms_api import MsApi
 from modelscope.utils.constant import Hubs
 from modelscope.utils.logger import get_logger

 logger = get_logger()


 def format_list(para) -> List:
    if para is None:
        para = []
    elif isinstance(para, str):
        para = [para]
    elif len(set(para)) < len(para):
        raise ValueError(f'List columns contains duplicates: {para}')
    return para


 class PyDataset:
    _hf_ds = None  # holds the underlying HuggingFace Dataset
    """A PyDataset backed by hugging face Dataset."""

    def __init__(self, hf_ds: Dataset):
    def __init__(self, hf_ds: Dataset, target: Optional[str] = None):
        self._hf_ds = hf_ds
        self.target = None
        self.target = target

    def __iter__(self):
        if isinstance(self._hf_ds, Dataset):
            for item in self._hf_ds:
                if self.target is not None:
                    yield item[self.target]
                else:
                    yield item
        else:
            for ds in self._hf_ds.values():
                for item in ds:
                    if self.target is not None:
                        yield item[self.target]
                    else:
                        yield item
        for item in self._hf_ds:
            if self.target is not None:
                yield item[self.target]
            else:
                yield item

    def __getitem__(self, key):
        return self._hf_ds[key]

    @classmethod
    def from_hf_dataset(cls,
                        hf_ds: Dataset,
                        target: str = None) -> 'PyDataset':
        dataset = cls(hf_ds)
        dataset.target = target
        return dataset
                        target: str = None) -> Union[dict, 'PyDataset']:
        if isinstance(hf_ds, Dataset):
            return cls(hf_ds, target)
        if len(hf_ds.keys()) == 1:
            return cls(next(iter(hf_ds.values())), target)
        return {k: cls(v, target) for k, v in hf_ds.items()}

    @staticmethod
    def load(path: Union[str, list],
             target: Optional[str] = None,
             version: Optional[str] = None,
             name: Optional[str] = None,
             split: Optional[str] = None,
             data_dir: Optional[str] = None,
             data_files: Optional[Union[str, Sequence[str],
                                        Mapping[str,
                                                Union[str,
                                                      Sequence[str]]]]] = None,
             hub: Optional[Hubs] = None) -> 'PyDataset':
    def load(
        dataset_name: Union[str, list],
        target: Optional[str] = None,
        version: Optional[str] = None,
        hub: Optional[Hubs] = Hubs.modelscope,
        subset_name: Optional[str] = None,
        split: Optional[str] = None,
        data_dir: Optional[str] = None,
        data_files: Optional[Union[str, Sequence[str],
                                   Mapping[str, Union[str,
                                                      Sequence[str]]]]] = None
    ) -> Union[dict, 'PyDataset']:
        """Load a PyDataset from the ModelScope Hub, Hugging Face Hub, urls, or a local dataset.
            Args:

                path (str): Path or name of the dataset.
                dataset_name (str): Path or name of the dataset.
                target (str, optional): Name of the column to output.
                version (str, optional): Version of the dataset script to load:
                name (str, optional): Defining the subset_name of the dataset.
                subset_name (str, optional): Defining the subset_name of the dataset.
                data_dir (str, optional): Defining the data_dir of the dataset configuration. I
                data_files (str or Sequence or Mapping, optional): Path(s) to source data file(s).
                split (str, optional): Which split of the data to load.
@@ -67,53 +84,302 @@ class PyDataset:
            Returns:
                PyDataset (obj:`PyDataset`): PyDataset object for a certain dataset.
            """
        if Hubs.modelscope == hub:
            # TODO: parse data meta information from modelscope hub
            # and possibly download data files to local (and update path)
            print('getting data from modelscope hub')
        if isinstance(path, str):
            dataset = load_dataset(
                path,
                name=name,
        if hub == Hubs.huggingface:
            dataset = hf_load_dataset(
                dataset_name,
                name=subset_name,
                revision=version,
                split=split,
                data_dir=data_dir,
                data_files=data_files)
        elif isinstance(path, list):
            return PyDataset.from_hf_dataset(dataset, target=target)
        else:
            return PyDataset._load_ms_dataset(
                dataset_name,
                target=target,
                subset_name=subset_name,
                version=version,
                split=split,
                data_dir=data_dir,
                data_files=data_files)

    @staticmethod
    def _load_ms_dataset(
        dataset_name: Union[str, list],
        target: Optional[str] = None,
        version: Optional[str] = None,
        subset_name: Optional[str] = None,
        split: Optional[str] = None,
        data_dir: Optional[str] = None,
        data_files: Optional[Union[str, Sequence[str],
                                   Mapping[str, Union[str,
                                                      Sequence[str]]]]] = None
    ) -> Union[dict, 'PyDataset']:
        if isinstance(dataset_name, str):
            use_hf = False
            if dataset_name in _PACKAGED_DATASETS_MODULES or os.path.isdir(dataset_name) or \
                    (os.path.isfile(dataset_name) and dataset_name.endswith('.py')):
                use_hf = True
            elif is_relative_path(dataset_name):
                ms_api = MsApi()
                dataset_scripts = ms_api.fetch_dataset_scripts(
                    dataset_name, version)
                if 'py' in dataset_scripts:  # dataset copied from hf datasets
                    dataset_name = dataset_scripts['py'][0]
                    use_hf = True
            else:
                raise FileNotFoundError(
                    f"Couldn't find a dataset script at {relative_to_absolute_path(dataset_name)} "
                    f'or any data file in the same directory.')

            if use_hf:
                dataset = hf_load_dataset(
                    dataset_name,
                    name=subset_name,
                    revision=version,
                    split=split,
                    data_dir=data_dir,
                    data_files=data_files,
                    cache_dir=MS_DATASETS_CACHE)
            else:
                # TODO load from ms datahub
                raise NotImplementedError(
                    f'Dataset {dataset_name} load from modelscope datahub to be implemented in '
                    f'the future')
        elif isinstance(dataset_name, list):
            if target is None:
                target = 'target'
            dataset = Dataset.from_dict({target: [p] for p in path})
            dataset = Dataset.from_dict({target: dataset_name})
        else:
            raise TypeError('path must be a str or a list, but got'
                            f' {type(path)}')
                            f' {type(dataset_name)}')
        return PyDataset.from_hf_dataset(dataset, target=target)

    def to_torch_dataset_with_processors(
        self,
        preprocessors: Union[Callable, List[Callable]],
        columns: Union[str, List[str]] = None,
    ):
        preprocessor_list = preprocessors if isinstance(
            preprocessors, list) else [preprocessors]

        columns = format_list(columns)

        columns = [
            key for key in self._hf_ds.features.keys() if key in columns
        ]
        sample = next(iter(self._hf_ds))

        sample_res = {k: np.array(sample[k]) for k in columns}
        for processor in preprocessor_list:
            sample_res.update(
                {k: np.array(v)
                 for k, v in processor(sample).items()})

        def is_numpy_number(value):
            return np.issubdtype(value.dtype, np.integer) or np.issubdtype(
                value.dtype, np.floating)

        retained_columns = []
        for k in sample_res.keys():
            if not is_numpy_number(sample_res[k]):
                logger.warning(
                    f'Data of column {k} is non-numeric, will be removed')
                continue
            retained_columns.append(k)

        import torch

        class MsIterableDataset(torch.utils.data.IterableDataset):

            def __init__(self, dataset: Iterable):
                super(MsIterableDataset).__init__()
                self.dataset = dataset

            def __iter__(self):
                for item_dict in self.dataset:
                    res = {
                        k: np.array(item_dict[k])
                        for k in columns if k in retained_columns
                    }
                    for preprocessor in preprocessor_list:
                        res.update({
                            k: np.array(v)
                            for k, v in preprocessor(item_dict).items()
                            if k in retained_columns
                        })
                    yield res

        return MsIterableDataset(self._hf_ds)

    def to_torch_dataset(
        self,
        columns: Union[str, List[str]] = None,
        output_all_columns: bool = False,
        preprocessors: Union[Callable, List[Callable]] = None,
        **format_kwargs,
    ):
        self._hf_ds.reset_format()
        self._hf_ds.set_format(
            type='torch',
            columns=columns,
            output_all_columns=output_all_columns,
            format_kwargs=format_kwargs)
        return self._hf_ds
        """Create a torch.utils.data.Dataset from the MS Dataset. The torch.utils.data.Dataset can be passed to
           torch.utils.data.DataLoader.

        Args:
            preprocessors (Callable or List[Callable], default None): (list of) Preprocessor object used to process
                every sample of the dataset. The output type of processors is dict, and each numeric field of the dict
                will be used as a field of torch.utils.data.Dataset.
            columns (str or List[str], default None): Dataset column(s) to be loaded (numeric data only). If the
                preprocessor is None, the arg columns must have at least one column. If the `preprocessors` is not None,
                the output fields of processors will also be added.
            format_kwargs: A `dict` of arguments to be passed to the `torch.tensor`.

        Returns:
            :class:`tf.data.Dataset`

        """
        if not TORCH_AVAILABLE:
            raise ImportError(
                'The function to_torch_dataset requires pytorch to be installed'
            )
        if preprocessors is not None:
            return self.to_torch_dataset_with_processors(preprocessors)
        else:
            self._hf_ds.reset_format()
            self._hf_ds.set_format(
                type='torch', columns=columns, format_kwargs=format_kwargs)
            return self._hf_ds

    def to_tf_dataset_with_processors(
        self,
        batch_size: int,
        shuffle: bool,
        preprocessors: Union[Callable, List[Callable]],
        drop_remainder: bool = None,
        prefetch: bool = True,
        label_cols: Union[str, List[str]] = None,
        columns: Union[str, List[str]] = None,
    ):
        preprocessor_list = preprocessors if isinstance(
            preprocessors, list) else [preprocessors]

        label_cols = format_list(label_cols)
        columns = format_list(columns)
        cols_to_retain = list(set(label_cols + columns))
        retained_columns = [
            key for key in self._hf_ds.features.keys() if key in cols_to_retain
        ]
        import tensorflow as tf
        tf_dataset = tf.data.Dataset.from_tensor_slices(
            np.arange(len(self._hf_ds), dtype=np.int64))
        if shuffle:
            tf_dataset = tf_dataset.shuffle(buffer_size=len(self._hf_ds))

        def func(i, return_dict=False):
            i = int(i)
            res = {k: np.array(self._hf_ds[i][k]) for k in retained_columns}
            for preprocessor in preprocessor_list:
                # TODO preprocessor output may have the same key
                res.update({
                    k: np.array(v)
                    for k, v in preprocessor(self._hf_ds[i]).items()
                })
            if return_dict:
                return res
            return tuple(list(res.values()))

        sample_res = func(0, True)

        @tf.function(input_signature=[tf.TensorSpec(None, tf.int64)])
        def fetch_function(i):
            output = tf.numpy_function(
                func,
                inp=[i],
                Tout=[
                    tf.dtypes.as_dtype(val.dtype)
                    for val in sample_res.values()
                ],
            )
            return {key: output[i] for i, key in enumerate(sample_res)}

        tf_dataset = tf_dataset.map(
            fetch_function, num_parallel_calls=tf.data.AUTOTUNE)
        if label_cols:

            def split_features_and_labels(input_batch):
                labels = {
                    key: tensor
                    for key, tensor in input_batch.items() if key in label_cols
                }
                if len(input_batch) == 1:
                    input_batch = next(iter(input_batch.values()))
                if len(labels) == 1:
                    labels = next(iter(labels.values()))
                return input_batch, labels

            tf_dataset = tf_dataset.map(split_features_and_labels)

        elif len(columns) == 1:
            tf_dataset = tf_dataset.map(lambda x: next(iter(x.values())))
        if batch_size > 1:
            tf_dataset = tf_dataset.batch(
                batch_size, drop_remainder=drop_remainder)

        if prefetch:
            tf_dataset = tf_dataset.prefetch(tf.data.experimental.AUTOTUNE)
        return tf_dataset

    def to_tf_dataset(
        self,
        columns: Union[str, List[str]],
        batch_size: int,
        shuffle: bool,
        collate_fn: Callable,
        preprocessors: Union[Callable, List[Callable]] = None,
        columns: Union[str, List[str]] = None,
        collate_fn: Callable = None,
        drop_remainder: bool = None,
        collate_fn_args: Dict[str, Any] = None,
        label_cols: Union[str, List[str]] = None,
        dummy_labels: bool = False,
        prefetch: bool = True,
    ):
        """Create a tf.data.Dataset from the MS Dataset. This tf.data.Dataset can be passed to tf methods like
           model.fit() or model.predict().

        Args:
            batch_size (int): Number of samples in a single batch.
            shuffle(bool): Shuffle the dataset order.
            preprocessors (Callable or List[Callable], default None): (list of) Preprocessor object used to process
                every sample of the dataset. The output type of processors is dict, and each field of the dict will be
                used as a field of the tf.data. Dataset. If the `preprocessors` is None, the `collate_fn`
                shouldn't be None.
            columns (str or List[str], default None): Dataset column(s) to be loaded. If the preprocessor is None,
                the arg columns must have at least one column. If the `preprocessors` is not None, the output fields of
                processors will also be added.
            collate_fn(Callable, default None): A callable object used to collect lists of samples into a batch. If
                the `preprocessors` is None, the `collate_fn` shouldn't be None.
            drop_remainder(bool, default None): Drop the last incomplete batch when loading.
            collate_fn_args (Dict, optional): A `dict` of arguments to be passed to the`collate_fn`.
            label_cols (str or List[str], defalut None): Dataset column(s) to load as labels.
            prefetch (bool, default True): Prefetch data.

        Returns:
            :class:`tf.data.Dataset`

        """
        if not TF_AVAILABLE:
            raise ImportError(
                'The function to_tf_dataset requires Tensorflow to be installed.'
            )
        if preprocessors is not None:
            return self.to_tf_dataset_with_processors(
                batch_size,
                shuffle,
                preprocessors,
                drop_remainder=drop_remainder,
                prefetch=prefetch,
                label_cols=label_cols,
                columns=columns)

        if collate_fn is None:
            logger.error(
                'The `preprocessors` and the `collate_fn` should`t be both None.'
            )
            return None
        self._hf_ds.reset_format()
        return self._hf_ds.to_tf_dataset(
            columns,
@@ -123,7 +389,6 @@ class PyDataset:
            drop_remainder=drop_remainder,
            collate_fn_args=collate_fn_args,
            label_cols=label_cols,
            dummy_labels=dummy_labels,
            prefetch=prefetch)

    def to_hf_dataset(self) -> Dataset:
--- a/modelscope/pydatasets/utils/init.py
+++ b/modelscope/pydatasets/utils/init.py
--- a/modelscope/pydatasets/utils/ms_api.py
+++ b/modelscope/pydatasets/utils/ms_api.py
@@ -0,0 +1,66 @@
 import os
 from collections import defaultdict
 from typing import Optional

 import requests

 from modelscope.pydatasets.config import (DOWNLOADED_DATASETS_PATH,
                                          MS_HUB_ENDPOINT)
 from modelscope.utils.logger import get_logger

 logger = get_logger()


 class MsApi:

    def __init__(self, endpoint=MS_HUB_ENDPOINT):
        self.endpoint = endpoint

    def list_datasets(self):
        path = f'{self.endpoint}/api/v1/datasets'
        headers = None
        params = {}
        r = requests.get(path, params=params, headers=headers)
        r.raise_for_status()
        dataset_list = r.json()['Data']
        return [x['Name'] for x in dataset_list]

    def fetch_dataset_scripts(self,
                              dataset_name: str,
                              version: Optional[str] = 'master',
                              force_download=False):
        datahub_url = f'{self.endpoint}/api/v1/datasets?Query={dataset_name}'
        r = requests.get(datahub_url)
        r.raise_for_status()
        dataset_list = r.json()['Data']
        if len(dataset_list) == 0:
            return None
        dataset_id = dataset_list[0]['Id']
        version = version or 'master'
        datahub_url = f'{self.endpoint}/api/v1/datasets/{dataset_id}/repo/tree?Revision={version}'
        r = requests.get(datahub_url)
        r.raise_for_status()
        file_list = r.json()['Data']['Files']
        cache_dir = os.path.join(DOWNLOADED_DATASETS_PATH, dataset_name,
                                 version)
        os.makedirs(cache_dir, exist_ok=True)
        local_paths = defaultdict(list)
        for file_info in file_list:
            file_path = file_info['Path']
            if file_path.endswith('.py'):
                datahub_url = f'{self.endpoint}/api/v1/datasets/{dataset_id}/repo/files?' \
                              f'Revision={version}&Path={file_path}'
                r = requests.get(datahub_url)
                r.raise_for_status()
                content = r.json()['Data']['Content']
                local_path = os.path.join(cache_dir, file_path)
                if os.path.exists(local_path) and not force_download:
                    logger.warning(
                        f"Reusing dataset {dataset_name}'s python file ({local_path})"
                    )
                    local_paths['py'].append(local_path)
                    continue
                with open(local_path, 'w') as f:
                    f.writelines(content)
                local_paths['py'].append(local_path)
        return local_paths
--- a/modelscope/utils/audio/init.py
+++ b/modelscope/utils/audio/init.py
--- a/modelscope/utils/audio/tts_exceptions.py
+++ b/modelscope/utils/audio/tts_exceptions.py
@@ -0,0 +1,42 @@
 """
 Define TTS exceptions
 """


 class TtsException(Exception):
    """
    TTS exception class.
    """
    pass


 class TtsFrontendException(TtsException):
    """
    TTS frontend module level exceptions.
    """
    pass


 class TtsFrontendInitializeFailedException(TtsFrontendException):
    """
    If tts frontend resource is invalid or not exist, this exception will be raised.
    """
    pass


 class TtsFrontendLanguageTypeInvalidException(TtsFrontendException):
    """
    If language type is invalid, this exception will be raised.
    """


 class TtsVocoderException(TtsException):
    """
    Vocoder exception
    """


 class TtsVocoderMelspecShapeMismatchException(TtsVocoderException):
    """
    If vocoder's input melspec shape mismatch, this exception will be raised.
    """
--- a/modelscope/utils/constant.py
+++ b/modelscope/utils/constant.py
@@ -28,8 +28,10 @@ class Tasks(object):
    image_editing = 'image-editing'
    image_generation = 'image-generation'
    image_matting = 'image-matting'
    ocr_detection = 'ocr-detection'

    # nlp tasks
    word_segmentation = 'word-segmentation'
    sentiment_analysis = 'sentiment-analysis'
    sentence_similarity = 'sentence-similarity'
    text_classification = 'text-classification'
--- a/modelscope/utils/registry.py
+++ b/modelscope/utils/registry.py
@@ -67,7 +67,6 @@ class Registry(object):
        if module_name in self._modules[group_key]:
            raise KeyError(f'{module_name} is already registered in '
                           f'{self._name}[{group_key}]')

        self._modules[group_key][module_name] = module_cls
        module_cls.group_key = group_key

--- a/modelscope/utils/test_utils.py
+++ b/modelscope/utils/test_utils.py
@@ -2,6 +2,9 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.

 import os
 import unittest

 from datasets.config import TF_AVAILABLE, TORCH_AVAILABLE

 TEST_LEVEL = 2
 TEST_LEVEL_STR = 'TEST_LEVEL'
@@ -15,6 +18,18 @@ def test_level():
    return TEST_LEVEL


 def require_tf(test_case):
    if not TF_AVAILABLE:
        test_case = unittest.skip('test requires TensorFlow')(test_case)
    return test_case


 def require_torch(test_case):
    if not TORCH_AVAILABLE:
        test_case = unittest.skip('test requires PyTorch')(test_case)
    return test_case


 def set_test_level(level: int):
    global TEST_LEVEL
    TEST_LEVEL = level
--- a/requirements.txt
+++ b/requirements.txt
@@ -2,4 +2,5 @@
 -r requirements/pipeline.txt
 -r requirements/multi-modal.txt
 -r requirements/nlp.txt
 -r requirements/audio.txt
 -r requirements/cv.txt
--- a/requirements/audio.txt
+++ b/requirements/audio.txt
@@ -0,0 +1,26 @@
 #tts
 h5py==2.10.0
 #https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/TTS/requirements/ttsfrd-0.0.1-cp36-cp36m-linux_x86_64.whl
 https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/TTS/requirements/ttsfrd-0.0.1-cp37-cp37m-linux_x86_64.whl
 https://swap.oss-cn-hangzhou.aliyuncs.com/Jiaqi%2Fmaas%2Ftts%2Frequirements%2Fpytorch_wavelets-1.3.0-py3-none-any.whl?Expires=1685688388&OSSAccessKeyId=LTAI4Ffebq4d9jTVDwiSbY4L&Signature=jcQbg5EZ%2Bdys3%2F4BRn3srrKLdIg%3D
 #https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/TTS/requirements/ttsfrd-0.0.1-cp38-cp38-linux_x86_64.whl
 #https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/TTS/requirements/ttsfrd-0.0.1-cp39-cp39-linux_x86_64.whl
 inflect
 keras==2.2.4
 librosa
 lxml
 matplotlib
 nara_wpe
 numpy==1.18.*
 protobuf==3.20.*
 ptflops
 PyWavelets>=1.0.0
 scikit-learn==0.23.2
 sox
 tensorboard
 tensorflow==1.15.*
 torch==1.10.*
 torchaudio
 torchvision
 tqdm
 unidecode
--- a/requirements/cv.txt
+++ b/requirements/cv.txt
@@ -1 +1,2 @@
 easydict
 tf_slim
--- a/requirements/runtime.txt
+++ b/requirements/runtime.txt
@@ -1,12 +1,13 @@
 addict
 datasets
 easydict
 https://mindscope.oss-cn-hangzhou.aliyuncs.com/sdklib/maas_hub-0.2.2.dev0-py3-none-any.whl
 https://mindscope.oss-cn-hangzhou.aliyuncs.com/sdklib/maas_hub-0.2.4.dev0-py3-none-any.whl
 numpy
 opencv-python-headless
 Pillow>=6.2.0
 pyyaml
 requests
 scipy
 tokenizers<=0.10.3
 transformers<=4.16.2
 yapf
--- a/setup.cfg
+++ b/setup.cfg
@@ -11,6 +11,7 @@ default_section = THIRDPARTY
 BASED_ON_STYLE = pep8
 BLANK_LINE_BEFORE_NESTED_CLASS_OR_DEF = true
 SPLIT_BEFORE_EXPRESSION_AFTER_OPENING_PAREN = true
 SPLIT_BEFORE_ARITHMETIC_OPERATOR = true

 [codespell]
 skip = *.ipynb
@@ -20,5 +21,5 @@ ignore-words-list = patten,nd,ty,mot,hist,formating,winn,gool,datas,wan,confids
 [flake8]
 select = B,C,E,F,P,T4,W,B9
 max-line-length = 120
 ignore = F401,F821
 ignore = F401,F821,W503
 exclude = docs/src,*.pyi,.git
--- a/tests/pipelines/test_base.py
+++ b/tests/pipelines/test_base.py
@@ -80,8 +80,7 @@ class CustomPipelineTest(unittest.TestCase):
        pipe2 = pipeline(dummy_task)
        self.assertTrue(type(pipe) is type(pipe2))

        img_url = 'http://pai-vision-data-hz.oss-cn-zhangjiakou.' \
                  'aliyuncs.com/data/test/images/image1.jpg'
        img_url = 'data/test/images/image1.jpg'
        output = pipe(img_url)
        self.assertEqual(output['filename'], img_url)
        self.assertEqual(output['output_png'].shape, (318, 512, 3))