Browse Source

merge with master

master
智丞 3 years ago
parent
commit
05ac2b15d1
47 changed files with 6402 additions and 2 deletions
  1. +5
    -0
      .gitignore
  2. +12
    -0
      docs/source/faq.md
  3. +4
    -0
      docs/source/quick_start.md
  4. +2
    -0
      modelscope/models/__init__.py
  5. +0
    -0
      modelscope/models/audio/tts/__init__.py
  6. +1
    -0
      modelscope/models/audio/tts/am/__init__.py
  7. +8
    -0
      modelscope/models/audio/tts/am/models/__init__.py
  8. +82
    -0
      modelscope/models/audio/tts/am/models/compat.py
  9. +273
    -0
      modelscope/models/audio/tts/am/models/fsmn.py
  10. +178
    -0
      modelscope/models/audio/tts/am/models/fsmn_encoder.py
  11. +160
    -0
      modelscope/models/audio/tts/am/models/helpers.py
  12. +461
    -0
      modelscope/models/audio/tts/am/models/modules.py
  13. +174
    -0
      modelscope/models/audio/tts/am/models/position.py
  14. +155
    -0
      modelscope/models/audio/tts/am/models/reducer.py
  15. +240
    -0
      modelscope/models/audio/tts/am/models/rnn_wrappers.py
  16. +760
    -0
      modelscope/models/audio/tts/am/models/robutrans.py
  17. +817
    -0
      modelscope/models/audio/tts/am/models/self_attention_decoder.py
  18. +182
    -0
      modelscope/models/audio/tts/am/models/self_attention_encoder.py
  19. +1157
    -0
      modelscope/models/audio/tts/am/models/transformer.py
  20. +255
    -0
      modelscope/models/audio/tts/am/sambert_hifi_16k.py
  21. +0
    -0
      modelscope/models/audio/tts/am/text/__init__.py
  22. +89
    -0
      modelscope/models/audio/tts/am/text/cleaners.py
  23. +64
    -0
      modelscope/models/audio/tts/am/text/cmudict.py
  24. +70
    -0
      modelscope/models/audio/tts/am/text/numbers.py
  25. +95
    -0
      modelscope/models/audio/tts/am/text/symbols.py
  26. +200
    -0
      modelscope/models/audio/tts/am/text/symbols_dict.py
  27. +1
    -0
      modelscope/models/audio/tts/frontend/__init__.py
  28. +39
    -0
      modelscope/models/audio/tts/frontend/generic_text_to_speech_frontend.py
  29. +1
    -0
      modelscope/models/audio/tts/vocoder/__init__.py
  30. +73
    -0
      modelscope/models/audio/tts/vocoder/hifigan16k.py
  31. +1
    -0
      modelscope/models/audio/tts/vocoder/models/__init__.py
  32. +516
    -0
      modelscope/models/audio/tts/vocoder/models/models.py
  33. +59
    -0
      modelscope/models/audio/tts/vocoder/models/utils.py
  34. +2
    -0
      modelscope/models/base.py
  35. +1
    -0
      modelscope/pipelines/audio/__init__.py
  36. +46
    -0
      modelscope/pipelines/audio/text_to_speech_pipeline.py
  37. +1
    -0
      modelscope/preprocessors/__init__.py
  38. +2
    -1
      modelscope/preprocessors/audio.py
  39. +53
    -0
      modelscope/preprocessors/text_to_speech.py
  40. +0
    -0
      modelscope/utils/audio/__init__.py
  41. +42
    -0
      modelscope/utils/audio/tts_exceptions.py
  42. +0
    -1
      modelscope/utils/registry.py
  43. +1
    -0
      requirements.txt
  44. +26
    -0
      requirements/audio.txt
  45. +60
    -0
      tests/pipelines/test_text_to_speech.py
  46. +28
    -0
      tests/preprocessors/test_text_to_speech.py
  47. +6
    -0
      tests/run.py

+ 5
- 0
.gitignore View File

@@ -24,6 +24,7 @@ wheels/
.installed.cfg
*.egg
/package
/temp
MANIFEST

# PyInstaller
@@ -123,3 +124,7 @@ replace.sh

# Pytorch
*.pth


# audio
*.wav

+ 12
- 0
docs/source/faq.md View File

@@ -29,3 +29,15 @@ reference: [https://huggingface.co/docs/tokenizers/installation#installation-fro
> ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.

由于依赖库之间的版本不兼容,可能会存在版本冲突的情况,大部分情况下不影响正常运行。

### 3. 安装pytorch出现版本错误

> ERROR: Ignored the following versions that require a different python version: 1.1.0 Requires-Python >=3.8; 1.1.0rc1 Requires-Python >=3.8; 1.1.1 Requires-Python >=3.8
> ERROR: Could not find a version that satisfies the requirement torch==1.8.1+cu111 (from versions: 1.0.0, 1.0.1, 1.0.1.post2, 1.1.0, 1.2.0, 1.3.0, 1.3.1, 1.4.0, 1.5.0, 1.5.1, 1.6.0, 1.7.0, 1.7.1, 1.8.0, 1.8.1, 1.9.0, 1.9.1, 1.10.0, 1.10.1, 1.10.2, 1.11.0)
> ERROR: No matching distribution found for torch==1.8.1+cu111

安装时使用如下命令:

```shell
pip install -f https://download.pytorch.org/whl/torch_stable.html -i https://pypi.tuna.tsinghua.edu.cn/simple -r requirements.txt
```

+ 4
- 0
docs/source/quick_start.md View File

@@ -25,6 +25,10 @@ ModelScope Library目前支持tensorflow,pytorch两大深度学习框架进行
* [Pytorch安装指导](https://pytorch.org/get-started/locally/)
* [Tensorflow安装指导](https://www.tensorflow.org/install/pip)

部分第三方依赖库需要提前安装numpy
```
pip install numpy
```

## ModelScope library 安装



+ 2
- 0
modelscope/models/__init__.py View File

@@ -1,5 +1,7 @@
# Copyright (c) Alibaba, Inc. and its affiliates.

from .audio.tts.am import SambertNetHifi16k
from .audio.tts.vocoder import Hifigan16k
from .base import Model
from .builder import MODELS, build_model
from .nlp import BertForSequenceClassification, SbertForSentenceSimilarity

+ 0
- 0
modelscope/models/audio/tts/__init__.py View File


+ 1
- 0
modelscope/models/audio/tts/am/__init__.py View File

@@ -0,0 +1 @@
from .sambert_hifi_16k import * # noqa F403

+ 8
- 0
modelscope/models/audio/tts/am/models/__init__.py View File

@@ -0,0 +1,8 @@
from .robutrans import RobuTrans


def create_model(name, hparams):
if name == 'robutrans':
return RobuTrans(hparams)
else:
raise Exception('Unknown model: ' + name)

+ 82
- 0
modelscope/models/audio/tts/am/models/compat.py View File

@@ -0,0 +1,82 @@
"""Functions for compatibility with different TensorFlow versions."""

import tensorflow as tf


def is_tf2():
"""Returns ``True`` if running TensorFlow 2.0."""
return tf.__version__.startswith('2')


def tf_supports(symbol):
"""Returns ``True`` if TensorFlow defines :obj:`symbol`."""
return _string_to_tf_symbol(symbol) is not None


def tf_any(*symbols):
"""Returns the first supported symbol."""
for symbol in symbols:
module = _string_to_tf_symbol(symbol)
if module is not None:
return module
return None


def tf_compat(v2=None, v1=None): # pylint: disable=invalid-name
"""Returns the compatible symbol based on the current TensorFlow version.

Args:
v2: The candidate v2 symbol name.
v1: The candidate v1 symbol name.

Returns:
A TensorFlow symbol.

Raises:
ValueError: if no symbol can be found.
"""
candidates = []
if v2 is not None:
candidates.append(v2)
if v1 is not None:
candidates.append(v1)
candidates.append('compat.v1.%s' % v1)
symbol = tf_any(*candidates)
if symbol is None:
raise ValueError('Failure to resolve the TensorFlow symbol')
return symbol


def name_from_variable_scope(name=''):
"""Creates a name prefixed by the current variable scope."""
var_scope = tf_compat(v1='get_variable_scope')().name
compat_name = ''
if name:
compat_name = '%s/' % name
if var_scope:
compat_name = '%s/%s' % (var_scope, compat_name)
return compat_name


def reuse():
"""Returns ``True`` if the current variable scope is marked for reuse."""
return tf_compat(v1='get_variable_scope')().reuse


def _string_to_tf_symbol(symbol):
modules = symbol.split('.')
namespace = tf
for module in modules:
namespace = getattr(namespace, module, None)
if namespace is None:
return None
return namespace


# pylint: disable=invalid-name
gfile_copy = tf_compat(v2='io.gfile.copy', v1='gfile.Copy')
gfile_exists = tf_compat(v2='io.gfile.exists', v1='gfile.Exists')
gfile_open = tf_compat(v2='io.gfile.GFile', v1='gfile.GFile')
is_tensor = tf_compat(v2='is_tensor', v1='contrib.framework.is_tensor')
logging = tf_compat(v1='logging')
nest = tf_compat(v2='nest', v1='contrib.framework.nest')

+ 273
- 0
modelscope/models/audio/tts/am/models/fsmn.py View File

@@ -0,0 +1,273 @@
import tensorflow as tf


def build_sequence_mask(sequence_length,
maximum_length=None,
dtype=tf.float32):
"""Builds the dot product mask.

Args:
sequence_length: The sequence length.
maximum_length: Optional size of the returned time dimension. Otherwise
it is the maximum of :obj:`sequence_length`.
dtype: The type of the mask tensor.

Returns:
A broadcastable ``tf.Tensor`` of type :obj:`dtype` and shape
``[batch_size, max_length]``.
"""
mask = tf.sequence_mask(
sequence_length, maxlen=maximum_length, dtype=dtype)

return mask


def norm(inputs):
"""Layer normalizes :obj:`inputs`."""
return tf.contrib.layers.layer_norm(inputs, begin_norm_axis=-1)


def pad_in_time(x, padding_shape):
"""Helper function to pad a tensor in the time dimension and retain the static depth dimension.

Agrs:
x: [Batch, Time, Frequency]
padding_length: padding size of constant value (0) before the time dimension

return:
padded x
"""

depth = x.get_shape().as_list()[-1]
x = tf.pad(x, [[0, 0], padding_shape, [0, 0]])
x.set_shape((None, None, depth))

return x


def pad_in_time_right(x, padding_length):
"""Helper function to pad a tensor in the time dimension and retain the static depth dimension.

Agrs:
x: [Batch, Time, Frequency]
padding_length: padding size of constant value (0) before the time dimension

return:
padded x
"""
depth = x.get_shape().as_list()[-1]
x = tf.pad(x, [[0, 0], [0, padding_length], [0, 0]])
x.set_shape((None, None, depth))

return x


def feed_forward(x, ffn_dim, memory_units, mode, dropout=0.0):
"""Implements the Transformer's "Feed Forward" layer.

.. math::

ffn(x) = max(0, x*W_1 + b_1)*W_2

Args:
x: The input.
ffn_dim: The number of units of the nonlinear transformation.
memory_units: the number of units of linear transformation
mode: A ``tf.estimator.ModeKeys`` mode.
dropout: The probability to drop units from the inner transformation.

Returns:
The transformed input.
"""
inner = tf.layers.conv1d(x, ffn_dim, 1, activation=tf.nn.relu)
inner = tf.layers.dropout(
inner, rate=dropout, training=mode == tf.estimator.ModeKeys.TRAIN)
outer = tf.layers.conv1d(inner, memory_units, 1, use_bias=False)

return outer


def drop_and_add(inputs, outputs, mode, dropout=0.0):
"""Drops units in the outputs and adds the previous values.

Args:
inputs: The input of the previous layer.
outputs: The output of the previous layer.
mode: A ``tf.estimator.ModeKeys`` mode.
dropout: The probability to drop units in :obj:`outputs`.

Returns:
The residual and normalized output.
"""
outputs = tf.layers.dropout(outputs, rate=dropout, training=mode)

input_dim = inputs.get_shape().as_list()[-1]
output_dim = outputs.get_shape().as_list()[-1]

if input_dim == output_dim:
outputs += inputs

return outputs


def MemoryBlock(
inputs,
filter_size,
mode,
mask=None,
dropout=0.0,
):
"""
Define the bidirectional memory block in FSMN

Agrs:
inputs: The output of the previous layer. [Batch, Time, Frequency]
filter_size: memory block filter size
mode: Training or Evaluation
mask: A ``tf.Tensor`` applied to the memory block output

return:
output: 3-D tensor ([Batch, Time, Frequency])
"""
static_shape = inputs.get_shape().as_list()
depth = static_shape[-1]
inputs = tf.expand_dims(inputs, axis=1) # [Batch, 1, Time, Frequency]
depthwise_filter = tf.get_variable(
'depth_conv_w',
shape=[1, filter_size, depth, 1],
initializer=tf.glorot_uniform_initializer(),
dtype=tf.float32)
memory = tf.nn.depthwise_conv2d(
input=inputs,
filter=depthwise_filter,
strides=[1, 1, 1, 1],
padding='SAME',
rate=[1, 1],
data_format='NHWC')
memory = memory + inputs
output = tf.layers.dropout(memory, rate=dropout, training=mode)
output = tf.reshape(
output,
[tf.shape(output)[0], tf.shape(output)[2], depth])
if mask is not None:
output = output * tf.expand_dims(mask, -1)

return output


def MemoryBlockV2(
inputs,
filter_size,
mode,
shift=0,
mask=None,
dropout=0.0,
):
"""
Define the bidirectional memory block in FSMN

Agrs:
inputs: The output of the previous layer. [Batch, Time, Frequency]
filter_size: memory block filter size
mode: Training or Evaluation
shift: left padding, to control delay
mask: A ``tf.Tensor`` applied to the memory block output

return:
output: 3-D tensor ([Batch, Time, Frequency])
"""
if mask is not None:
inputs = inputs * tf.expand_dims(mask, -1)

static_shape = inputs.get_shape().as_list()
depth = static_shape[-1]
# padding
left_padding = int(round((filter_size - 1) / 2))
right_padding = int((filter_size - 1) / 2)
if shift > 0:
left_padding = left_padding + shift
right_padding = right_padding - shift
pad_inputs = pad_in_time(inputs, [left_padding, right_padding])
pad_inputs = tf.expand_dims(
pad_inputs, axis=1) # [Batch, 1, Time, Frequency]
depthwise_filter = tf.get_variable(
'depth_conv_w',
shape=[1, filter_size, depth, 1],
initializer=tf.glorot_uniform_initializer(),
dtype=tf.float32)
memory = tf.nn.depthwise_conv2d(
input=pad_inputs,
filter=depthwise_filter,
strides=[1, 1, 1, 1],
padding='VALID',
rate=[1, 1],
data_format='NHWC')
memory = tf.reshape(
memory,
[tf.shape(memory)[0], tf.shape(memory)[2], depth])
memory = memory + inputs
output = tf.layers.dropout(memory, rate=dropout, training=mode)
if mask is not None:
output = output * tf.expand_dims(mask, -1)

return output


def UniMemoryBlock(
inputs,
filter_size,
mode,
cache=None,
mask=None,
dropout=0.0,
):
"""
Define the unidirectional memory block in FSMN

Agrs:
inputs: The output of the previous layer. [Batch, Time, Frequency]
filter_size: memory block filter size
cache: for streaming inference
mode: Training or Evaluation
mask: A ``tf.Tensor`` applied to the memory block output
dropout: dorpout factor
return:
output: 3-D tensor ([Batch, Time, Frequency])
"""
if cache is not None:
static_shape = cache['queries'].get_shape().as_list()
depth = static_shape[-1]
queries = tf.slice(cache['queries'], [0, 1, 0], [
tf.shape(cache['queries'])[0],
tf.shape(cache['queries'])[1] - 1, depth
])
queries = tf.concat([queries, inputs], axis=1)
cache['queries'] = queries
else:
padding_length = filter_size - 1
queries = pad_in_time(inputs, [padding_length, 0])

queries = tf.expand_dims(queries, axis=1) # [Batch, 1, Time, Frequency]
static_shape = queries.get_shape().as_list()
depth = static_shape[-1]
depthwise_filter = tf.get_variable(
'depth_conv_w',
shape=[1, filter_size, depth, 1],
initializer=tf.glorot_uniform_initializer(),
dtype=tf.float32)
memory = tf.nn.depthwise_conv2d(
input=queries,
filter=depthwise_filter,
strides=[1, 1, 1, 1],
padding='VALID',
rate=[1, 1],
data_format='NHWC')
memory = tf.reshape(
memory,
[tf.shape(memory)[0], tf.shape(memory)[2], depth])
memory = memory + inputs
output = tf.layers.dropout(memory, rate=dropout, training=mode)
if mask is not None:
output = output * tf.expand_dims(mask, -1)

return output

+ 178
- 0
modelscope/models/audio/tts/am/models/fsmn_encoder.py View File

@@ -0,0 +1,178 @@
import tensorflow as tf

from . import fsmn


class FsmnEncoder():
"""Encoder using Fsmn
"""

def __init__(self,
filter_size,
fsmn_num_layers,
dnn_num_layers,
num_memory_units=512,
ffn_inner_dim=2048,
dropout=0.0,
position_encoder=None):
"""Initializes the parameters of the encoder.

Args:
filter_size: the total order of memory block
fsmn_num_layers: The number of fsmn layers.
dnn_num_layers: The number of dnn layers
num_units: The number of memory units.
ffn_inner_dim: The number of units of the inner linear transformation
in the feed forward layer.
dropout: The probability to drop units from the outputs.
position_encoder: The :class:`opennmt.layers.position.PositionEncoder` to
apply on inputs or ``None``.
"""
super(FsmnEncoder, self).__init__()
self.filter_size = filter_size
self.fsmn_num_layers = fsmn_num_layers
self.dnn_num_layers = dnn_num_layers
self.num_memory_units = num_memory_units
self.ffn_inner_dim = ffn_inner_dim
self.dropout = dropout
self.position_encoder = position_encoder

def encode(self, inputs, sequence_length=None, mode=True):
if self.position_encoder is not None:
inputs = self.position_encoder(inputs)

inputs = tf.layers.dropout(inputs, rate=self.dropout, training=mode)

mask = fsmn.build_sequence_mask(
sequence_length, maximum_length=tf.shape(inputs)[1])

state = ()

for layer in range(self.fsmn_num_layers):
with tf.variable_scope('fsmn_layer_{}'.format(layer)):
with tf.variable_scope('ffn'):
context = fsmn.feed_forward(
inputs,
self.ffn_inner_dim,
self.num_memory_units,
mode,
dropout=self.dropout)

with tf.variable_scope('memory'):
memory = fsmn.MemoryBlock(
context,
self.filter_size,
mode,
mask=mask,
dropout=self.dropout)

memory = fsmn.drop_and_add(
inputs, memory, mode, dropout=self.dropout)

inputs = memory
state += (tf.reduce_mean(inputs, axis=1), )

for layer in range(self.dnn_num_layers):
with tf.variable_scope('dnn_layer_{}'.format(layer)):
transformed = fsmn.feed_forward(
inputs,
self.ffn_inner_dim,
self.num_memory_units,
mode,
dropout=self.dropout)

inputs = transformed
state += (tf.reduce_mean(inputs, axis=1), )

outputs = inputs
return (outputs, state, sequence_length)


class FsmnEncoderV2():
"""Encoder using Fsmn
"""

def __init__(self,
filter_size,
fsmn_num_layers,
dnn_num_layers,
num_memory_units=512,
ffn_inner_dim=2048,
dropout=0.0,
shift=0,
position_encoder=None):
"""Initializes the parameters of the encoder.

Args:
filter_size: the total order of memory block
fsmn_num_layers: The number of fsmn layers.
dnn_num_layers: The number of dnn layers
num_units: The number of memory units.
ffn_inner_dim: The number of units of the inner linear transformation
in the feed forward layer.
dropout: The probability to drop units from the outputs.
shift: left padding, to control delay
position_encoder: The :class:`opennmt.layers.position.PositionEncoder` to
apply on inputs or ``None``.
"""
super(FsmnEncoderV2, self).__init__()
self.filter_size = filter_size
self.fsmn_num_layers = fsmn_num_layers
self.dnn_num_layers = dnn_num_layers
self.num_memory_units = num_memory_units
self.ffn_inner_dim = ffn_inner_dim
self.dropout = dropout
self.shift = shift
if not isinstance(shift, list):
self.shift = [shift for _ in range(self.fsmn_num_layers)]
self.position_encoder = position_encoder

def encode(self, inputs, sequence_length=None, mode=True):
if self.position_encoder is not None:
inputs = self.position_encoder(inputs)

inputs = tf.layers.dropout(inputs, rate=self.dropout, training=mode)

mask = fsmn.build_sequence_mask(
sequence_length, maximum_length=tf.shape(inputs)[1])

state = ()
for layer in range(self.fsmn_num_layers):
with tf.variable_scope('fsmn_layer_{}'.format(layer)):
with tf.variable_scope('ffn'):
context = fsmn.feed_forward(
inputs,
self.ffn_inner_dim,
self.num_memory_units,
mode,
dropout=self.dropout)

with tf.variable_scope('memory'):
memory = fsmn.MemoryBlockV2(
context,
self.filter_size,
mode,
shift=self.shift[layer],
mask=mask,
dropout=self.dropout)

memory = fsmn.drop_and_add(
inputs, memory, mode, dropout=self.dropout)

inputs = memory
state += (tf.reduce_mean(inputs, axis=1), )

for layer in range(self.dnn_num_layers):
with tf.variable_scope('dnn_layer_{}'.format(layer)):
transformed = fsmn.feed_forward(
inputs,
self.ffn_inner_dim,
self.num_memory_units,
mode,
dropout=self.dropout)

inputs = transformed
state += (tf.reduce_mean(inputs, axis=1), )

outputs = inputs
return (outputs, state, sequence_length)

+ 160
- 0
modelscope/models/audio/tts/am/models/helpers.py View File

@@ -0,0 +1,160 @@
import numpy as np
import tensorflow as tf
from tensorflow.contrib.seq2seq import Helper


class VarTestHelper(Helper):

def __init__(self, batch_size, inputs, dim):
with tf.name_scope('VarTestHelper'):
self._batch_size = batch_size
self._inputs = inputs
self._dim = dim

num_steps = tf.shape(self._inputs)[1]
self._lengths = tf.tile([num_steps], [self._batch_size])

self._inputs = tf.roll(inputs, shift=-1, axis=1)
self._init_inputs = inputs[:, 0, :]

@property
def batch_size(self):
return self._batch_size

@property
def sample_ids_shape(self):
return tf.TensorShape([])

@property
def sample_ids_dtype(self):
return np.int32

def initialize(self, name=None):
return (tf.tile([False], [self._batch_size]),
_go_frames(self._batch_size, self._dim, self._init_inputs))

def sample(self, time, outputs, state, name=None):
return tf.tile([0], [self._batch_size]) # Return all 0; we ignore them

def next_inputs(self, time, outputs, state, sample_ids, name=None):
with tf.name_scope('VarTestHelper'):
finished = (time + 1 >= self._lengths)
next_inputs = tf.concat([outputs, self._inputs[:, time, :]],
axis=-1)
return (finished, next_inputs, state)


class VarTrainingHelper(Helper):

def __init__(self, targets, inputs, dim):
with tf.name_scope('VarTrainingHelper'):
self._targets = targets # [N, T_in, 1]
self._batch_size = tf.shape(inputs)[0] # N
self._inputs = inputs
self._dim = dim

num_steps = tf.shape(self._targets)[1]
self._lengths = tf.tile([num_steps], [self._batch_size])

self._inputs = tf.roll(inputs, shift=-1, axis=1)
self._init_inputs = inputs[:, 0, :]

@property
def batch_size(self):
return self._batch_size

@property
def sample_ids_shape(self):
return tf.TensorShape([])

@property
def sample_ids_dtype(self):
return np.int32

def initialize(self, name=None):
return (tf.tile([False], [self._batch_size]),
_go_frames(self._batch_size, self._dim, self._init_inputs))

def sample(self, time, outputs, state, name=None):
return tf.tile([0], [self._batch_size]) # Return all 0; we ignore them

def next_inputs(self, time, outputs, state, sample_ids, name=None):
with tf.name_scope(name or 'VarTrainingHelper'):
finished = (time + 1 >= self._lengths)
next_inputs = tf.concat(
[self._targets[:, time, :], self._inputs[:, time, :]], axis=-1)
return (finished, next_inputs, state)


class VarTrainingSSHelper(Helper):

def __init__(self, targets, inputs, dim, global_step, schedule_begin,
alpha, decay_steps):
with tf.name_scope('VarTrainingSSHelper'):
self._targets = targets # [N, T_in, 1]
self._batch_size = tf.shape(inputs)[0] # N
self._inputs = inputs
self._dim = dim

num_steps = tf.shape(self._targets)[1]
self._lengths = tf.tile([num_steps], [self._batch_size])

self._inputs = tf.roll(inputs, shift=-1, axis=1)
self._init_inputs = inputs[:, 0, :]

# for schedule sampling
self._global_step = global_step
self._schedule_begin = schedule_begin
self._alpha = alpha
self._decay_steps = decay_steps

@property
def batch_size(self):
return self._batch_size

@property
def sample_ids_shape(self):
return tf.TensorShape([])

@property
def sample_ids_dtype(self):
return np.int32

def initialize(self, name=None):
self._ratio = _tf_decay(self._global_step, self._schedule_begin,
self._alpha, self._decay_steps)
return (tf.tile([False], [self._batch_size]),
_go_frames(self._batch_size, self._dim, self._init_inputs))

def sample(self, time, outputs, state, name=None):
return tf.tile([0], [self._batch_size]) # Return all 0; we ignore them

def next_inputs(self, time, outputs, state, sample_ids, name=None):
with tf.name_scope(name or 'VarTrainingHelper'):
finished = (time + 1 >= self._lengths)
next_inputs_tmp = tf.cond(
tf.less(
tf.random_uniform([], minval=0, maxval=1,
dtype=tf.float32), self._ratio),
lambda: self._targets[:, time, :], lambda: outputs)
next_inputs = tf.concat(
[next_inputs_tmp, self._inputs[:, time, :]], axis=-1)
return (finished, next_inputs, state)


def _go_frames(batch_size, dim, init_inputs):
'''Returns all-zero <GO> frames for a given batch size and output dimension'''
return tf.concat([tf.tile([[0.0]], [batch_size, dim]), init_inputs],
axis=-1)


def _tf_decay(global_step, schedule_begin, alpha, decay_steps):
tfr = tf.train.exponential_decay(
1.0,
global_step=global_step - schedule_begin,
decay_steps=decay_steps,
decay_rate=alpha,
name='tfr_decay')
final_tfr = tf.cond(
tf.less(global_step, schedule_begin), lambda: 1.0, lambda: tfr)
return final_tfr

+ 461
- 0
modelscope/models/audio/tts/am/models/modules.py View File

@@ -0,0 +1,461 @@
import tensorflow as tf
from tensorflow.contrib.cudnn_rnn import CudnnLSTM
from tensorflow.contrib.cudnn_rnn.python.ops import cudnn_rnn_ops
from tensorflow.contrib.rnn import LSTMBlockCell


def encoder_prenet(inputs,
n_conv_layers,
filters,
kernel_size,
dense_units,
is_training,
mask=None,
scope='encoder_prenet'):
x = inputs
with tf.variable_scope(scope):
for i in range(n_conv_layers):
x = conv1d(
x,
filters,
kernel_size,
is_training,
activation=tf.nn.relu,
dropout=True,
mask=mask,
scope='conv1d_{}'.format(i))
x = tf.layers.dense(
x, units=dense_units, activation=None, name='dense')
return x


def decoder_prenet(inputs,
prenet_units,
dense_units,
is_training,
scope='decoder_prenet'):
x = inputs
with tf.variable_scope(scope):
for i, units in enumerate(prenet_units):
x = tf.layers.dense(
x,
units=units,
activation=tf.nn.relu,
name='dense_{}'.format(i))
x = tf.layers.dropout(
x, rate=0.5, training=is_training, name='dropout_{}'.format(i))
x = tf.layers.dense(
x, units=dense_units, activation=None, name='dense')
return x


def encoder(inputs,
input_lengths,
n_conv_layers,
filters,
kernel_size,
lstm_units,
is_training,
embedded_inputs_speaker,
mask=None,
scope='encoder'):
with tf.variable_scope(scope):
x = conv_and_lstm(
inputs,
input_lengths,
n_conv_layers,
filters,
kernel_size,
lstm_units,
is_training,
embedded_inputs_speaker,
mask=mask)
return x


def prenet(inputs, prenet_units, is_training, scope='prenet'):
x = inputs
with tf.variable_scope(scope):
for i, units in enumerate(prenet_units):
x = tf.layers.dense(
x,
units=units,
activation=tf.nn.relu,
name='dense_{}'.format(i))
x = tf.layers.dropout(
x, rate=0.5, training=is_training, name='dropout_{}'.format(i))
return x


def postnet_residual_ulstm(inputs,
n_conv_layers,
filters,
kernel_size,
lstm_units,
output_units,
is_training,
scope='postnet_residual_ulstm'):
with tf.variable_scope(scope):
x = conv_and_ulstm(inputs, None, n_conv_layers, filters, kernel_size,
lstm_units, is_training)
x = conv1d(
x,
output_units,
kernel_size,
is_training,
activation=None,
dropout=False,
scope='conv1d_{}'.format(n_conv_layers - 1))
return x


def postnet_residual_lstm(inputs,
n_conv_layers,
filters,
kernel_size,
lstm_units,
output_units,
is_training,
scope='postnet_residual_lstm'):
with tf.variable_scope(scope):
x = conv_and_lstm(inputs, None, n_conv_layers, filters, kernel_size,
lstm_units, is_training)
x = conv1d(
x,
output_units,
kernel_size,
is_training,
activation=None,
dropout=False,
scope='conv1d_{}'.format(n_conv_layers - 1))
return x


def postnet_linear_ulstm(inputs,
n_conv_layers,
filters,
kernel_size,
lstm_units,
output_units,
is_training,
scope='postnet_linear'):
with tf.variable_scope(scope):
x = conv_and_ulstm(inputs, None, n_conv_layers, filters, kernel_size,
lstm_units, is_training)
x = tf.layers.dense(x, units=output_units)
return x


def postnet_linear_lstm(inputs,
n_conv_layers,
filters,
kernel_size,
lstm_units,
output_units,
output_lengths,
is_training,
embedded_inputs_speaker2,
mask=None,
scope='postnet_linear'):
with tf.variable_scope(scope):
x = conv_and_lstm_dec(
inputs,
output_lengths,
n_conv_layers,
filters,
kernel_size,
lstm_units,
is_training,
embedded_inputs_speaker2,
mask=mask)
x = tf.layers.dense(x, units=output_units)
return x


def postnet_linear(inputs,
n_conv_layers,
filters,
kernel_size,
lstm_units,
output_units,
output_lengths,
is_training,
embedded_inputs_speaker2,
mask=None,
scope='postnet_linear'):
with tf.variable_scope(scope):
x = conv_dec(
inputs,
output_lengths,
n_conv_layers,
filters,
kernel_size,
lstm_units,
is_training,
embedded_inputs_speaker2,
mask=mask)
return x


def conv_and_lstm(inputs,
sequence_lengths,
n_conv_layers,
filters,
kernel_size,
lstm_units,
is_training,
embedded_inputs_speaker,
mask=None,
scope='conv_and_lstm'):
x = inputs
with tf.variable_scope(scope):
for i in range(n_conv_layers):
x = conv1d(
x,
filters,
kernel_size,
is_training,
activation=tf.nn.relu,
dropout=True,
mask=mask,
scope='conv1d_{}'.format(i))

x = tf.concat([x, embedded_inputs_speaker], axis=2)

outputs, states = tf.nn.bidirectional_dynamic_rnn(
LSTMBlockCell(lstm_units),
LSTMBlockCell(lstm_units),
x,
sequence_length=sequence_lengths,
dtype=tf.float32)
x = tf.concat(outputs, axis=-1)

return x


def conv_and_lstm_dec(inputs,
sequence_lengths,
n_conv_layers,
filters,
kernel_size,
lstm_units,
is_training,
embedded_inputs_speaker2,
mask=None,
scope='conv_and_lstm'):
x = inputs
with tf.variable_scope(scope):
for i in range(n_conv_layers):
x = conv1d(
x,
filters,
kernel_size,
is_training,
activation=tf.nn.relu,
dropout=True,
mask=mask,
scope='conv1d_{}'.format(i))

x = tf.concat([x, embedded_inputs_speaker2], axis=2)

outputs, states = tf.nn.bidirectional_dynamic_rnn(
LSTMBlockCell(lstm_units),
LSTMBlockCell(lstm_units),
x,
sequence_length=sequence_lengths,
dtype=tf.float32)
x = tf.concat(outputs, axis=-1)
return x


def conv_dec(inputs,
sequence_lengths,
n_conv_layers,
filters,
kernel_size,
lstm_units,
is_training,
embedded_inputs_speaker2,
mask=None,
scope='conv_and_lstm'):
x = inputs
with tf.variable_scope(scope):
for i in range(n_conv_layers):
x = conv1d(
x,
filters,
kernel_size,
is_training,
activation=tf.nn.relu,
dropout=True,
mask=mask,
scope='conv1d_{}'.format(i))
x = tf.concat([x, embedded_inputs_speaker2], axis=2)
return x


def conv_and_ulstm(inputs,
sequence_lengths,
n_conv_layers,
filters,
kernel_size,
lstm_units,
is_training,
scope='conv_and_ulstm'):
x = inputs
with tf.variable_scope(scope):
for i in range(n_conv_layers):
x = conv1d(
x,
filters,
kernel_size,
is_training,
activation=tf.nn.relu,
dropout=True,
scope='conv1d_{}'.format(i))

outputs, states = tf.nn.dynamic_rnn(
LSTMBlockCell(lstm_units),
x,
sequence_length=sequence_lengths,
dtype=tf.float32)

return outputs


def conv1d(inputs,
filters,
kernel_size,
is_training,
activation=None,
dropout=False,
mask=None,
scope='conv1d'):
with tf.variable_scope(scope):
if mask is not None:
inputs = inputs * tf.expand_dims(mask, -1)
x = tf.layers.conv1d(
inputs, filters=filters, kernel_size=kernel_size, padding='same')
if mask is not None:
x = x * tf.expand_dims(mask, -1)

x = tf.layers.batch_normalization(x, training=is_training)
if activation is not None:
x = activation(x)
if dropout:
x = tf.layers.dropout(x, rate=0.5, training=is_training)
return x


def conv1d_dp(inputs,
filters,
kernel_size,
is_training,
activation=None,
dropout=False,
dropoutrate=0.5,
mask=None,
scope='conv1d'):
with tf.variable_scope(scope):
if mask is not None:
inputs = inputs * tf.expand_dims(mask, -1)
x = tf.layers.conv1d(
inputs, filters=filters, kernel_size=kernel_size, padding='same')
if mask is not None:
x = x * tf.expand_dims(mask, -1)

x = tf.contrib.layers.layer_norm(x)
if activation is not None:
x = activation(x)
if dropout:
x = tf.layers.dropout(x, rate=dropoutrate, training=is_training)
return x


def duration_predictor(inputs,
n_conv_layers,
filters,
kernel_size,
lstm_units,
input_lengths,
is_training,
embedded_inputs_speaker,
mask=None,
scope='duration_predictor'):
with tf.variable_scope(scope):
x = inputs
for i in range(n_conv_layers):
x = conv1d_dp(
x,
filters,
kernel_size,
is_training,
activation=tf.nn.relu,
dropout=True,
dropoutrate=0.1,
mask=mask,
scope='conv1d_{}'.format(i))

x = tf.concat([x, embedded_inputs_speaker], axis=2)

outputs, states = tf.nn.bidirectional_dynamic_rnn(
LSTMBlockCell(lstm_units),
LSTMBlockCell(lstm_units),
x,
sequence_length=input_lengths,
dtype=tf.float32)
x = tf.concat(outputs, axis=-1)

x = tf.layers.dense(x, units=1)
x = tf.nn.relu(x)
return x


def duration_predictor2(inputs,
n_conv_layers,
filters,
kernel_size,
input_lengths,
is_training,
mask=None,
scope='duration_predictor'):
with tf.variable_scope(scope):
x = inputs
for i in range(n_conv_layers):
x = conv1d_dp(
x,
filters,
kernel_size,
is_training,
activation=tf.nn.relu,
dropout=True,
dropoutrate=0.1,
mask=mask,
scope='conv1d_{}'.format(i))

x = tf.layers.dense(x, units=1)
x = tf.nn.relu(x)
return x


def conv_prenet(inputs,
n_conv_layers,
filters,
kernel_size,
is_training,
mask=None,
scope='conv_prenet'):
x = inputs
with tf.variable_scope(scope):
for i in range(n_conv_layers):
x = conv1d(
x,
filters,
kernel_size,
is_training,
activation=tf.nn.relu,
dropout=True,
mask=mask,
scope='conv1d_{}'.format(i))

return x

+ 174
- 0
modelscope/models/audio/tts/am/models/position.py View File

@@ -0,0 +1,174 @@
"""Define position encoder classes."""

import abc
import math

import tensorflow as tf

from .reducer import SumReducer


class PositionEncoder(tf.keras.layers.Layer):
"""Base class for position encoders."""

def __init__(self, reducer=None, **kwargs):
"""Initializes the position encoder.
Args:
reducer: A :class:`opennmt.layers.Reducer` to merge inputs and position
encodings. Defaults to :class:`opennmt.layers.SumReducer`.
**kwargs: Additional layer keyword arguments.
"""
super(PositionEncoder, self).__init__(**kwargs)
if reducer is None:
reducer = SumReducer(dtype=kwargs.get('dtype'))
self.reducer = reducer

def call(self, inputs, position=None): # pylint: disable=arguments-differ
"""Add position encodings to :obj:`inputs`.
Args:
inputs: The inputs to encode.
position: The single position to encode, to use when this layer is called
step by step.
Returns:
A ``tf.Tensor`` whose shape depends on the configured ``reducer``.
"""
batch_size = tf.shape(inputs)[0]
timesteps = tf.shape(inputs)[1]
input_dim = inputs.shape[-1].value
positions = tf.range(timesteps) + 1 if position is None else [position]
position_encoding = self._encode([positions], input_dim)
position_encoding = tf.tile(position_encoding, [batch_size, 1, 1])
return self.reducer([inputs, position_encoding])

@abc.abstractmethod
def _encode(self, positions, depth):
"""Creates position encodings.
Args:
positions: The positions to encode of shape :math:`[B, ...]`.
depth: The encoding depth :math:`D`.
Returns:
A ``tf.Tensor`` of shape :math:`[B, ..., D]`.
"""
raise NotImplementedError()


class PositionEmbedder(PositionEncoder):
"""Encodes position with a lookup table."""

def __init__(self, maximum_position=128, reducer=None, **kwargs):
"""Initializes the position encoder.
Args:
maximum_position: The maximum position to embed. Positions greater
than this value will be set to :obj:`maximum_position`.
reducer: A :class:`opennmt.layers.Reducer` to merge inputs and position
encodings. Defaults to :class:`opennmt.layers.SumReducer`.
**kwargs: Additional layer keyword arguments.
"""
super(PositionEmbedder, self).__init__(reducer=reducer, **kwargs)
self.maximum_position = maximum_position
self.embedding = None

def build(self, input_shape):
shape = [self.maximum_position + 1, input_shape[-1]]
self.embedding = self.add_weight('position_embedding', shape)
super(PositionEmbedder, self).build(input_shape)

def _encode(self, positions, depth):
positions = tf.minimum(positions, self.maximum_position)
return tf.nn.embedding_lookup(self.embedding, positions)


class SinusoidalPositionEncoder(PositionEncoder):
"""Encodes positions with sine waves as described in
https://arxiv.org/abs/1706.03762.
"""

def _encode(self, positions, depth):
if depth % 2 != 0:
raise ValueError(
'SinusoidalPositionEncoder expects the depth to be divisble '
'by 2 but got %d' % depth)

batch_size = tf.shape(positions)[0]
positions = tf.cast(positions, tf.float32)

log_timescale_increment = math.log(10000) / (depth / 2 - 1)
inv_timescales = tf.exp(
tf.range(depth / 2, dtype=tf.float32) * -log_timescale_increment)
inv_timescales = tf.reshape(
tf.tile(inv_timescales, [batch_size]), [batch_size, depth // 2])
scaled_time = tf.expand_dims(positions, -1) * tf.expand_dims(
inv_timescales, 1)
encoding = tf.concat(
[tf.sin(scaled_time), tf.cos(scaled_time)], axis=2)
return tf.cast(encoding, self.dtype)


class SinusodalPositionalEncoding(tf.keras.layers.Layer):

def __init__(self, name='SinusodalPositionalEncoding'):
super(SinusodalPositionalEncoding, self).__init__(name=name)

@staticmethod
def positional_encoding(len, dim, step=1.):
"""
:param len: int scalar
:param dim: int scalar
:param step:
:return: position embedding
"""
pos_mat = tf.tile(
tf.expand_dims(
tf.range(0, tf.cast(len, dtype=tf.float32), dtype=tf.float32)
* step,
axis=-1), [1, dim])
dim_mat = tf.tile(
tf.expand_dims(
tf.range(0, tf.cast(dim, dtype=tf.float32), dtype=tf.float32),
axis=0), [len, 1])
dim_mat_int = tf.cast(dim_mat, dtype=tf.int32)
pos_encoding = tf.where( # [time, dims]
tf.math.equal(tf.math.mod(dim_mat_int, 2), 0),
x=tf.math.sin(
pos_mat / tf.pow(10000., dim_mat / tf.cast(dim, tf.float32))),
y=tf.math.cos(pos_mat
/ tf.pow(10000.,
(dim_mat - 1) / tf.cast(dim, tf.float32))))
return pos_encoding


class BatchSinusodalPositionalEncoding(tf.keras.layers.Layer):

def __init__(self, name='BatchSinusodalPositionalEncoding'):
super(BatchSinusodalPositionalEncoding, self).__init__(name=name)

@staticmethod
def positional_encoding(batch_size, len, dim, pos_mat, step=1.):
"""
:param len: int scalar
:param dim: int scalar
:param step:
:param pos_mat: [B, len] = [len, 1] * dim
:return: position embedding
"""
pos_mat = tf.tile(
tf.expand_dims(tf.cast(pos_mat, dtype=tf.float32) * step, axis=-1),
[1, 1, dim]) # [B, len, dim]

dim_mat = tf.tile(
tf.expand_dims(
tf.expand_dims(
tf.range(
0, tf.cast(dim, dtype=tf.float32), dtype=tf.float32),
axis=0),
axis=0), [batch_size, len, 1]) # [B, len, dim]

dim_mat_int = tf.cast(dim_mat, dtype=tf.int32)
pos_encoding = tf.where( # [B, time, dims]
tf.math.equal(tf.mod(dim_mat_int, 2), 0),
x=tf.math.sin(
pos_mat / tf.pow(10000., dim_mat / tf.cast(dim, tf.float32))),
y=tf.math.cos(pos_mat
/ tf.pow(10000.,
(dim_mat - 1) / tf.cast(dim, tf.float32))))
return pos_encoding

+ 155
- 0
modelscope/models/audio/tts/am/models/reducer.py View File

@@ -0,0 +1,155 @@
"""Define reducers: objects that merge inputs."""

import abc
import functools

import tensorflow as tf


def pad_in_time(x, padding_length):
"""Helper function to pad a tensor in the time dimension and retain the static depth dimension."""
return tf.pad(x, [[0, 0], [0, padding_length], [0, 0]])


def align_in_time(x, length):
"""Aligns the time dimension of :obj:`x` with :obj:`length`."""
time_dim = tf.shape(x)[1]
return tf.cond(
tf.less(time_dim, length),
true_fn=lambda: pad_in_time(x, length - time_dim),
false_fn=lambda: x[:, :length])


def pad_with_identity(x,
sequence_length,
max_sequence_length,
identity_values=0,
maxlen=None):
"""Pads a tensor with identity values up to :obj:`max_sequence_length`.
Args:
x: A ``tf.Tensor`` of shape ``[batch_size, time, depth]``.
sequence_length: The true sequence length of :obj:`x`.
max_sequence_length: The sequence length up to which the tensor must contain
:obj:`identity values`.
identity_values: The identity value.
maxlen: Size of the output time dimension. Default is the maximum value in
obj:`max_sequence_length`.
Returns:
A ``tf.Tensor`` of shape ``[batch_size, maxlen, depth]``.
"""
if maxlen is None:
maxlen = tf.reduce_max(max_sequence_length)

mask = tf.sequence_mask(sequence_length, maxlen=maxlen, dtype=x.dtype)
mask = tf.expand_dims(mask, axis=-1)
mask_combined = tf.sequence_mask(
max_sequence_length, maxlen=maxlen, dtype=x.dtype)
mask_combined = tf.expand_dims(mask_combined, axis=-1)

identity_mask = mask_combined * (1.0 - mask)

x = pad_in_time(x, maxlen - tf.shape(x)[1])
x = x * mask + (identity_mask * identity_values)

return x


def pad_n_with_identity(inputs, sequence_lengths, identity_values=0):
"""Pads each input tensors with identity values up to
``max(sequence_lengths)`` for each batch.
Args:
inputs: A list of ``tf.Tensor``.
sequence_lengths: A list of sequence length.
identity_values: The identity value.
Returns:
A tuple ``(padded, max_sequence_length)`` which are respectively a list of
``tf.Tensor`` where each tensor are padded with identity and the combined
sequence length.
"""
max_sequence_length = tf.reduce_max(sequence_lengths, axis=0)
maxlen = tf.reduce_max([tf.shape(x)[1] for x in inputs])
padded = [
pad_with_identity(
x,
length,
max_sequence_length,
identity_values=identity_values,
maxlen=maxlen) for x, length in zip(inputs, sequence_lengths)
]
return padded, max_sequence_length


class Reducer(tf.keras.layers.Layer):
"""Base class for reducers."""

def zip_and_reduce(self, x, y):
"""Zips the :obj:`x` with :obj:`y` structures together and reduces all
elements. If the structures are nested, they will be flattened first.
Args:
x: The first structure.
y: The second structure.
Returns:
The same structure as :obj:`x` and :obj:`y` where each element from
:obj:`x` is reduced with the correspond element from :obj:`y`.
Raises:
ValueError: if the two structures are not the same.
"""
tf.nest.assert_same_structure(x, y)
x_flat = tf.nest.flatten(x)
y_flat = tf.nest.flatten(y)
reduced = list(map(self, zip(x_flat, y_flat)))
return tf.nest.pack_sequence_as(x, reduced)

def call(self, inputs, sequence_length=None): # pylint: disable=arguments-differ
"""Reduces all input elements.
Args:
inputs: A list of ``tf.Tensor``.
sequence_length: The length of each input, if reducing sequences.
Returns:
If :obj:`sequence_length` is set, a tuple
``(reduced_input, reduced_length)``, otherwise a reduced ``tf.Tensor``
only.
"""
if sequence_length is None:
return self.reduce(inputs)
else:
return self.reduce_sequence(
inputs, sequence_lengths=sequence_length)

@abc.abstractmethod
def reduce(self, inputs):
"""See :meth:`opennmt.layers.Reducer.__call__`."""
raise NotImplementedError()

@abc.abstractmethod
def reduce_sequence(self, inputs, sequence_lengths):
"""See :meth:`opennmt.layers.Reducer.__call__`."""
raise NotImplementedError()


class SumReducer(Reducer):
"""A reducer that sums the inputs."""

def reduce(self, inputs):
if len(inputs) == 1:
return inputs[0]
if len(inputs) == 2:
return inputs[0] + inputs[1]
return tf.add_n(inputs)

def reduce_sequence(self, inputs, sequence_lengths):
padded, combined_length = pad_n_with_identity(
inputs, sequence_lengths, identity_values=0)
return self.reduce(padded), combined_length


class MultiplyReducer(Reducer):
"""A reducer that multiplies the inputs."""

def reduce(self, inputs):
return functools.reduce(lambda a, x: a * x, inputs)

def reduce_sequence(self, inputs, sequence_lengths):
padded, combined_length = pad_n_with_identity(
inputs, sequence_lengths, identity_values=1)
return self.reduce(padded), combined_length

+ 240
- 0
modelscope/models/audio/tts/am/models/rnn_wrappers.py View File

@@ -0,0 +1,240 @@
import numpy as np
import tensorflow as tf
from tensorflow.contrib.rnn import RNNCell
from tensorflow.contrib.seq2seq import AttentionWrapperState
from tensorflow.python.ops import rnn_cell_impl

from .modules import prenet


class VarPredictorCell(RNNCell):
'''Wrapper wrapper knock knock.'''

def __init__(self, var_predictor_cell, is_training, dim, prenet_units):
super(VarPredictorCell, self).__init__()
self._var_predictor_cell = var_predictor_cell
self._is_training = is_training
self._dim = dim
self._prenet_units = prenet_units

@property
def state_size(self):
return tuple([self.output_size, self._var_predictor_cell.state_size])

@property
def output_size(self):
return self._dim

def zero_state(self, batch_size, dtype):
return tuple([
rnn_cell_impl._zero_state_tensors(self.output_size, batch_size,
dtype),
self._var_predictor_cell.zero_state(batch_size, dtype)
])

def call(self, inputs, state):
'''Run the Tacotron2 super decoder cell.'''
super_cell_out, decoder_state = state

# split
prenet_input = inputs[:, 0:self._dim]
encoder_output = inputs[:, self._dim:]

# prenet and concat
prenet_output = prenet(
prenet_input,
self._prenet_units,
self._is_training,
scope='var_prenet')
decoder_input = tf.concat([prenet_output, encoder_output], axis=-1)

# decoder LSTM/GRU
new_super_cell_out, new_decoder_state = self._var_predictor_cell(
decoder_input, decoder_state)

# projection
new_super_cell_out = tf.layers.dense(
new_super_cell_out, units=self._dim)

new_states = tuple([new_super_cell_out, new_decoder_state])

return new_super_cell_out, new_states


class DurPredictorCell(RNNCell):
'''Wrapper wrapper knock knock.'''

def __init__(self, var_predictor_cell, is_training, dim, prenet_units):
super(DurPredictorCell, self).__init__()
self._var_predictor_cell = var_predictor_cell
self._is_training = is_training
self._dim = dim
self._prenet_units = prenet_units

@property
def state_size(self):
return tuple([self.output_size, self._var_predictor_cell.state_size])

@property
def output_size(self):
return self._dim

def zero_state(self, batch_size, dtype):
return tuple([
rnn_cell_impl._zero_state_tensors(self.output_size, batch_size,
dtype),
self._var_predictor_cell.zero_state(batch_size, dtype)
])

def call(self, inputs, state):
'''Run the Tacotron2 super decoder cell.'''
super_cell_out, decoder_state = state

# split
prenet_input = inputs[:, 0:self._dim]
encoder_output = inputs[:, self._dim:]

# prenet and concat
prenet_output = prenet(
prenet_input,
self._prenet_units,
self._is_training,
scope='dur_prenet')
decoder_input = tf.concat([prenet_output, encoder_output], axis=-1)

# decoder LSTM/GRU
new_super_cell_out, new_decoder_state = self._var_predictor_cell(
decoder_input, decoder_state)

# projection
new_super_cell_out = tf.layers.dense(
new_super_cell_out, units=self._dim)
new_super_cell_out = tf.nn.relu(new_super_cell_out)
# new_super_cell_out = tf.log(tf.cast(tf.round(tf.exp(new_super_cell_out) - 1), tf.float32) + 1)

new_states = tuple([new_super_cell_out, new_decoder_state])

return new_super_cell_out, new_states


class DurPredictorCECell(RNNCell):
'''Wrapper wrapper knock knock.'''

def __init__(self, var_predictor_cell, is_training, dim, prenet_units,
max_dur, dur_embedding_dim):
super(DurPredictorCECell, self).__init__()
self._var_predictor_cell = var_predictor_cell
self._is_training = is_training
self._dim = dim
self._prenet_units = prenet_units
self._max_dur = max_dur
self._dur_embedding_dim = dur_embedding_dim

@property
def state_size(self):
return tuple([self.output_size, self._var_predictor_cell.state_size])

@property
def output_size(self):
return self._max_dur

def zero_state(self, batch_size, dtype):
return tuple([
rnn_cell_impl._zero_state_tensors(self.output_size, batch_size,
dtype),
self._var_predictor_cell.zero_state(batch_size, dtype)
])

def call(self, inputs, state):
'''Run the Tacotron2 super decoder cell.'''
super_cell_out, decoder_state = state

# split
prenet_input = tf.squeeze(
tf.cast(inputs[:, 0:self._dim], tf.int32), axis=-1) # [N]
prenet_input = tf.one_hot(
prenet_input, self._max_dur, on_value=1.0, off_value=0.0,
axis=-1) # [N, 120]
prenet_input = tf.layers.dense(
prenet_input, units=self._dur_embedding_dim)
encoder_output = inputs[:, self._dim:]

# prenet and concat
prenet_output = prenet(
prenet_input,
self._prenet_units,
self._is_training,
scope='dur_prenet')
decoder_input = tf.concat([prenet_output, encoder_output], axis=-1)

# decoder LSTM/GRU
new_super_cell_out, new_decoder_state = self._var_predictor_cell(
decoder_input, decoder_state)

# projection
new_super_cell_out = tf.layers.dense(
new_super_cell_out, units=self._max_dur) # [N, 120]
new_super_cell_out = tf.nn.softmax(new_super_cell_out) # [N, 120]

new_states = tuple([new_super_cell_out, new_decoder_state])

return new_super_cell_out, new_states


class VarPredictorCell2(RNNCell):
'''Wrapper wrapper knock knock.'''

def __init__(self, var_predictor_cell, is_training, dim, prenet_units):
super(VarPredictorCell2, self).__init__()
self._var_predictor_cell = var_predictor_cell
self._is_training = is_training
self._dim = dim
self._prenet_units = prenet_units

@property
def state_size(self):
return tuple([self.output_size, self._var_predictor_cell.state_size])

@property
def output_size(self):
return self._dim

def zero_state(self, batch_size, dtype):
return tuple([
rnn_cell_impl._zero_state_tensors(self.output_size, batch_size,
dtype),
self._var_predictor_cell.zero_state(batch_size, dtype)
])

def call(self, inputs, state):
'''Run the Tacotron2 super decoder cell.'''
super_cell_out, decoder_state = state

# split
prenet_input = inputs[:, 0:self._dim]
encoder_output = inputs[:, self._dim:]

# prenet and concat
prenet_output = prenet(
prenet_input,
self._prenet_units,
self._is_training,
scope='var_prenet')
decoder_input = tf.concat([prenet_output, encoder_output], axis=-1)

# decoder LSTM/GRU
new_super_cell_out, new_decoder_state = self._var_predictor_cell(
decoder_input, decoder_state)

# projection
new_super_cell_out = tf.layers.dense(
new_super_cell_out, units=self._dim)

# split and relu
new_super_cell_out = tf.concat([
tf.nn.relu(new_super_cell_out[:, 0:1]), new_super_cell_out[:, 1:]
], axis=-1) # yapf:disable

new_states = tuple([new_super_cell_out, new_decoder_state])

return new_super_cell_out, new_states

+ 760
- 0
modelscope/models/audio/tts/am/models/robutrans.py View File

@@ -0,0 +1,760 @@
import tensorflow as tf
from tensorflow.contrib.rnn import LSTMBlockCell, MultiRNNCell
from tensorflow.contrib.seq2seq import BasicDecoder
from tensorflow.python.ops.ragged.ragged_util import repeat

from .fsmn_encoder import FsmnEncoderV2
from .helpers import VarTestHelper, VarTrainingHelper
from .modules import conv_prenet, decoder_prenet, encoder_prenet
from .position import (BatchSinusodalPositionalEncoding,
SinusodalPositionalEncoding)
from .rnn_wrappers import DurPredictorCell, VarPredictorCell
from .self_attention_decoder import SelfAttentionDecoder
from .self_attention_encoder import SelfAttentionEncoder


class RobuTrans():

def __init__(self, hparams):
self._hparams = hparams

def initialize(self,
inputs,
inputs_emotion,
inputs_speaker,
input_lengths,
output_lengths=None,
mel_targets=None,
durations=None,
pitch_contours=None,
uv_masks=None,
pitch_scales=None,
duration_scales=None,
energy_contours=None,
energy_scales=None):
'''Initializes the model for inference.

Sets "mel_outputs", "linear_outputs", "stop_token_outputs", and "alignments" fields.

Args:
inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of
steps in the input time series, and values are character IDs
input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths
of each sequence in inputs.
output_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths
of each sequence in outputs.
mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number
of steps in the output time series, M is num_mels, and values are entries in the mel
spectrogram. Only needed for training.
'''
with tf.variable_scope('inference') as _:
is_training = mel_targets is not None
batch_size = tf.shape(inputs)[0]
hp = self._hparams

input_mask = None
if input_lengths is not None and is_training:
input_mask = tf.sequence_mask(
input_lengths, tf.shape(inputs)[1], dtype=tf.float32)

if input_mask is not None:
inputs = inputs * tf.expand_dims(input_mask, -1)

# speaker embedding
embedded_inputs_speaker = tf.layers.dense(
inputs_speaker,
32,
activation=None,
use_bias=False,
kernel_initializer=tf.truncated_normal_initializer(stddev=0.5))

# emotion embedding
embedded_inputs_emotion = tf.layers.dense(
inputs_emotion,
32,
activation=None,
use_bias=False,
kernel_initializer=tf.truncated_normal_initializer(stddev=0.5))

# symbol embedding
with tf.variable_scope('Embedding'):
embedded_inputs = tf.layers.dense(
inputs,
hp.embedding_dim,
activation=None,
use_bias=False,
kernel_initializer=tf.truncated_normal_initializer(
stddev=0.5))

# Encoder
with tf.variable_scope('Encoder'):
Encoder = SelfAttentionEncoder(
num_layers=hp.encoder_num_layers,
num_units=hp.encoder_num_units,
num_heads=hp.encoder_num_heads,
ffn_inner_dim=hp.encoder_ffn_inner_dim,
dropout=hp.encoder_dropout,
attention_dropout=hp.encoder_attention_dropout,
relu_dropout=hp.encoder_relu_dropout)
encoder_outputs, state_mo, sequence_length_mo, attns = Encoder.encode(
embedded_inputs,
sequence_length=input_lengths,
mode=is_training)
encoder_outputs = tf.layers.dense(
encoder_outputs,
hp.encoder_projection_units,
activation=None,
use_bias=False,
kernel_initializer=tf.truncated_normal_initializer(
stddev=0.5))

# pitch and energy
var_inputs = tf.concat([
encoder_outputs, embedded_inputs_speaker,
embedded_inputs_emotion
], 2)
if input_mask is not None:
var_inputs = var_inputs * tf.expand_dims(input_mask, -1)

with tf.variable_scope('Pitch_Predictor'):
Pitch_Predictor_FSMN = FsmnEncoderV2(
filter_size=hp.predictor_filter_size,
fsmn_num_layers=hp.predictor_fsmn_num_layers,
dnn_num_layers=hp.predictor_dnn_num_layers,
num_memory_units=hp.predictor_num_memory_units,
ffn_inner_dim=hp.predictor_ffn_inner_dim,
dropout=hp.predictor_dropout,
shift=hp.predictor_shift,
position_encoder=None)
pitch_contour_outputs, _, _ = Pitch_Predictor_FSMN.encode(
tf.concat([
encoder_outputs, embedded_inputs_speaker,
embedded_inputs_emotion
], 2),
sequence_length=input_lengths,
mode=is_training)
pitch_contour_outputs, _ = tf.nn.bidirectional_dynamic_rnn(
LSTMBlockCell(hp.predictor_lstm_units),
LSTMBlockCell(hp.predictor_lstm_units),
pitch_contour_outputs,
sequence_length=input_lengths,
dtype=tf.float32)
pitch_contour_outputs = tf.concat(
pitch_contour_outputs, axis=-1)
pitch_contour_outputs = tf.layers.dense(
pitch_contour_outputs, units=1) # [N, T_in, 1]
pitch_contour_outputs = tf.squeeze(
pitch_contour_outputs, axis=2) # [N, T_in]

with tf.variable_scope('Energy_Predictor'):
Energy_Predictor_FSMN = FsmnEncoderV2(
filter_size=hp.predictor_filter_size,
fsmn_num_layers=hp.predictor_fsmn_num_layers,
dnn_num_layers=hp.predictor_dnn_num_layers,
num_memory_units=hp.predictor_num_memory_units,
ffn_inner_dim=hp.predictor_ffn_inner_dim,
dropout=hp.predictor_dropout,
shift=hp.predictor_shift,
position_encoder=None)
energy_contour_outputs, _, _ = Energy_Predictor_FSMN.encode(
tf.concat([
encoder_outputs, embedded_inputs_speaker,
embedded_inputs_emotion
], 2),
sequence_length=input_lengths,
mode=is_training)
energy_contour_outputs, _ = tf.nn.bidirectional_dynamic_rnn(
LSTMBlockCell(hp.predictor_lstm_units),
LSTMBlockCell(hp.predictor_lstm_units),
energy_contour_outputs,
sequence_length=input_lengths,
dtype=tf.float32)
energy_contour_outputs = tf.concat(
energy_contour_outputs, axis=-1)
energy_contour_outputs = tf.layers.dense(
energy_contour_outputs, units=1) # [N, T_in, 1]
energy_contour_outputs = tf.squeeze(
energy_contour_outputs, axis=2) # [N, T_in]

if is_training:
pitch_embeddings = tf.expand_dims(
pitch_contours, axis=2) # [N, T_in, 1]
pitch_embeddings = tf.layers.conv1d(
pitch_embeddings,
filters=hp.encoder_projection_units,
kernel_size=9,
padding='same',
name='pitch_embeddings') # [N, T_in, 32]

energy_embeddings = tf.expand_dims(
energy_contours, axis=2) # [N, T_in, 1]
energy_embeddings = tf.layers.conv1d(
energy_embeddings,
filters=hp.encoder_projection_units,
kernel_size=9,
padding='same',
name='energy_embeddings') # [N, T_in, 32]
else:
pitch_contour_outputs *= pitch_scales
pitch_embeddings = tf.expand_dims(
pitch_contour_outputs, axis=2) # [N, T_in, 1]
pitch_embeddings = tf.layers.conv1d(
pitch_embeddings,
filters=hp.encoder_projection_units,
kernel_size=9,
padding='same',
name='pitch_embeddings') # [N, T_in, 32]

energy_contour_outputs *= energy_scales
energy_embeddings = tf.expand_dims(
energy_contour_outputs, axis=2) # [N, T_in, 1]
energy_embeddings = tf.layers.conv1d(
energy_embeddings,
filters=hp.encoder_projection_units,
kernel_size=9,
padding='same',
name='energy_embeddings') # [N, T_in, 32]

encoder_outputs_ = encoder_outputs + pitch_embeddings + energy_embeddings

# duration
dur_inputs = tf.concat([
encoder_outputs_, embedded_inputs_speaker,
embedded_inputs_emotion
], 2)
if input_mask is not None:
dur_inputs = dur_inputs * tf.expand_dims(input_mask, -1)
with tf.variable_scope('Duration_Predictor'):
duration_predictor_cell = MultiRNNCell([
LSTMBlockCell(hp.predictor_lstm_units),
LSTMBlockCell(hp.predictor_lstm_units)
], state_is_tuple=True) # yapf:disable
duration_output_cell = DurPredictorCell(
duration_predictor_cell, is_training, 1,
hp.predictor_prenet_units)
duration_predictor_init_state = duration_output_cell.zero_state(
batch_size=batch_size, dtype=tf.float32)
if is_training:
duration_helper = VarTrainingHelper(
tf.expand_dims(
tf.log(tf.cast(durations, tf.float32) + 1),
axis=2), dur_inputs, 1)
else:
duration_helper = VarTestHelper(batch_size, dur_inputs, 1)
(
duration_outputs, _
), final_duration_predictor_state, _ = tf.contrib.seq2seq.dynamic_decode(
BasicDecoder(duration_output_cell, duration_helper,
duration_predictor_init_state),
maximum_iterations=1000)
duration_outputs = tf.squeeze(
duration_outputs, axis=2) # [N, T_in]
if input_mask is not None:
duration_outputs = duration_outputs * input_mask
duration_outputs_ = tf.exp(duration_outputs) - 1

# Length Regulator
with tf.variable_scope('Length_Regulator'):
if is_training:
i = tf.constant(1)
# position embedding
j = tf.constant(1)
dur_len = tf.shape(durations)[-1]
embedded_position_i = tf.range(1, durations[0, 0] + 1)

def condition_pos(j, e):
return tf.less(j, dur_len)

def loop_body_pos(j, embedded_position_i):
embedded_position_i = tf.concat([
embedded_position_i,
tf.range(1, durations[0, j] + 1)
], axis=0) # yapf:disable
return [j + 1, embedded_position_i]

j, embedded_position_i = tf.while_loop(
condition_pos,
loop_body_pos, [j, embedded_position_i],
shape_invariants=[
j.get_shape(),
tf.TensorShape([None])
])
embedded_position = tf.reshape(embedded_position_i,
(1, -1))

# others
LR_outputs = repeat(
encoder_outputs_[0:1, :, :], durations[0, :], axis=1)
embedded_outputs_speaker = repeat(
embedded_inputs_speaker[0:1, :, :],
durations[0, :],
axis=1)
embedded_outputs_emotion = repeat(
embedded_inputs_emotion[0:1, :, :],
durations[0, :],
axis=1)

def condition(i, pos, layer, s, e):
return tf.less(i, tf.shape(mel_targets)[0])

def loop_body(i, embedded_position, LR_outputs,
embedded_outputs_speaker,
embedded_outputs_emotion):
# position embedding
jj = tf.constant(1)
embedded_position_i = tf.range(1, durations[i, 0] + 1)

def condition_pos_i(j, e):
return tf.less(j, dur_len)

def loop_body_pos_i(j, embedded_position_i):
embedded_position_i = tf.concat([
embedded_position_i,
tf.range(1, durations[i, j] + 1)
], axis=0) # yapf:disable
return [j + 1, embedded_position_i]

jj, embedded_position_i = tf.while_loop(
condition_pos_i,
loop_body_pos_i, [jj, embedded_position_i],
shape_invariants=[
jj.get_shape(),
tf.TensorShape([None])
])
embedded_position = tf.concat([
embedded_position,
tf.reshape(embedded_position_i, (1, -1))
], 0)

# others
LR_outputs = tf.concat([
LR_outputs,
repeat(
encoder_outputs_[i:i + 1, :, :],
durations[i, :],
axis=1)
], 0)
embedded_outputs_speaker = tf.concat([
embedded_outputs_speaker,
repeat(
embedded_inputs_speaker[i:i + 1, :, :],
durations[i, :],
axis=1)
], 0)
embedded_outputs_emotion = tf.concat([
embedded_outputs_emotion,
repeat(
embedded_inputs_emotion[i:i + 1, :, :],
durations[i, :],
axis=1)
], 0)
return [
i + 1, embedded_position, LR_outputs,
embedded_outputs_speaker, embedded_outputs_emotion
]

i, embedded_position, LR_outputs,
embedded_outputs_speaker,
embedded_outputs_emotion = tf.while_loop(
condition,
loop_body, [
i, embedded_position, LR_outputs,
embedded_outputs_speaker, embedded_outputs_emotion
],
shape_invariants=[
i.get_shape(),
tf.TensorShape([None, None]),
tf.TensorShape([None, None, None]),
tf.TensorShape([None, None, None]),
tf.TensorShape([None, None, None])
],
parallel_iterations=hp.batch_size)

ori_framenum = tf.shape(mel_targets)[1]
else:
# position
j = tf.constant(1)
dur_len = tf.shape(duration_outputs_)[-1]
embedded_position_i = tf.range(
1,
tf.cast(tf.round(duration_outputs_)[0, 0], tf.int32)
+ 1)

def condition_pos(j, e):
return tf.less(j, dur_len)

def loop_body_pos(j, embedded_position_i):
embedded_position_i = tf.concat([
embedded_position_i,
tf.range(
1,
tf.cast(
tf.round(duration_outputs_)[0, j],
tf.int32) + 1)
], axis=0) # yapf:disable
return [j + 1, embedded_position_i]

j, embedded_position_i = tf.while_loop(
condition_pos,
loop_body_pos, [j, embedded_position_i],
shape_invariants=[
j.get_shape(),
tf.TensorShape([None])
])
embedded_position = tf.reshape(embedded_position_i,
(1, -1))
# others
duration_outputs_ *= duration_scales
LR_outputs = repeat(
encoder_outputs_[0:1, :, :],
tf.cast(tf.round(duration_outputs_)[0, :], tf.int32),
axis=1)
embedded_outputs_speaker = repeat(
embedded_inputs_speaker[0:1, :, :],
tf.cast(tf.round(duration_outputs_)[0, :], tf.int32),
axis=1)
embedded_outputs_emotion = repeat(
embedded_inputs_emotion[0:1, :, :],
tf.cast(tf.round(duration_outputs_)[0, :], tf.int32),
axis=1)
ori_framenum = tf.shape(LR_outputs)[1]

left = hp.outputs_per_step - tf.mod(
ori_framenum, hp.outputs_per_step)
LR_outputs = tf.cond(
tf.equal(left,
hp.outputs_per_step), lambda: LR_outputs,
lambda: tf.pad(LR_outputs, [[0, 0], [0, left], [0, 0]],
'CONSTANT'))
embedded_outputs_speaker = tf.cond(
tf.equal(left, hp.outputs_per_step),
lambda: embedded_outputs_speaker, lambda: tf.pad(
embedded_outputs_speaker, [[0, 0], [0, left],
[0, 0]], 'CONSTANT'))
embedded_outputs_emotion = tf.cond(
tf.equal(left, hp.outputs_per_step),
lambda: embedded_outputs_emotion, lambda: tf.pad(
embedded_outputs_emotion, [[0, 0], [0, left],
[0, 0]], 'CONSTANT'))
embedded_position = tf.cond(
tf.equal(left, hp.outputs_per_step),
lambda: embedded_position,
lambda: tf.pad(embedded_position, [[0, 0], [0, left]],
'CONSTANT'))

# Pos_Embedding
with tf.variable_scope('Position_Embedding'):
Pos_Embedding = BatchSinusodalPositionalEncoding()
position_embeddings = Pos_Embedding.positional_encoding(
batch_size,
tf.shape(LR_outputs)[1], hp.encoder_projection_units,
embedded_position)
LR_outputs += position_embeddings

# multi-frame
LR_outputs = tf.reshape(LR_outputs, [
batch_size, -1,
hp.outputs_per_step * hp.encoder_projection_units
])
embedded_outputs_speaker = tf.reshape(
embedded_outputs_speaker,
[batch_size, -1, hp.outputs_per_step * 32])[:, :, :32]
embedded_outputs_emotion = tf.reshape(
embedded_outputs_emotion,
[batch_size, -1, hp.outputs_per_step * 32])[:, :, :32]
# [N, T_out, D_LR_outputs] (D_LR_outputs = hp.outputs_per_step * hp.encoder_projection_units + 64)
LR_outputs = tf.concat([
LR_outputs, embedded_outputs_speaker, embedded_outputs_emotion
], -1)

# auto bandwidth
if is_training:
durations_mask = tf.cast(durations,
tf.float32) * input_mask # [N, T_in]
else:
durations_mask = duration_outputs_
X_band_width = tf.cast(
tf.round(tf.reduce_max(durations_mask) / hp.outputs_per_step),
tf.int32)
H_band_width = X_band_width

with tf.variable_scope('Decoder'):
Decoder = SelfAttentionDecoder(
num_layers=hp.decoder_num_layers,
num_units=hp.decoder_num_units,
num_heads=hp.decoder_num_heads,
ffn_inner_dim=hp.decoder_ffn_inner_dim,
dropout=hp.decoder_dropout,
attention_dropout=hp.decoder_attention_dropout,
relu_dropout=hp.decoder_relu_dropout,
prenet_units=hp.prenet_units,
dense_units=hp.prenet_proj_units,
num_mels=hp.num_mels,
outputs_per_step=hp.outputs_per_step,
X_band_width=X_band_width,
H_band_width=H_band_width,
position_encoder=None)
if is_training:
if hp.free_run:
r = hp.outputs_per_step
init_decoder_input = tf.expand_dims(
tf.tile([[0.0]], [batch_size, hp.num_mels]),
axis=1) # [N, 1, hp.num_mels]
decoder_input_lengths = tf.cast(
output_lengths / r, tf.int32)
decoder_outputs, attention_x, attention_h = Decoder.dynamic_decode_and_search(
init_decoder_input,
maximum_iterations=tf.shape(LR_outputs)[1],
mode=is_training,
memory=LR_outputs,
memory_sequence_length=decoder_input_lengths)
else:
r = hp.outputs_per_step
decoder_input = mel_targets[:, r - 1::
r, :] # [N, T_out / r, hp.num_mels]
init_decoder_input = tf.expand_dims(
tf.tile([[0.0]], [batch_size, hp.num_mels]),
axis=1) # [N, 1, hp.num_mels]
decoder_input = tf.concat(
[init_decoder_input, decoder_input],
axis=1) # [N, T_out / r + 1, hp.num_mels]
decoder_input = decoder_input[:, :
-1, :] # [N, T_out / r, hp.num_mels]
decoder_input_lengths = tf.cast(
output_lengths / r, tf.int32)
decoder_outputs, attention_x, attention_h = Decoder.decode_from_inputs(
decoder_input,
decoder_input_lengths,
mode=is_training,
memory=LR_outputs,
memory_sequence_length=decoder_input_lengths)
else:
init_decoder_input = tf.expand_dims(
tf.tile([[0.0]], [batch_size, hp.num_mels]),
axis=1) # [N, 1, hp.num_mels]
decoder_outputs, attention_x, attention_h = Decoder.dynamic_decode_and_search(
init_decoder_input,
maximum_iterations=tf.shape(LR_outputs)[1],
mode=is_training,
memory=LR_outputs,
memory_sequence_length=tf.expand_dims(
tf.shape(LR_outputs)[1], axis=0))

if is_training:
mel_outputs_ = tf.reshape(decoder_outputs,
[batch_size, -1, hp.num_mels])
else:
mel_outputs_ = tf.reshape(
decoder_outputs,
[batch_size, -1, hp.num_mels])[:, :ori_framenum, :]
mel_outputs = mel_outputs_

with tf.variable_scope('Postnet'):
Postnet_FSMN = FsmnEncoderV2(
filter_size=hp.postnet_filter_size,
fsmn_num_layers=hp.postnet_fsmn_num_layers,
dnn_num_layers=hp.postnet_dnn_num_layers,
num_memory_units=hp.postnet_num_memory_units,
ffn_inner_dim=hp.postnet_ffn_inner_dim,
dropout=hp.postnet_dropout,
shift=hp.postnet_shift,
position_encoder=None)
if is_training:
postnet_fsmn_outputs, _, _ = Postnet_FSMN.encode(
mel_outputs,
sequence_length=output_lengths,
mode=is_training)
hidden_lstm_outputs, _ = tf.nn.dynamic_rnn(
LSTMBlockCell(hp.postnet_lstm_units),
postnet_fsmn_outputs,
sequence_length=output_lengths,
dtype=tf.float32)
else:
postnet_fsmn_outputs, _, _ = Postnet_FSMN.encode(
mel_outputs,
sequence_length=[tf.shape(mel_outputs_)[1]],
mode=is_training)
hidden_lstm_outputs, _ = tf.nn.dynamic_rnn(
LSTMBlockCell(hp.postnet_lstm_units),
postnet_fsmn_outputs,
sequence_length=[tf.shape(mel_outputs_)[1]],
dtype=tf.float32)

mel_residual_outputs = tf.layers.dense(
hidden_lstm_outputs, units=hp.num_mels)
mel_outputs += mel_residual_outputs

self.inputs = inputs
self.inputs_speaker = inputs_speaker
self.inputs_emotion = inputs_emotion
self.input_lengths = input_lengths
self.durations = durations
self.output_lengths = output_lengths
self.mel_outputs_ = mel_outputs_
self.mel_outputs = mel_outputs
self.mel_targets = mel_targets
self.duration_outputs = duration_outputs
self.duration_outputs_ = duration_outputs_
self.duration_scales = duration_scales
self.pitch_contour_outputs = pitch_contour_outputs
self.pitch_contours = pitch_contours
self.pitch_scales = pitch_scales
self.energy_contour_outputs = energy_contour_outputs
self.energy_contours = energy_contours
self.energy_scales = energy_scales
self.uv_masks_ = uv_masks

self.embedded_inputs_emotion = embedded_inputs_emotion
self.embedding_fsmn_outputs = embedded_inputs
self.encoder_outputs = encoder_outputs
self.encoder_outputs_ = encoder_outputs_
self.LR_outputs = LR_outputs
self.postnet_fsmn_outputs = postnet_fsmn_outputs

self.pitch_embeddings = pitch_embeddings
self.energy_embeddings = energy_embeddings

self.attns = attns
self.attention_x = attention_x
self.attention_h = attention_h
self.X_band_width = X_band_width
self.H_band_width = H_band_width

def add_loss(self):
'''Adds loss to the model. Sets "loss" field. initialize must have been called.'''
with tf.variable_scope('loss') as _:
hp = self._hparams
mask = tf.sequence_mask(
self.output_lengths,
tf.shape(self.mel_targets)[1],
dtype=tf.float32)
valid_outputs = tf.reduce_sum(mask)

mask_input = tf.sequence_mask(
self.input_lengths,
tf.shape(self.durations)[1],
dtype=tf.float32)
valid_inputs = tf.reduce_sum(mask_input)

# mel loss
if self.uv_masks_ is not None:
valid_outputs_mask = tf.reduce_sum(
tf.expand_dims(mask, -1) * self.uv_masks_)
self.mel_loss_ = tf.reduce_sum(
tf.abs(self.mel_targets - self.mel_outputs_)
* tf.expand_dims(mask, -1) * self.uv_masks_) / (
valid_outputs_mask * hp.num_mels)
self.mel_loss = tf.reduce_sum(
tf.abs(self.mel_targets - self.mel_outputs)
* tf.expand_dims(mask, -1) * self.uv_masks_) / (
valid_outputs_mask * hp.num_mels)
else:
self.mel_loss_ = tf.reduce_sum(
tf.abs(self.mel_targets - self.mel_outputs_)
* tf.expand_dims(mask, -1)) / (
valid_outputs * hp.num_mels)
self.mel_loss = tf.reduce_sum(
tf.abs(self.mel_targets - self.mel_outputs)
* tf.expand_dims(mask, -1)) / (
valid_outputs * hp.num_mels)

# duration loss
self.duration_loss = tf.reduce_sum(
tf.abs(
tf.log(tf.cast(self.durations, tf.float32) + 1)
- self.duration_outputs) * mask_input) / valid_inputs

# pitch contour loss
self.pitch_contour_loss = tf.reduce_sum(
tf.abs(self.pitch_contours - self.pitch_contour_outputs)
* mask_input) / valid_inputs

# energy contour loss
self.energy_contour_loss = tf.reduce_sum(
tf.abs(self.energy_contours - self.energy_contour_outputs)
* mask_input) / valid_inputs

# final loss
self.loss = self.mel_loss_ + self.mel_loss + self.duration_loss \
+ self.pitch_contour_loss + self.energy_contour_loss

# guided attention loss
self.guided_attention_loss = tf.constant(0.0)
if hp.guided_attention:
i0 = tf.constant(0)
loss0 = tf.constant(0.0)

def c(i, _):
return tf.less(i, tf.shape(mel_targets)[0])

def loop_body(i, loss):
decoder_input_lengths = tf.cast(
self.output_lengths / hp.outputs_per_step, tf.int32)
input_len = decoder_input_lengths[i]
output_len = decoder_input_lengths[i]
input_w = tf.expand_dims(
tf.range(tf.cast(input_len, dtype=tf.float32)),
axis=1) / tf.cast(
input_len, dtype=tf.float32) # [T_in, 1]
output_w = tf.expand_dims(
tf.range(tf.cast(output_len, dtype=tf.float32)),
axis=0) / tf.cast(
output_len, dtype=tf.float32) # [1, T_out]
guided_attention_w = 1.0 - tf.exp(
-(1 / hp.guided_attention_2g_squared)
* tf.square(input_w - output_w)) # [T_in, T_out]
guided_attention_w = tf.expand_dims(
guided_attention_w, axis=0) # [1, T_in, T_out]
# [hp.decoder_num_heads, T_in, T_out]
guided_attention_w = tf.tile(guided_attention_w,
[hp.decoder_num_heads, 1, 1])
loss_i = tf.constant(0.0)
for j in range(hp.decoder_num_layers):
loss_i += tf.reduce_mean(
self.attention_h[j][i, :, :input_len, :output_len]
* guided_attention_w)

return [tf.add(i, 1), tf.add(loss, loss_i)]

_, loss = tf.while_loop(
c,
loop_body,
loop_vars=[i0, loss0],
parallel_iterations=hp.batch_size)
self.guided_attention_loss = loss / hp.batch_size
self.loss += hp.guided_attention_loss_weight * self.guided_attention_loss

def add_optimizer(self, global_step):
'''Adds optimizer. Sets "gradients" and "optimize" fields. add_loss must have been called.

Args:
global_step: int32 scalar Tensor representing current global step in training
'''
with tf.variable_scope('optimizer') as _:
hp = self._hparams
if hp.decay_learning_rate:
self.learning_rate = _learning_rate_decay(
hp.initial_learning_rate, global_step)
else:
self.learning_rate = tf.convert_to_tensor(
hp.initial_learning_rate)
optimizer = tf.train.AdamOptimizer(self.learning_rate,
hp.adam_beta1, hp.adam_beta2)
gradients, variables = zip(*optimizer.compute_gradients(self.loss))
self.gradients = gradients
clipped_gradients, _ = tf.clip_by_global_norm(gradients, 1.0)

# Add dependency on UPDATE_OPS; otherwise batchnorm won't work correctly. See:
# https://github.com/tensorflow/tensorflow/issues/1122
with tf.control_dependencies(
tf.get_collection(tf.GraphKeys.UPDATE_OPS)):
self.optimize = optimizer.apply_gradients(
zip(clipped_gradients, variables), global_step=global_step)


def _learning_rate_decay(init_lr, global_step):
# Noam scheme from tensor2tensor:
warmup_steps = 4000.0
step = tf.cast(global_step + 1, dtype=tf.float32)
return init_lr * warmup_steps**0.5 * tf.minimum(step * warmup_steps**-1.5,
step**-0.5)

+ 817
- 0
modelscope/models/audio/tts/am/models/self_attention_decoder.py View File

@@ -0,0 +1,817 @@
"""Define self-attention decoder."""

import sys

import tensorflow as tf

from . import compat, transformer
from .modules import decoder_prenet
from .position import SinusoidalPositionEncoder


class SelfAttentionDecoder():
"""Decoder using self-attention as described in
https://arxiv.org/abs/1706.03762.
"""

def __init__(self,
num_layers,
num_units=512,
num_heads=8,
ffn_inner_dim=2048,
dropout=0.1,
attention_dropout=0.1,
relu_dropout=0.1,
prenet_units=256,
dense_units=128,
num_mels=80,
outputs_per_step=3,
X_band_width=None,
H_band_width=None,
position_encoder=SinusoidalPositionEncoder(),
self_attention_type='scaled_dot'):
"""Initializes the parameters of the decoder.

Args:
num_layers: The number of layers.
num_units: The number of hidden units.
num_heads: The number of heads in the multi-head attention.
ffn_inner_dim: The number of units of the inner linear transformation
in the feed forward layer.
dropout: The probability to drop units from the outputs.
attention_dropout: The probability to drop units from the attention.
relu_dropout: The probability to drop units from the ReLU activation in
the feed forward layer.
position_encoder: A :class:`opennmt.layers.position.PositionEncoder` to
apply on inputs or ``None``.
self_attention_type: Type of self attention, "scaled_dot" or "average" (case
insensitive).

Raises:
ValueError: if :obj:`self_attention_type` is invalid.
"""
super(SelfAttentionDecoder, self).__init__()
self.num_layers = num_layers
self.num_units = num_units
self.num_heads = num_heads
self.ffn_inner_dim = ffn_inner_dim
self.dropout = dropout
self.attention_dropout = attention_dropout
self.relu_dropout = relu_dropout
self.position_encoder = position_encoder
self.self_attention_type = self_attention_type.lower()
if self.self_attention_type not in ('scaled_dot', 'average'):
raise ValueError('invalid attention type %s'
% self.self_attention_type)
if self.self_attention_type == 'average':
tf.logging.warning(
'Support for average attention network is experimental '
'and may change in future versions.')
self.prenet_units = prenet_units
self.dense_units = dense_units
self.num_mels = num_mels
self.outputs_per_step = outputs_per_step
self.X_band_width = X_band_width
self.H_band_width = H_band_width

@property
def output_size(self):
"""Returns the decoder output size."""
return self.num_units

@property
def support_alignment_history(self):
return True

@property
def support_multi_source(self):
return True

def _init_cache(self, batch_size, dtype=tf.float32, num_sources=1):
cache = {}

for layer in range(self.num_layers):
proj_cache_shape = [
batch_size, self.num_heads, 0, self.num_units // self.num_heads
]
layer_cache = {}
layer_cache['memory'] = [{
'memory_keys':
tf.zeros(proj_cache_shape, dtype=dtype),
'memory_values':
tf.zeros(proj_cache_shape, dtype=dtype)
} for _ in range(num_sources)]
if self.self_attention_type == 'scaled_dot':
layer_cache['self_keys'] = tf.zeros(
proj_cache_shape, dtype=dtype)
layer_cache['self_values'] = tf.zeros(
proj_cache_shape, dtype=dtype)
elif self.self_attention_type == 'average':
layer_cache['prev_g'] = tf.zeros(
[batch_size, 1, self.num_units], dtype=dtype)
cache['layer_{}'.format(layer)] = layer_cache

return cache

def _init_attn(self, dtype=tf.float32):
attn = []
for layer in range(self.num_layers):
attn.append(tf.TensorArray(tf.float32, size=0, dynamic_size=True))
return attn

def _self_attention_stack(self,
inputs,
sequence_length=None,
mode=True,
cache=None,
memory=None,
memory_sequence_length=None,
step=None):

# [N, T_out, self.dense_units] or [N, 1, self.dense_units]
prenet_outputs = decoder_prenet(inputs, self.prenet_units,
self.dense_units, mode)
if step is None:
decoder_inputs = tf.concat(
[memory, prenet_outputs],
axis=-1) # [N, T_out, memory_size + self.dense_units]
else:
decoder_inputs = tf.concat(
[memory[:, step:step + 1, :], prenet_outputs],
axis=-1) # [N, 1, memory_size + self.dense_units]
decoder_inputs = tf.layers.dense(
decoder_inputs, units=self.dense_units)

inputs = decoder_inputs
inputs *= self.num_units**0.5
if self.position_encoder is not None:
inputs = self.position_encoder(
inputs, position=step + 1 if step is not None else None)

inputs = tf.layers.dropout(inputs, rate=self.dropout, training=mode)

decoder_mask = None
memory_mask = None
# last_attention = None

X_band_width_tmp = -1
H_band_width_tmp = -1
if self.X_band_width is not None:
X_band_width_tmp = tf.cast(
tf.cond(
tf.less(tf.shape(memory)[1], self.X_band_width),
lambda: -1, lambda: self.X_band_width),
dtype=tf.int64)
if self.H_band_width is not None:
H_band_width_tmp = tf.cast(
tf.cond(
tf.less(tf.shape(memory)[1], self.H_band_width),
lambda: -1, lambda: self.H_band_width),
dtype=tf.int64)

if self.self_attention_type == 'scaled_dot':
if sequence_length is not None:
decoder_mask = transformer.build_future_mask(
sequence_length,
num_heads=self.num_heads,
maximum_length=tf.shape(inputs)[1],
band=X_band_width_tmp) # [N, 1, T_out, T_out]
elif self.self_attention_type == 'average':
if cache is None:
if sequence_length is None:
sequence_length = tf.fill([tf.shape(inputs)[0]],
tf.shape(inputs)[1])
decoder_mask = transformer.cumulative_average_mask(
sequence_length,
maximum_length=tf.shape(inputs)[1],
dtype=inputs.dtype)

if memory is not None and not tf.contrib.framework.nest.is_sequence(
memory):
memory = (memory, )
if memory_sequence_length is not None:
if not tf.contrib.framework.nest.is_sequence(
memory_sequence_length):
memory_sequence_length = (memory_sequence_length, )
if step is None:
memory_mask = [
transformer.build_history_mask(
length,
num_heads=self.num_heads,
maximum_length=tf.shape(m)[1],
band=H_band_width_tmp)
for m, length in zip(memory, memory_sequence_length)
]
else:
memory_mask = [
transformer.build_history_mask(
length,
num_heads=self.num_heads,
maximum_length=tf.shape(m)[1],
band=H_band_width_tmp)[:, :, step:step + 1, :]
for m, length in zip(memory, memory_sequence_length)
]

# last_attention = None
attns_x = []
attns_h = []
for layer in range(self.num_layers):
layer_name = 'layer_{}'.format(layer)
layer_cache = cache[layer_name] if cache is not None else None
with tf.variable_scope(layer_name):
if memory is not None:
for i, (mem, mask) in enumerate(zip(memory, memory_mask)):
memory_cache = None
if layer_cache is not None:
memory_cache = layer_cache['memory'][i]
scope_name = 'multi_head_{}'.format(i)
if i == 0:
scope_name = 'multi_head'
with tf.variable_scope(scope_name):
encoded, attn_x, attn_h = transformer.multi_head_attention_PNCA(
self.num_heads,
transformer.norm(inputs),
mem,
mode,
num_units=self.num_units,
mask=decoder_mask,
mask_h=mask,
cache=layer_cache,
cache_h=memory_cache,
dropout=self.attention_dropout,
return_attention=True,
layer_name=layer_name,
X_band_width=self.X_band_width)
attns_x.append(attn_x)
attns_h.append(attn_h)
context = transformer.drop_and_add(
inputs, encoded, mode, dropout=self.dropout)

with tf.variable_scope('ffn'):
transformed = transformer.feed_forward_ori(
transformer.norm(context),
self.ffn_inner_dim,
mode,
dropout=self.relu_dropout)
transformed = transformer.drop_and_add(
context, transformed, mode, dropout=self.dropout)

inputs = transformed

outputs = transformer.norm(inputs)
outputs = tf.layers.dense(
outputs, units=self.num_mels * self.outputs_per_step)
return outputs, attns_x, attns_h

def decode_from_inputs(self,
inputs,
sequence_length,
initial_state=None,
mode=True,
memory=None,
memory_sequence_length=None):
outputs, attention_x, attention_h = self._self_attention_stack(
inputs,
sequence_length=sequence_length,
mode=mode,
memory=memory,
memory_sequence_length=memory_sequence_length)
return outputs, attention_x, attention_h

def step_fn(self,
mode,
batch_size,
initial_state=None,
memory=None,
memory_sequence_length=None,
dtype=tf.float32):
if memory is None:
num_sources = 0
elif tf.contrib.framework.nest.is_sequence(memory):
num_sources = len(memory)
else:
num_sources = 1
cache = self._init_cache(
batch_size, dtype=dtype, num_sources=num_sources)
attention_x = self._init_attn(dtype=dtype)
attention_h = self._init_attn(dtype=dtype)

def _fn(step, inputs, cache):
outputs, attention_x, attention_h = self._self_attention_stack(
inputs,
mode=mode,
cache=cache,
memory=memory,
memory_sequence_length=memory_sequence_length,
step=step)
attention_x_tmp = []
for layer in range(len(attention_h)):
attention_x_tmp_l = tf.zeros_like(attention_h[layer])
if self.X_band_width is not None:
pred = tf.less(step, self.X_band_width + 1)
attention_x_tmp_l_1 = tf.cond(pred, # yapf:disable
lambda: attention_x_tmp_l[:, :, :, :step + 1] + attention_x[layer],
lambda: tf.concat([
attention_x_tmp_l[:, :, :,
:step - self.X_band_width],
attention_x_tmp_l[:, :, :,
step - self.X_band_width:step + 1]
+ attention_x[layer]],
axis=-1)) # yapf:disable
attention_x_tmp_l_2 = attention_x_tmp_l[:, :, :, step + 1:]
attention_x_tmp.append(
tf.concat([attention_x_tmp_l_1, attention_x_tmp_l_2],
axis=-1))
else:
attention_x_tmp_l_1 = attention_x_tmp_l[:, :, :, :step + 1]
attention_x_tmp_l_2 = attention_x_tmp_l[:, :, :, step + 1:]
attention_x_tmp.append(
tf.concat([
attention_x_tmp_l_1 + attention_x[layer],
attention_x_tmp_l_2
], axis=-1)) # yapf:disable
attention_x = attention_x_tmp
return outputs, cache, attention_x, attention_h

return _fn, cache, attention_x, attention_h

def dynamic_decode_and_search(self, init_decoder_input, maximum_iterations,
mode, memory, memory_sequence_length):
batch_size = tf.shape(init_decoder_input)[0]
step_fn, init_cache, init_attn_x, init_attn_h = self.step_fn(
mode,
batch_size,
memory=memory,
memory_sequence_length=memory_sequence_length)

outputs, attention_x, attention_h, cache = self.dynamic_decode(
step_fn,
init_decoder_input,
init_cache=init_cache,
init_attn_x=init_attn_x,
init_attn_h=init_attn_h,
maximum_iterations=maximum_iterations,
batch_size=batch_size)
return outputs, attention_x, attention_h

def dynamic_decode_and_search_teacher_forcing(self, decoder_input,
maximum_iterations, mode,
memory,
memory_sequence_length):
batch_size = tf.shape(decoder_input)[0]
step_fn, init_cache, init_attn_x, init_attn_h = self.step_fn(
mode,
batch_size,
memory=memory,
memory_sequence_length=memory_sequence_length)

outputs, attention_x, attention_h, cache = self.dynamic_decode_teacher_forcing(
step_fn,
decoder_input,
init_cache=init_cache,
init_attn_x=init_attn_x,
init_attn_h=init_attn_h,
maximum_iterations=maximum_iterations,
batch_size=batch_size)
return outputs, attention_x, attention_h

def dynamic_decode(self,
step_fn,
init_decoder_input,
init_cache=None,
init_attn_x=None,
init_attn_h=None,
maximum_iterations=None,
batch_size=None):

def _cond(step, cache, inputs, outputs, attention_x, attention_h): # pylint: disable=unused-argument
return tf.less(step, maximum_iterations)

def _body(step, cache, inputs, outputs, attention_x, attention_h):
# output: [1, 1, num_mels * r]
# attn: [1, 1, T_out]
output, cache, attn_x, attn_h = step_fn(
step, inputs, cache) # outputs, cache, attention, attns
for layer in range(len(attention_x)):
attention_x[layer] = attention_x[layer].write(
step, tf.cast(attn_x[layer], tf.float32))

for layer in range(len(attention_h)):
attention_h[layer] = attention_h[layer].write(
step, tf.cast(attn_h[layer], tf.float32))

outputs = outputs.write(step, tf.cast(output, tf.float32))
return step + 1, cache, output[:, :, -self.
num_mels:], outputs, attention_x, attention_h

step = tf.constant(0, dtype=tf.int32)
outputs = tf.TensorArray(tf.float32, size=0, dynamic_size=True)

_, cache, _, outputs, attention_x, attention_h = tf.while_loop(
_cond,
_body,
loop_vars=(step, init_cache, init_decoder_input, outputs,
init_attn_x, init_attn_h),
shape_invariants=(step.shape,
compat.nest.map_structure(
self._get_shape_invariants, init_cache),
compat.nest.map_structure(
self._get_shape_invariants,
init_decoder_input), tf.TensorShape(None),
compat.nest.map_structure(
self._get_shape_invariants, init_attn_x),
compat.nest.map_structure(
self._get_shape_invariants, init_attn_h)),
parallel_iterations=1,
back_prop=False,
maximum_iterations=maximum_iterations)
# element of outputs: [N, 1, num_mels * r]
outputs_stack = outputs.stack() # [T_out, N, 1, num_mels * r]
outputs_stack = tf.transpose(
outputs_stack, perm=[2, 1, 0, 3]) # [1, N, T_out, num_mels * r]
outputs_stack = tf.squeeze(
outputs_stack, axis=0) # [N, T_out, num_mels * r]

attention_x_stack = []
for layer in range(len(attention_x)):
attention_x_stack_tmp = attention_x[layer].stack(
) # [T_out, N, H, 1, T_out]
attention_x_stack_tmp = tf.transpose(
attention_x_stack_tmp, perm=[3, 1, 2, 0,
4]) # [1, N, H, T_out, T_out]
attention_x_stack_tmp = tf.squeeze(
attention_x_stack_tmp, axis=0) # [N, H, T_out, T_out]
attention_x_stack.append(attention_x_stack_tmp)

attention_h_stack = []
for layer in range(len(attention_h)):
attention_h_stack_tmp = attention_h[layer].stack(
) # [T_out, N, H, 1, T_out]
attention_h_stack_tmp = tf.transpose(
attention_h_stack_tmp, perm=[3, 1, 2, 0,
4]) # [1, N, H, T_out, T_out]
attention_h_stack_tmp = tf.squeeze(
attention_h_stack_tmp, axis=0) # [N, H, T_out, T_out]
attention_h_stack.append(attention_h_stack_tmp)

return outputs_stack, attention_x_stack, attention_h_stack, cache

def dynamic_decode_teacher_forcing(self,
step_fn,
decoder_input,
init_cache=None,
init_attn_x=None,
init_attn_h=None,
maximum_iterations=None,
batch_size=None):

def _cond(step, cache, inputs, outputs, attention_x, attention_h): # pylint: disable=unused-argument
return tf.less(step, maximum_iterations)

def _body(step, cache, inputs, outputs, attention_x, attention_h):
# output: [1, 1, num_mels * r]
# attn: [1, 1, T_out]
output, cache, attn_x, attn_h = step_fn(
step, inputs[:, step:step + 1, :],
cache) # outputs, cache, attention, attns
for layer in range(len(attention_x)):
attention_x[layer] = attention_x[layer].write(
step, tf.cast(attn_x[layer], tf.float32))

for layer in range(len(attention_h)):
attention_h[layer] = attention_h[layer].write(
step, tf.cast(attn_h[layer], tf.float32))
outputs = outputs.write(step, tf.cast(output, tf.float32))
return step + 1, cache, inputs, outputs, attention_x, attention_h

step = tf.constant(0, dtype=tf.int32)
outputs = tf.TensorArray(tf.float32, size=0, dynamic_size=True)

_, cache, _, outputs, attention_x, attention_h = tf.while_loop(
_cond,
_body,
loop_vars=(step, init_cache, decoder_input, outputs, init_attn_x,
init_attn_h),
shape_invariants=(step.shape,
compat.nest.map_structure(
self._get_shape_invariants,
init_cache), decoder_input.shape,
tf.TensorShape(None),
compat.nest.map_structure(
self._get_shape_invariants, init_attn_x),
compat.nest.map_structure(
self._get_shape_invariants, init_attn_h)),
parallel_iterations=1,
back_prop=False,
maximum_iterations=maximum_iterations)
# element of outputs: [N, 1, num_mels * r]
outputs_stack = outputs.stack() # [T_out, N, 1, num_mels * r]
outputs_stack = tf.transpose(
outputs_stack, perm=[2, 1, 0, 3]) # [1, N, T_out, num_mels * r]
outputs_stack = tf.squeeze(
outputs_stack, axis=0) # [N, T_out, num_mels * r]

attention_x_stack = []
for layer in range(len(attention_x)):
attention_x_stack_tmp = attention_x[layer].stack(
) # [T_out, N, H, 1, T_out]
attention_x_stack_tmp = tf.transpose(
attention_x_stack_tmp, perm=[3, 1, 2, 0,
4]) # [1, N, H, T_out, T_out]
attention_x_stack_tmp = tf.squeeze(
attention_x_stack_tmp, axis=0) # [N, H, T_out, T_out]
attention_x_stack.append(attention_x_stack_tmp)

attention_h_stack = []
for layer in range(len(attention_h)):
attention_h_stack_tmp = attention_h[layer].stack(
) # [T_out, N, H, 1, T_out]
attention_h_stack_tmp = tf.transpose(
attention_h_stack_tmp, perm=[3, 1, 2, 0,
4]) # [1, N, H, T_out, T_out]
attention_h_stack_tmp = tf.squeeze(
attention_h_stack_tmp, axis=0) # [N, H, T_out, T_out]
attention_h_stack.append(attention_h_stack_tmp)

return outputs_stack, attention_x_stack, attention_h_stack, cache

def _get_shape_invariants(self, tensor):
"""Returns the shape of the tensor but sets middle dims to None."""
if isinstance(tensor, tf.TensorArray):
shape = None
else:
shape = tensor.shape.as_list()
for i in range(1, len(shape) - 1):
shape[i] = None
return tf.TensorShape(shape)


class SelfAttentionDecoderOri():
"""Decoder using self-attention as described in
https://arxiv.org/abs/1706.03762.
"""

def __init__(self,
num_layers,
num_units=512,
num_heads=8,
ffn_inner_dim=2048,
dropout=0.1,
attention_dropout=0.1,
relu_dropout=0.1,
position_encoder=SinusoidalPositionEncoder(),
self_attention_type='scaled_dot'):
"""Initializes the parameters of the decoder.

Args:
num_layers: The number of layers.
num_units: The number of hidden units.
num_heads: The number of heads in the multi-head attention.
ffn_inner_dim: The number of units of the inner linear transformation
in the feed forward layer.
dropout: The probability to drop units from the outputs.
attention_dropout: The probability to drop units from the attention.
relu_dropout: The probability to drop units from the ReLU activation in
the feed forward layer.
position_encoder: A :class:`opennmt.layers.position.PositionEncoder` to
apply on inputs or ``None``.
self_attention_type: Type of self attention, "scaled_dot" or "average" (case
insensitive).

Raises:
ValueError: if :obj:`self_attention_type` is invalid.
"""
super(SelfAttentionDecoderOri, self).__init__()
self.num_layers = num_layers
self.num_units = num_units
self.num_heads = num_heads
self.ffn_inner_dim = ffn_inner_dim
self.dropout = dropout
self.attention_dropout = attention_dropout
self.relu_dropout = relu_dropout
self.position_encoder = position_encoder
self.self_attention_type = self_attention_type.lower()
if self.self_attention_type not in ('scaled_dot', 'average'):
raise ValueError('invalid attention type %s'
% self.self_attention_type)
if self.self_attention_type == 'average':
tf.logging.warning(
'Support for average attention network is experimental '
'and may change in future versions.')

@property
def output_size(self):
"""Returns the decoder output size."""
return self.num_units

@property
def support_alignment_history(self):
return True

@property
def support_multi_source(self):
return True

def _init_cache(self, batch_size, dtype=tf.float32, num_sources=1):
cache = {}

for layer in range(self.num_layers):
proj_cache_shape = [
batch_size, self.num_heads, 0, self.num_units // self.num_heads
]
layer_cache = {}
layer_cache['memory'] = [{
'memory_keys':
tf.zeros(proj_cache_shape, dtype=dtype),
'memory_values':
tf.zeros(proj_cache_shape, dtype=dtype)
} for _ in range(num_sources)]
if self.self_attention_type == 'scaled_dot':
layer_cache['self_keys'] = tf.zeros(
proj_cache_shape, dtype=dtype)
layer_cache['self_values'] = tf.zeros(
proj_cache_shape, dtype=dtype)
elif self.self_attention_type == 'average':
layer_cache['prev_g'] = tf.zeros(
[batch_size, 1, self.num_units], dtype=dtype)
cache['layer_{}'.format(layer)] = layer_cache

return cache

def _self_attention_stack(self,
inputs,
sequence_length=None,
mode=True,
cache=None,
memory=None,
memory_sequence_length=None,
step=None):
inputs *= self.num_units**0.5
if self.position_encoder is not None:
inputs = self.position_encoder(
inputs, position=step + 1 if step is not None else None)

inputs = tf.layers.dropout(inputs, rate=self.dropout, training=mode)

decoder_mask = None
memory_mask = None
last_attention = None

if self.self_attention_type == 'scaled_dot':
if sequence_length is not None:
decoder_mask = transformer.build_future_mask(
sequence_length,
num_heads=self.num_heads,
maximum_length=tf.shape(inputs)[1])
elif self.self_attention_type == 'average':
if cache is None:
if sequence_length is None:
sequence_length = tf.fill([tf.shape(inputs)[0]],
tf.shape(inputs)[1])
decoder_mask = transformer.cumulative_average_mask(
sequence_length,
maximum_length=tf.shape(inputs)[1],
dtype=inputs.dtype)

if memory is not None and not tf.contrib.framework.nest.is_sequence(
memory):
memory = (memory, )
if memory_sequence_length is not None:
if not tf.contrib.framework.nest.is_sequence(
memory_sequence_length):
memory_sequence_length = (memory_sequence_length, )
memory_mask = [
transformer.build_sequence_mask(
length,
num_heads=self.num_heads,
maximum_length=tf.shape(m)[1])
for m, length in zip(memory, memory_sequence_length)
]

for layer in range(self.num_layers):
layer_name = 'layer_{}'.format(layer)
layer_cache = cache[layer_name] if cache is not None else None
with tf.variable_scope(layer_name):
if self.self_attention_type == 'scaled_dot':
with tf.variable_scope('masked_multi_head'):
encoded = transformer.multi_head_attention(
self.num_heads,
transformer.norm(inputs),
None,
mode,
num_units=self.num_units,
mask=decoder_mask,
cache=layer_cache,
dropout=self.attention_dropout)
last_context = transformer.drop_and_add(
inputs, encoded, mode, dropout=self.dropout)
elif self.self_attention_type == 'average':
with tf.variable_scope('average_attention'):
# Cumulative average.
x = transformer.norm(inputs)
y = transformer.cumulative_average(
x,
decoder_mask if cache is None else step,
cache=layer_cache)
# FFN.
y = transformer.feed_forward(
y,
self.ffn_inner_dim,
mode,
dropout=self.relu_dropout)
# Gating layer.
z = tf.layers.dense(
tf.concat([x, y], -1), self.num_units * 2)
i, f = tf.split(z, 2, axis=-1)
y = tf.sigmoid(i) * x + tf.sigmoid(f) * y
last_context = transformer.drop_and_add(
inputs, y, mode, dropout=self.dropout)

if memory is not None:
for i, (mem, mask) in enumerate(zip(memory, memory_mask)):
memory_cache = layer_cache['memory'][i] if layer_cache is not None else None # yapf:disable
with tf.variable_scope('multi_head' if i
== 0 else 'multi_head_%d' % i): # yapf:disable
context, last_attention = transformer.multi_head_attention(
self.num_heads,
transformer.norm(last_context),
mem,
mode,
mask=mask,
cache=memory_cache,
dropout=self.attention_dropout,
return_attention=True)
last_context = transformer.drop_and_add(
last_context,
context,
mode,
dropout=self.dropout)
if i > 0: # Do not return attention in case of multi source.
last_attention = None

with tf.variable_scope('ffn'):
transformed = transformer.feed_forward_ori(
transformer.norm(last_context),
self.ffn_inner_dim,
mode,
dropout=self.relu_dropout)
transformed = transformer.drop_and_add(
last_context, transformed, mode, dropout=self.dropout)

inputs = transformed

if last_attention is not None:
# The first head of the last layer is returned.
first_head_attention = last_attention[:, 0]
else:
first_head_attention = None

outputs = transformer.norm(inputs)
return outputs, first_head_attention

def decode_from_inputs(self,
inputs,
sequence_length,
initial_state=None,
mode=True,
memory=None,
memory_sequence_length=None):
outputs, attention = self._self_attention_stack(
inputs,
sequence_length=sequence_length,
mode=mode,
memory=memory,
memory_sequence_length=memory_sequence_length)
return outputs, None, attention

def step_fn(self,
mode,
batch_size,
initial_state=None,
memory=None,
memory_sequence_length=None,
dtype=tf.float32):
if memory is None:
num_sources = 0
elif tf.contrib.framework.nest.is_sequence(memory):
num_sources = len(memory)
else:
num_sources = 1
cache = self._init_cache(
batch_size, dtype=dtype, num_sources=num_sources)

def _fn(step, inputs, cache, mode):
inputs = tf.expand_dims(inputs, 1)
outputs, attention = self._self_attention_stack(
inputs,
mode=mode,
cache=cache,
memory=memory,
memory_sequence_length=memory_sequence_length,
step=step)
outputs = tf.squeeze(outputs, axis=1)
if attention is not None:
attention = tf.squeeze(attention, axis=1)
return outputs, cache, attention

return _fn, cache

+ 182
- 0
modelscope/models/audio/tts/am/models/self_attention_encoder.py View File

@@ -0,0 +1,182 @@
"""Define the self-attention encoder."""

import tensorflow as tf

from . import transformer
from .position import SinusoidalPositionEncoder


class SelfAttentionEncoder():
"""Encoder using self-attention as described in
https://arxiv.org/abs/1706.03762.
"""

def __init__(self,
num_layers,
num_units=512,
num_heads=8,
ffn_inner_dim=2048,
dropout=0.1,
attention_dropout=0.1,
relu_dropout=0.1,
position_encoder=SinusoidalPositionEncoder()):
"""Initializes the parameters of the encoder.

Args:
num_layers: The number of layers.
num_units: The number of hidden units.
num_heads: The number of heads in the multi-head attention.
ffn_inner_dim: The number of units of the inner linear transformation
in the feed forward layer.
dropout: The probability to drop units from the outputs.
attention_dropout: The probability to drop units from the attention.
relu_dropout: The probability to drop units from the ReLU activation in
the feed forward layer.
position_encoder: The :class:`opennmt.layers.position.PositionEncoder` to
apply on inputs or ``None``.
"""
super(SelfAttentionEncoder, self).__init__()
self.num_layers = num_layers
self.num_units = num_units
self.num_heads = num_heads
self.ffn_inner_dim = ffn_inner_dim
self.dropout = dropout
self.attention_dropout = attention_dropout
self.relu_dropout = relu_dropout
self.position_encoder = position_encoder

def encode(self, inputs, sequence_length=None, mode=True):
inputs *= self.num_units**0.5
if self.position_encoder is not None:
inputs = self.position_encoder(inputs)

inputs = tf.layers.dropout(inputs, rate=self.dropout, training=mode)
mask = transformer.build_sequence_mask(
sequence_length,
num_heads=self.num_heads,
maximum_length=tf.shape(inputs)[1])

mask_FF = tf.squeeze(
transformer.build_sequence_mask(
sequence_length, maximum_length=tf.shape(inputs)[1]),
axis=1)

state = ()

attns = []
for layer in range(self.num_layers):
with tf.variable_scope('layer_{}'.format(layer)):
with tf.variable_scope('multi_head'):
context, attn = transformer.multi_head_attention(
self.num_heads,
transformer.norm(inputs),
None,
mode,
num_units=self.num_units,
mask=mask,
dropout=self.attention_dropout,
return_attention=True)
attns.append(attn)
context = transformer.drop_and_add(
inputs, context, mode, dropout=self.dropout)

with tf.variable_scope('ffn'):
transformed = transformer.feed_forward(
transformer.norm(context),
self.ffn_inner_dim,
mode,
dropout=self.relu_dropout,
mask=mask_FF)
transformed = transformer.drop_and_add(
context, transformed, mode, dropout=self.dropout)

inputs = transformed
state += (tf.reduce_mean(inputs, axis=1), )

outputs = transformer.norm(inputs)
return (outputs, state, sequence_length, attns)


class SelfAttentionEncoderOri():
"""Encoder using self-attention as described in
https://arxiv.org/abs/1706.03762.
"""

def __init__(self,
num_layers,
num_units=512,
num_heads=8,
ffn_inner_dim=2048,
dropout=0.1,
attention_dropout=0.1,
relu_dropout=0.1,
position_encoder=SinusoidalPositionEncoder()):
"""Initializes the parameters of the encoder.

Args:
num_layers: The number of layers.
num_units: The number of hidden units.
num_heads: The number of heads in the multi-head attention.
ffn_inner_dim: The number of units of the inner linear transformation
in the feed forward layer.
dropout: The probability to drop units from the outputs.
attention_dropout: The probability to drop units from the attention.
relu_dropout: The probability to drop units from the ReLU activation in
the feed forward layer.
position_encoder: The :class:`opennmt.layers.position.PositionEncoder` to
apply on inputs or ``None``.
"""
super(SelfAttentionEncoderOri, self).__init__()
self.num_layers = num_layers
self.num_units = num_units
self.num_heads = num_heads
self.ffn_inner_dim = ffn_inner_dim
self.dropout = dropout
self.attention_dropout = attention_dropout
self.relu_dropout = relu_dropout
self.position_encoder = position_encoder

def encode(self, inputs, sequence_length=None, mode=True):
inputs *= self.num_units**0.5
if self.position_encoder is not None:
inputs = self.position_encoder(inputs)

inputs = tf.layers.dropout(inputs, rate=self.dropout, training=mode)
mask = transformer.build_sequence_mask(
sequence_length,
num_heads=self.num_heads,
maximum_length=tf.shape(inputs)[1]) # [N, 1, 1, T_out]

state = ()

attns = []
for layer in range(self.num_layers):
with tf.variable_scope('layer_{}'.format(layer)):
with tf.variable_scope('multi_head'):
context, attn = transformer.multi_head_attention(
self.num_heads,
transformer.norm(inputs),
None,
mode,
num_units=self.num_units,
mask=mask,
dropout=self.attention_dropout,
return_attention=True)
attns.append(attn)
context = transformer.drop_and_add(
inputs, context, mode, dropout=self.dropout)

with tf.variable_scope('ffn'):
transformed = transformer.feed_forward_ori(
transformer.norm(context),
self.ffn_inner_dim,
mode,
dropout=self.relu_dropout)
transformed = transformer.drop_and_add(
context, transformed, mode, dropout=self.dropout)

inputs = transformed
state += (tf.reduce_mean(inputs, axis=1), )

outputs = transformer.norm(inputs)
return (outputs, state, sequence_length, attns)

+ 1157
- 0
modelscope/models/audio/tts/am/models/transformer.py
File diff suppressed because it is too large
View File


+ 255
- 0
modelscope/models/audio/tts/am/sambert_hifi_16k.py View File

@@ -0,0 +1,255 @@
import io
import os
from typing import Any, Dict, Optional, Union

import numpy as np
import tensorflow as tf
from sklearn.preprocessing import MultiLabelBinarizer

from modelscope.models.base import Model
from modelscope.models.builder import MODELS
from modelscope.utils.constant import ModelFile, Tasks
from .models import create_model
from .text.symbols import load_symbols
from .text.symbols_dict import SymbolsDict

__all__ = ['SambertNetHifi16k']


def multi_label_symbol_to_sequence(my_classes, my_symbol):
one_hot = MultiLabelBinarizer(my_classes)
tokens = my_symbol.strip().split(' ')
sequences = []
for token in tokens:
sequences.append(tuple(token.split('&')))
# sequences.append(tuple(['~'])) # sequence length minus 1 to ignore EOS ~
return one_hot.fit_transform(sequences)


@MODELS.register_module(Tasks.text_to_speech, module_name=r'sambert_hifi_16k')
class SambertNetHifi16k(Model):

def __init__(self,
model_dir,
pitch_control_str='',
duration_control_str='',
energy_control_str='',
*args,
**kwargs):
tf.reset_default_graph()
local_ckpt_path = os.path.join(ModelFile.TF_CHECKPOINT_FOLDER, 'ckpt')
self._ckpt_path = os.path.join(model_dir, local_ckpt_path)
self._dict_path = os.path.join(model_dir, 'dicts')
self._hparams = tf.contrib.training.HParams(**kwargs)
values = self._hparams.values()
hp = [' {}:{}'.format(name, values[name]) for name in sorted(values)]
print('Hyperparameters:\n' + '\n'.join(hp))
super().__init__(self._ckpt_path, *args, **kwargs)
model_name = 'robutrans'
self._lfeat_type_list = self._hparams.lfeat_type_list.strip().split(
',')
sy, tone, syllable_flag, word_segment, emo_category, speaker = load_symbols(
self._dict_path)
self._sy = sy
self._tone = tone
self._syllable_flag = syllable_flag
self._word_segment = word_segment
self._emo_category = emo_category
self._speaker = speaker
self._inputs_dim = dict()
for lfeat_type in self._lfeat_type_list:
if lfeat_type == 'sy':
self._inputs_dim[lfeat_type] = len(sy)
elif lfeat_type == 'tone':
self._inputs_dim[lfeat_type] = len(tone)
elif lfeat_type == 'syllable_flag':
self._inputs_dim[lfeat_type] = len(syllable_flag)
elif lfeat_type == 'word_segment':
self._inputs_dim[lfeat_type] = len(word_segment)
elif lfeat_type == 'emo_category':
self._inputs_dim[lfeat_type] = len(emo_category)
elif lfeat_type == 'speaker':
self._inputs_dim[lfeat_type] = len(speaker)

self._symbols_dict = SymbolsDict(sy, tone, syllable_flag, word_segment,
emo_category, speaker,
self._inputs_dim,
self._lfeat_type_list)
dim_inputs = sum(self._inputs_dim.values(
)) - self._inputs_dim['speaker'] - self._inputs_dim['emo_category']
inputs = tf.placeholder(tf.float32, [1, None, dim_inputs], 'inputs')
inputs_emotion = tf.placeholder(
tf.float32, [1, None, self._inputs_dim['emo_category']],
'inputs_emotion')
inputs_speaker = tf.placeholder(tf.float32,
[1, None, self._inputs_dim['speaker']],
'inputs_speaker')

input_lengths = tf.placeholder(tf.int32, [1], 'input_lengths')
pitch_contours_scale = tf.placeholder(tf.float32, [1, None],
'pitch_contours_scale')
energy_contours_scale = tf.placeholder(tf.float32, [1, None],
'energy_contours_scale')
duration_scale = tf.placeholder(tf.float32, [1, None],
'duration_scale')

with tf.variable_scope('model') as _:
self._model = create_model(model_name, self._hparams)
self._model.initialize(
inputs,
inputs_emotion,
inputs_speaker,
input_lengths,
duration_scales=duration_scale,
pitch_scales=pitch_contours_scale,
energy_scales=energy_contours_scale)
self._mel_spec = self._model.mel_outputs[0]
self._duration_outputs = self._model.duration_outputs[0]
self._duration_outputs_ = self._model.duration_outputs_[0]
self._pitch_contour_outputs = self._model.pitch_contour_outputs[0]
self._energy_contour_outputs = self._model.energy_contour_outputs[
0]
self._embedded_inputs_emotion = self._model.embedded_inputs_emotion[
0]
self._embedding_fsmn_outputs = self._model.embedding_fsmn_outputs[
0]
self._encoder_outputs = self._model.encoder_outputs[0]
self._pitch_embeddings = self._model.pitch_embeddings[0]
self._energy_embeddings = self._model.energy_embeddings[0]
self._LR_outputs = self._model.LR_outputs[0]
self._postnet_fsmn_outputs = self._model.postnet_fsmn_outputs[0]
self._attention_h = self._model.attention_h
self._attention_x = self._model.attention_x

print('Loading checkpoint: %s' % self._ckpt_path)
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
self._session = tf.Session(config=config)
self._session.run(tf.global_variables_initializer())

saver = tf.train.Saver()
saver.restore(self._session, self._ckpt_path)

duration_cfg_lst = []
if len(duration_control_str) != 0:
for item in duration_control_str.strip().split('|'):
percent, scale = item.lstrip('(').rstrip(')').split(',')
duration_cfg_lst.append((float(percent), float(scale)))

self._duration_cfg_lst = duration_cfg_lst

pitch_contours_cfg_lst = []
if len(pitch_control_str) != 0:
for item in pitch_control_str.strip().split('|'):
percent, scale = item.lstrip('(').rstrip(')').split(',')
pitch_contours_cfg_lst.append(
(float(percent), float(scale)))

self._pitch_contours_cfg_lst = pitch_contours_cfg_lst

energy_contours_cfg_lst = []
if len(energy_control_str) != 0:
for item in energy_control_str.strip().split('|'):
percent, scale = item.lstrip('(').rstrip(')').split(',')
energy_contours_cfg_lst.append(
(float(percent), float(scale)))

self._energy_contours_cfg_lst = energy_contours_cfg_lst

def forward(self, text):
cleaner_names = [x.strip() for x in self._hparams.cleaners.split(',')]

lfeat_symbol = text.strip().split(' ')
lfeat_symbol_separate = [''] * int(len(self._lfeat_type_list))
for this_lfeat_symbol in lfeat_symbol:
this_lfeat_symbol = this_lfeat_symbol.strip('{').strip('}').split(
'$')
if len(this_lfeat_symbol) != len(self._lfeat_type_list):
raise Exception(
'Length of this_lfeat_symbol in training data'
+ ' is not equal to the length of lfeat_type_list, '
+ str(len(this_lfeat_symbol)) + ' VS. '
+ str(len(self._lfeat_type_list)))
index = 0
while index < len(lfeat_symbol_separate):
lfeat_symbol_separate[index] = lfeat_symbol_separate[
index] + this_lfeat_symbol[index] + ' '
index = index + 1

index = 0
lfeat_type = self._lfeat_type_list[index]
sequence = self._symbols_dict.symbol_to_sequence(
lfeat_symbol_separate[index].strip(), lfeat_type, cleaner_names)
sequence_array = np.asarray(
sequence[:-1],
dtype=np.int32) # sequence length minus 1 to ignore EOS ~
inputs = np.eye(
self._inputs_dim[lfeat_type], dtype=np.float32)[sequence_array]
index = index + 1
while index < len(self._lfeat_type_list) - 2:
lfeat_type = self._lfeat_type_list[index]
sequence = self._symbols_dict.symbol_to_sequence(
lfeat_symbol_separate[index].strip(), lfeat_type,
cleaner_names)
sequence_array = np.asarray(
sequence[:-1],
dtype=np.int32) # sequence length minus 1 to ignore EOS ~
inputs_temp = np.eye(
self._inputs_dim[lfeat_type], dtype=np.float32)[sequence_array]
inputs = np.concatenate((inputs, inputs_temp), axis=1)
index = index + 1
seq = inputs

lfeat_type = 'emo_category'
inputs_emotion = multi_label_symbol_to_sequence(
self._emo_category, lfeat_symbol_separate[index].strip())
# inputs_emotion = inputs_emotion * 1.5
index = index + 1

lfeat_type = 'speaker'
inputs_speaker = multi_label_symbol_to_sequence(
self._speaker, lfeat_symbol_separate[index].strip())

duration_scale = np.ones((len(seq), ), dtype=np.float32)
start_idx = 0
for (percent, scale) in self._duration_cfg_lst:
duration_scale[start_idx:start_idx
+ int(percent * len(seq))] = scale
start_idx += int(percent * len(seq))

pitch_contours_scale = np.ones((len(seq), ), dtype=np.float32)
start_idx = 0
for (percent, scale) in self._pitch_contours_cfg_lst:
pitch_contours_scale[start_idx:start_idx
+ int(percent * len(seq))] = scale
start_idx += int(percent * len(seq))

energy_contours_scale = np.ones((len(seq), ), dtype=np.float32)
start_idx = 0
for (percent, scale) in self._energy_contours_cfg_lst:
energy_contours_scale[start_idx:start_idx
+ int(percent * len(seq))] = scale
start_idx += int(percent * len(seq))

feed_dict = {
self._model.inputs: [np.asarray(seq, dtype=np.float32)],
self._model.inputs_emotion:
[np.asarray(inputs_emotion, dtype=np.float32)],
self._model.inputs_speaker:
[np.asarray(inputs_speaker, dtype=np.float32)],
self._model.input_lengths:
np.asarray([len(seq)], dtype=np.int32),
self._model.duration_scales: [duration_scale],
self._model.pitch_scales: [pitch_contours_scale],
self._model.energy_scales: [energy_contours_scale]
}

result = self._session.run([
self._mel_spec, self._duration_outputs, self._duration_outputs_,
self._pitch_contour_outputs, self._embedded_inputs_emotion,
self._embedding_fsmn_outputs, self._encoder_outputs,
self._pitch_embeddings, self._LR_outputs,
self._postnet_fsmn_outputs, self._energy_contour_outputs,
self._energy_embeddings, self._attention_x, self._attention_h
], feed_dict=feed_dict) # yapf:disable
return result[0]

+ 0
- 0
modelscope/models/audio/tts/am/text/__init__.py View File


+ 89
- 0
modelscope/models/audio/tts/am/text/cleaners.py View File

@@ -0,0 +1,89 @@
'''
Cleaners are transformations that run over the input text at both training and eval time.

Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners"
hyperparameter. Some cleaners are English-specific. You'll typically want to use:
1. "english_cleaners" for English text
2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using
the Unidecode library (https://pypi.python.org/pypi/Unidecode)
3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update
the symbols in symbols.py to match your data).
'''

import re

from unidecode import unidecode

from .numbers import normalize_numbers

# Regular expression matching whitespace:
_whitespace_re = re.compile(r'\s+')

# List of (regular expression, replacement) pairs for abbreviations:
_abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1])
for x in [
('mrs', 'misess'),
('mr', 'mister'),
('dr', 'doctor'),
('st', 'saint'),
('co', 'company'),
('jr', 'junior'),
('maj', 'major'),
('gen', 'general'),
('drs', 'doctors'),
('rev', 'reverend'),
('lt', 'lieutenant'),
('hon', 'honorable'),
('sgt', 'sergeant'),
('capt', 'captain'),
('esq', 'esquire'),
('ltd', 'limited'),
('col', 'colonel'),
('ft', 'fort'), ]] # yapf:disable


def expand_abbreviations(text):
for regex, replacement in _abbreviations:
text = re.sub(regex, replacement, text)
return text


def expand_numbers(text):
return normalize_numbers(text)


def lowercase(text):
return text.lower()


def collapse_whitespace(text):
return re.sub(_whitespace_re, ' ', text)


def convert_to_ascii(text):
return unidecode(text)


def basic_cleaners(text):
'''Basic pipeline that lowercases and collapses whitespace without transliteration.'''
text = lowercase(text)
text = collapse_whitespace(text)
return text


def transliteration_cleaners(text):
'''Pipeline for non-English text that transliterates to ASCII.'''
text = convert_to_ascii(text)
text = lowercase(text)
text = collapse_whitespace(text)
return text


def english_cleaners(text):
'''Pipeline for English text, including number and abbreviation expansion.'''
text = convert_to_ascii(text)
text = lowercase(text)
text = expand_numbers(text)
text = expand_abbreviations(text)
text = collapse_whitespace(text)
return text

+ 64
- 0
modelscope/models/audio/tts/am/text/cmudict.py View File

@@ -0,0 +1,64 @@
import re

valid_symbols = [
'AA', 'AA0', 'AA1', 'AA2', 'AE', 'AE0', 'AE1', 'AE2', 'AH', 'AH0', 'AH1',
'AH2', 'AO', 'AO0', 'AO1', 'AO2', 'AW', 'AW0', 'AW1', 'AW2', 'AY', 'AY0',
'AY1', 'AY2', 'B', 'CH', 'D', 'DH', 'EH', 'EH0', 'EH1', 'EH2', 'ER', 'ER0',
'ER1', 'ER2', 'EY', 'EY0', 'EY1', 'EY2', 'F', 'G', 'HH', 'IH', 'IH0',
'IH1', 'IH2', 'IY', 'IY0', 'IY1', 'IY2', 'JH', 'K', 'L', 'M', 'N', 'NG',
'OW', 'OW0', 'OW1', 'OW2', 'OY', 'OY0', 'OY1', 'OY2', 'P', 'R', 'S', 'SH',
'T', 'TH', 'UH', 'UH0', 'UH1', 'UH2', 'UW', 'UW0', 'UW1', 'UW2', 'V', 'W',
'Y', 'Z', 'ZH'
]

_valid_symbol_set = set(valid_symbols)


class CMUDict:
'''Thin wrapper around CMUDict data. http://www.speech.cs.cmu.edu/cgi-bin/cmudict'''

def __init__(self, file_or_path, keep_ambiguous=True):
if isinstance(file_or_path, str):
with open(file_or_path, encoding='latin-1') as f:
entries = _parse_cmudict(f)
else:
entries = _parse_cmudict(file_or_path)
if not keep_ambiguous:
entries = {
word: pron
for word, pron in entries.items() if len(pron) == 1
}
self._entries = entries

def __len__(self):
return len(self._entries)

def lookup(self, word):
'''Returns list of ARPAbet pronunciations of the given word.'''
return self._entries.get(word.upper())


_alt_re = re.compile(r'\([0-9]+\)')


def _parse_cmudict(file):
cmudict = {}
for line in file:
if len(line) and (line[0] >= 'A' and line[0] <= 'Z' or line[0] == "'"):
parts = line.split(' ')
word = re.sub(_alt_re, '', parts[0])
pronunciation = _get_pronunciation(parts[1])
if pronunciation:
if word in cmudict:
cmudict[word].append(pronunciation)
else:
cmudict[word] = [pronunciation]
return cmudict


def _get_pronunciation(s):
parts = s.strip().split(' ')
for part in parts:
if part not in _valid_symbol_set:
return None
return ' '.join(parts)

+ 70
- 0
modelscope/models/audio/tts/am/text/numbers.py View File

@@ -0,0 +1,70 @@
import re

import inflect

_inflect = inflect.engine()
_comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])')
_decimal_number_re = re.compile(r'([0-9]+\.[0-9]+)')
_pounds_re = re.compile(r'£([0-9\,]*[0-9]+)')
_dollars_re = re.compile(r'\$([0-9\.\,]*[0-9]+)')
_ordinal_re = re.compile(r'[0-9]+(st|nd|rd|th)')
_number_re = re.compile(r'[0-9]+')


def _remove_commas(m):
return m.group(1).replace(',', '')


def _expand_decimal_point(m):
return m.group(1).replace('.', ' point ')


def _expand_dollars(m):
match = m.group(1)
parts = match.split('.')
if len(parts) > 2:
return match + ' dollars' # Unexpected format
dollars = int(parts[0]) if parts[0] else 0
cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0
if dollars and cents:
dollar_unit = 'dollar' if dollars == 1 else 'dollars'
cent_unit = 'cent' if cents == 1 else 'cents'
return '%s %s, %s %s' % (dollars, dollar_unit, cents, cent_unit)
elif dollars:
dollar_unit = 'dollar' if dollars == 1 else 'dollars'
return '%s %s' % (dollars, dollar_unit)
elif cents:
cent_unit = 'cent' if cents == 1 else 'cents'
return '%s %s' % (cents, cent_unit)
else:
return 'zero dollars'


def _expand_ordinal(m):
return _inflect.number_to_words(m.group(0))


def _expand_number(m):
num = int(m.group(0))
if num > 1000 and num < 3000:
if num == 2000:
return 'two thousand'
elif num > 2000 and num < 2010:
return 'two thousand ' + _inflect.number_to_words(num % 100)
elif num % 100 == 0:
return _inflect.number_to_words(num // 100) + ' hundred'
else:
return _inflect.number_to_words(
num, andword='', zero='oh', group=2).replace(', ', ' ')
else:
return _inflect.number_to_words(num, andword='')


def normalize_numbers(text):
text = re.sub(_comma_number_re, _remove_commas, text)
text = re.sub(_pounds_re, r'\1 pounds', text)
text = re.sub(_dollars_re, _expand_dollars, text)
text = re.sub(_decimal_number_re, _expand_decimal_point, text)
text = re.sub(_ordinal_re, _expand_ordinal, text)
text = re.sub(_number_re, _expand_number, text)
return text

+ 95
- 0
modelscope/models/audio/tts/am/text/symbols.py View File

@@ -0,0 +1,95 @@
'''
Defines the set of symbols used in text input to the model.

The default is a set of ASCII characters that works well for English or text that has been run
through Unidecode. For other data, you can modify _characters. See TRAINING_DATA.md for details.
'''
import codecs
import os

_pad = '_'
_eos = '~'
_mask = '@[MASK]'


def load_symbols(dict_path):
_characters = ''
_ch_symbols = []
sy_dict_name = 'sy_dict.txt'
sy_dict_path = os.path.join(dict_path, sy_dict_name)
f = codecs.open(sy_dict_path, 'r')
for line in f:
line = line.strip('\r\n')
_ch_symbols.append(line)

_arpabet = ['@' + s for s in _ch_symbols]

# Export all symbols:
sy = list(_characters) + _arpabet + [_pad, _eos, _mask]

_characters = ''

_ch_tones = []
tone_dict_name = 'tone_dict.txt'
tone_dict_path = os.path.join(dict_path, tone_dict_name)
f = codecs.open(tone_dict_path, 'r')
for line in f:
line = line.strip('\r\n')
_ch_tones.append(line)

# Export all tones:
tone = list(_characters) + _ch_tones + [_pad, _eos, _mask]

_characters = ''

_ch_syllable_flags = []
syllable_flag_name = 'syllable_flag_dict.txt'
syllable_flag_path = os.path.join(dict_path, syllable_flag_name)
f = codecs.open(syllable_flag_path, 'r')
for line in f:
line = line.strip('\r\n')
_ch_syllable_flags.append(line)

# Export all syllable_flags:
syllable_flag = list(_characters) + _ch_syllable_flags + [
_pad, _eos, _mask
]

_characters = ''

_ch_word_segments = []
word_segment_name = 'word_segment_dict.txt'
word_segment_path = os.path.join(dict_path, word_segment_name)
f = codecs.open(word_segment_path, 'r')
for line in f:
line = line.strip('\r\n')
_ch_word_segments.append(line)

# Export all syllable_flags:
word_segment = list(_characters) + _ch_word_segments + [_pad, _eos, _mask]

_characters = ''

_ch_emo_types = []
emo_category_name = 'emo_category_dict.txt'
emo_category_path = os.path.join(dict_path, emo_category_name)
f = codecs.open(emo_category_path, 'r')
for line in f:
line = line.strip('\r\n')
_ch_emo_types.append(line)

emo_category = list(_characters) + _ch_emo_types + [_pad, _eos, _mask]

_characters = ''

_ch_speakers = []
speaker_name = 'speaker_dict.txt'
speaker_path = os.path.join(dict_path, speaker_name)
f = codecs.open(speaker_path, 'r')
for line in f:
line = line.strip('\r\n')
_ch_speakers.append(line)

# Export all syllable_flags:
speaker = list(_characters) + _ch_speakers + [_pad, _eos, _mask]
return sy, tone, syllable_flag, word_segment, emo_category, speaker

+ 200
- 0
modelscope/models/audio/tts/am/text/symbols_dict.py View File

@@ -0,0 +1,200 @@
import re
import sys

from .cleaners import (basic_cleaners, english_cleaners,
transliteration_cleaners)


class SymbolsDict:

def __init__(self, sy, tone, syllable_flag, word_segment, emo_category,
speaker, inputs_dim, lfeat_type_list):
self._inputs_dim = inputs_dim
self._lfeat_type_list = lfeat_type_list
self._sy_to_id = {s: i for i, s in enumerate(sy)}
self._id_to_sy = {i: s for i, s in enumerate(sy)}
self._tone_to_id = {s: i for i, s in enumerate(tone)}
self._id_to_tone = {i: s for i, s in enumerate(tone)}
self._syllable_flag_to_id = {s: i for i, s in enumerate(syllable_flag)}
self._id_to_syllable_flag = {i: s for i, s in enumerate(syllable_flag)}
self._word_segment_to_id = {s: i for i, s in enumerate(word_segment)}
self._id_to_word_segment = {i: s for i, s in enumerate(word_segment)}
self._emo_category_to_id = {s: i for i, s in enumerate(emo_category)}
self._id_to_emo_category = {i: s for i, s in enumerate(emo_category)}
self._speaker_to_id = {s: i for i, s in enumerate(speaker)}
self._id_to_speaker = {i: s for i, s in enumerate(speaker)}
print('_sy_to_id: ')
print(self._sy_to_id)
print('_tone_to_id: ')
print(self._tone_to_id)
print('_syllable_flag_to_id: ')
print(self._syllable_flag_to_id)
print('_word_segment_to_id: ')
print(self._word_segment_to_id)
print('_emo_category_to_id: ')
print(self._emo_category_to_id)
print('_speaker_to_id: ')
print(self._speaker_to_id)
self._curly_re = re.compile(r'(.*?)\{(.+?)\}(.*)')
self._cleaners = {
basic_cleaners.__name__: basic_cleaners,
transliteration_cleaners.__name__: transliteration_cleaners,
english_cleaners.__name__: english_cleaners
}

def _clean_text(self, text, cleaner_names):
for name in cleaner_names:
cleaner = self._cleaners.get(name)
if not cleaner:
raise Exception('Unknown cleaner: %s' % name)
text = cleaner(text)
return text

def _sy_to_sequence(self, sy):
return [self._sy_to_id[s] for s in sy if self._should_keep_sy(s)]

def _arpabet_to_sequence(self, text):
return self._sy_to_sequence(['@' + s for s in text.split()])

def _should_keep_sy(self, s):
return s in self._sy_to_id and s != '_' and s != '~'

def symbol_to_sequence(self, this_lfeat_symbol, lfeat_type, cleaner_names):
sequence = []
if lfeat_type == 'sy':
this_lfeat_symbol = this_lfeat_symbol.strip().split(' ')
this_lfeat_symbol_format = ''
index = 0
while index < len(this_lfeat_symbol):
this_lfeat_symbol_format = this_lfeat_symbol_format + '{' + this_lfeat_symbol[
index] + '}' + ' '
index = index + 1
sequence = self.text_to_sequence(this_lfeat_symbol_format,
cleaner_names)
elif lfeat_type == 'tone':
sequence = self.tone_to_sequence(this_lfeat_symbol)
elif lfeat_type == 'syllable_flag':
sequence = self.syllable_flag_to_sequence(this_lfeat_symbol)
elif lfeat_type == 'word_segment':
sequence = self.word_segment_to_sequence(this_lfeat_symbol)
elif lfeat_type == 'emo_category':
sequence = self.emo_category_to_sequence(this_lfeat_symbol)
elif lfeat_type == 'speaker':
sequence = self.speaker_to_sequence(this_lfeat_symbol)
else:
raise Exception('Unknown lfeat type: %s' % lfeat_type)

return sequence

def text_to_sequence(self, text, cleaner_names):
'''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.

The text can optionally have ARPAbet sequences enclosed in curly braces embedded
in it. For example, "Turn left on {HH AW1 S S T AH0 N} Street."

Args:
text: string to convert to a sequence
cleaner_names: names of the cleaner functions to run the text through

Returns:
List of integers corresponding to the symbols in the text
'''
sequence = []

# Check for curly braces and treat their contents as ARPAbet:
while len(text):
m = self._curly_re.match(text)
if not m:
sequence += self._sy_to_sequence(
self._clean_text(text, cleaner_names))
break
sequence += self._sy_to_sequence(
self._clean_text(m.group(1), cleaner_names))
sequence += self._arpabet_to_sequence(m.group(2))
text = m.group(3)

# Append EOS token
sequence.append(self._sy_to_id['~'])
return sequence

def tone_to_sequence(self, tone):
tones = tone.strip().split(' ')
sequence = []
for this_tone in tones:
sequence.append(self._tone_to_id[this_tone])
sequence.append(self._tone_to_id['~'])
return sequence

def syllable_flag_to_sequence(self, syllable_flag):
syllable_flags = syllable_flag.strip().split(' ')
sequence = []
for this_syllable_flag in syllable_flags:
sequence.append(self._syllable_flag_to_id[this_syllable_flag])
sequence.append(self._syllable_flag_to_id['~'])
return sequence

def word_segment_to_sequence(self, word_segment):
word_segments = word_segment.strip().split(' ')
sequence = []
for this_word_segment in word_segments:
sequence.append(self._word_segment_to_id[this_word_segment])
sequence.append(self._word_segment_to_id['~'])
return sequence

def emo_category_to_sequence(self, emo_type):
emo_categories = emo_type.strip().split(' ')
sequence = []
for this_category in emo_categories:
sequence.append(self._emo_category_to_id[this_category])
sequence.append(self._emo_category_to_id['~'])
return sequence

def speaker_to_sequence(self, speaker):
speakers = speaker.strip().split(' ')
sequence = []
for this_speaker in speakers:
sequence.append(self._speaker_to_id[this_speaker])
sequence.append(self._speaker_to_id['~'])
return sequence

def sequence_to_symbol(self, sequence):
result = ''
pre_lfeat_dim = 0
for lfeat_type in self._lfeat_type_list:
current_one_hot_sequence = sequence[:, pre_lfeat_dim:pre_lfeat_dim
+ self._inputs_dim[lfeat_type]]
current_sequence = current_one_hot_sequence.argmax(1)
length = current_sequence.shape[0]

index = 0
while index < length:
this_sequence = current_sequence[index]
s = ''
if lfeat_type == 'sy':
s = self._id_to_sy[this_sequence]
if len(s) > 1 and s[0] == '@':
s = s[1:]
elif lfeat_type == 'tone':
s = self._id_to_tone[this_sequence]
elif lfeat_type == 'syllable_flag':
s = self._id_to_syllable_flag[this_sequence]
elif lfeat_type == 'word_segment':
s = self._id_to_word_segment[this_sequence]
elif lfeat_type == 'emo_category':
s = self._id_to_emo_category[this_sequence]
elif lfeat_type == 'speaker':
s = self._id_to_speaker[this_sequence]
else:
raise Exception('Unknown lfeat type: %s' % lfeat_type)

if index == 0:
result = result + lfeat_type + ': '

result = result + '{' + s + '}'

if index == length - 1:
result = result + '; '

index = index + 1
pre_lfeat_dim = pre_lfeat_dim + self._inputs_dim[lfeat_type]
return result

+ 1
- 0
modelscope/models/audio/tts/frontend/__init__.py View File

@@ -0,0 +1 @@
from .generic_text_to_speech_frontend import * # noqa F403

+ 39
- 0
modelscope/models/audio/tts/frontend/generic_text_to_speech_frontend.py View File

@@ -0,0 +1,39 @@
import os
import zipfile
from typing import Any, Dict, List

import ttsfrd

from modelscope.models.base import Model
from modelscope.models.builder import MODELS
from modelscope.utils.audio.tts_exceptions import (
TtsFrontendInitializeFailedException,
TtsFrontendLanguageTypeInvalidException)
from modelscope.utils.constant import Tasks

__all__ = ['GenericTtsFrontend']


@MODELS.register_module(
Tasks.text_to_speech, module_name=r'generic_tts_frontend')
class GenericTtsFrontend(Model):

def __init__(self, model_dir='.', lang_type='pinyin', *args, **kwargs):
super().__init__(model_dir, *args, **kwargs)
frontend = ttsfrd.TtsFrontendEngine()
zip_file = os.path.join(model_dir, 'resource.zip')
self._res_path = os.path.join(model_dir, 'resource')
with zipfile.ZipFile(zip_file, 'r') as zip_ref:
zip_ref.extractall(model_dir)
if not frontend.initialize(self._res_path):
raise TtsFrontendInitializeFailedException(
'resource invalid: {}'.format(self._res_path))
if not frontend.set_lang_type(lang_type):
raise TtsFrontendLanguageTypeInvalidException(
'language type invalid: {}, valid is pinyin and chenmix'.
format(lang_type))
self._frontend = frontend

def forward(self, data: str) -> Dict[str, List]:
result = self._frontend.gen_tacotron_symbols(data)
return {'texts': [s for s in result.splitlines() if s != '']}

+ 1
- 0
modelscope/models/audio/tts/vocoder/__init__.py View File

@@ -0,0 +1 @@
from .hifigan16k import * # noqa F403

+ 73
- 0
modelscope/models/audio/tts/vocoder/hifigan16k.py View File

@@ -0,0 +1,73 @@
from __future__ import (absolute_import, division, print_function,
unicode_literals)
import argparse
import glob
import os
import time

import json
import numpy as np
import torch
from scipy.io.wavfile import write

from modelscope.models.base import Model
from modelscope.models.builder import MODELS
from modelscope.utils.audio.tts_exceptions import \
TtsVocoderMelspecShapeMismatchException
from modelscope.utils.constant import ModelFile, Tasks
from .models import Generator

__all__ = ['Hifigan16k', 'AttrDict']
MAX_WAV_VALUE = 32768.0


def load_checkpoint(filepath, device):
assert os.path.isfile(filepath)
print("Loading '{}'".format(filepath))
checkpoint_dict = torch.load(filepath, map_location=device)
print('Complete.')
return checkpoint_dict


class AttrDict(dict):

def __init__(self, *args, **kwargs):
super(AttrDict, self).__init__(*args, **kwargs)
self.__dict__ = self


@MODELS.register_module(Tasks.text_to_speech, module_name=r'hifigan16k')
class Hifigan16k(Model):

def __init__(self, model_dir, *args, **kwargs):
self._ckpt_path = os.path.join(model_dir,
ModelFile.TORCH_MODEL_BIN_FILE)
self._config = AttrDict(**kwargs)

super().__init__(self._ckpt_path, *args, **kwargs)
if torch.cuda.is_available():
torch.manual_seed(self._config.seed)
self._device = torch.device('cuda')
else:
self._device = torch.device('cpu')
self._generator = Generator(self._config).to(self._device)
state_dict_g = load_checkpoint(self._ckpt_path, self._device)
self._generator.load_state_dict(state_dict_g['generator'])
self._generator.eval()
self._generator.remove_weight_norm()

def forward(self, melspec):
dim0 = list(melspec.shape)[-1]
if dim0 != 80:
raise TtsVocoderMelspecShapeMismatchException(
'input melspec mismatch 0 dim require 80 but {}'.format(dim0))
with torch.no_grad():
x = melspec.T
x = torch.FloatTensor(x).to(self._device)
if len(x.shape) == 2:
x = x.unsqueeze(0)
y_g_hat = self._generator(x)
audio = y_g_hat.squeeze()
audio = audio * MAX_WAV_VALUE
audio = audio.cpu().numpy().astype('int16')
return audio

+ 1
- 0
modelscope/models/audio/tts/vocoder/models/__init__.py View File

@@ -0,0 +1 @@
from .models import Generator

+ 516
- 0
modelscope/models/audio/tts/vocoder/models/models.py View File

@@ -0,0 +1,516 @@
from distutils.version import LooseVersion

import torch
import torch.nn as nn
import torch.nn.functional as F
from pytorch_wavelets import DWT1DForward
from torch.nn import AvgPool1d, Conv1d, Conv2d, ConvTranspose1d
from torch.nn.utils import remove_weight_norm, spectral_norm, weight_norm

from .utils import get_padding, init_weights

is_pytorch_17plus = LooseVersion(torch.__version__) >= LooseVersion('1.7')


def stft(x, fft_size, hop_size, win_length, window):
"""Perform STFT and convert to magnitude spectrogram.

Args:
x (Tensor): Input signal tensor (B, T).
fft_size (int): FFT size.
hop_size (int): Hop size.
win_length (int): Window length.
window (str): Window function type.

Returns:
Tensor: Magnitude spectrogram (B, #frames, fft_size // 2 + 1).

"""
if is_pytorch_17plus:
x_stft = torch.stft(
x, fft_size, hop_size, win_length, window, return_complex=False)
else:
x_stft = torch.stft(x, fft_size, hop_size, win_length, window)
real = x_stft[..., 0]
imag = x_stft[..., 1]

# NOTE(kan-bayashi): clamp is needed to avoid nan or inf
return torch.sqrt(torch.clamp(real**2 + imag**2, min=1e-7)).transpose(2, 1)


LRELU_SLOPE = 0.1


def get_padding_casual(kernel_size, dilation=1):
return int(kernel_size * dilation - dilation)


class Conv1dCasual(torch.nn.Module):

def __init__(self,
in_channels,
out_channels,
kernel_size,
stride=1,
padding=0,
dilation=1,
groups=1,
bias=True,
padding_mode='zeros'):
super(Conv1dCasual, self).__init__()
self.pad = padding
self.conv1d = weight_norm(
Conv1d(
in_channels,
out_channels,
kernel_size,
stride,
padding=0,
dilation=dilation,
groups=groups,
bias=bias,
padding_mode=padding_mode))
self.conv1d.apply(init_weights)

def forward(self, x): # bdt
# described starting from the last dimension and moving forward.
x = F.pad(x, (self.pad, 0, 0, 0, 0, 0), 'constant')
x = self.conv1d(x)
return x

def remove_weight_norm(self):
remove_weight_norm(self.conv1d)


class ConvTranspose1dCausal(torch.nn.Module):
"""CausalConvTranspose1d module with customized initialization."""

def __init__(self,
in_channels,
out_channels,
kernel_size,
stride,
padding=0):
"""Initialize CausalConvTranspose1d module."""
super(ConvTranspose1dCausal, self).__init__()
self.deconv = weight_norm(
ConvTranspose1d(in_channels, out_channels, kernel_size, stride))
self.stride = stride
self.deconv.apply(init_weights)
self.pad = kernel_size - stride

def forward(self, x):
"""Calculate forward propagation.
Args:
x (Tensor): Input tensor (B, in_channels, T_in).
Returns:
Tensor: Output tensor (B, out_channels, T_out).
"""
# x = F.pad(x, (self.pad, 0, 0, 0, 0, 0), "constant")
return self.deconv(x)[:, :, :-self.pad]

def remove_weight_norm(self):
remove_weight_norm(self.deconv)


class ResBlock1(torch.nn.Module):

def __init__(self, h, channels, kernel_size=3, dilation=(1, 3, 5)):
super(ResBlock1, self).__init__()
self.h = h
self.convs1 = nn.ModuleList([
Conv1dCasual(
channels,
channels,
kernel_size,
1,
dilation=dilation[i],
padding=get_padding_casual(kernel_size, dilation[i]))
for i in range(len(dilation))
])

self.convs2 = nn.ModuleList([
Conv1dCasual(
channels,
channels,
kernel_size,
1,
dilation=1,
padding=get_padding_casual(kernel_size, 1))
for i in range(len(dilation))
])

def forward(self, x):
for c1, c2 in zip(self.convs1, self.convs2):
xt = F.leaky_relu(x, LRELU_SLOPE)
xt = c1(xt)
xt = F.leaky_relu(xt, LRELU_SLOPE)
xt = c2(xt)
x = xt + x
return x

def remove_weight_norm(self):
for layer in self.convs1:
layer.remove_weight_norm()
for layer in self.convs2:
layer.remove_weight_norm()


class Generator(torch.nn.Module):

def __init__(self, h):
super(Generator, self).__init__()
self.h = h
self.num_kernels = len(h.resblock_kernel_sizes)
self.num_upsamples = len(h.upsample_rates)
print('num_kernels={}, num_upsamples={}'.format(
self.num_kernels, self.num_upsamples))
self.conv_pre = Conv1dCasual(
80, h.upsample_initial_channel, 7, 1, padding=7 - 1)
resblock = ResBlock1 if h.resblock == '1' else ResBlock2

self.ups = nn.ModuleList()
self.repeat_ups = nn.ModuleList()
for i, (u, k) in enumerate(
zip(h.upsample_rates, h.upsample_kernel_sizes)):
upsample = nn.Sequential(
nn.Upsample(mode='nearest', scale_factor=u),
nn.LeakyReLU(LRELU_SLOPE),
Conv1dCasual(
h.upsample_initial_channel // (2**i),
h.upsample_initial_channel // (2**(i + 1)),
kernel_size=7,
stride=1,
padding=7 - 1))
self.repeat_ups.append(upsample)
self.ups.append(
ConvTranspose1dCausal(
h.upsample_initial_channel // (2**i),
h.upsample_initial_channel // (2**(i + 1)),
k,
u,
padding=(k - u) // 2))

self.resblocks = nn.ModuleList()
for i in range(len(self.ups)):
ch = h.upsample_initial_channel // (2**(i + 1))
for j, (k, d) in enumerate(
zip(h.resblock_kernel_sizes, h.resblock_dilation_sizes)):
self.resblocks.append(resblock(h, ch, k, d))

self.conv_post = Conv1dCasual(ch, 1, 7, 1, padding=7 - 1)

def forward(self, x):
x = self.conv_pre(x)
for i in range(self.num_upsamples):
x = torch.sin(x) + x
# transconv
x1 = F.leaky_relu(x, LRELU_SLOPE)
x1 = self.ups[i](x1)
# repeat
x2 = self.repeat_ups[i](x)
x = x1 + x2
xs = None
for j in range(self.num_kernels):
if xs is None:
xs = self.resblocks[i * self.num_kernels + j](x)
else:
xs += self.resblocks[i * self.num_kernels + j](x)
x = xs / self.num_kernels
x = F.leaky_relu(x)
x = self.conv_post(x)
x = torch.tanh(x)
return x

def remove_weight_norm(self):
print('Removing weight norm...')
for layer in self.ups:
layer.remove_weight_norm()
for layer in self.repeat_ups:
layer[-1].remove_weight_norm()
for layer in self.resblocks:
layer.remove_weight_norm()
self.conv_pre.remove_weight_norm()
self.conv_post.remove_weight_norm()


class DiscriminatorP(torch.nn.Module):

def __init__(self,
period,
kernel_size=5,
stride=3,
use_spectral_norm=False):
super(DiscriminatorP, self).__init__()
self.period = period
norm_f = weight_norm if use_spectral_norm is False else spectral_norm
self.convs = nn.ModuleList([
norm_f(
Conv2d(
1,
32, (kernel_size, 1), (stride, 1),
padding=(get_padding(5, 1), 0))),
norm_f(
Conv2d(
32,
128, (kernel_size, 1), (stride, 1),
padding=(get_padding(5, 1), 0))),
norm_f(
Conv2d(
128,
512, (kernel_size, 1), (stride, 1),
padding=(get_padding(5, 1), 0))),
norm_f(
Conv2d(
512,
1024, (kernel_size, 1), (stride, 1),
padding=(get_padding(5, 1), 0))),
norm_f(Conv2d(1024, 1024, (kernel_size, 1), 1, padding=(2, 0))),
])
self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))

def forward(self, x):
fmap = []

# 1d to 2d
b, c, t = x.shape
if t % self.period != 0: # pad first
n_pad = self.period - (t % self.period)
x = F.pad(x, (0, n_pad), 'reflect')
t = t + n_pad
x = x.view(b, c, t // self.period, self.period)

for layer in self.convs:
x = layer(x)
x = F.leaky_relu(x, LRELU_SLOPE)
fmap.append(x)
x = self.conv_post(x)
fmap.append(x)
x = torch.flatten(x, 1, -1)

return x, fmap


class MultiPeriodDiscriminator(torch.nn.Module):

def __init__(self):
super(MultiPeriodDiscriminator, self).__init__()
self.discriminators = nn.ModuleList([
DiscriminatorP(2),
DiscriminatorP(3),
DiscriminatorP(5),
DiscriminatorP(7),
DiscriminatorP(11),
])

def forward(self, y, y_hat):
y_d_rs = []
y_d_gs = []
fmap_rs = []
fmap_gs = []
for i, d in enumerate(self.discriminators):
y_d_r, fmap_r = d(y)
y_d_g, fmap_g = d(y_hat)
y_d_rs.append(y_d_r)
fmap_rs.append(fmap_r)
y_d_gs.append(y_d_g)
fmap_gs.append(fmap_g)

return y_d_rs, y_d_gs, fmap_rs, fmap_gs


class DiscriminatorS(torch.nn.Module):

def __init__(self, use_spectral_norm=False):
super(DiscriminatorS, self).__init__()
norm_f = weight_norm if use_spectral_norm is False else spectral_norm
self.convs = nn.ModuleList([
norm_f(Conv1d(1, 128, 15, 1, padding=7)),
norm_f(Conv1d(128, 128, 41, 2, groups=4, padding=20)),
norm_f(Conv1d(128, 256, 41, 2, groups=16, padding=20)),
norm_f(Conv1d(256, 512, 41, 4, groups=16, padding=20)),
norm_f(Conv1d(512, 1024, 41, 4, groups=16, padding=20)),
norm_f(Conv1d(1024, 1024, 41, 1, groups=16, padding=20)),
norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
])
self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))

def forward(self, x):
fmap = []
for layer in self.convs:
x = layer(x)
x = F.leaky_relu(x, LRELU_SLOPE)
fmap.append(x)
x = self.conv_post(x)
fmap.append(x)
x = torch.flatten(x, 1, -1)

return x, fmap


class MultiScaleDiscriminator(torch.nn.Module):

def __init__(self):
super(MultiScaleDiscriminator, self).__init__()
self.discriminators = nn.ModuleList([
DiscriminatorS(use_spectral_norm=True),
DiscriminatorS(),
DiscriminatorS(),
])
self.meanpools = nn.ModuleList(
[DWT1DForward(wave='db3', J=1),
DWT1DForward(wave='db3', J=1)])
self.convs = nn.ModuleList([
weight_norm(Conv1d(2, 1, 15, 1, padding=7)),
weight_norm(Conv1d(2, 1, 15, 1, padding=7))
])

def forward(self, y, y_hat):
y_d_rs = []
y_d_gs = []
fmap_rs = []
fmap_gs = []
for i, d in enumerate(self.discriminators):
if i != 0:
yl, yh = self.meanpools[i - 1](y)
y = torch.cat([yl, yh[0]], dim=1)
y = self.convs[i - 1](y)
y = F.leaky_relu(y, LRELU_SLOPE)

yl_hat, yh_hat = self.meanpools[i - 1](y_hat)
y_hat = torch.cat([yl_hat, yh_hat[0]], dim=1)
y_hat = self.convs[i - 1](y_hat)
y_hat = F.leaky_relu(y_hat, LRELU_SLOPE)

y_d_r, fmap_r = d(y)
y_d_g, fmap_g = d(y_hat)
y_d_rs.append(y_d_r)
fmap_rs.append(fmap_r)
y_d_gs.append(y_d_g)
fmap_gs.append(fmap_g)

return y_d_rs, y_d_gs, fmap_rs, fmap_gs


class DiscriminatorSTFT(torch.nn.Module):

def __init__(self,
kernel_size=11,
stride=2,
use_spectral_norm=False,
fft_size=1024,
shift_size=120,
win_length=600,
window='hann_window'):
super(DiscriminatorSTFT, self).__init__()
self.fft_size = fft_size
self.shift_size = shift_size
self.win_length = win_length
norm_f = weight_norm if use_spectral_norm is False else spectral_norm
self.convs = nn.ModuleList([
norm_f(
Conv2d(
fft_size // 2 + 1,
32, (15, 1), (1, 1),
padding=(get_padding(15, 1), 0))),
norm_f(
Conv2d(
32,
32, (kernel_size, 1), (stride, 1),
padding=(get_padding(9, 1), 0))),
norm_f(
Conv2d(
32,
32, (kernel_size, 1), (stride, 1),
padding=(get_padding(9, 1), 0))),
norm_f(
Conv2d(
32,
32, (kernel_size, 1), (stride, 1),
padding=(get_padding(9, 1), 0))),
norm_f(Conv2d(32, 32, (5, 1), (1, 1), padding=(2, 0))),
])
self.conv_post = norm_f(Conv2d(32, 1, (3, 1), (1, 1), padding=(1, 0)))
self.register_buffer('window', getattr(torch, window)(win_length))

def forward(self, wav):
wav = torch.squeeze(wav, 1)
x_mag = stft(wav, self.fft_size, self.shift_size, self.win_length,
self.window)
x = torch.transpose(x_mag, 2, 1).unsqueeze(-1)
fmap = []
for layer in self.convs:
x = layer(x)
x = F.leaky_relu(x, LRELU_SLOPE)
fmap.append(x)
x = self.conv_post(x)
fmap.append(x)
x = x.squeeze(-1)

return x, fmap


class MultiSTFTDiscriminator(torch.nn.Module):

def __init__(
self,
fft_sizes=[1024, 2048, 512],
hop_sizes=[120, 240, 50],
win_lengths=[600, 1200, 240],
window='hann_window',
):
super(MultiSTFTDiscriminator, self).__init__()
self.discriminators = nn.ModuleList()
for fs, ss, wl in zip(fft_sizes, hop_sizes, win_lengths):
self.discriminators += [
DiscriminatorSTFT(fft_size=fs, shift_size=ss, win_length=wl)
]

def forward(self, y, y_hat):
y_d_rs = []
y_d_gs = []
fmap_rs = []
fmap_gs = []
for i, d in enumerate(self.discriminators):
y_d_r, fmap_r = d(y)
y_d_g, fmap_g = d(y_hat)
y_d_rs.append(y_d_r)
fmap_rs.append(fmap_r)
y_d_gs.append(y_d_g)
fmap_gs.append(fmap_g)

return y_d_rs, y_d_gs, fmap_rs, fmap_gs


def feature_loss(fmap_r, fmap_g):
loss = 0
for dr, dg in zip(fmap_r, fmap_g):
for rl, gl in zip(dr, dg):
loss += torch.mean(torch.abs(rl - gl))

return loss * 2


def discriminator_loss(disc_real_outputs, disc_generated_outputs):
loss = 0
r_losses = []
g_losses = []
for dr, dg in zip(disc_real_outputs, disc_generated_outputs):
r_loss = torch.mean((1 - dr)**2)
g_loss = torch.mean(dg**2)
loss += (r_loss + g_loss)
r_losses.append(r_loss.item())
g_losses.append(g_loss.item())

return loss, r_losses, g_losses


def generator_loss(disc_outputs):
loss = 0
gen_losses = []
for dg in disc_outputs:
temp_loss = torch.mean((1 - dg)**2)
gen_losses.append(temp_loss)
loss += temp_loss

return loss, gen_losses

+ 59
- 0
modelscope/models/audio/tts/vocoder/models/utils.py View File

@@ -0,0 +1,59 @@
import glob
import os

import matplotlib
import matplotlib.pylab as plt
import torch
from torch.nn.utils import weight_norm

matplotlib.use('Agg')


def plot_spectrogram(spectrogram):
fig, ax = plt.subplots(figsize=(10, 2))
im = ax.imshow(
spectrogram, aspect='auto', origin='lower', interpolation='none')
plt.colorbar(im, ax=ax)

fig.canvas.draw()
plt.close()

return fig


def init_weights(m, mean=0.0, std=0.01):
classname = m.__class__.__name__
if classname.find('Conv') != -1:
m.weight.data.normal_(mean, std)


def apply_weight_norm(m):
classname = m.__class__.__name__
if classname.find('Conv') != -1:
weight_norm(m)


def get_padding(kernel_size, dilation=1):
return int((kernel_size * dilation - dilation) / 2)


def load_checkpoint(filepath, device):
assert os.path.isfile(filepath)
print("Loading '{}'".format(filepath))
checkpoint_dict = torch.load(filepath, map_location=device)
print('Complete.')
return checkpoint_dict


def save_checkpoint(filepath, obj):
print('Saving checkpoint to {}'.format(filepath))
torch.save(obj, filepath)
print('Complete.')


def scan_checkpoint(cp_dir, prefix):
pattern = os.path.join(cp_dir, prefix + '????????')
cp_list = glob.glob(pattern)
if len(cp_list) == 0:
return None
return sorted(cp_list)[-1]

+ 2
- 0
modelscope/models/base.py View File

@@ -62,4 +62,6 @@ class Model(ABC):
if hasattr(model_cfg, 'model_type') and not hasattr(model_cfg, 'type'):
model_cfg.type = model_cfg.model_type
model_cfg.model_dir = local_model_dir
for k, v in kwargs.items():
model_cfg.k = v
return build_model(model_cfg, task_name)

+ 1
- 0
modelscope/pipelines/audio/__init__.py View File

@@ -1 +1,2 @@
from .linear_aec_pipeline import LinearAECPipeline
from .text_to_speech_pipeline import * # noqa F403

+ 46
- 0
modelscope/pipelines/audio/text_to_speech_pipeline.py View File

@@ -0,0 +1,46 @@
import time
from typing import Any, Dict, List

import numpy as np

from modelscope.models import Model
from modelscope.models.audio.tts.am import SambertNetHifi16k
from modelscope.models.audio.tts.vocoder import Hifigan16k
from modelscope.pipelines.base import Pipeline
from modelscope.pipelines.builder import PIPELINES
from modelscope.preprocessors import TextToTacotronSymbols, build_preprocessor
from modelscope.utils.constant import Fields, Tasks

__all__ = ['TextToSpeechSambertHifigan16kPipeline']


@PIPELINES.register_module(
Tasks.text_to_speech, module_name=r'tts-sambert-hifigan-16k')
class TextToSpeechSambertHifigan16kPipeline(Pipeline):

def __init__(self,
config_file: str = None,
model: List[Model] = None,
preprocessor: TextToTacotronSymbols = None,
**kwargs):
super().__init__(
config_file=config_file,
model=model,
preprocessor=preprocessor,
**kwargs)
assert len(model) == 2, 'model number should be 2'
self._am = model[0]
self._vocoder = model[1]
self._preprocessor = preprocessor

def forward(self, inputs: Dict[str, Any]) -> Dict[str, np.ndarray]:
texts = inputs['texts']
audio_total = np.empty((0), dtype='int16')
for line in texts:
line = line.strip().split('\t')
audio = self._vocoder.forward(self._am.forward(line[1]))
audio_total = np.append(audio_total, audio, axis=0)
return {'output': audio_total}

def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
return inputs

+ 1
- 0
modelscope/preprocessors/__init__.py View File

@@ -8,3 +8,4 @@ from .image import LoadImage, load_image
from .nlp import * # noqa F403
from .space.dialog_intent_prediction_preprocessor import * # noqa F403
from .space.dialog_modeling_preprocessor import * # noqa F403
from .text_to_speech import * # noqa F403

+ 2
- 1
modelscope/preprocessors/audio.py View File

@@ -5,7 +5,6 @@ from typing import Any, Dict
import numpy as np
import scipy.io.wavfile as wav
import torch
import torchaudio.compliance.kaldi as kaldi
from numpy.ctypeslib import ndpointer

from modelscope.utils.constant import Fields
@@ -123,6 +122,8 @@ class Feature:
if self.feat_type == 'raw':
return utt
elif self.feat_type == 'fbank':
# have to use local import before modelscope framework supoort lazy loading
import torchaudio.compliance.kaldi as kaldi
if len(utt.shape) == 1:
utt = utt.unsqueeze(0)
feat = kaldi.fbank(utt, **self.fbank_config)


+ 53
- 0
modelscope/preprocessors/text_to_speech.py View File

@@ -0,0 +1,53 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
import io
from typing import Any, Dict, Union

import ttsfrd

from modelscope.fileio import File
from modelscope.models.audio.tts.frontend import GenericTtsFrontend
from modelscope.models.base import Model
from modelscope.utils.audio.tts_exceptions import * # noqa F403
from modelscope.utils.constant import Fields
from .base import Preprocessor
from .builder import PREPROCESSORS

__all__ = ['TextToTacotronSymbols', 'text_to_tacotron_symbols']


@PREPROCESSORS.register_module(
Fields.audio, module_name=r'text_to_tacotron_symbols')
class TextToTacotronSymbols(Preprocessor):
"""extract tacotron symbols from text.

Args:
res_path (str): TTS frontend resource url
lang_type (str): language type, valid values are "pinyin" and "chenmix"
"""

def __init__(self, model_name, lang_type='pinyin'):
self._frontend_model = Model.from_pretrained(
model_name, lang_type=lang_type)
assert self._frontend_model is not None, 'load model from pretained failed'

def __call__(self, data: str) -> Dict[str, Any]:
"""Call functions to load text and get tacotron symbols.

Args:
input (str): text with utf-8
Returns:
symbos (list[str]): texts in tacotron symbols format.
"""
return self._frontend_model.forward(data)


def text_to_tacotron_symbols(text='', path='./', lang='pinyin'):
""" simple interface to transform text to tacotron symbols

Args:
text (str): input text
path (str): resource path
lang (str): language type from one of "pinyin" and "chenmix"
"""
transform = TextToTacotronSymbols(path, lang)
return transform(text)

+ 0
- 0
modelscope/utils/audio/__init__.py View File


+ 42
- 0
modelscope/utils/audio/tts_exceptions.py View File

@@ -0,0 +1,42 @@
"""
Define TTS exceptions
"""


class TtsException(Exception):
"""
TTS exception class.
"""
pass


class TtsFrontendException(TtsException):
"""
TTS frontend module level exceptions.
"""
pass


class TtsFrontendInitializeFailedException(TtsFrontendException):
"""
If tts frontend resource is invalid or not exist, this exception will be raised.
"""
pass


class TtsFrontendLanguageTypeInvalidException(TtsFrontendException):
"""
If language type is invalid, this exception will be raised.
"""


class TtsVocoderException(TtsException):
"""
Vocoder exception
"""


class TtsVocoderMelspecShapeMismatchException(TtsVocoderException):
"""
If vocoder's input melspec shape mismatch, this exception will be raised.
"""

+ 0
- 1
modelscope/utils/registry.py View File

@@ -67,7 +67,6 @@ class Registry(object):
if module_name in self._modules[group_key]:
raise KeyError(f'{module_name} is already registered in '
f'{self._name}[{group_key}]')

self._modules[group_key][module_name] = module_cls
module_cls.group_key = group_key



+ 1
- 0
requirements.txt View File

@@ -2,4 +2,5 @@
-r requirements/pipeline.txt
-r requirements/multi-modal.txt
-r requirements/nlp.txt
-r requirements/audio.txt
-r requirements/cv.txt

+ 26
- 0
requirements/audio.txt View File

@@ -0,0 +1,26 @@
#tts
h5py==2.10.0
#https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/TTS/requirements/ttsfrd-0.0.1-cp36-cp36m-linux_x86_64.whl
https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/TTS/requirements/ttsfrd-0.0.1-cp37-cp37m-linux_x86_64.whl
https://swap.oss-cn-hangzhou.aliyuncs.com/Jiaqi%2Fmaas%2Ftts%2Frequirements%2Fpytorch_wavelets-1.3.0-py3-none-any.whl?Expires=1685688388&OSSAccessKeyId=LTAI4Ffebq4d9jTVDwiSbY4L&Signature=jcQbg5EZ%2Bdys3%2F4BRn3srrKLdIg%3D
#https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/TTS/requirements/ttsfrd-0.0.1-cp38-cp38-linux_x86_64.whl
#https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/TTS/requirements/ttsfrd-0.0.1-cp39-cp39-linux_x86_64.whl
inflect
keras==2.2.4
librosa
lxml
matplotlib
nara_wpe
numpy==1.18.*
protobuf==3.20.*
ptflops
PyWavelets>=1.0.0
scikit-learn==0.23.2
sox
tensorboard
tensorflow==1.15.*
torch==1.10.*
torchaudio
torchvision
tqdm
unidecode

+ 60
- 0
tests/pipelines/test_text_to_speech.py View File

@@ -0,0 +1,60 @@
import time
import unittest

import json
import tensorflow as tf
# NOTICE: Tensorflow 1.15 seems not so compatible with pytorch.
# A segmentation fault may be raise by pytorch cpp library
# if 'import tensorflow' in front of 'import torch'.
# Puting a 'import torch' here can bypass this incompatibility.
import torch
from scipy.io.wavfile import write

from modelscope.fileio import File
from modelscope.models import Model, build_model
from modelscope.models.audio.tts.am import SambertNetHifi16k
from modelscope.models.audio.tts.vocoder import AttrDict, Hifigan16k
from modelscope.pipelines import pipeline
from modelscope.preprocessors import build_preprocessor
from modelscope.utils.constant import Fields, InputFields, Tasks
from modelscope.utils.logger import get_logger

logger = get_logger()


class TextToSpeechSambertHifigan16kPipelineTest(unittest.TestCase):

def test_pipeline(self):
lang_type = 'pinyin'
text = '明天天气怎么样'
preprocessor_model_id = 'damo/speech_binary_tts_frontend_resource'
am_model_id = 'damo/speech_sambert16k_tts_zhitian_emo'
voc_model_id = 'damo/speech_hifigan16k_tts_zhitian_emo'

cfg_preprocessor = dict(
type='text_to_tacotron_symbols',
model_name=preprocessor_model_id,
lang_type=lang_type)
preprocessor = build_preprocessor(cfg_preprocessor, Fields.audio)
self.assertTrue(preprocessor is not None)

am = Model.from_pretrained(am_model_id)
self.assertTrue(am is not None)

voc = Model.from_pretrained(voc_model_id)
self.assertTrue(voc is not None)

sambert_tts = pipeline(
pipeline_name='tts-sambert-hifigan-16k',
config_file='',
model=[am, voc],
preprocessor=preprocessor)
self.assertTrue(sambert_tts is not None)

output = sambert_tts(text)
self.assertTrue(len(output['output']) > 0)
write('output.wav', 16000, output['output'])


if __name__ == '__main__':
unittest.main()

+ 28
- 0
tests/preprocessors/test_text_to_speech.py View File

@@ -0,0 +1,28 @@
import shutil
import unittest

from modelscope.preprocessors import build_preprocessor
from modelscope.utils.constant import Fields, InputFields
from modelscope.utils.logger import get_logger

logger = get_logger()


class TtsPreprocessorTest(unittest.TestCase):

def test_preprocess(self):
lang_type = 'pinyin'
text = '今天天气不错,我们去散步吧。'
cfg = dict(
type='text_to_tacotron_symbols',
model_name='damo/speech_binary_tts_frontend_resource',
lang_type=lang_type)
preprocessor = build_preprocessor(cfg, Fields.audio)
output = preprocessor(text)
self.assertTrue(output)
for line in output['texts']:
print(line)


if __name__ == '__main__':
unittest.main()

+ 6
- 0
tests/run.py View File

@@ -7,6 +7,12 @@ import sys
import unittest
from fnmatch import fnmatch

# NOTICE: Tensorflow 1.15 seems not so compatible with pytorch.
# A segmentation fault may be raise by pytorch cpp library
# if 'import tensorflow' in front of 'import torch'.
# Puting a 'import torch' here can bypass this incompatibility.
import torch

from modelscope.utils.logger import get_logger
from modelscope.utils.test_utils import set_test_level, test_level



Loading…
Cancel
Save