@@ -24,6 +24,7 @@ wheels/ | |||
.installed.cfg | |||
*.egg | |||
/package | |||
/temp | |||
MANIFEST | |||
# PyInstaller | |||
@@ -123,3 +124,7 @@ replace.sh | |||
# Pytorch | |||
*.pth | |||
# audio | |||
*.wav |
@@ -29,3 +29,15 @@ reference: [https://huggingface.co/docs/tokenizers/installation#installation-fro | |||
> ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts. | |||
由于依赖库之间的版本不兼容,可能会存在版本冲突的情况,大部分情况下不影响正常运行。 | |||
### 3. 安装pytorch出现版本错误 | |||
> ERROR: Ignored the following versions that require a different python version: 1.1.0 Requires-Python >=3.8; 1.1.0rc1 Requires-Python >=3.8; 1.1.1 Requires-Python >=3.8 | |||
> ERROR: Could not find a version that satisfies the requirement torch==1.8.1+cu111 (from versions: 1.0.0, 1.0.1, 1.0.1.post2, 1.1.0, 1.2.0, 1.3.0, 1.3.1, 1.4.0, 1.5.0, 1.5.1, 1.6.0, 1.7.0, 1.7.1, 1.8.0, 1.8.1, 1.9.0, 1.9.1, 1.10.0, 1.10.1, 1.10.2, 1.11.0) | |||
> ERROR: No matching distribution found for torch==1.8.1+cu111 | |||
安装时使用如下命令: | |||
```shell | |||
pip install -f https://download.pytorch.org/whl/torch_stable.html -i https://pypi.tuna.tsinghua.edu.cn/simple -r requirements.txt | |||
``` |
@@ -25,6 +25,10 @@ ModelScope Library目前支持tensorflow,pytorch两大深度学习框架进行 | |||
* [Pytorch安装指导](https://pytorch.org/get-started/locally/) | |||
* [Tensorflow安装指导](https://www.tensorflow.org/install/pip) | |||
部分第三方依赖库需要提前安装numpy | |||
``` | |||
pip install numpy | |||
``` | |||
## ModelScope library 安装 | |||
@@ -1,5 +1,7 @@ | |||
# Copyright (c) Alibaba, Inc. and its affiliates. | |||
from .audio.tts.am import SambertNetHifi16k | |||
from .audio.tts.vocoder import Hifigan16k | |||
from .base import Model | |||
from .builder import MODELS, build_model | |||
from .nlp import BertForSequenceClassification, SbertForSentenceSimilarity |
@@ -0,0 +1 @@ | |||
from .sambert_hifi_16k import * # noqa F403 |
@@ -0,0 +1,8 @@ | |||
from .robutrans import RobuTrans | |||
def create_model(name, hparams): | |||
if name == 'robutrans': | |||
return RobuTrans(hparams) | |||
else: | |||
raise Exception('Unknown model: ' + name) |
@@ -0,0 +1,82 @@ | |||
"""Functions for compatibility with different TensorFlow versions.""" | |||
import tensorflow as tf | |||
def is_tf2(): | |||
"""Returns ``True`` if running TensorFlow 2.0.""" | |||
return tf.__version__.startswith('2') | |||
def tf_supports(symbol): | |||
"""Returns ``True`` if TensorFlow defines :obj:`symbol`.""" | |||
return _string_to_tf_symbol(symbol) is not None | |||
def tf_any(*symbols): | |||
"""Returns the first supported symbol.""" | |||
for symbol in symbols: | |||
module = _string_to_tf_symbol(symbol) | |||
if module is not None: | |||
return module | |||
return None | |||
def tf_compat(v2=None, v1=None): # pylint: disable=invalid-name | |||
"""Returns the compatible symbol based on the current TensorFlow version. | |||
Args: | |||
v2: The candidate v2 symbol name. | |||
v1: The candidate v1 symbol name. | |||
Returns: | |||
A TensorFlow symbol. | |||
Raises: | |||
ValueError: if no symbol can be found. | |||
""" | |||
candidates = [] | |||
if v2 is not None: | |||
candidates.append(v2) | |||
if v1 is not None: | |||
candidates.append(v1) | |||
candidates.append('compat.v1.%s' % v1) | |||
symbol = tf_any(*candidates) | |||
if symbol is None: | |||
raise ValueError('Failure to resolve the TensorFlow symbol') | |||
return symbol | |||
def name_from_variable_scope(name=''): | |||
"""Creates a name prefixed by the current variable scope.""" | |||
var_scope = tf_compat(v1='get_variable_scope')().name | |||
compat_name = '' | |||
if name: | |||
compat_name = '%s/' % name | |||
if var_scope: | |||
compat_name = '%s/%s' % (var_scope, compat_name) | |||
return compat_name | |||
def reuse(): | |||
"""Returns ``True`` if the current variable scope is marked for reuse.""" | |||
return tf_compat(v1='get_variable_scope')().reuse | |||
def _string_to_tf_symbol(symbol): | |||
modules = symbol.split('.') | |||
namespace = tf | |||
for module in modules: | |||
namespace = getattr(namespace, module, None) | |||
if namespace is None: | |||
return None | |||
return namespace | |||
# pylint: disable=invalid-name | |||
gfile_copy = tf_compat(v2='io.gfile.copy', v1='gfile.Copy') | |||
gfile_exists = tf_compat(v2='io.gfile.exists', v1='gfile.Exists') | |||
gfile_open = tf_compat(v2='io.gfile.GFile', v1='gfile.GFile') | |||
is_tensor = tf_compat(v2='is_tensor', v1='contrib.framework.is_tensor') | |||
logging = tf_compat(v1='logging') | |||
nest = tf_compat(v2='nest', v1='contrib.framework.nest') |
@@ -0,0 +1,273 @@ | |||
import tensorflow as tf | |||
def build_sequence_mask(sequence_length, | |||
maximum_length=None, | |||
dtype=tf.float32): | |||
"""Builds the dot product mask. | |||
Args: | |||
sequence_length: The sequence length. | |||
maximum_length: Optional size of the returned time dimension. Otherwise | |||
it is the maximum of :obj:`sequence_length`. | |||
dtype: The type of the mask tensor. | |||
Returns: | |||
A broadcastable ``tf.Tensor`` of type :obj:`dtype` and shape | |||
``[batch_size, max_length]``. | |||
""" | |||
mask = tf.sequence_mask( | |||
sequence_length, maxlen=maximum_length, dtype=dtype) | |||
return mask | |||
def norm(inputs): | |||
"""Layer normalizes :obj:`inputs`.""" | |||
return tf.contrib.layers.layer_norm(inputs, begin_norm_axis=-1) | |||
def pad_in_time(x, padding_shape): | |||
"""Helper function to pad a tensor in the time dimension and retain the static depth dimension. | |||
Agrs: | |||
x: [Batch, Time, Frequency] | |||
padding_length: padding size of constant value (0) before the time dimension | |||
return: | |||
padded x | |||
""" | |||
depth = x.get_shape().as_list()[-1] | |||
x = tf.pad(x, [[0, 0], padding_shape, [0, 0]]) | |||
x.set_shape((None, None, depth)) | |||
return x | |||
def pad_in_time_right(x, padding_length): | |||
"""Helper function to pad a tensor in the time dimension and retain the static depth dimension. | |||
Agrs: | |||
x: [Batch, Time, Frequency] | |||
padding_length: padding size of constant value (0) before the time dimension | |||
return: | |||
padded x | |||
""" | |||
depth = x.get_shape().as_list()[-1] | |||
x = tf.pad(x, [[0, 0], [0, padding_length], [0, 0]]) | |||
x.set_shape((None, None, depth)) | |||
return x | |||
def feed_forward(x, ffn_dim, memory_units, mode, dropout=0.0): | |||
"""Implements the Transformer's "Feed Forward" layer. | |||
.. math:: | |||
ffn(x) = max(0, x*W_1 + b_1)*W_2 | |||
Args: | |||
x: The input. | |||
ffn_dim: The number of units of the nonlinear transformation. | |||
memory_units: the number of units of linear transformation | |||
mode: A ``tf.estimator.ModeKeys`` mode. | |||
dropout: The probability to drop units from the inner transformation. | |||
Returns: | |||
The transformed input. | |||
""" | |||
inner = tf.layers.conv1d(x, ffn_dim, 1, activation=tf.nn.relu) | |||
inner = tf.layers.dropout( | |||
inner, rate=dropout, training=mode == tf.estimator.ModeKeys.TRAIN) | |||
outer = tf.layers.conv1d(inner, memory_units, 1, use_bias=False) | |||
return outer | |||
def drop_and_add(inputs, outputs, mode, dropout=0.0): | |||
"""Drops units in the outputs and adds the previous values. | |||
Args: | |||
inputs: The input of the previous layer. | |||
outputs: The output of the previous layer. | |||
mode: A ``tf.estimator.ModeKeys`` mode. | |||
dropout: The probability to drop units in :obj:`outputs`. | |||
Returns: | |||
The residual and normalized output. | |||
""" | |||
outputs = tf.layers.dropout(outputs, rate=dropout, training=mode) | |||
input_dim = inputs.get_shape().as_list()[-1] | |||
output_dim = outputs.get_shape().as_list()[-1] | |||
if input_dim == output_dim: | |||
outputs += inputs | |||
return outputs | |||
def MemoryBlock( | |||
inputs, | |||
filter_size, | |||
mode, | |||
mask=None, | |||
dropout=0.0, | |||
): | |||
""" | |||
Define the bidirectional memory block in FSMN | |||
Agrs: | |||
inputs: The output of the previous layer. [Batch, Time, Frequency] | |||
filter_size: memory block filter size | |||
mode: Training or Evaluation | |||
mask: A ``tf.Tensor`` applied to the memory block output | |||
return: | |||
output: 3-D tensor ([Batch, Time, Frequency]) | |||
""" | |||
static_shape = inputs.get_shape().as_list() | |||
depth = static_shape[-1] | |||
inputs = tf.expand_dims(inputs, axis=1) # [Batch, 1, Time, Frequency] | |||
depthwise_filter = tf.get_variable( | |||
'depth_conv_w', | |||
shape=[1, filter_size, depth, 1], | |||
initializer=tf.glorot_uniform_initializer(), | |||
dtype=tf.float32) | |||
memory = tf.nn.depthwise_conv2d( | |||
input=inputs, | |||
filter=depthwise_filter, | |||
strides=[1, 1, 1, 1], | |||
padding='SAME', | |||
rate=[1, 1], | |||
data_format='NHWC') | |||
memory = memory + inputs | |||
output = tf.layers.dropout(memory, rate=dropout, training=mode) | |||
output = tf.reshape( | |||
output, | |||
[tf.shape(output)[0], tf.shape(output)[2], depth]) | |||
if mask is not None: | |||
output = output * tf.expand_dims(mask, -1) | |||
return output | |||
def MemoryBlockV2( | |||
inputs, | |||
filter_size, | |||
mode, | |||
shift=0, | |||
mask=None, | |||
dropout=0.0, | |||
): | |||
""" | |||
Define the bidirectional memory block in FSMN | |||
Agrs: | |||
inputs: The output of the previous layer. [Batch, Time, Frequency] | |||
filter_size: memory block filter size | |||
mode: Training or Evaluation | |||
shift: left padding, to control delay | |||
mask: A ``tf.Tensor`` applied to the memory block output | |||
return: | |||
output: 3-D tensor ([Batch, Time, Frequency]) | |||
""" | |||
if mask is not None: | |||
inputs = inputs * tf.expand_dims(mask, -1) | |||
static_shape = inputs.get_shape().as_list() | |||
depth = static_shape[-1] | |||
# padding | |||
left_padding = int(round((filter_size - 1) / 2)) | |||
right_padding = int((filter_size - 1) / 2) | |||
if shift > 0: | |||
left_padding = left_padding + shift | |||
right_padding = right_padding - shift | |||
pad_inputs = pad_in_time(inputs, [left_padding, right_padding]) | |||
pad_inputs = tf.expand_dims( | |||
pad_inputs, axis=1) # [Batch, 1, Time, Frequency] | |||
depthwise_filter = tf.get_variable( | |||
'depth_conv_w', | |||
shape=[1, filter_size, depth, 1], | |||
initializer=tf.glorot_uniform_initializer(), | |||
dtype=tf.float32) | |||
memory = tf.nn.depthwise_conv2d( | |||
input=pad_inputs, | |||
filter=depthwise_filter, | |||
strides=[1, 1, 1, 1], | |||
padding='VALID', | |||
rate=[1, 1], | |||
data_format='NHWC') | |||
memory = tf.reshape( | |||
memory, | |||
[tf.shape(memory)[0], tf.shape(memory)[2], depth]) | |||
memory = memory + inputs | |||
output = tf.layers.dropout(memory, rate=dropout, training=mode) | |||
if mask is not None: | |||
output = output * tf.expand_dims(mask, -1) | |||
return output | |||
def UniMemoryBlock( | |||
inputs, | |||
filter_size, | |||
mode, | |||
cache=None, | |||
mask=None, | |||
dropout=0.0, | |||
): | |||
""" | |||
Define the unidirectional memory block in FSMN | |||
Agrs: | |||
inputs: The output of the previous layer. [Batch, Time, Frequency] | |||
filter_size: memory block filter size | |||
cache: for streaming inference | |||
mode: Training or Evaluation | |||
mask: A ``tf.Tensor`` applied to the memory block output | |||
dropout: dorpout factor | |||
return: | |||
output: 3-D tensor ([Batch, Time, Frequency]) | |||
""" | |||
if cache is not None: | |||
static_shape = cache['queries'].get_shape().as_list() | |||
depth = static_shape[-1] | |||
queries = tf.slice(cache['queries'], [0, 1, 0], [ | |||
tf.shape(cache['queries'])[0], | |||
tf.shape(cache['queries'])[1] - 1, depth | |||
]) | |||
queries = tf.concat([queries, inputs], axis=1) | |||
cache['queries'] = queries | |||
else: | |||
padding_length = filter_size - 1 | |||
queries = pad_in_time(inputs, [padding_length, 0]) | |||
queries = tf.expand_dims(queries, axis=1) # [Batch, 1, Time, Frequency] | |||
static_shape = queries.get_shape().as_list() | |||
depth = static_shape[-1] | |||
depthwise_filter = tf.get_variable( | |||
'depth_conv_w', | |||
shape=[1, filter_size, depth, 1], | |||
initializer=tf.glorot_uniform_initializer(), | |||
dtype=tf.float32) | |||
memory = tf.nn.depthwise_conv2d( | |||
input=queries, | |||
filter=depthwise_filter, | |||
strides=[1, 1, 1, 1], | |||
padding='VALID', | |||
rate=[1, 1], | |||
data_format='NHWC') | |||
memory = tf.reshape( | |||
memory, | |||
[tf.shape(memory)[0], tf.shape(memory)[2], depth]) | |||
memory = memory + inputs | |||
output = tf.layers.dropout(memory, rate=dropout, training=mode) | |||
if mask is not None: | |||
output = output * tf.expand_dims(mask, -1) | |||
return output |
@@ -0,0 +1,178 @@ | |||
import tensorflow as tf | |||
from . import fsmn | |||
class FsmnEncoder(): | |||
"""Encoder using Fsmn | |||
""" | |||
def __init__(self, | |||
filter_size, | |||
fsmn_num_layers, | |||
dnn_num_layers, | |||
num_memory_units=512, | |||
ffn_inner_dim=2048, | |||
dropout=0.0, | |||
position_encoder=None): | |||
"""Initializes the parameters of the encoder. | |||
Args: | |||
filter_size: the total order of memory block | |||
fsmn_num_layers: The number of fsmn layers. | |||
dnn_num_layers: The number of dnn layers | |||
num_units: The number of memory units. | |||
ffn_inner_dim: The number of units of the inner linear transformation | |||
in the feed forward layer. | |||
dropout: The probability to drop units from the outputs. | |||
position_encoder: The :class:`opennmt.layers.position.PositionEncoder` to | |||
apply on inputs or ``None``. | |||
""" | |||
super(FsmnEncoder, self).__init__() | |||
self.filter_size = filter_size | |||
self.fsmn_num_layers = fsmn_num_layers | |||
self.dnn_num_layers = dnn_num_layers | |||
self.num_memory_units = num_memory_units | |||
self.ffn_inner_dim = ffn_inner_dim | |||
self.dropout = dropout | |||
self.position_encoder = position_encoder | |||
def encode(self, inputs, sequence_length=None, mode=True): | |||
if self.position_encoder is not None: | |||
inputs = self.position_encoder(inputs) | |||
inputs = tf.layers.dropout(inputs, rate=self.dropout, training=mode) | |||
mask = fsmn.build_sequence_mask( | |||
sequence_length, maximum_length=tf.shape(inputs)[1]) | |||
state = () | |||
for layer in range(self.fsmn_num_layers): | |||
with tf.variable_scope('fsmn_layer_{}'.format(layer)): | |||
with tf.variable_scope('ffn'): | |||
context = fsmn.feed_forward( | |||
inputs, | |||
self.ffn_inner_dim, | |||
self.num_memory_units, | |||
mode, | |||
dropout=self.dropout) | |||
with tf.variable_scope('memory'): | |||
memory = fsmn.MemoryBlock( | |||
context, | |||
self.filter_size, | |||
mode, | |||
mask=mask, | |||
dropout=self.dropout) | |||
memory = fsmn.drop_and_add( | |||
inputs, memory, mode, dropout=self.dropout) | |||
inputs = memory | |||
state += (tf.reduce_mean(inputs, axis=1), ) | |||
for layer in range(self.dnn_num_layers): | |||
with tf.variable_scope('dnn_layer_{}'.format(layer)): | |||
transformed = fsmn.feed_forward( | |||
inputs, | |||
self.ffn_inner_dim, | |||
self.num_memory_units, | |||
mode, | |||
dropout=self.dropout) | |||
inputs = transformed | |||
state += (tf.reduce_mean(inputs, axis=1), ) | |||
outputs = inputs | |||
return (outputs, state, sequence_length) | |||
class FsmnEncoderV2(): | |||
"""Encoder using Fsmn | |||
""" | |||
def __init__(self, | |||
filter_size, | |||
fsmn_num_layers, | |||
dnn_num_layers, | |||
num_memory_units=512, | |||
ffn_inner_dim=2048, | |||
dropout=0.0, | |||
shift=0, | |||
position_encoder=None): | |||
"""Initializes the parameters of the encoder. | |||
Args: | |||
filter_size: the total order of memory block | |||
fsmn_num_layers: The number of fsmn layers. | |||
dnn_num_layers: The number of dnn layers | |||
num_units: The number of memory units. | |||
ffn_inner_dim: The number of units of the inner linear transformation | |||
in the feed forward layer. | |||
dropout: The probability to drop units from the outputs. | |||
shift: left padding, to control delay | |||
position_encoder: The :class:`opennmt.layers.position.PositionEncoder` to | |||
apply on inputs or ``None``. | |||
""" | |||
super(FsmnEncoderV2, self).__init__() | |||
self.filter_size = filter_size | |||
self.fsmn_num_layers = fsmn_num_layers | |||
self.dnn_num_layers = dnn_num_layers | |||
self.num_memory_units = num_memory_units | |||
self.ffn_inner_dim = ffn_inner_dim | |||
self.dropout = dropout | |||
self.shift = shift | |||
if not isinstance(shift, list): | |||
self.shift = [shift for _ in range(self.fsmn_num_layers)] | |||
self.position_encoder = position_encoder | |||
def encode(self, inputs, sequence_length=None, mode=True): | |||
if self.position_encoder is not None: | |||
inputs = self.position_encoder(inputs) | |||
inputs = tf.layers.dropout(inputs, rate=self.dropout, training=mode) | |||
mask = fsmn.build_sequence_mask( | |||
sequence_length, maximum_length=tf.shape(inputs)[1]) | |||
state = () | |||
for layer in range(self.fsmn_num_layers): | |||
with tf.variable_scope('fsmn_layer_{}'.format(layer)): | |||
with tf.variable_scope('ffn'): | |||
context = fsmn.feed_forward( | |||
inputs, | |||
self.ffn_inner_dim, | |||
self.num_memory_units, | |||
mode, | |||
dropout=self.dropout) | |||
with tf.variable_scope('memory'): | |||
memory = fsmn.MemoryBlockV2( | |||
context, | |||
self.filter_size, | |||
mode, | |||
shift=self.shift[layer], | |||
mask=mask, | |||
dropout=self.dropout) | |||
memory = fsmn.drop_and_add( | |||
inputs, memory, mode, dropout=self.dropout) | |||
inputs = memory | |||
state += (tf.reduce_mean(inputs, axis=1), ) | |||
for layer in range(self.dnn_num_layers): | |||
with tf.variable_scope('dnn_layer_{}'.format(layer)): | |||
transformed = fsmn.feed_forward( | |||
inputs, | |||
self.ffn_inner_dim, | |||
self.num_memory_units, | |||
mode, | |||
dropout=self.dropout) | |||
inputs = transformed | |||
state += (tf.reduce_mean(inputs, axis=1), ) | |||
outputs = inputs | |||
return (outputs, state, sequence_length) |
@@ -0,0 +1,160 @@ | |||
import numpy as np | |||
import tensorflow as tf | |||
from tensorflow.contrib.seq2seq import Helper | |||
class VarTestHelper(Helper): | |||
def __init__(self, batch_size, inputs, dim): | |||
with tf.name_scope('VarTestHelper'): | |||
self._batch_size = batch_size | |||
self._inputs = inputs | |||
self._dim = dim | |||
num_steps = tf.shape(self._inputs)[1] | |||
self._lengths = tf.tile([num_steps], [self._batch_size]) | |||
self._inputs = tf.roll(inputs, shift=-1, axis=1) | |||
self._init_inputs = inputs[:, 0, :] | |||
@property | |||
def batch_size(self): | |||
return self._batch_size | |||
@property | |||
def sample_ids_shape(self): | |||
return tf.TensorShape([]) | |||
@property | |||
def sample_ids_dtype(self): | |||
return np.int32 | |||
def initialize(self, name=None): | |||
return (tf.tile([False], [self._batch_size]), | |||
_go_frames(self._batch_size, self._dim, self._init_inputs)) | |||
def sample(self, time, outputs, state, name=None): | |||
return tf.tile([0], [self._batch_size]) # Return all 0; we ignore them | |||
def next_inputs(self, time, outputs, state, sample_ids, name=None): | |||
with tf.name_scope('VarTestHelper'): | |||
finished = (time + 1 >= self._lengths) | |||
next_inputs = tf.concat([outputs, self._inputs[:, time, :]], | |||
axis=-1) | |||
return (finished, next_inputs, state) | |||
class VarTrainingHelper(Helper): | |||
def __init__(self, targets, inputs, dim): | |||
with tf.name_scope('VarTrainingHelper'): | |||
self._targets = targets # [N, T_in, 1] | |||
self._batch_size = tf.shape(inputs)[0] # N | |||
self._inputs = inputs | |||
self._dim = dim | |||
num_steps = tf.shape(self._targets)[1] | |||
self._lengths = tf.tile([num_steps], [self._batch_size]) | |||
self._inputs = tf.roll(inputs, shift=-1, axis=1) | |||
self._init_inputs = inputs[:, 0, :] | |||
@property | |||
def batch_size(self): | |||
return self._batch_size | |||
@property | |||
def sample_ids_shape(self): | |||
return tf.TensorShape([]) | |||
@property | |||
def sample_ids_dtype(self): | |||
return np.int32 | |||
def initialize(self, name=None): | |||
return (tf.tile([False], [self._batch_size]), | |||
_go_frames(self._batch_size, self._dim, self._init_inputs)) | |||
def sample(self, time, outputs, state, name=None): | |||
return tf.tile([0], [self._batch_size]) # Return all 0; we ignore them | |||
def next_inputs(self, time, outputs, state, sample_ids, name=None): | |||
with tf.name_scope(name or 'VarTrainingHelper'): | |||
finished = (time + 1 >= self._lengths) | |||
next_inputs = tf.concat( | |||
[self._targets[:, time, :], self._inputs[:, time, :]], axis=-1) | |||
return (finished, next_inputs, state) | |||
class VarTrainingSSHelper(Helper): | |||
def __init__(self, targets, inputs, dim, global_step, schedule_begin, | |||
alpha, decay_steps): | |||
with tf.name_scope('VarTrainingSSHelper'): | |||
self._targets = targets # [N, T_in, 1] | |||
self._batch_size = tf.shape(inputs)[0] # N | |||
self._inputs = inputs | |||
self._dim = dim | |||
num_steps = tf.shape(self._targets)[1] | |||
self._lengths = tf.tile([num_steps], [self._batch_size]) | |||
self._inputs = tf.roll(inputs, shift=-1, axis=1) | |||
self._init_inputs = inputs[:, 0, :] | |||
# for schedule sampling | |||
self._global_step = global_step | |||
self._schedule_begin = schedule_begin | |||
self._alpha = alpha | |||
self._decay_steps = decay_steps | |||
@property | |||
def batch_size(self): | |||
return self._batch_size | |||
@property | |||
def sample_ids_shape(self): | |||
return tf.TensorShape([]) | |||
@property | |||
def sample_ids_dtype(self): | |||
return np.int32 | |||
def initialize(self, name=None): | |||
self._ratio = _tf_decay(self._global_step, self._schedule_begin, | |||
self._alpha, self._decay_steps) | |||
return (tf.tile([False], [self._batch_size]), | |||
_go_frames(self._batch_size, self._dim, self._init_inputs)) | |||
def sample(self, time, outputs, state, name=None): | |||
return tf.tile([0], [self._batch_size]) # Return all 0; we ignore them | |||
def next_inputs(self, time, outputs, state, sample_ids, name=None): | |||
with tf.name_scope(name or 'VarTrainingHelper'): | |||
finished = (time + 1 >= self._lengths) | |||
next_inputs_tmp = tf.cond( | |||
tf.less( | |||
tf.random_uniform([], minval=0, maxval=1, | |||
dtype=tf.float32), self._ratio), | |||
lambda: self._targets[:, time, :], lambda: outputs) | |||
next_inputs = tf.concat( | |||
[next_inputs_tmp, self._inputs[:, time, :]], axis=-1) | |||
return (finished, next_inputs, state) | |||
def _go_frames(batch_size, dim, init_inputs): | |||
'''Returns all-zero <GO> frames for a given batch size and output dimension''' | |||
return tf.concat([tf.tile([[0.0]], [batch_size, dim]), init_inputs], | |||
axis=-1) | |||
def _tf_decay(global_step, schedule_begin, alpha, decay_steps): | |||
tfr = tf.train.exponential_decay( | |||
1.0, | |||
global_step=global_step - schedule_begin, | |||
decay_steps=decay_steps, | |||
decay_rate=alpha, | |||
name='tfr_decay') | |||
final_tfr = tf.cond( | |||
tf.less(global_step, schedule_begin), lambda: 1.0, lambda: tfr) | |||
return final_tfr |
@@ -0,0 +1,461 @@ | |||
import tensorflow as tf | |||
from tensorflow.contrib.cudnn_rnn import CudnnLSTM | |||
from tensorflow.contrib.cudnn_rnn.python.ops import cudnn_rnn_ops | |||
from tensorflow.contrib.rnn import LSTMBlockCell | |||
def encoder_prenet(inputs, | |||
n_conv_layers, | |||
filters, | |||
kernel_size, | |||
dense_units, | |||
is_training, | |||
mask=None, | |||
scope='encoder_prenet'): | |||
x = inputs | |||
with tf.variable_scope(scope): | |||
for i in range(n_conv_layers): | |||
x = conv1d( | |||
x, | |||
filters, | |||
kernel_size, | |||
is_training, | |||
activation=tf.nn.relu, | |||
dropout=True, | |||
mask=mask, | |||
scope='conv1d_{}'.format(i)) | |||
x = tf.layers.dense( | |||
x, units=dense_units, activation=None, name='dense') | |||
return x | |||
def decoder_prenet(inputs, | |||
prenet_units, | |||
dense_units, | |||
is_training, | |||
scope='decoder_prenet'): | |||
x = inputs | |||
with tf.variable_scope(scope): | |||
for i, units in enumerate(prenet_units): | |||
x = tf.layers.dense( | |||
x, | |||
units=units, | |||
activation=tf.nn.relu, | |||
name='dense_{}'.format(i)) | |||
x = tf.layers.dropout( | |||
x, rate=0.5, training=is_training, name='dropout_{}'.format(i)) | |||
x = tf.layers.dense( | |||
x, units=dense_units, activation=None, name='dense') | |||
return x | |||
def encoder(inputs, | |||
input_lengths, | |||
n_conv_layers, | |||
filters, | |||
kernel_size, | |||
lstm_units, | |||
is_training, | |||
embedded_inputs_speaker, | |||
mask=None, | |||
scope='encoder'): | |||
with tf.variable_scope(scope): | |||
x = conv_and_lstm( | |||
inputs, | |||
input_lengths, | |||
n_conv_layers, | |||
filters, | |||
kernel_size, | |||
lstm_units, | |||
is_training, | |||
embedded_inputs_speaker, | |||
mask=mask) | |||
return x | |||
def prenet(inputs, prenet_units, is_training, scope='prenet'): | |||
x = inputs | |||
with tf.variable_scope(scope): | |||
for i, units in enumerate(prenet_units): | |||
x = tf.layers.dense( | |||
x, | |||
units=units, | |||
activation=tf.nn.relu, | |||
name='dense_{}'.format(i)) | |||
x = tf.layers.dropout( | |||
x, rate=0.5, training=is_training, name='dropout_{}'.format(i)) | |||
return x | |||
def postnet_residual_ulstm(inputs, | |||
n_conv_layers, | |||
filters, | |||
kernel_size, | |||
lstm_units, | |||
output_units, | |||
is_training, | |||
scope='postnet_residual_ulstm'): | |||
with tf.variable_scope(scope): | |||
x = conv_and_ulstm(inputs, None, n_conv_layers, filters, kernel_size, | |||
lstm_units, is_training) | |||
x = conv1d( | |||
x, | |||
output_units, | |||
kernel_size, | |||
is_training, | |||
activation=None, | |||
dropout=False, | |||
scope='conv1d_{}'.format(n_conv_layers - 1)) | |||
return x | |||
def postnet_residual_lstm(inputs, | |||
n_conv_layers, | |||
filters, | |||
kernel_size, | |||
lstm_units, | |||
output_units, | |||
is_training, | |||
scope='postnet_residual_lstm'): | |||
with tf.variable_scope(scope): | |||
x = conv_and_lstm(inputs, None, n_conv_layers, filters, kernel_size, | |||
lstm_units, is_training) | |||
x = conv1d( | |||
x, | |||
output_units, | |||
kernel_size, | |||
is_training, | |||
activation=None, | |||
dropout=False, | |||
scope='conv1d_{}'.format(n_conv_layers - 1)) | |||
return x | |||
def postnet_linear_ulstm(inputs, | |||
n_conv_layers, | |||
filters, | |||
kernel_size, | |||
lstm_units, | |||
output_units, | |||
is_training, | |||
scope='postnet_linear'): | |||
with tf.variable_scope(scope): | |||
x = conv_and_ulstm(inputs, None, n_conv_layers, filters, kernel_size, | |||
lstm_units, is_training) | |||
x = tf.layers.dense(x, units=output_units) | |||
return x | |||
def postnet_linear_lstm(inputs, | |||
n_conv_layers, | |||
filters, | |||
kernel_size, | |||
lstm_units, | |||
output_units, | |||
output_lengths, | |||
is_training, | |||
embedded_inputs_speaker2, | |||
mask=None, | |||
scope='postnet_linear'): | |||
with tf.variable_scope(scope): | |||
x = conv_and_lstm_dec( | |||
inputs, | |||
output_lengths, | |||
n_conv_layers, | |||
filters, | |||
kernel_size, | |||
lstm_units, | |||
is_training, | |||
embedded_inputs_speaker2, | |||
mask=mask) | |||
x = tf.layers.dense(x, units=output_units) | |||
return x | |||
def postnet_linear(inputs, | |||
n_conv_layers, | |||
filters, | |||
kernel_size, | |||
lstm_units, | |||
output_units, | |||
output_lengths, | |||
is_training, | |||
embedded_inputs_speaker2, | |||
mask=None, | |||
scope='postnet_linear'): | |||
with tf.variable_scope(scope): | |||
x = conv_dec( | |||
inputs, | |||
output_lengths, | |||
n_conv_layers, | |||
filters, | |||
kernel_size, | |||
lstm_units, | |||
is_training, | |||
embedded_inputs_speaker2, | |||
mask=mask) | |||
return x | |||
def conv_and_lstm(inputs, | |||
sequence_lengths, | |||
n_conv_layers, | |||
filters, | |||
kernel_size, | |||
lstm_units, | |||
is_training, | |||
embedded_inputs_speaker, | |||
mask=None, | |||
scope='conv_and_lstm'): | |||
x = inputs | |||
with tf.variable_scope(scope): | |||
for i in range(n_conv_layers): | |||
x = conv1d( | |||
x, | |||
filters, | |||
kernel_size, | |||
is_training, | |||
activation=tf.nn.relu, | |||
dropout=True, | |||
mask=mask, | |||
scope='conv1d_{}'.format(i)) | |||
x = tf.concat([x, embedded_inputs_speaker], axis=2) | |||
outputs, states = tf.nn.bidirectional_dynamic_rnn( | |||
LSTMBlockCell(lstm_units), | |||
LSTMBlockCell(lstm_units), | |||
x, | |||
sequence_length=sequence_lengths, | |||
dtype=tf.float32) | |||
x = tf.concat(outputs, axis=-1) | |||
return x | |||
def conv_and_lstm_dec(inputs, | |||
sequence_lengths, | |||
n_conv_layers, | |||
filters, | |||
kernel_size, | |||
lstm_units, | |||
is_training, | |||
embedded_inputs_speaker2, | |||
mask=None, | |||
scope='conv_and_lstm'): | |||
x = inputs | |||
with tf.variable_scope(scope): | |||
for i in range(n_conv_layers): | |||
x = conv1d( | |||
x, | |||
filters, | |||
kernel_size, | |||
is_training, | |||
activation=tf.nn.relu, | |||
dropout=True, | |||
mask=mask, | |||
scope='conv1d_{}'.format(i)) | |||
x = tf.concat([x, embedded_inputs_speaker2], axis=2) | |||
outputs, states = tf.nn.bidirectional_dynamic_rnn( | |||
LSTMBlockCell(lstm_units), | |||
LSTMBlockCell(lstm_units), | |||
x, | |||
sequence_length=sequence_lengths, | |||
dtype=tf.float32) | |||
x = tf.concat(outputs, axis=-1) | |||
return x | |||
def conv_dec(inputs, | |||
sequence_lengths, | |||
n_conv_layers, | |||
filters, | |||
kernel_size, | |||
lstm_units, | |||
is_training, | |||
embedded_inputs_speaker2, | |||
mask=None, | |||
scope='conv_and_lstm'): | |||
x = inputs | |||
with tf.variable_scope(scope): | |||
for i in range(n_conv_layers): | |||
x = conv1d( | |||
x, | |||
filters, | |||
kernel_size, | |||
is_training, | |||
activation=tf.nn.relu, | |||
dropout=True, | |||
mask=mask, | |||
scope='conv1d_{}'.format(i)) | |||
x = tf.concat([x, embedded_inputs_speaker2], axis=2) | |||
return x | |||
def conv_and_ulstm(inputs, | |||
sequence_lengths, | |||
n_conv_layers, | |||
filters, | |||
kernel_size, | |||
lstm_units, | |||
is_training, | |||
scope='conv_and_ulstm'): | |||
x = inputs | |||
with tf.variable_scope(scope): | |||
for i in range(n_conv_layers): | |||
x = conv1d( | |||
x, | |||
filters, | |||
kernel_size, | |||
is_training, | |||
activation=tf.nn.relu, | |||
dropout=True, | |||
scope='conv1d_{}'.format(i)) | |||
outputs, states = tf.nn.dynamic_rnn( | |||
LSTMBlockCell(lstm_units), | |||
x, | |||
sequence_length=sequence_lengths, | |||
dtype=tf.float32) | |||
return outputs | |||
def conv1d(inputs, | |||
filters, | |||
kernel_size, | |||
is_training, | |||
activation=None, | |||
dropout=False, | |||
mask=None, | |||
scope='conv1d'): | |||
with tf.variable_scope(scope): | |||
if mask is not None: | |||
inputs = inputs * tf.expand_dims(mask, -1) | |||
x = tf.layers.conv1d( | |||
inputs, filters=filters, kernel_size=kernel_size, padding='same') | |||
if mask is not None: | |||
x = x * tf.expand_dims(mask, -1) | |||
x = tf.layers.batch_normalization(x, training=is_training) | |||
if activation is not None: | |||
x = activation(x) | |||
if dropout: | |||
x = tf.layers.dropout(x, rate=0.5, training=is_training) | |||
return x | |||
def conv1d_dp(inputs, | |||
filters, | |||
kernel_size, | |||
is_training, | |||
activation=None, | |||
dropout=False, | |||
dropoutrate=0.5, | |||
mask=None, | |||
scope='conv1d'): | |||
with tf.variable_scope(scope): | |||
if mask is not None: | |||
inputs = inputs * tf.expand_dims(mask, -1) | |||
x = tf.layers.conv1d( | |||
inputs, filters=filters, kernel_size=kernel_size, padding='same') | |||
if mask is not None: | |||
x = x * tf.expand_dims(mask, -1) | |||
x = tf.contrib.layers.layer_norm(x) | |||
if activation is not None: | |||
x = activation(x) | |||
if dropout: | |||
x = tf.layers.dropout(x, rate=dropoutrate, training=is_training) | |||
return x | |||
def duration_predictor(inputs, | |||
n_conv_layers, | |||
filters, | |||
kernel_size, | |||
lstm_units, | |||
input_lengths, | |||
is_training, | |||
embedded_inputs_speaker, | |||
mask=None, | |||
scope='duration_predictor'): | |||
with tf.variable_scope(scope): | |||
x = inputs | |||
for i in range(n_conv_layers): | |||
x = conv1d_dp( | |||
x, | |||
filters, | |||
kernel_size, | |||
is_training, | |||
activation=tf.nn.relu, | |||
dropout=True, | |||
dropoutrate=0.1, | |||
mask=mask, | |||
scope='conv1d_{}'.format(i)) | |||
x = tf.concat([x, embedded_inputs_speaker], axis=2) | |||
outputs, states = tf.nn.bidirectional_dynamic_rnn( | |||
LSTMBlockCell(lstm_units), | |||
LSTMBlockCell(lstm_units), | |||
x, | |||
sequence_length=input_lengths, | |||
dtype=tf.float32) | |||
x = tf.concat(outputs, axis=-1) | |||
x = tf.layers.dense(x, units=1) | |||
x = tf.nn.relu(x) | |||
return x | |||
def duration_predictor2(inputs, | |||
n_conv_layers, | |||
filters, | |||
kernel_size, | |||
input_lengths, | |||
is_training, | |||
mask=None, | |||
scope='duration_predictor'): | |||
with tf.variable_scope(scope): | |||
x = inputs | |||
for i in range(n_conv_layers): | |||
x = conv1d_dp( | |||
x, | |||
filters, | |||
kernel_size, | |||
is_training, | |||
activation=tf.nn.relu, | |||
dropout=True, | |||
dropoutrate=0.1, | |||
mask=mask, | |||
scope='conv1d_{}'.format(i)) | |||
x = tf.layers.dense(x, units=1) | |||
x = tf.nn.relu(x) | |||
return x | |||
def conv_prenet(inputs, | |||
n_conv_layers, | |||
filters, | |||
kernel_size, | |||
is_training, | |||
mask=None, | |||
scope='conv_prenet'): | |||
x = inputs | |||
with tf.variable_scope(scope): | |||
for i in range(n_conv_layers): | |||
x = conv1d( | |||
x, | |||
filters, | |||
kernel_size, | |||
is_training, | |||
activation=tf.nn.relu, | |||
dropout=True, | |||
mask=mask, | |||
scope='conv1d_{}'.format(i)) | |||
return x |
@@ -0,0 +1,174 @@ | |||
"""Define position encoder classes.""" | |||
import abc | |||
import math | |||
import tensorflow as tf | |||
from .reducer import SumReducer | |||
class PositionEncoder(tf.keras.layers.Layer): | |||
"""Base class for position encoders.""" | |||
def __init__(self, reducer=None, **kwargs): | |||
"""Initializes the position encoder. | |||
Args: | |||
reducer: A :class:`opennmt.layers.Reducer` to merge inputs and position | |||
encodings. Defaults to :class:`opennmt.layers.SumReducer`. | |||
**kwargs: Additional layer keyword arguments. | |||
""" | |||
super(PositionEncoder, self).__init__(**kwargs) | |||
if reducer is None: | |||
reducer = SumReducer(dtype=kwargs.get('dtype')) | |||
self.reducer = reducer | |||
def call(self, inputs, position=None): # pylint: disable=arguments-differ | |||
"""Add position encodings to :obj:`inputs`. | |||
Args: | |||
inputs: The inputs to encode. | |||
position: The single position to encode, to use when this layer is called | |||
step by step. | |||
Returns: | |||
A ``tf.Tensor`` whose shape depends on the configured ``reducer``. | |||
""" | |||
batch_size = tf.shape(inputs)[0] | |||
timesteps = tf.shape(inputs)[1] | |||
input_dim = inputs.shape[-1].value | |||
positions = tf.range(timesteps) + 1 if position is None else [position] | |||
position_encoding = self._encode([positions], input_dim) | |||
position_encoding = tf.tile(position_encoding, [batch_size, 1, 1]) | |||
return self.reducer([inputs, position_encoding]) | |||
@abc.abstractmethod | |||
def _encode(self, positions, depth): | |||
"""Creates position encodings. | |||
Args: | |||
positions: The positions to encode of shape :math:`[B, ...]`. | |||
depth: The encoding depth :math:`D`. | |||
Returns: | |||
A ``tf.Tensor`` of shape :math:`[B, ..., D]`. | |||
""" | |||
raise NotImplementedError() | |||
class PositionEmbedder(PositionEncoder): | |||
"""Encodes position with a lookup table.""" | |||
def __init__(self, maximum_position=128, reducer=None, **kwargs): | |||
"""Initializes the position encoder. | |||
Args: | |||
maximum_position: The maximum position to embed. Positions greater | |||
than this value will be set to :obj:`maximum_position`. | |||
reducer: A :class:`opennmt.layers.Reducer` to merge inputs and position | |||
encodings. Defaults to :class:`opennmt.layers.SumReducer`. | |||
**kwargs: Additional layer keyword arguments. | |||
""" | |||
super(PositionEmbedder, self).__init__(reducer=reducer, **kwargs) | |||
self.maximum_position = maximum_position | |||
self.embedding = None | |||
def build(self, input_shape): | |||
shape = [self.maximum_position + 1, input_shape[-1]] | |||
self.embedding = self.add_weight('position_embedding', shape) | |||
super(PositionEmbedder, self).build(input_shape) | |||
def _encode(self, positions, depth): | |||
positions = tf.minimum(positions, self.maximum_position) | |||
return tf.nn.embedding_lookup(self.embedding, positions) | |||
class SinusoidalPositionEncoder(PositionEncoder): | |||
"""Encodes positions with sine waves as described in | |||
https://arxiv.org/abs/1706.03762. | |||
""" | |||
def _encode(self, positions, depth): | |||
if depth % 2 != 0: | |||
raise ValueError( | |||
'SinusoidalPositionEncoder expects the depth to be divisble ' | |||
'by 2 but got %d' % depth) | |||
batch_size = tf.shape(positions)[0] | |||
positions = tf.cast(positions, tf.float32) | |||
log_timescale_increment = math.log(10000) / (depth / 2 - 1) | |||
inv_timescales = tf.exp( | |||
tf.range(depth / 2, dtype=tf.float32) * -log_timescale_increment) | |||
inv_timescales = tf.reshape( | |||
tf.tile(inv_timescales, [batch_size]), [batch_size, depth // 2]) | |||
scaled_time = tf.expand_dims(positions, -1) * tf.expand_dims( | |||
inv_timescales, 1) | |||
encoding = tf.concat( | |||
[tf.sin(scaled_time), tf.cos(scaled_time)], axis=2) | |||
return tf.cast(encoding, self.dtype) | |||
class SinusodalPositionalEncoding(tf.keras.layers.Layer): | |||
def __init__(self, name='SinusodalPositionalEncoding'): | |||
super(SinusodalPositionalEncoding, self).__init__(name=name) | |||
@staticmethod | |||
def positional_encoding(len, dim, step=1.): | |||
""" | |||
:param len: int scalar | |||
:param dim: int scalar | |||
:param step: | |||
:return: position embedding | |||
""" | |||
pos_mat = tf.tile( | |||
tf.expand_dims( | |||
tf.range(0, tf.cast(len, dtype=tf.float32), dtype=tf.float32) | |||
* step, | |||
axis=-1), [1, dim]) | |||
dim_mat = tf.tile( | |||
tf.expand_dims( | |||
tf.range(0, tf.cast(dim, dtype=tf.float32), dtype=tf.float32), | |||
axis=0), [len, 1]) | |||
dim_mat_int = tf.cast(dim_mat, dtype=tf.int32) | |||
pos_encoding = tf.where( # [time, dims] | |||
tf.math.equal(tf.math.mod(dim_mat_int, 2), 0), | |||
x=tf.math.sin( | |||
pos_mat / tf.pow(10000., dim_mat / tf.cast(dim, tf.float32))), | |||
y=tf.math.cos(pos_mat | |||
/ tf.pow(10000., | |||
(dim_mat - 1) / tf.cast(dim, tf.float32)))) | |||
return pos_encoding | |||
class BatchSinusodalPositionalEncoding(tf.keras.layers.Layer): | |||
def __init__(self, name='BatchSinusodalPositionalEncoding'): | |||
super(BatchSinusodalPositionalEncoding, self).__init__(name=name) | |||
@staticmethod | |||
def positional_encoding(batch_size, len, dim, pos_mat, step=1.): | |||
""" | |||
:param len: int scalar | |||
:param dim: int scalar | |||
:param step: | |||
:param pos_mat: [B, len] = [len, 1] * dim | |||
:return: position embedding | |||
""" | |||
pos_mat = tf.tile( | |||
tf.expand_dims(tf.cast(pos_mat, dtype=tf.float32) * step, axis=-1), | |||
[1, 1, dim]) # [B, len, dim] | |||
dim_mat = tf.tile( | |||
tf.expand_dims( | |||
tf.expand_dims( | |||
tf.range( | |||
0, tf.cast(dim, dtype=tf.float32), dtype=tf.float32), | |||
axis=0), | |||
axis=0), [batch_size, len, 1]) # [B, len, dim] | |||
dim_mat_int = tf.cast(dim_mat, dtype=tf.int32) | |||
pos_encoding = tf.where( # [B, time, dims] | |||
tf.math.equal(tf.mod(dim_mat_int, 2), 0), | |||
x=tf.math.sin( | |||
pos_mat / tf.pow(10000., dim_mat / tf.cast(dim, tf.float32))), | |||
y=tf.math.cos(pos_mat | |||
/ tf.pow(10000., | |||
(dim_mat - 1) / tf.cast(dim, tf.float32)))) | |||
return pos_encoding |
@@ -0,0 +1,155 @@ | |||
"""Define reducers: objects that merge inputs.""" | |||
import abc | |||
import functools | |||
import tensorflow as tf | |||
def pad_in_time(x, padding_length): | |||
"""Helper function to pad a tensor in the time dimension and retain the static depth dimension.""" | |||
return tf.pad(x, [[0, 0], [0, padding_length], [0, 0]]) | |||
def align_in_time(x, length): | |||
"""Aligns the time dimension of :obj:`x` with :obj:`length`.""" | |||
time_dim = tf.shape(x)[1] | |||
return tf.cond( | |||
tf.less(time_dim, length), | |||
true_fn=lambda: pad_in_time(x, length - time_dim), | |||
false_fn=lambda: x[:, :length]) | |||
def pad_with_identity(x, | |||
sequence_length, | |||
max_sequence_length, | |||
identity_values=0, | |||
maxlen=None): | |||
"""Pads a tensor with identity values up to :obj:`max_sequence_length`. | |||
Args: | |||
x: A ``tf.Tensor`` of shape ``[batch_size, time, depth]``. | |||
sequence_length: The true sequence length of :obj:`x`. | |||
max_sequence_length: The sequence length up to which the tensor must contain | |||
:obj:`identity values`. | |||
identity_values: The identity value. | |||
maxlen: Size of the output time dimension. Default is the maximum value in | |||
obj:`max_sequence_length`. | |||
Returns: | |||
A ``tf.Tensor`` of shape ``[batch_size, maxlen, depth]``. | |||
""" | |||
if maxlen is None: | |||
maxlen = tf.reduce_max(max_sequence_length) | |||
mask = tf.sequence_mask(sequence_length, maxlen=maxlen, dtype=x.dtype) | |||
mask = tf.expand_dims(mask, axis=-1) | |||
mask_combined = tf.sequence_mask( | |||
max_sequence_length, maxlen=maxlen, dtype=x.dtype) | |||
mask_combined = tf.expand_dims(mask_combined, axis=-1) | |||
identity_mask = mask_combined * (1.0 - mask) | |||
x = pad_in_time(x, maxlen - tf.shape(x)[1]) | |||
x = x * mask + (identity_mask * identity_values) | |||
return x | |||
def pad_n_with_identity(inputs, sequence_lengths, identity_values=0): | |||
"""Pads each input tensors with identity values up to | |||
``max(sequence_lengths)`` for each batch. | |||
Args: | |||
inputs: A list of ``tf.Tensor``. | |||
sequence_lengths: A list of sequence length. | |||
identity_values: The identity value. | |||
Returns: | |||
A tuple ``(padded, max_sequence_length)`` which are respectively a list of | |||
``tf.Tensor`` where each tensor are padded with identity and the combined | |||
sequence length. | |||
""" | |||
max_sequence_length = tf.reduce_max(sequence_lengths, axis=0) | |||
maxlen = tf.reduce_max([tf.shape(x)[1] for x in inputs]) | |||
padded = [ | |||
pad_with_identity( | |||
x, | |||
length, | |||
max_sequence_length, | |||
identity_values=identity_values, | |||
maxlen=maxlen) for x, length in zip(inputs, sequence_lengths) | |||
] | |||
return padded, max_sequence_length | |||
class Reducer(tf.keras.layers.Layer): | |||
"""Base class for reducers.""" | |||
def zip_and_reduce(self, x, y): | |||
"""Zips the :obj:`x` with :obj:`y` structures together and reduces all | |||
elements. If the structures are nested, they will be flattened first. | |||
Args: | |||
x: The first structure. | |||
y: The second structure. | |||
Returns: | |||
The same structure as :obj:`x` and :obj:`y` where each element from | |||
:obj:`x` is reduced with the correspond element from :obj:`y`. | |||
Raises: | |||
ValueError: if the two structures are not the same. | |||
""" | |||
tf.nest.assert_same_structure(x, y) | |||
x_flat = tf.nest.flatten(x) | |||
y_flat = tf.nest.flatten(y) | |||
reduced = list(map(self, zip(x_flat, y_flat))) | |||
return tf.nest.pack_sequence_as(x, reduced) | |||
def call(self, inputs, sequence_length=None): # pylint: disable=arguments-differ | |||
"""Reduces all input elements. | |||
Args: | |||
inputs: A list of ``tf.Tensor``. | |||
sequence_length: The length of each input, if reducing sequences. | |||
Returns: | |||
If :obj:`sequence_length` is set, a tuple | |||
``(reduced_input, reduced_length)``, otherwise a reduced ``tf.Tensor`` | |||
only. | |||
""" | |||
if sequence_length is None: | |||
return self.reduce(inputs) | |||
else: | |||
return self.reduce_sequence( | |||
inputs, sequence_lengths=sequence_length) | |||
@abc.abstractmethod | |||
def reduce(self, inputs): | |||
"""See :meth:`opennmt.layers.Reducer.__call__`.""" | |||
raise NotImplementedError() | |||
@abc.abstractmethod | |||
def reduce_sequence(self, inputs, sequence_lengths): | |||
"""See :meth:`opennmt.layers.Reducer.__call__`.""" | |||
raise NotImplementedError() | |||
class SumReducer(Reducer): | |||
"""A reducer that sums the inputs.""" | |||
def reduce(self, inputs): | |||
if len(inputs) == 1: | |||
return inputs[0] | |||
if len(inputs) == 2: | |||
return inputs[0] + inputs[1] | |||
return tf.add_n(inputs) | |||
def reduce_sequence(self, inputs, sequence_lengths): | |||
padded, combined_length = pad_n_with_identity( | |||
inputs, sequence_lengths, identity_values=0) | |||
return self.reduce(padded), combined_length | |||
class MultiplyReducer(Reducer): | |||
"""A reducer that multiplies the inputs.""" | |||
def reduce(self, inputs): | |||
return functools.reduce(lambda a, x: a * x, inputs) | |||
def reduce_sequence(self, inputs, sequence_lengths): | |||
padded, combined_length = pad_n_with_identity( | |||
inputs, sequence_lengths, identity_values=1) | |||
return self.reduce(padded), combined_length |
@@ -0,0 +1,240 @@ | |||
import numpy as np | |||
import tensorflow as tf | |||
from tensorflow.contrib.rnn import RNNCell | |||
from tensorflow.contrib.seq2seq import AttentionWrapperState | |||
from tensorflow.python.ops import rnn_cell_impl | |||
from .modules import prenet | |||
class VarPredictorCell(RNNCell): | |||
'''Wrapper wrapper knock knock.''' | |||
def __init__(self, var_predictor_cell, is_training, dim, prenet_units): | |||
super(VarPredictorCell, self).__init__() | |||
self._var_predictor_cell = var_predictor_cell | |||
self._is_training = is_training | |||
self._dim = dim | |||
self._prenet_units = prenet_units | |||
@property | |||
def state_size(self): | |||
return tuple([self.output_size, self._var_predictor_cell.state_size]) | |||
@property | |||
def output_size(self): | |||
return self._dim | |||
def zero_state(self, batch_size, dtype): | |||
return tuple([ | |||
rnn_cell_impl._zero_state_tensors(self.output_size, batch_size, | |||
dtype), | |||
self._var_predictor_cell.zero_state(batch_size, dtype) | |||
]) | |||
def call(self, inputs, state): | |||
'''Run the Tacotron2 super decoder cell.''' | |||
super_cell_out, decoder_state = state | |||
# split | |||
prenet_input = inputs[:, 0:self._dim] | |||
encoder_output = inputs[:, self._dim:] | |||
# prenet and concat | |||
prenet_output = prenet( | |||
prenet_input, | |||
self._prenet_units, | |||
self._is_training, | |||
scope='var_prenet') | |||
decoder_input = tf.concat([prenet_output, encoder_output], axis=-1) | |||
# decoder LSTM/GRU | |||
new_super_cell_out, new_decoder_state = self._var_predictor_cell( | |||
decoder_input, decoder_state) | |||
# projection | |||
new_super_cell_out = tf.layers.dense( | |||
new_super_cell_out, units=self._dim) | |||
new_states = tuple([new_super_cell_out, new_decoder_state]) | |||
return new_super_cell_out, new_states | |||
class DurPredictorCell(RNNCell): | |||
'''Wrapper wrapper knock knock.''' | |||
def __init__(self, var_predictor_cell, is_training, dim, prenet_units): | |||
super(DurPredictorCell, self).__init__() | |||
self._var_predictor_cell = var_predictor_cell | |||
self._is_training = is_training | |||
self._dim = dim | |||
self._prenet_units = prenet_units | |||
@property | |||
def state_size(self): | |||
return tuple([self.output_size, self._var_predictor_cell.state_size]) | |||
@property | |||
def output_size(self): | |||
return self._dim | |||
def zero_state(self, batch_size, dtype): | |||
return tuple([ | |||
rnn_cell_impl._zero_state_tensors(self.output_size, batch_size, | |||
dtype), | |||
self._var_predictor_cell.zero_state(batch_size, dtype) | |||
]) | |||
def call(self, inputs, state): | |||
'''Run the Tacotron2 super decoder cell.''' | |||
super_cell_out, decoder_state = state | |||
# split | |||
prenet_input = inputs[:, 0:self._dim] | |||
encoder_output = inputs[:, self._dim:] | |||
# prenet and concat | |||
prenet_output = prenet( | |||
prenet_input, | |||
self._prenet_units, | |||
self._is_training, | |||
scope='dur_prenet') | |||
decoder_input = tf.concat([prenet_output, encoder_output], axis=-1) | |||
# decoder LSTM/GRU | |||
new_super_cell_out, new_decoder_state = self._var_predictor_cell( | |||
decoder_input, decoder_state) | |||
# projection | |||
new_super_cell_out = tf.layers.dense( | |||
new_super_cell_out, units=self._dim) | |||
new_super_cell_out = tf.nn.relu(new_super_cell_out) | |||
# new_super_cell_out = tf.log(tf.cast(tf.round(tf.exp(new_super_cell_out) - 1), tf.float32) + 1) | |||
new_states = tuple([new_super_cell_out, new_decoder_state]) | |||
return new_super_cell_out, new_states | |||
class DurPredictorCECell(RNNCell): | |||
'''Wrapper wrapper knock knock.''' | |||
def __init__(self, var_predictor_cell, is_training, dim, prenet_units, | |||
max_dur, dur_embedding_dim): | |||
super(DurPredictorCECell, self).__init__() | |||
self._var_predictor_cell = var_predictor_cell | |||
self._is_training = is_training | |||
self._dim = dim | |||
self._prenet_units = prenet_units | |||
self._max_dur = max_dur | |||
self._dur_embedding_dim = dur_embedding_dim | |||
@property | |||
def state_size(self): | |||
return tuple([self.output_size, self._var_predictor_cell.state_size]) | |||
@property | |||
def output_size(self): | |||
return self._max_dur | |||
def zero_state(self, batch_size, dtype): | |||
return tuple([ | |||
rnn_cell_impl._zero_state_tensors(self.output_size, batch_size, | |||
dtype), | |||
self._var_predictor_cell.zero_state(batch_size, dtype) | |||
]) | |||
def call(self, inputs, state): | |||
'''Run the Tacotron2 super decoder cell.''' | |||
super_cell_out, decoder_state = state | |||
# split | |||
prenet_input = tf.squeeze( | |||
tf.cast(inputs[:, 0:self._dim], tf.int32), axis=-1) # [N] | |||
prenet_input = tf.one_hot( | |||
prenet_input, self._max_dur, on_value=1.0, off_value=0.0, | |||
axis=-1) # [N, 120] | |||
prenet_input = tf.layers.dense( | |||
prenet_input, units=self._dur_embedding_dim) | |||
encoder_output = inputs[:, self._dim:] | |||
# prenet and concat | |||
prenet_output = prenet( | |||
prenet_input, | |||
self._prenet_units, | |||
self._is_training, | |||
scope='dur_prenet') | |||
decoder_input = tf.concat([prenet_output, encoder_output], axis=-1) | |||
# decoder LSTM/GRU | |||
new_super_cell_out, new_decoder_state = self._var_predictor_cell( | |||
decoder_input, decoder_state) | |||
# projection | |||
new_super_cell_out = tf.layers.dense( | |||
new_super_cell_out, units=self._max_dur) # [N, 120] | |||
new_super_cell_out = tf.nn.softmax(new_super_cell_out) # [N, 120] | |||
new_states = tuple([new_super_cell_out, new_decoder_state]) | |||
return new_super_cell_out, new_states | |||
class VarPredictorCell2(RNNCell): | |||
'''Wrapper wrapper knock knock.''' | |||
def __init__(self, var_predictor_cell, is_training, dim, prenet_units): | |||
super(VarPredictorCell2, self).__init__() | |||
self._var_predictor_cell = var_predictor_cell | |||
self._is_training = is_training | |||
self._dim = dim | |||
self._prenet_units = prenet_units | |||
@property | |||
def state_size(self): | |||
return tuple([self.output_size, self._var_predictor_cell.state_size]) | |||
@property | |||
def output_size(self): | |||
return self._dim | |||
def zero_state(self, batch_size, dtype): | |||
return tuple([ | |||
rnn_cell_impl._zero_state_tensors(self.output_size, batch_size, | |||
dtype), | |||
self._var_predictor_cell.zero_state(batch_size, dtype) | |||
]) | |||
def call(self, inputs, state): | |||
'''Run the Tacotron2 super decoder cell.''' | |||
super_cell_out, decoder_state = state | |||
# split | |||
prenet_input = inputs[:, 0:self._dim] | |||
encoder_output = inputs[:, self._dim:] | |||
# prenet and concat | |||
prenet_output = prenet( | |||
prenet_input, | |||
self._prenet_units, | |||
self._is_training, | |||
scope='var_prenet') | |||
decoder_input = tf.concat([prenet_output, encoder_output], axis=-1) | |||
# decoder LSTM/GRU | |||
new_super_cell_out, new_decoder_state = self._var_predictor_cell( | |||
decoder_input, decoder_state) | |||
# projection | |||
new_super_cell_out = tf.layers.dense( | |||
new_super_cell_out, units=self._dim) | |||
# split and relu | |||
new_super_cell_out = tf.concat([ | |||
tf.nn.relu(new_super_cell_out[:, 0:1]), new_super_cell_out[:, 1:] | |||
], axis=-1) # yapf:disable | |||
new_states = tuple([new_super_cell_out, new_decoder_state]) | |||
return new_super_cell_out, new_states |
@@ -0,0 +1,760 @@ | |||
import tensorflow as tf | |||
from tensorflow.contrib.rnn import LSTMBlockCell, MultiRNNCell | |||
from tensorflow.contrib.seq2seq import BasicDecoder | |||
from tensorflow.python.ops.ragged.ragged_util import repeat | |||
from .fsmn_encoder import FsmnEncoderV2 | |||
from .helpers import VarTestHelper, VarTrainingHelper | |||
from .modules import conv_prenet, decoder_prenet, encoder_prenet | |||
from .position import (BatchSinusodalPositionalEncoding, | |||
SinusodalPositionalEncoding) | |||
from .rnn_wrappers import DurPredictorCell, VarPredictorCell | |||
from .self_attention_decoder import SelfAttentionDecoder | |||
from .self_attention_encoder import SelfAttentionEncoder | |||
class RobuTrans(): | |||
def __init__(self, hparams): | |||
self._hparams = hparams | |||
def initialize(self, | |||
inputs, | |||
inputs_emotion, | |||
inputs_speaker, | |||
input_lengths, | |||
output_lengths=None, | |||
mel_targets=None, | |||
durations=None, | |||
pitch_contours=None, | |||
uv_masks=None, | |||
pitch_scales=None, | |||
duration_scales=None, | |||
energy_contours=None, | |||
energy_scales=None): | |||
'''Initializes the model for inference. | |||
Sets "mel_outputs", "linear_outputs", "stop_token_outputs", and "alignments" fields. | |||
Args: | |||
inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of | |||
steps in the input time series, and values are character IDs | |||
input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths | |||
of each sequence in inputs. | |||
output_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths | |||
of each sequence in outputs. | |||
mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number | |||
of steps in the output time series, M is num_mels, and values are entries in the mel | |||
spectrogram. Only needed for training. | |||
''' | |||
with tf.variable_scope('inference') as _: | |||
is_training = mel_targets is not None | |||
batch_size = tf.shape(inputs)[0] | |||
hp = self._hparams | |||
input_mask = None | |||
if input_lengths is not None and is_training: | |||
input_mask = tf.sequence_mask( | |||
input_lengths, tf.shape(inputs)[1], dtype=tf.float32) | |||
if input_mask is not None: | |||
inputs = inputs * tf.expand_dims(input_mask, -1) | |||
# speaker embedding | |||
embedded_inputs_speaker = tf.layers.dense( | |||
inputs_speaker, | |||
32, | |||
activation=None, | |||
use_bias=False, | |||
kernel_initializer=tf.truncated_normal_initializer(stddev=0.5)) | |||
# emotion embedding | |||
embedded_inputs_emotion = tf.layers.dense( | |||
inputs_emotion, | |||
32, | |||
activation=None, | |||
use_bias=False, | |||
kernel_initializer=tf.truncated_normal_initializer(stddev=0.5)) | |||
# symbol embedding | |||
with tf.variable_scope('Embedding'): | |||
embedded_inputs = tf.layers.dense( | |||
inputs, | |||
hp.embedding_dim, | |||
activation=None, | |||
use_bias=False, | |||
kernel_initializer=tf.truncated_normal_initializer( | |||
stddev=0.5)) | |||
# Encoder | |||
with tf.variable_scope('Encoder'): | |||
Encoder = SelfAttentionEncoder( | |||
num_layers=hp.encoder_num_layers, | |||
num_units=hp.encoder_num_units, | |||
num_heads=hp.encoder_num_heads, | |||
ffn_inner_dim=hp.encoder_ffn_inner_dim, | |||
dropout=hp.encoder_dropout, | |||
attention_dropout=hp.encoder_attention_dropout, | |||
relu_dropout=hp.encoder_relu_dropout) | |||
encoder_outputs, state_mo, sequence_length_mo, attns = Encoder.encode( | |||
embedded_inputs, | |||
sequence_length=input_lengths, | |||
mode=is_training) | |||
encoder_outputs = tf.layers.dense( | |||
encoder_outputs, | |||
hp.encoder_projection_units, | |||
activation=None, | |||
use_bias=False, | |||
kernel_initializer=tf.truncated_normal_initializer( | |||
stddev=0.5)) | |||
# pitch and energy | |||
var_inputs = tf.concat([ | |||
encoder_outputs, embedded_inputs_speaker, | |||
embedded_inputs_emotion | |||
], 2) | |||
if input_mask is not None: | |||
var_inputs = var_inputs * tf.expand_dims(input_mask, -1) | |||
with tf.variable_scope('Pitch_Predictor'): | |||
Pitch_Predictor_FSMN = FsmnEncoderV2( | |||
filter_size=hp.predictor_filter_size, | |||
fsmn_num_layers=hp.predictor_fsmn_num_layers, | |||
dnn_num_layers=hp.predictor_dnn_num_layers, | |||
num_memory_units=hp.predictor_num_memory_units, | |||
ffn_inner_dim=hp.predictor_ffn_inner_dim, | |||
dropout=hp.predictor_dropout, | |||
shift=hp.predictor_shift, | |||
position_encoder=None) | |||
pitch_contour_outputs, _, _ = Pitch_Predictor_FSMN.encode( | |||
tf.concat([ | |||
encoder_outputs, embedded_inputs_speaker, | |||
embedded_inputs_emotion | |||
], 2), | |||
sequence_length=input_lengths, | |||
mode=is_training) | |||
pitch_contour_outputs, _ = tf.nn.bidirectional_dynamic_rnn( | |||
LSTMBlockCell(hp.predictor_lstm_units), | |||
LSTMBlockCell(hp.predictor_lstm_units), | |||
pitch_contour_outputs, | |||
sequence_length=input_lengths, | |||
dtype=tf.float32) | |||
pitch_contour_outputs = tf.concat( | |||
pitch_contour_outputs, axis=-1) | |||
pitch_contour_outputs = tf.layers.dense( | |||
pitch_contour_outputs, units=1) # [N, T_in, 1] | |||
pitch_contour_outputs = tf.squeeze( | |||
pitch_contour_outputs, axis=2) # [N, T_in] | |||
with tf.variable_scope('Energy_Predictor'): | |||
Energy_Predictor_FSMN = FsmnEncoderV2( | |||
filter_size=hp.predictor_filter_size, | |||
fsmn_num_layers=hp.predictor_fsmn_num_layers, | |||
dnn_num_layers=hp.predictor_dnn_num_layers, | |||
num_memory_units=hp.predictor_num_memory_units, | |||
ffn_inner_dim=hp.predictor_ffn_inner_dim, | |||
dropout=hp.predictor_dropout, | |||
shift=hp.predictor_shift, | |||
position_encoder=None) | |||
energy_contour_outputs, _, _ = Energy_Predictor_FSMN.encode( | |||
tf.concat([ | |||
encoder_outputs, embedded_inputs_speaker, | |||
embedded_inputs_emotion | |||
], 2), | |||
sequence_length=input_lengths, | |||
mode=is_training) | |||
energy_contour_outputs, _ = tf.nn.bidirectional_dynamic_rnn( | |||
LSTMBlockCell(hp.predictor_lstm_units), | |||
LSTMBlockCell(hp.predictor_lstm_units), | |||
energy_contour_outputs, | |||
sequence_length=input_lengths, | |||
dtype=tf.float32) | |||
energy_contour_outputs = tf.concat( | |||
energy_contour_outputs, axis=-1) | |||
energy_contour_outputs = tf.layers.dense( | |||
energy_contour_outputs, units=1) # [N, T_in, 1] | |||
energy_contour_outputs = tf.squeeze( | |||
energy_contour_outputs, axis=2) # [N, T_in] | |||
if is_training: | |||
pitch_embeddings = tf.expand_dims( | |||
pitch_contours, axis=2) # [N, T_in, 1] | |||
pitch_embeddings = tf.layers.conv1d( | |||
pitch_embeddings, | |||
filters=hp.encoder_projection_units, | |||
kernel_size=9, | |||
padding='same', | |||
name='pitch_embeddings') # [N, T_in, 32] | |||
energy_embeddings = tf.expand_dims( | |||
energy_contours, axis=2) # [N, T_in, 1] | |||
energy_embeddings = tf.layers.conv1d( | |||
energy_embeddings, | |||
filters=hp.encoder_projection_units, | |||
kernel_size=9, | |||
padding='same', | |||
name='energy_embeddings') # [N, T_in, 32] | |||
else: | |||
pitch_contour_outputs *= pitch_scales | |||
pitch_embeddings = tf.expand_dims( | |||
pitch_contour_outputs, axis=2) # [N, T_in, 1] | |||
pitch_embeddings = tf.layers.conv1d( | |||
pitch_embeddings, | |||
filters=hp.encoder_projection_units, | |||
kernel_size=9, | |||
padding='same', | |||
name='pitch_embeddings') # [N, T_in, 32] | |||
energy_contour_outputs *= energy_scales | |||
energy_embeddings = tf.expand_dims( | |||
energy_contour_outputs, axis=2) # [N, T_in, 1] | |||
energy_embeddings = tf.layers.conv1d( | |||
energy_embeddings, | |||
filters=hp.encoder_projection_units, | |||
kernel_size=9, | |||
padding='same', | |||
name='energy_embeddings') # [N, T_in, 32] | |||
encoder_outputs_ = encoder_outputs + pitch_embeddings + energy_embeddings | |||
# duration | |||
dur_inputs = tf.concat([ | |||
encoder_outputs_, embedded_inputs_speaker, | |||
embedded_inputs_emotion | |||
], 2) | |||
if input_mask is not None: | |||
dur_inputs = dur_inputs * tf.expand_dims(input_mask, -1) | |||
with tf.variable_scope('Duration_Predictor'): | |||
duration_predictor_cell = MultiRNNCell([ | |||
LSTMBlockCell(hp.predictor_lstm_units), | |||
LSTMBlockCell(hp.predictor_lstm_units) | |||
], state_is_tuple=True) # yapf:disable | |||
duration_output_cell = DurPredictorCell( | |||
duration_predictor_cell, is_training, 1, | |||
hp.predictor_prenet_units) | |||
duration_predictor_init_state = duration_output_cell.zero_state( | |||
batch_size=batch_size, dtype=tf.float32) | |||
if is_training: | |||
duration_helper = VarTrainingHelper( | |||
tf.expand_dims( | |||
tf.log(tf.cast(durations, tf.float32) + 1), | |||
axis=2), dur_inputs, 1) | |||
else: | |||
duration_helper = VarTestHelper(batch_size, dur_inputs, 1) | |||
( | |||
duration_outputs, _ | |||
), final_duration_predictor_state, _ = tf.contrib.seq2seq.dynamic_decode( | |||
BasicDecoder(duration_output_cell, duration_helper, | |||
duration_predictor_init_state), | |||
maximum_iterations=1000) | |||
duration_outputs = tf.squeeze( | |||
duration_outputs, axis=2) # [N, T_in] | |||
if input_mask is not None: | |||
duration_outputs = duration_outputs * input_mask | |||
duration_outputs_ = tf.exp(duration_outputs) - 1 | |||
# Length Regulator | |||
with tf.variable_scope('Length_Regulator'): | |||
if is_training: | |||
i = tf.constant(1) | |||
# position embedding | |||
j = tf.constant(1) | |||
dur_len = tf.shape(durations)[-1] | |||
embedded_position_i = tf.range(1, durations[0, 0] + 1) | |||
def condition_pos(j, e): | |||
return tf.less(j, dur_len) | |||
def loop_body_pos(j, embedded_position_i): | |||
embedded_position_i = tf.concat([ | |||
embedded_position_i, | |||
tf.range(1, durations[0, j] + 1) | |||
], axis=0) # yapf:disable | |||
return [j + 1, embedded_position_i] | |||
j, embedded_position_i = tf.while_loop( | |||
condition_pos, | |||
loop_body_pos, [j, embedded_position_i], | |||
shape_invariants=[ | |||
j.get_shape(), | |||
tf.TensorShape([None]) | |||
]) | |||
embedded_position = tf.reshape(embedded_position_i, | |||
(1, -1)) | |||
# others | |||
LR_outputs = repeat( | |||
encoder_outputs_[0:1, :, :], durations[0, :], axis=1) | |||
embedded_outputs_speaker = repeat( | |||
embedded_inputs_speaker[0:1, :, :], | |||
durations[0, :], | |||
axis=1) | |||
embedded_outputs_emotion = repeat( | |||
embedded_inputs_emotion[0:1, :, :], | |||
durations[0, :], | |||
axis=1) | |||
def condition(i, pos, layer, s, e): | |||
return tf.less(i, tf.shape(mel_targets)[0]) | |||
def loop_body(i, embedded_position, LR_outputs, | |||
embedded_outputs_speaker, | |||
embedded_outputs_emotion): | |||
# position embedding | |||
jj = tf.constant(1) | |||
embedded_position_i = tf.range(1, durations[i, 0] + 1) | |||
def condition_pos_i(j, e): | |||
return tf.less(j, dur_len) | |||
def loop_body_pos_i(j, embedded_position_i): | |||
embedded_position_i = tf.concat([ | |||
embedded_position_i, | |||
tf.range(1, durations[i, j] + 1) | |||
], axis=0) # yapf:disable | |||
return [j + 1, embedded_position_i] | |||
jj, embedded_position_i = tf.while_loop( | |||
condition_pos_i, | |||
loop_body_pos_i, [jj, embedded_position_i], | |||
shape_invariants=[ | |||
jj.get_shape(), | |||
tf.TensorShape([None]) | |||
]) | |||
embedded_position = tf.concat([ | |||
embedded_position, | |||
tf.reshape(embedded_position_i, (1, -1)) | |||
], 0) | |||
# others | |||
LR_outputs = tf.concat([ | |||
LR_outputs, | |||
repeat( | |||
encoder_outputs_[i:i + 1, :, :], | |||
durations[i, :], | |||
axis=1) | |||
], 0) | |||
embedded_outputs_speaker = tf.concat([ | |||
embedded_outputs_speaker, | |||
repeat( | |||
embedded_inputs_speaker[i:i + 1, :, :], | |||
durations[i, :], | |||
axis=1) | |||
], 0) | |||
embedded_outputs_emotion = tf.concat([ | |||
embedded_outputs_emotion, | |||
repeat( | |||
embedded_inputs_emotion[i:i + 1, :, :], | |||
durations[i, :], | |||
axis=1) | |||
], 0) | |||
return [ | |||
i + 1, embedded_position, LR_outputs, | |||
embedded_outputs_speaker, embedded_outputs_emotion | |||
] | |||
i, embedded_position, LR_outputs, | |||
embedded_outputs_speaker, | |||
embedded_outputs_emotion = tf.while_loop( | |||
condition, | |||
loop_body, [ | |||
i, embedded_position, LR_outputs, | |||
embedded_outputs_speaker, embedded_outputs_emotion | |||
], | |||
shape_invariants=[ | |||
i.get_shape(), | |||
tf.TensorShape([None, None]), | |||
tf.TensorShape([None, None, None]), | |||
tf.TensorShape([None, None, None]), | |||
tf.TensorShape([None, None, None]) | |||
], | |||
parallel_iterations=hp.batch_size) | |||
ori_framenum = tf.shape(mel_targets)[1] | |||
else: | |||
# position | |||
j = tf.constant(1) | |||
dur_len = tf.shape(duration_outputs_)[-1] | |||
embedded_position_i = tf.range( | |||
1, | |||
tf.cast(tf.round(duration_outputs_)[0, 0], tf.int32) | |||
+ 1) | |||
def condition_pos(j, e): | |||
return tf.less(j, dur_len) | |||
def loop_body_pos(j, embedded_position_i): | |||
embedded_position_i = tf.concat([ | |||
embedded_position_i, | |||
tf.range( | |||
1, | |||
tf.cast( | |||
tf.round(duration_outputs_)[0, j], | |||
tf.int32) + 1) | |||
], axis=0) # yapf:disable | |||
return [j + 1, embedded_position_i] | |||
j, embedded_position_i = tf.while_loop( | |||
condition_pos, | |||
loop_body_pos, [j, embedded_position_i], | |||
shape_invariants=[ | |||
j.get_shape(), | |||
tf.TensorShape([None]) | |||
]) | |||
embedded_position = tf.reshape(embedded_position_i, | |||
(1, -1)) | |||
# others | |||
duration_outputs_ *= duration_scales | |||
LR_outputs = repeat( | |||
encoder_outputs_[0:1, :, :], | |||
tf.cast(tf.round(duration_outputs_)[0, :], tf.int32), | |||
axis=1) | |||
embedded_outputs_speaker = repeat( | |||
embedded_inputs_speaker[0:1, :, :], | |||
tf.cast(tf.round(duration_outputs_)[0, :], tf.int32), | |||
axis=1) | |||
embedded_outputs_emotion = repeat( | |||
embedded_inputs_emotion[0:1, :, :], | |||
tf.cast(tf.round(duration_outputs_)[0, :], tf.int32), | |||
axis=1) | |||
ori_framenum = tf.shape(LR_outputs)[1] | |||
left = hp.outputs_per_step - tf.mod( | |||
ori_framenum, hp.outputs_per_step) | |||
LR_outputs = tf.cond( | |||
tf.equal(left, | |||
hp.outputs_per_step), lambda: LR_outputs, | |||
lambda: tf.pad(LR_outputs, [[0, 0], [0, left], [0, 0]], | |||
'CONSTANT')) | |||
embedded_outputs_speaker = tf.cond( | |||
tf.equal(left, hp.outputs_per_step), | |||
lambda: embedded_outputs_speaker, lambda: tf.pad( | |||
embedded_outputs_speaker, [[0, 0], [0, left], | |||
[0, 0]], 'CONSTANT')) | |||
embedded_outputs_emotion = tf.cond( | |||
tf.equal(left, hp.outputs_per_step), | |||
lambda: embedded_outputs_emotion, lambda: tf.pad( | |||
embedded_outputs_emotion, [[0, 0], [0, left], | |||
[0, 0]], 'CONSTANT')) | |||
embedded_position = tf.cond( | |||
tf.equal(left, hp.outputs_per_step), | |||
lambda: embedded_position, | |||
lambda: tf.pad(embedded_position, [[0, 0], [0, left]], | |||
'CONSTANT')) | |||
# Pos_Embedding | |||
with tf.variable_scope('Position_Embedding'): | |||
Pos_Embedding = BatchSinusodalPositionalEncoding() | |||
position_embeddings = Pos_Embedding.positional_encoding( | |||
batch_size, | |||
tf.shape(LR_outputs)[1], hp.encoder_projection_units, | |||
embedded_position) | |||
LR_outputs += position_embeddings | |||
# multi-frame | |||
LR_outputs = tf.reshape(LR_outputs, [ | |||
batch_size, -1, | |||
hp.outputs_per_step * hp.encoder_projection_units | |||
]) | |||
embedded_outputs_speaker = tf.reshape( | |||
embedded_outputs_speaker, | |||
[batch_size, -1, hp.outputs_per_step * 32])[:, :, :32] | |||
embedded_outputs_emotion = tf.reshape( | |||
embedded_outputs_emotion, | |||
[batch_size, -1, hp.outputs_per_step * 32])[:, :, :32] | |||
# [N, T_out, D_LR_outputs] (D_LR_outputs = hp.outputs_per_step * hp.encoder_projection_units + 64) | |||
LR_outputs = tf.concat([ | |||
LR_outputs, embedded_outputs_speaker, embedded_outputs_emotion | |||
], -1) | |||
# auto bandwidth | |||
if is_training: | |||
durations_mask = tf.cast(durations, | |||
tf.float32) * input_mask # [N, T_in] | |||
else: | |||
durations_mask = duration_outputs_ | |||
X_band_width = tf.cast( | |||
tf.round(tf.reduce_max(durations_mask) / hp.outputs_per_step), | |||
tf.int32) | |||
H_band_width = X_band_width | |||
with tf.variable_scope('Decoder'): | |||
Decoder = SelfAttentionDecoder( | |||
num_layers=hp.decoder_num_layers, | |||
num_units=hp.decoder_num_units, | |||
num_heads=hp.decoder_num_heads, | |||
ffn_inner_dim=hp.decoder_ffn_inner_dim, | |||
dropout=hp.decoder_dropout, | |||
attention_dropout=hp.decoder_attention_dropout, | |||
relu_dropout=hp.decoder_relu_dropout, | |||
prenet_units=hp.prenet_units, | |||
dense_units=hp.prenet_proj_units, | |||
num_mels=hp.num_mels, | |||
outputs_per_step=hp.outputs_per_step, | |||
X_band_width=X_band_width, | |||
H_band_width=H_band_width, | |||
position_encoder=None) | |||
if is_training: | |||
if hp.free_run: | |||
r = hp.outputs_per_step | |||
init_decoder_input = tf.expand_dims( | |||
tf.tile([[0.0]], [batch_size, hp.num_mels]), | |||
axis=1) # [N, 1, hp.num_mels] | |||
decoder_input_lengths = tf.cast( | |||
output_lengths / r, tf.int32) | |||
decoder_outputs, attention_x, attention_h = Decoder.dynamic_decode_and_search( | |||
init_decoder_input, | |||
maximum_iterations=tf.shape(LR_outputs)[1], | |||
mode=is_training, | |||
memory=LR_outputs, | |||
memory_sequence_length=decoder_input_lengths) | |||
else: | |||
r = hp.outputs_per_step | |||
decoder_input = mel_targets[:, r - 1:: | |||
r, :] # [N, T_out / r, hp.num_mels] | |||
init_decoder_input = tf.expand_dims( | |||
tf.tile([[0.0]], [batch_size, hp.num_mels]), | |||
axis=1) # [N, 1, hp.num_mels] | |||
decoder_input = tf.concat( | |||
[init_decoder_input, decoder_input], | |||
axis=1) # [N, T_out / r + 1, hp.num_mels] | |||
decoder_input = decoder_input[:, : | |||
-1, :] # [N, T_out / r, hp.num_mels] | |||
decoder_input_lengths = tf.cast( | |||
output_lengths / r, tf.int32) | |||
decoder_outputs, attention_x, attention_h = Decoder.decode_from_inputs( | |||
decoder_input, | |||
decoder_input_lengths, | |||
mode=is_training, | |||
memory=LR_outputs, | |||
memory_sequence_length=decoder_input_lengths) | |||
else: | |||
init_decoder_input = tf.expand_dims( | |||
tf.tile([[0.0]], [batch_size, hp.num_mels]), | |||
axis=1) # [N, 1, hp.num_mels] | |||
decoder_outputs, attention_x, attention_h = Decoder.dynamic_decode_and_search( | |||
init_decoder_input, | |||
maximum_iterations=tf.shape(LR_outputs)[1], | |||
mode=is_training, | |||
memory=LR_outputs, | |||
memory_sequence_length=tf.expand_dims( | |||
tf.shape(LR_outputs)[1], axis=0)) | |||
if is_training: | |||
mel_outputs_ = tf.reshape(decoder_outputs, | |||
[batch_size, -1, hp.num_mels]) | |||
else: | |||
mel_outputs_ = tf.reshape( | |||
decoder_outputs, | |||
[batch_size, -1, hp.num_mels])[:, :ori_framenum, :] | |||
mel_outputs = mel_outputs_ | |||
with tf.variable_scope('Postnet'): | |||
Postnet_FSMN = FsmnEncoderV2( | |||
filter_size=hp.postnet_filter_size, | |||
fsmn_num_layers=hp.postnet_fsmn_num_layers, | |||
dnn_num_layers=hp.postnet_dnn_num_layers, | |||
num_memory_units=hp.postnet_num_memory_units, | |||
ffn_inner_dim=hp.postnet_ffn_inner_dim, | |||
dropout=hp.postnet_dropout, | |||
shift=hp.postnet_shift, | |||
position_encoder=None) | |||
if is_training: | |||
postnet_fsmn_outputs, _, _ = Postnet_FSMN.encode( | |||
mel_outputs, | |||
sequence_length=output_lengths, | |||
mode=is_training) | |||
hidden_lstm_outputs, _ = tf.nn.dynamic_rnn( | |||
LSTMBlockCell(hp.postnet_lstm_units), | |||
postnet_fsmn_outputs, | |||
sequence_length=output_lengths, | |||
dtype=tf.float32) | |||
else: | |||
postnet_fsmn_outputs, _, _ = Postnet_FSMN.encode( | |||
mel_outputs, | |||
sequence_length=[tf.shape(mel_outputs_)[1]], | |||
mode=is_training) | |||
hidden_lstm_outputs, _ = tf.nn.dynamic_rnn( | |||
LSTMBlockCell(hp.postnet_lstm_units), | |||
postnet_fsmn_outputs, | |||
sequence_length=[tf.shape(mel_outputs_)[1]], | |||
dtype=tf.float32) | |||
mel_residual_outputs = tf.layers.dense( | |||
hidden_lstm_outputs, units=hp.num_mels) | |||
mel_outputs += mel_residual_outputs | |||
self.inputs = inputs | |||
self.inputs_speaker = inputs_speaker | |||
self.inputs_emotion = inputs_emotion | |||
self.input_lengths = input_lengths | |||
self.durations = durations | |||
self.output_lengths = output_lengths | |||
self.mel_outputs_ = mel_outputs_ | |||
self.mel_outputs = mel_outputs | |||
self.mel_targets = mel_targets | |||
self.duration_outputs = duration_outputs | |||
self.duration_outputs_ = duration_outputs_ | |||
self.duration_scales = duration_scales | |||
self.pitch_contour_outputs = pitch_contour_outputs | |||
self.pitch_contours = pitch_contours | |||
self.pitch_scales = pitch_scales | |||
self.energy_contour_outputs = energy_contour_outputs | |||
self.energy_contours = energy_contours | |||
self.energy_scales = energy_scales | |||
self.uv_masks_ = uv_masks | |||
self.embedded_inputs_emotion = embedded_inputs_emotion | |||
self.embedding_fsmn_outputs = embedded_inputs | |||
self.encoder_outputs = encoder_outputs | |||
self.encoder_outputs_ = encoder_outputs_ | |||
self.LR_outputs = LR_outputs | |||
self.postnet_fsmn_outputs = postnet_fsmn_outputs | |||
self.pitch_embeddings = pitch_embeddings | |||
self.energy_embeddings = energy_embeddings | |||
self.attns = attns | |||
self.attention_x = attention_x | |||
self.attention_h = attention_h | |||
self.X_band_width = X_band_width | |||
self.H_band_width = H_band_width | |||
def add_loss(self): | |||
'''Adds loss to the model. Sets "loss" field. initialize must have been called.''' | |||
with tf.variable_scope('loss') as _: | |||
hp = self._hparams | |||
mask = tf.sequence_mask( | |||
self.output_lengths, | |||
tf.shape(self.mel_targets)[1], | |||
dtype=tf.float32) | |||
valid_outputs = tf.reduce_sum(mask) | |||
mask_input = tf.sequence_mask( | |||
self.input_lengths, | |||
tf.shape(self.durations)[1], | |||
dtype=tf.float32) | |||
valid_inputs = tf.reduce_sum(mask_input) | |||
# mel loss | |||
if self.uv_masks_ is not None: | |||
valid_outputs_mask = tf.reduce_sum( | |||
tf.expand_dims(mask, -1) * self.uv_masks_) | |||
self.mel_loss_ = tf.reduce_sum( | |||
tf.abs(self.mel_targets - self.mel_outputs_) | |||
* tf.expand_dims(mask, -1) * self.uv_masks_) / ( | |||
valid_outputs_mask * hp.num_mels) | |||
self.mel_loss = tf.reduce_sum( | |||
tf.abs(self.mel_targets - self.mel_outputs) | |||
* tf.expand_dims(mask, -1) * self.uv_masks_) / ( | |||
valid_outputs_mask * hp.num_mels) | |||
else: | |||
self.mel_loss_ = tf.reduce_sum( | |||
tf.abs(self.mel_targets - self.mel_outputs_) | |||
* tf.expand_dims(mask, -1)) / ( | |||
valid_outputs * hp.num_mels) | |||
self.mel_loss = tf.reduce_sum( | |||
tf.abs(self.mel_targets - self.mel_outputs) | |||
* tf.expand_dims(mask, -1)) / ( | |||
valid_outputs * hp.num_mels) | |||
# duration loss | |||
self.duration_loss = tf.reduce_sum( | |||
tf.abs( | |||
tf.log(tf.cast(self.durations, tf.float32) + 1) | |||
- self.duration_outputs) * mask_input) / valid_inputs | |||
# pitch contour loss | |||
self.pitch_contour_loss = tf.reduce_sum( | |||
tf.abs(self.pitch_contours - self.pitch_contour_outputs) | |||
* mask_input) / valid_inputs | |||
# energy contour loss | |||
self.energy_contour_loss = tf.reduce_sum( | |||
tf.abs(self.energy_contours - self.energy_contour_outputs) | |||
* mask_input) / valid_inputs | |||
# final loss | |||
self.loss = self.mel_loss_ + self.mel_loss + self.duration_loss \ | |||
+ self.pitch_contour_loss + self.energy_contour_loss | |||
# guided attention loss | |||
self.guided_attention_loss = tf.constant(0.0) | |||
if hp.guided_attention: | |||
i0 = tf.constant(0) | |||
loss0 = tf.constant(0.0) | |||
def c(i, _): | |||
return tf.less(i, tf.shape(mel_targets)[0]) | |||
def loop_body(i, loss): | |||
decoder_input_lengths = tf.cast( | |||
self.output_lengths / hp.outputs_per_step, tf.int32) | |||
input_len = decoder_input_lengths[i] | |||
output_len = decoder_input_lengths[i] | |||
input_w = tf.expand_dims( | |||
tf.range(tf.cast(input_len, dtype=tf.float32)), | |||
axis=1) / tf.cast( | |||
input_len, dtype=tf.float32) # [T_in, 1] | |||
output_w = tf.expand_dims( | |||
tf.range(tf.cast(output_len, dtype=tf.float32)), | |||
axis=0) / tf.cast( | |||
output_len, dtype=tf.float32) # [1, T_out] | |||
guided_attention_w = 1.0 - tf.exp( | |||
-(1 / hp.guided_attention_2g_squared) | |||
* tf.square(input_w - output_w)) # [T_in, T_out] | |||
guided_attention_w = tf.expand_dims( | |||
guided_attention_w, axis=0) # [1, T_in, T_out] | |||
# [hp.decoder_num_heads, T_in, T_out] | |||
guided_attention_w = tf.tile(guided_attention_w, | |||
[hp.decoder_num_heads, 1, 1]) | |||
loss_i = tf.constant(0.0) | |||
for j in range(hp.decoder_num_layers): | |||
loss_i += tf.reduce_mean( | |||
self.attention_h[j][i, :, :input_len, :output_len] | |||
* guided_attention_w) | |||
return [tf.add(i, 1), tf.add(loss, loss_i)] | |||
_, loss = tf.while_loop( | |||
c, | |||
loop_body, | |||
loop_vars=[i0, loss0], | |||
parallel_iterations=hp.batch_size) | |||
self.guided_attention_loss = loss / hp.batch_size | |||
self.loss += hp.guided_attention_loss_weight * self.guided_attention_loss | |||
def add_optimizer(self, global_step): | |||
'''Adds optimizer. Sets "gradients" and "optimize" fields. add_loss must have been called. | |||
Args: | |||
global_step: int32 scalar Tensor representing current global step in training | |||
''' | |||
with tf.variable_scope('optimizer') as _: | |||
hp = self._hparams | |||
if hp.decay_learning_rate: | |||
self.learning_rate = _learning_rate_decay( | |||
hp.initial_learning_rate, global_step) | |||
else: | |||
self.learning_rate = tf.convert_to_tensor( | |||
hp.initial_learning_rate) | |||
optimizer = tf.train.AdamOptimizer(self.learning_rate, | |||
hp.adam_beta1, hp.adam_beta2) | |||
gradients, variables = zip(*optimizer.compute_gradients(self.loss)) | |||
self.gradients = gradients | |||
clipped_gradients, _ = tf.clip_by_global_norm(gradients, 1.0) | |||
# Add dependency on UPDATE_OPS; otherwise batchnorm won't work correctly. See: | |||
# https://github.com/tensorflow/tensorflow/issues/1122 | |||
with tf.control_dependencies( | |||
tf.get_collection(tf.GraphKeys.UPDATE_OPS)): | |||
self.optimize = optimizer.apply_gradients( | |||
zip(clipped_gradients, variables), global_step=global_step) | |||
def _learning_rate_decay(init_lr, global_step): | |||
# Noam scheme from tensor2tensor: | |||
warmup_steps = 4000.0 | |||
step = tf.cast(global_step + 1, dtype=tf.float32) | |||
return init_lr * warmup_steps**0.5 * tf.minimum(step * warmup_steps**-1.5, | |||
step**-0.5) |
@@ -0,0 +1,817 @@ | |||
"""Define self-attention decoder.""" | |||
import sys | |||
import tensorflow as tf | |||
from . import compat, transformer | |||
from .modules import decoder_prenet | |||
from .position import SinusoidalPositionEncoder | |||
class SelfAttentionDecoder(): | |||
"""Decoder using self-attention as described in | |||
https://arxiv.org/abs/1706.03762. | |||
""" | |||
def __init__(self, | |||
num_layers, | |||
num_units=512, | |||
num_heads=8, | |||
ffn_inner_dim=2048, | |||
dropout=0.1, | |||
attention_dropout=0.1, | |||
relu_dropout=0.1, | |||
prenet_units=256, | |||
dense_units=128, | |||
num_mels=80, | |||
outputs_per_step=3, | |||
X_band_width=None, | |||
H_band_width=None, | |||
position_encoder=SinusoidalPositionEncoder(), | |||
self_attention_type='scaled_dot'): | |||
"""Initializes the parameters of the decoder. | |||
Args: | |||
num_layers: The number of layers. | |||
num_units: The number of hidden units. | |||
num_heads: The number of heads in the multi-head attention. | |||
ffn_inner_dim: The number of units of the inner linear transformation | |||
in the feed forward layer. | |||
dropout: The probability to drop units from the outputs. | |||
attention_dropout: The probability to drop units from the attention. | |||
relu_dropout: The probability to drop units from the ReLU activation in | |||
the feed forward layer. | |||
position_encoder: A :class:`opennmt.layers.position.PositionEncoder` to | |||
apply on inputs or ``None``. | |||
self_attention_type: Type of self attention, "scaled_dot" or "average" (case | |||
insensitive). | |||
Raises: | |||
ValueError: if :obj:`self_attention_type` is invalid. | |||
""" | |||
super(SelfAttentionDecoder, self).__init__() | |||
self.num_layers = num_layers | |||
self.num_units = num_units | |||
self.num_heads = num_heads | |||
self.ffn_inner_dim = ffn_inner_dim | |||
self.dropout = dropout | |||
self.attention_dropout = attention_dropout | |||
self.relu_dropout = relu_dropout | |||
self.position_encoder = position_encoder | |||
self.self_attention_type = self_attention_type.lower() | |||
if self.self_attention_type not in ('scaled_dot', 'average'): | |||
raise ValueError('invalid attention type %s' | |||
% self.self_attention_type) | |||
if self.self_attention_type == 'average': | |||
tf.logging.warning( | |||
'Support for average attention network is experimental ' | |||
'and may change in future versions.') | |||
self.prenet_units = prenet_units | |||
self.dense_units = dense_units | |||
self.num_mels = num_mels | |||
self.outputs_per_step = outputs_per_step | |||
self.X_band_width = X_band_width | |||
self.H_band_width = H_band_width | |||
@property | |||
def output_size(self): | |||
"""Returns the decoder output size.""" | |||
return self.num_units | |||
@property | |||
def support_alignment_history(self): | |||
return True | |||
@property | |||
def support_multi_source(self): | |||
return True | |||
def _init_cache(self, batch_size, dtype=tf.float32, num_sources=1): | |||
cache = {} | |||
for layer in range(self.num_layers): | |||
proj_cache_shape = [ | |||
batch_size, self.num_heads, 0, self.num_units // self.num_heads | |||
] | |||
layer_cache = {} | |||
layer_cache['memory'] = [{ | |||
'memory_keys': | |||
tf.zeros(proj_cache_shape, dtype=dtype), | |||
'memory_values': | |||
tf.zeros(proj_cache_shape, dtype=dtype) | |||
} for _ in range(num_sources)] | |||
if self.self_attention_type == 'scaled_dot': | |||
layer_cache['self_keys'] = tf.zeros( | |||
proj_cache_shape, dtype=dtype) | |||
layer_cache['self_values'] = tf.zeros( | |||
proj_cache_shape, dtype=dtype) | |||
elif self.self_attention_type == 'average': | |||
layer_cache['prev_g'] = tf.zeros( | |||
[batch_size, 1, self.num_units], dtype=dtype) | |||
cache['layer_{}'.format(layer)] = layer_cache | |||
return cache | |||
def _init_attn(self, dtype=tf.float32): | |||
attn = [] | |||
for layer in range(self.num_layers): | |||
attn.append(tf.TensorArray(tf.float32, size=0, dynamic_size=True)) | |||
return attn | |||
def _self_attention_stack(self, | |||
inputs, | |||
sequence_length=None, | |||
mode=True, | |||
cache=None, | |||
memory=None, | |||
memory_sequence_length=None, | |||
step=None): | |||
# [N, T_out, self.dense_units] or [N, 1, self.dense_units] | |||
prenet_outputs = decoder_prenet(inputs, self.prenet_units, | |||
self.dense_units, mode) | |||
if step is None: | |||
decoder_inputs = tf.concat( | |||
[memory, prenet_outputs], | |||
axis=-1) # [N, T_out, memory_size + self.dense_units] | |||
else: | |||
decoder_inputs = tf.concat( | |||
[memory[:, step:step + 1, :], prenet_outputs], | |||
axis=-1) # [N, 1, memory_size + self.dense_units] | |||
decoder_inputs = tf.layers.dense( | |||
decoder_inputs, units=self.dense_units) | |||
inputs = decoder_inputs | |||
inputs *= self.num_units**0.5 | |||
if self.position_encoder is not None: | |||
inputs = self.position_encoder( | |||
inputs, position=step + 1 if step is not None else None) | |||
inputs = tf.layers.dropout(inputs, rate=self.dropout, training=mode) | |||
decoder_mask = None | |||
memory_mask = None | |||
# last_attention = None | |||
X_band_width_tmp = -1 | |||
H_band_width_tmp = -1 | |||
if self.X_band_width is not None: | |||
X_band_width_tmp = tf.cast( | |||
tf.cond( | |||
tf.less(tf.shape(memory)[1], self.X_band_width), | |||
lambda: -1, lambda: self.X_band_width), | |||
dtype=tf.int64) | |||
if self.H_band_width is not None: | |||
H_band_width_tmp = tf.cast( | |||
tf.cond( | |||
tf.less(tf.shape(memory)[1], self.H_band_width), | |||
lambda: -1, lambda: self.H_band_width), | |||
dtype=tf.int64) | |||
if self.self_attention_type == 'scaled_dot': | |||
if sequence_length is not None: | |||
decoder_mask = transformer.build_future_mask( | |||
sequence_length, | |||
num_heads=self.num_heads, | |||
maximum_length=tf.shape(inputs)[1], | |||
band=X_band_width_tmp) # [N, 1, T_out, T_out] | |||
elif self.self_attention_type == 'average': | |||
if cache is None: | |||
if sequence_length is None: | |||
sequence_length = tf.fill([tf.shape(inputs)[0]], | |||
tf.shape(inputs)[1]) | |||
decoder_mask = transformer.cumulative_average_mask( | |||
sequence_length, | |||
maximum_length=tf.shape(inputs)[1], | |||
dtype=inputs.dtype) | |||
if memory is not None and not tf.contrib.framework.nest.is_sequence( | |||
memory): | |||
memory = (memory, ) | |||
if memory_sequence_length is not None: | |||
if not tf.contrib.framework.nest.is_sequence( | |||
memory_sequence_length): | |||
memory_sequence_length = (memory_sequence_length, ) | |||
if step is None: | |||
memory_mask = [ | |||
transformer.build_history_mask( | |||
length, | |||
num_heads=self.num_heads, | |||
maximum_length=tf.shape(m)[1], | |||
band=H_band_width_tmp) | |||
for m, length in zip(memory, memory_sequence_length) | |||
] | |||
else: | |||
memory_mask = [ | |||
transformer.build_history_mask( | |||
length, | |||
num_heads=self.num_heads, | |||
maximum_length=tf.shape(m)[1], | |||
band=H_band_width_tmp)[:, :, step:step + 1, :] | |||
for m, length in zip(memory, memory_sequence_length) | |||
] | |||
# last_attention = None | |||
attns_x = [] | |||
attns_h = [] | |||
for layer in range(self.num_layers): | |||
layer_name = 'layer_{}'.format(layer) | |||
layer_cache = cache[layer_name] if cache is not None else None | |||
with tf.variable_scope(layer_name): | |||
if memory is not None: | |||
for i, (mem, mask) in enumerate(zip(memory, memory_mask)): | |||
memory_cache = None | |||
if layer_cache is not None: | |||
memory_cache = layer_cache['memory'][i] | |||
scope_name = 'multi_head_{}'.format(i) | |||
if i == 0: | |||
scope_name = 'multi_head' | |||
with tf.variable_scope(scope_name): | |||
encoded, attn_x, attn_h = transformer.multi_head_attention_PNCA( | |||
self.num_heads, | |||
transformer.norm(inputs), | |||
mem, | |||
mode, | |||
num_units=self.num_units, | |||
mask=decoder_mask, | |||
mask_h=mask, | |||
cache=layer_cache, | |||
cache_h=memory_cache, | |||
dropout=self.attention_dropout, | |||
return_attention=True, | |||
layer_name=layer_name, | |||
X_band_width=self.X_band_width) | |||
attns_x.append(attn_x) | |||
attns_h.append(attn_h) | |||
context = transformer.drop_and_add( | |||
inputs, encoded, mode, dropout=self.dropout) | |||
with tf.variable_scope('ffn'): | |||
transformed = transformer.feed_forward_ori( | |||
transformer.norm(context), | |||
self.ffn_inner_dim, | |||
mode, | |||
dropout=self.relu_dropout) | |||
transformed = transformer.drop_and_add( | |||
context, transformed, mode, dropout=self.dropout) | |||
inputs = transformed | |||
outputs = transformer.norm(inputs) | |||
outputs = tf.layers.dense( | |||
outputs, units=self.num_mels * self.outputs_per_step) | |||
return outputs, attns_x, attns_h | |||
def decode_from_inputs(self, | |||
inputs, | |||
sequence_length, | |||
initial_state=None, | |||
mode=True, | |||
memory=None, | |||
memory_sequence_length=None): | |||
outputs, attention_x, attention_h = self._self_attention_stack( | |||
inputs, | |||
sequence_length=sequence_length, | |||
mode=mode, | |||
memory=memory, | |||
memory_sequence_length=memory_sequence_length) | |||
return outputs, attention_x, attention_h | |||
def step_fn(self, | |||
mode, | |||
batch_size, | |||
initial_state=None, | |||
memory=None, | |||
memory_sequence_length=None, | |||
dtype=tf.float32): | |||
if memory is None: | |||
num_sources = 0 | |||
elif tf.contrib.framework.nest.is_sequence(memory): | |||
num_sources = len(memory) | |||
else: | |||
num_sources = 1 | |||
cache = self._init_cache( | |||
batch_size, dtype=dtype, num_sources=num_sources) | |||
attention_x = self._init_attn(dtype=dtype) | |||
attention_h = self._init_attn(dtype=dtype) | |||
def _fn(step, inputs, cache): | |||
outputs, attention_x, attention_h = self._self_attention_stack( | |||
inputs, | |||
mode=mode, | |||
cache=cache, | |||
memory=memory, | |||
memory_sequence_length=memory_sequence_length, | |||
step=step) | |||
attention_x_tmp = [] | |||
for layer in range(len(attention_h)): | |||
attention_x_tmp_l = tf.zeros_like(attention_h[layer]) | |||
if self.X_band_width is not None: | |||
pred = tf.less(step, self.X_band_width + 1) | |||
attention_x_tmp_l_1 = tf.cond(pred, # yapf:disable | |||
lambda: attention_x_tmp_l[:, :, :, :step + 1] + attention_x[layer], | |||
lambda: tf.concat([ | |||
attention_x_tmp_l[:, :, :, | |||
:step - self.X_band_width], | |||
attention_x_tmp_l[:, :, :, | |||
step - self.X_band_width:step + 1] | |||
+ attention_x[layer]], | |||
axis=-1)) # yapf:disable | |||
attention_x_tmp_l_2 = attention_x_tmp_l[:, :, :, step + 1:] | |||
attention_x_tmp.append( | |||
tf.concat([attention_x_tmp_l_1, attention_x_tmp_l_2], | |||
axis=-1)) | |||
else: | |||
attention_x_tmp_l_1 = attention_x_tmp_l[:, :, :, :step + 1] | |||
attention_x_tmp_l_2 = attention_x_tmp_l[:, :, :, step + 1:] | |||
attention_x_tmp.append( | |||
tf.concat([ | |||
attention_x_tmp_l_1 + attention_x[layer], | |||
attention_x_tmp_l_2 | |||
], axis=-1)) # yapf:disable | |||
attention_x = attention_x_tmp | |||
return outputs, cache, attention_x, attention_h | |||
return _fn, cache, attention_x, attention_h | |||
def dynamic_decode_and_search(self, init_decoder_input, maximum_iterations, | |||
mode, memory, memory_sequence_length): | |||
batch_size = tf.shape(init_decoder_input)[0] | |||
step_fn, init_cache, init_attn_x, init_attn_h = self.step_fn( | |||
mode, | |||
batch_size, | |||
memory=memory, | |||
memory_sequence_length=memory_sequence_length) | |||
outputs, attention_x, attention_h, cache = self.dynamic_decode( | |||
step_fn, | |||
init_decoder_input, | |||
init_cache=init_cache, | |||
init_attn_x=init_attn_x, | |||
init_attn_h=init_attn_h, | |||
maximum_iterations=maximum_iterations, | |||
batch_size=batch_size) | |||
return outputs, attention_x, attention_h | |||
def dynamic_decode_and_search_teacher_forcing(self, decoder_input, | |||
maximum_iterations, mode, | |||
memory, | |||
memory_sequence_length): | |||
batch_size = tf.shape(decoder_input)[0] | |||
step_fn, init_cache, init_attn_x, init_attn_h = self.step_fn( | |||
mode, | |||
batch_size, | |||
memory=memory, | |||
memory_sequence_length=memory_sequence_length) | |||
outputs, attention_x, attention_h, cache = self.dynamic_decode_teacher_forcing( | |||
step_fn, | |||
decoder_input, | |||
init_cache=init_cache, | |||
init_attn_x=init_attn_x, | |||
init_attn_h=init_attn_h, | |||
maximum_iterations=maximum_iterations, | |||
batch_size=batch_size) | |||
return outputs, attention_x, attention_h | |||
def dynamic_decode(self, | |||
step_fn, | |||
init_decoder_input, | |||
init_cache=None, | |||
init_attn_x=None, | |||
init_attn_h=None, | |||
maximum_iterations=None, | |||
batch_size=None): | |||
def _cond(step, cache, inputs, outputs, attention_x, attention_h): # pylint: disable=unused-argument | |||
return tf.less(step, maximum_iterations) | |||
def _body(step, cache, inputs, outputs, attention_x, attention_h): | |||
# output: [1, 1, num_mels * r] | |||
# attn: [1, 1, T_out] | |||
output, cache, attn_x, attn_h = step_fn( | |||
step, inputs, cache) # outputs, cache, attention, attns | |||
for layer in range(len(attention_x)): | |||
attention_x[layer] = attention_x[layer].write( | |||
step, tf.cast(attn_x[layer], tf.float32)) | |||
for layer in range(len(attention_h)): | |||
attention_h[layer] = attention_h[layer].write( | |||
step, tf.cast(attn_h[layer], tf.float32)) | |||
outputs = outputs.write(step, tf.cast(output, tf.float32)) | |||
return step + 1, cache, output[:, :, -self. | |||
num_mels:], outputs, attention_x, attention_h | |||
step = tf.constant(0, dtype=tf.int32) | |||
outputs = tf.TensorArray(tf.float32, size=0, dynamic_size=True) | |||
_, cache, _, outputs, attention_x, attention_h = tf.while_loop( | |||
_cond, | |||
_body, | |||
loop_vars=(step, init_cache, init_decoder_input, outputs, | |||
init_attn_x, init_attn_h), | |||
shape_invariants=(step.shape, | |||
compat.nest.map_structure( | |||
self._get_shape_invariants, init_cache), | |||
compat.nest.map_structure( | |||
self._get_shape_invariants, | |||
init_decoder_input), tf.TensorShape(None), | |||
compat.nest.map_structure( | |||
self._get_shape_invariants, init_attn_x), | |||
compat.nest.map_structure( | |||
self._get_shape_invariants, init_attn_h)), | |||
parallel_iterations=1, | |||
back_prop=False, | |||
maximum_iterations=maximum_iterations) | |||
# element of outputs: [N, 1, num_mels * r] | |||
outputs_stack = outputs.stack() # [T_out, N, 1, num_mels * r] | |||
outputs_stack = tf.transpose( | |||
outputs_stack, perm=[2, 1, 0, 3]) # [1, N, T_out, num_mels * r] | |||
outputs_stack = tf.squeeze( | |||
outputs_stack, axis=0) # [N, T_out, num_mels * r] | |||
attention_x_stack = [] | |||
for layer in range(len(attention_x)): | |||
attention_x_stack_tmp = attention_x[layer].stack( | |||
) # [T_out, N, H, 1, T_out] | |||
attention_x_stack_tmp = tf.transpose( | |||
attention_x_stack_tmp, perm=[3, 1, 2, 0, | |||
4]) # [1, N, H, T_out, T_out] | |||
attention_x_stack_tmp = tf.squeeze( | |||
attention_x_stack_tmp, axis=0) # [N, H, T_out, T_out] | |||
attention_x_stack.append(attention_x_stack_tmp) | |||
attention_h_stack = [] | |||
for layer in range(len(attention_h)): | |||
attention_h_stack_tmp = attention_h[layer].stack( | |||
) # [T_out, N, H, 1, T_out] | |||
attention_h_stack_tmp = tf.transpose( | |||
attention_h_stack_tmp, perm=[3, 1, 2, 0, | |||
4]) # [1, N, H, T_out, T_out] | |||
attention_h_stack_tmp = tf.squeeze( | |||
attention_h_stack_tmp, axis=0) # [N, H, T_out, T_out] | |||
attention_h_stack.append(attention_h_stack_tmp) | |||
return outputs_stack, attention_x_stack, attention_h_stack, cache | |||
def dynamic_decode_teacher_forcing(self, | |||
step_fn, | |||
decoder_input, | |||
init_cache=None, | |||
init_attn_x=None, | |||
init_attn_h=None, | |||
maximum_iterations=None, | |||
batch_size=None): | |||
def _cond(step, cache, inputs, outputs, attention_x, attention_h): # pylint: disable=unused-argument | |||
return tf.less(step, maximum_iterations) | |||
def _body(step, cache, inputs, outputs, attention_x, attention_h): | |||
# output: [1, 1, num_mels * r] | |||
# attn: [1, 1, T_out] | |||
output, cache, attn_x, attn_h = step_fn( | |||
step, inputs[:, step:step + 1, :], | |||
cache) # outputs, cache, attention, attns | |||
for layer in range(len(attention_x)): | |||
attention_x[layer] = attention_x[layer].write( | |||
step, tf.cast(attn_x[layer], tf.float32)) | |||
for layer in range(len(attention_h)): | |||
attention_h[layer] = attention_h[layer].write( | |||
step, tf.cast(attn_h[layer], tf.float32)) | |||
outputs = outputs.write(step, tf.cast(output, tf.float32)) | |||
return step + 1, cache, inputs, outputs, attention_x, attention_h | |||
step = tf.constant(0, dtype=tf.int32) | |||
outputs = tf.TensorArray(tf.float32, size=0, dynamic_size=True) | |||
_, cache, _, outputs, attention_x, attention_h = tf.while_loop( | |||
_cond, | |||
_body, | |||
loop_vars=(step, init_cache, decoder_input, outputs, init_attn_x, | |||
init_attn_h), | |||
shape_invariants=(step.shape, | |||
compat.nest.map_structure( | |||
self._get_shape_invariants, | |||
init_cache), decoder_input.shape, | |||
tf.TensorShape(None), | |||
compat.nest.map_structure( | |||
self._get_shape_invariants, init_attn_x), | |||
compat.nest.map_structure( | |||
self._get_shape_invariants, init_attn_h)), | |||
parallel_iterations=1, | |||
back_prop=False, | |||
maximum_iterations=maximum_iterations) | |||
# element of outputs: [N, 1, num_mels * r] | |||
outputs_stack = outputs.stack() # [T_out, N, 1, num_mels * r] | |||
outputs_stack = tf.transpose( | |||
outputs_stack, perm=[2, 1, 0, 3]) # [1, N, T_out, num_mels * r] | |||
outputs_stack = tf.squeeze( | |||
outputs_stack, axis=0) # [N, T_out, num_mels * r] | |||
attention_x_stack = [] | |||
for layer in range(len(attention_x)): | |||
attention_x_stack_tmp = attention_x[layer].stack( | |||
) # [T_out, N, H, 1, T_out] | |||
attention_x_stack_tmp = tf.transpose( | |||
attention_x_stack_tmp, perm=[3, 1, 2, 0, | |||
4]) # [1, N, H, T_out, T_out] | |||
attention_x_stack_tmp = tf.squeeze( | |||
attention_x_stack_tmp, axis=0) # [N, H, T_out, T_out] | |||
attention_x_stack.append(attention_x_stack_tmp) | |||
attention_h_stack = [] | |||
for layer in range(len(attention_h)): | |||
attention_h_stack_tmp = attention_h[layer].stack( | |||
) # [T_out, N, H, 1, T_out] | |||
attention_h_stack_tmp = tf.transpose( | |||
attention_h_stack_tmp, perm=[3, 1, 2, 0, | |||
4]) # [1, N, H, T_out, T_out] | |||
attention_h_stack_tmp = tf.squeeze( | |||
attention_h_stack_tmp, axis=0) # [N, H, T_out, T_out] | |||
attention_h_stack.append(attention_h_stack_tmp) | |||
return outputs_stack, attention_x_stack, attention_h_stack, cache | |||
def _get_shape_invariants(self, tensor): | |||
"""Returns the shape of the tensor but sets middle dims to None.""" | |||
if isinstance(tensor, tf.TensorArray): | |||
shape = None | |||
else: | |||
shape = tensor.shape.as_list() | |||
for i in range(1, len(shape) - 1): | |||
shape[i] = None | |||
return tf.TensorShape(shape) | |||
class SelfAttentionDecoderOri(): | |||
"""Decoder using self-attention as described in | |||
https://arxiv.org/abs/1706.03762. | |||
""" | |||
def __init__(self, | |||
num_layers, | |||
num_units=512, | |||
num_heads=8, | |||
ffn_inner_dim=2048, | |||
dropout=0.1, | |||
attention_dropout=0.1, | |||
relu_dropout=0.1, | |||
position_encoder=SinusoidalPositionEncoder(), | |||
self_attention_type='scaled_dot'): | |||
"""Initializes the parameters of the decoder. | |||
Args: | |||
num_layers: The number of layers. | |||
num_units: The number of hidden units. | |||
num_heads: The number of heads in the multi-head attention. | |||
ffn_inner_dim: The number of units of the inner linear transformation | |||
in the feed forward layer. | |||
dropout: The probability to drop units from the outputs. | |||
attention_dropout: The probability to drop units from the attention. | |||
relu_dropout: The probability to drop units from the ReLU activation in | |||
the feed forward layer. | |||
position_encoder: A :class:`opennmt.layers.position.PositionEncoder` to | |||
apply on inputs or ``None``. | |||
self_attention_type: Type of self attention, "scaled_dot" or "average" (case | |||
insensitive). | |||
Raises: | |||
ValueError: if :obj:`self_attention_type` is invalid. | |||
""" | |||
super(SelfAttentionDecoderOri, self).__init__() | |||
self.num_layers = num_layers | |||
self.num_units = num_units | |||
self.num_heads = num_heads | |||
self.ffn_inner_dim = ffn_inner_dim | |||
self.dropout = dropout | |||
self.attention_dropout = attention_dropout | |||
self.relu_dropout = relu_dropout | |||
self.position_encoder = position_encoder | |||
self.self_attention_type = self_attention_type.lower() | |||
if self.self_attention_type not in ('scaled_dot', 'average'): | |||
raise ValueError('invalid attention type %s' | |||
% self.self_attention_type) | |||
if self.self_attention_type == 'average': | |||
tf.logging.warning( | |||
'Support for average attention network is experimental ' | |||
'and may change in future versions.') | |||
@property | |||
def output_size(self): | |||
"""Returns the decoder output size.""" | |||
return self.num_units | |||
@property | |||
def support_alignment_history(self): | |||
return True | |||
@property | |||
def support_multi_source(self): | |||
return True | |||
def _init_cache(self, batch_size, dtype=tf.float32, num_sources=1): | |||
cache = {} | |||
for layer in range(self.num_layers): | |||
proj_cache_shape = [ | |||
batch_size, self.num_heads, 0, self.num_units // self.num_heads | |||
] | |||
layer_cache = {} | |||
layer_cache['memory'] = [{ | |||
'memory_keys': | |||
tf.zeros(proj_cache_shape, dtype=dtype), | |||
'memory_values': | |||
tf.zeros(proj_cache_shape, dtype=dtype) | |||
} for _ in range(num_sources)] | |||
if self.self_attention_type == 'scaled_dot': | |||
layer_cache['self_keys'] = tf.zeros( | |||
proj_cache_shape, dtype=dtype) | |||
layer_cache['self_values'] = tf.zeros( | |||
proj_cache_shape, dtype=dtype) | |||
elif self.self_attention_type == 'average': | |||
layer_cache['prev_g'] = tf.zeros( | |||
[batch_size, 1, self.num_units], dtype=dtype) | |||
cache['layer_{}'.format(layer)] = layer_cache | |||
return cache | |||
def _self_attention_stack(self, | |||
inputs, | |||
sequence_length=None, | |||
mode=True, | |||
cache=None, | |||
memory=None, | |||
memory_sequence_length=None, | |||
step=None): | |||
inputs *= self.num_units**0.5 | |||
if self.position_encoder is not None: | |||
inputs = self.position_encoder( | |||
inputs, position=step + 1 if step is not None else None) | |||
inputs = tf.layers.dropout(inputs, rate=self.dropout, training=mode) | |||
decoder_mask = None | |||
memory_mask = None | |||
last_attention = None | |||
if self.self_attention_type == 'scaled_dot': | |||
if sequence_length is not None: | |||
decoder_mask = transformer.build_future_mask( | |||
sequence_length, | |||
num_heads=self.num_heads, | |||
maximum_length=tf.shape(inputs)[1]) | |||
elif self.self_attention_type == 'average': | |||
if cache is None: | |||
if sequence_length is None: | |||
sequence_length = tf.fill([tf.shape(inputs)[0]], | |||
tf.shape(inputs)[1]) | |||
decoder_mask = transformer.cumulative_average_mask( | |||
sequence_length, | |||
maximum_length=tf.shape(inputs)[1], | |||
dtype=inputs.dtype) | |||
if memory is not None and not tf.contrib.framework.nest.is_sequence( | |||
memory): | |||
memory = (memory, ) | |||
if memory_sequence_length is not None: | |||
if not tf.contrib.framework.nest.is_sequence( | |||
memory_sequence_length): | |||
memory_sequence_length = (memory_sequence_length, ) | |||
memory_mask = [ | |||
transformer.build_sequence_mask( | |||
length, | |||
num_heads=self.num_heads, | |||
maximum_length=tf.shape(m)[1]) | |||
for m, length in zip(memory, memory_sequence_length) | |||
] | |||
for layer in range(self.num_layers): | |||
layer_name = 'layer_{}'.format(layer) | |||
layer_cache = cache[layer_name] if cache is not None else None | |||
with tf.variable_scope(layer_name): | |||
if self.self_attention_type == 'scaled_dot': | |||
with tf.variable_scope('masked_multi_head'): | |||
encoded = transformer.multi_head_attention( | |||
self.num_heads, | |||
transformer.norm(inputs), | |||
None, | |||
mode, | |||
num_units=self.num_units, | |||
mask=decoder_mask, | |||
cache=layer_cache, | |||
dropout=self.attention_dropout) | |||
last_context = transformer.drop_and_add( | |||
inputs, encoded, mode, dropout=self.dropout) | |||
elif self.self_attention_type == 'average': | |||
with tf.variable_scope('average_attention'): | |||
# Cumulative average. | |||
x = transformer.norm(inputs) | |||
y = transformer.cumulative_average( | |||
x, | |||
decoder_mask if cache is None else step, | |||
cache=layer_cache) | |||
# FFN. | |||
y = transformer.feed_forward( | |||
y, | |||
self.ffn_inner_dim, | |||
mode, | |||
dropout=self.relu_dropout) | |||
# Gating layer. | |||
z = tf.layers.dense( | |||
tf.concat([x, y], -1), self.num_units * 2) | |||
i, f = tf.split(z, 2, axis=-1) | |||
y = tf.sigmoid(i) * x + tf.sigmoid(f) * y | |||
last_context = transformer.drop_and_add( | |||
inputs, y, mode, dropout=self.dropout) | |||
if memory is not None: | |||
for i, (mem, mask) in enumerate(zip(memory, memory_mask)): | |||
memory_cache = layer_cache['memory'][i] if layer_cache is not None else None # yapf:disable | |||
with tf.variable_scope('multi_head' if i | |||
== 0 else 'multi_head_%d' % i): # yapf:disable | |||
context, last_attention = transformer.multi_head_attention( | |||
self.num_heads, | |||
transformer.norm(last_context), | |||
mem, | |||
mode, | |||
mask=mask, | |||
cache=memory_cache, | |||
dropout=self.attention_dropout, | |||
return_attention=True) | |||
last_context = transformer.drop_and_add( | |||
last_context, | |||
context, | |||
mode, | |||
dropout=self.dropout) | |||
if i > 0: # Do not return attention in case of multi source. | |||
last_attention = None | |||
with tf.variable_scope('ffn'): | |||
transformed = transformer.feed_forward_ori( | |||
transformer.norm(last_context), | |||
self.ffn_inner_dim, | |||
mode, | |||
dropout=self.relu_dropout) | |||
transformed = transformer.drop_and_add( | |||
last_context, transformed, mode, dropout=self.dropout) | |||
inputs = transformed | |||
if last_attention is not None: | |||
# The first head of the last layer is returned. | |||
first_head_attention = last_attention[:, 0] | |||
else: | |||
first_head_attention = None | |||
outputs = transformer.norm(inputs) | |||
return outputs, first_head_attention | |||
def decode_from_inputs(self, | |||
inputs, | |||
sequence_length, | |||
initial_state=None, | |||
mode=True, | |||
memory=None, | |||
memory_sequence_length=None): | |||
outputs, attention = self._self_attention_stack( | |||
inputs, | |||
sequence_length=sequence_length, | |||
mode=mode, | |||
memory=memory, | |||
memory_sequence_length=memory_sequence_length) | |||
return outputs, None, attention | |||
def step_fn(self, | |||
mode, | |||
batch_size, | |||
initial_state=None, | |||
memory=None, | |||
memory_sequence_length=None, | |||
dtype=tf.float32): | |||
if memory is None: | |||
num_sources = 0 | |||
elif tf.contrib.framework.nest.is_sequence(memory): | |||
num_sources = len(memory) | |||
else: | |||
num_sources = 1 | |||
cache = self._init_cache( | |||
batch_size, dtype=dtype, num_sources=num_sources) | |||
def _fn(step, inputs, cache, mode): | |||
inputs = tf.expand_dims(inputs, 1) | |||
outputs, attention = self._self_attention_stack( | |||
inputs, | |||
mode=mode, | |||
cache=cache, | |||
memory=memory, | |||
memory_sequence_length=memory_sequence_length, | |||
step=step) | |||
outputs = tf.squeeze(outputs, axis=1) | |||
if attention is not None: | |||
attention = tf.squeeze(attention, axis=1) | |||
return outputs, cache, attention | |||
return _fn, cache |
@@ -0,0 +1,182 @@ | |||
"""Define the self-attention encoder.""" | |||
import tensorflow as tf | |||
from . import transformer | |||
from .position import SinusoidalPositionEncoder | |||
class SelfAttentionEncoder(): | |||
"""Encoder using self-attention as described in | |||
https://arxiv.org/abs/1706.03762. | |||
""" | |||
def __init__(self, | |||
num_layers, | |||
num_units=512, | |||
num_heads=8, | |||
ffn_inner_dim=2048, | |||
dropout=0.1, | |||
attention_dropout=0.1, | |||
relu_dropout=0.1, | |||
position_encoder=SinusoidalPositionEncoder()): | |||
"""Initializes the parameters of the encoder. | |||
Args: | |||
num_layers: The number of layers. | |||
num_units: The number of hidden units. | |||
num_heads: The number of heads in the multi-head attention. | |||
ffn_inner_dim: The number of units of the inner linear transformation | |||
in the feed forward layer. | |||
dropout: The probability to drop units from the outputs. | |||
attention_dropout: The probability to drop units from the attention. | |||
relu_dropout: The probability to drop units from the ReLU activation in | |||
the feed forward layer. | |||
position_encoder: The :class:`opennmt.layers.position.PositionEncoder` to | |||
apply on inputs or ``None``. | |||
""" | |||
super(SelfAttentionEncoder, self).__init__() | |||
self.num_layers = num_layers | |||
self.num_units = num_units | |||
self.num_heads = num_heads | |||
self.ffn_inner_dim = ffn_inner_dim | |||
self.dropout = dropout | |||
self.attention_dropout = attention_dropout | |||
self.relu_dropout = relu_dropout | |||
self.position_encoder = position_encoder | |||
def encode(self, inputs, sequence_length=None, mode=True): | |||
inputs *= self.num_units**0.5 | |||
if self.position_encoder is not None: | |||
inputs = self.position_encoder(inputs) | |||
inputs = tf.layers.dropout(inputs, rate=self.dropout, training=mode) | |||
mask = transformer.build_sequence_mask( | |||
sequence_length, | |||
num_heads=self.num_heads, | |||
maximum_length=tf.shape(inputs)[1]) | |||
mask_FF = tf.squeeze( | |||
transformer.build_sequence_mask( | |||
sequence_length, maximum_length=tf.shape(inputs)[1]), | |||
axis=1) | |||
state = () | |||
attns = [] | |||
for layer in range(self.num_layers): | |||
with tf.variable_scope('layer_{}'.format(layer)): | |||
with tf.variable_scope('multi_head'): | |||
context, attn = transformer.multi_head_attention( | |||
self.num_heads, | |||
transformer.norm(inputs), | |||
None, | |||
mode, | |||
num_units=self.num_units, | |||
mask=mask, | |||
dropout=self.attention_dropout, | |||
return_attention=True) | |||
attns.append(attn) | |||
context = transformer.drop_and_add( | |||
inputs, context, mode, dropout=self.dropout) | |||
with tf.variable_scope('ffn'): | |||
transformed = transformer.feed_forward( | |||
transformer.norm(context), | |||
self.ffn_inner_dim, | |||
mode, | |||
dropout=self.relu_dropout, | |||
mask=mask_FF) | |||
transformed = transformer.drop_and_add( | |||
context, transformed, mode, dropout=self.dropout) | |||
inputs = transformed | |||
state += (tf.reduce_mean(inputs, axis=1), ) | |||
outputs = transformer.norm(inputs) | |||
return (outputs, state, sequence_length, attns) | |||
class SelfAttentionEncoderOri(): | |||
"""Encoder using self-attention as described in | |||
https://arxiv.org/abs/1706.03762. | |||
""" | |||
def __init__(self, | |||
num_layers, | |||
num_units=512, | |||
num_heads=8, | |||
ffn_inner_dim=2048, | |||
dropout=0.1, | |||
attention_dropout=0.1, | |||
relu_dropout=0.1, | |||
position_encoder=SinusoidalPositionEncoder()): | |||
"""Initializes the parameters of the encoder. | |||
Args: | |||
num_layers: The number of layers. | |||
num_units: The number of hidden units. | |||
num_heads: The number of heads in the multi-head attention. | |||
ffn_inner_dim: The number of units of the inner linear transformation | |||
in the feed forward layer. | |||
dropout: The probability to drop units from the outputs. | |||
attention_dropout: The probability to drop units from the attention. | |||
relu_dropout: The probability to drop units from the ReLU activation in | |||
the feed forward layer. | |||
position_encoder: The :class:`opennmt.layers.position.PositionEncoder` to | |||
apply on inputs or ``None``. | |||
""" | |||
super(SelfAttentionEncoderOri, self).__init__() | |||
self.num_layers = num_layers | |||
self.num_units = num_units | |||
self.num_heads = num_heads | |||
self.ffn_inner_dim = ffn_inner_dim | |||
self.dropout = dropout | |||
self.attention_dropout = attention_dropout | |||
self.relu_dropout = relu_dropout | |||
self.position_encoder = position_encoder | |||
def encode(self, inputs, sequence_length=None, mode=True): | |||
inputs *= self.num_units**0.5 | |||
if self.position_encoder is not None: | |||
inputs = self.position_encoder(inputs) | |||
inputs = tf.layers.dropout(inputs, rate=self.dropout, training=mode) | |||
mask = transformer.build_sequence_mask( | |||
sequence_length, | |||
num_heads=self.num_heads, | |||
maximum_length=tf.shape(inputs)[1]) # [N, 1, 1, T_out] | |||
state = () | |||
attns = [] | |||
for layer in range(self.num_layers): | |||
with tf.variable_scope('layer_{}'.format(layer)): | |||
with tf.variable_scope('multi_head'): | |||
context, attn = transformer.multi_head_attention( | |||
self.num_heads, | |||
transformer.norm(inputs), | |||
None, | |||
mode, | |||
num_units=self.num_units, | |||
mask=mask, | |||
dropout=self.attention_dropout, | |||
return_attention=True) | |||
attns.append(attn) | |||
context = transformer.drop_and_add( | |||
inputs, context, mode, dropout=self.dropout) | |||
with tf.variable_scope('ffn'): | |||
transformed = transformer.feed_forward_ori( | |||
transformer.norm(context), | |||
self.ffn_inner_dim, | |||
mode, | |||
dropout=self.relu_dropout) | |||
transformed = transformer.drop_and_add( | |||
context, transformed, mode, dropout=self.dropout) | |||
inputs = transformed | |||
state += (tf.reduce_mean(inputs, axis=1), ) | |||
outputs = transformer.norm(inputs) | |||
return (outputs, state, sequence_length, attns) |
@@ -0,0 +1,255 @@ | |||
import io | |||
import os | |||
from typing import Any, Dict, Optional, Union | |||
import numpy as np | |||
import tensorflow as tf | |||
from sklearn.preprocessing import MultiLabelBinarizer | |||
from modelscope.models.base import Model | |||
from modelscope.models.builder import MODELS | |||
from modelscope.utils.constant import ModelFile, Tasks | |||
from .models import create_model | |||
from .text.symbols import load_symbols | |||
from .text.symbols_dict import SymbolsDict | |||
__all__ = ['SambertNetHifi16k'] | |||
def multi_label_symbol_to_sequence(my_classes, my_symbol): | |||
one_hot = MultiLabelBinarizer(my_classes) | |||
tokens = my_symbol.strip().split(' ') | |||
sequences = [] | |||
for token in tokens: | |||
sequences.append(tuple(token.split('&'))) | |||
# sequences.append(tuple(['~'])) # sequence length minus 1 to ignore EOS ~ | |||
return one_hot.fit_transform(sequences) | |||
@MODELS.register_module(Tasks.text_to_speech, module_name=r'sambert_hifi_16k') | |||
class SambertNetHifi16k(Model): | |||
def __init__(self, | |||
model_dir, | |||
pitch_control_str='', | |||
duration_control_str='', | |||
energy_control_str='', | |||
*args, | |||
**kwargs): | |||
tf.reset_default_graph() | |||
local_ckpt_path = os.path.join(ModelFile.TF_CHECKPOINT_FOLDER, 'ckpt') | |||
self._ckpt_path = os.path.join(model_dir, local_ckpt_path) | |||
self._dict_path = os.path.join(model_dir, 'dicts') | |||
self._hparams = tf.contrib.training.HParams(**kwargs) | |||
values = self._hparams.values() | |||
hp = [' {}:{}'.format(name, values[name]) for name in sorted(values)] | |||
print('Hyperparameters:\n' + '\n'.join(hp)) | |||
super().__init__(self._ckpt_path, *args, **kwargs) | |||
model_name = 'robutrans' | |||
self._lfeat_type_list = self._hparams.lfeat_type_list.strip().split( | |||
',') | |||
sy, tone, syllable_flag, word_segment, emo_category, speaker = load_symbols( | |||
self._dict_path) | |||
self._sy = sy | |||
self._tone = tone | |||
self._syllable_flag = syllable_flag | |||
self._word_segment = word_segment | |||
self._emo_category = emo_category | |||
self._speaker = speaker | |||
self._inputs_dim = dict() | |||
for lfeat_type in self._lfeat_type_list: | |||
if lfeat_type == 'sy': | |||
self._inputs_dim[lfeat_type] = len(sy) | |||
elif lfeat_type == 'tone': | |||
self._inputs_dim[lfeat_type] = len(tone) | |||
elif lfeat_type == 'syllable_flag': | |||
self._inputs_dim[lfeat_type] = len(syllable_flag) | |||
elif lfeat_type == 'word_segment': | |||
self._inputs_dim[lfeat_type] = len(word_segment) | |||
elif lfeat_type == 'emo_category': | |||
self._inputs_dim[lfeat_type] = len(emo_category) | |||
elif lfeat_type == 'speaker': | |||
self._inputs_dim[lfeat_type] = len(speaker) | |||
self._symbols_dict = SymbolsDict(sy, tone, syllable_flag, word_segment, | |||
emo_category, speaker, | |||
self._inputs_dim, | |||
self._lfeat_type_list) | |||
dim_inputs = sum(self._inputs_dim.values( | |||
)) - self._inputs_dim['speaker'] - self._inputs_dim['emo_category'] | |||
inputs = tf.placeholder(tf.float32, [1, None, dim_inputs], 'inputs') | |||
inputs_emotion = tf.placeholder( | |||
tf.float32, [1, None, self._inputs_dim['emo_category']], | |||
'inputs_emotion') | |||
inputs_speaker = tf.placeholder(tf.float32, | |||
[1, None, self._inputs_dim['speaker']], | |||
'inputs_speaker') | |||
input_lengths = tf.placeholder(tf.int32, [1], 'input_lengths') | |||
pitch_contours_scale = tf.placeholder(tf.float32, [1, None], | |||
'pitch_contours_scale') | |||
energy_contours_scale = tf.placeholder(tf.float32, [1, None], | |||
'energy_contours_scale') | |||
duration_scale = tf.placeholder(tf.float32, [1, None], | |||
'duration_scale') | |||
with tf.variable_scope('model') as _: | |||
self._model = create_model(model_name, self._hparams) | |||
self._model.initialize( | |||
inputs, | |||
inputs_emotion, | |||
inputs_speaker, | |||
input_lengths, | |||
duration_scales=duration_scale, | |||
pitch_scales=pitch_contours_scale, | |||
energy_scales=energy_contours_scale) | |||
self._mel_spec = self._model.mel_outputs[0] | |||
self._duration_outputs = self._model.duration_outputs[0] | |||
self._duration_outputs_ = self._model.duration_outputs_[0] | |||
self._pitch_contour_outputs = self._model.pitch_contour_outputs[0] | |||
self._energy_contour_outputs = self._model.energy_contour_outputs[ | |||
0] | |||
self._embedded_inputs_emotion = self._model.embedded_inputs_emotion[ | |||
0] | |||
self._embedding_fsmn_outputs = self._model.embedding_fsmn_outputs[ | |||
0] | |||
self._encoder_outputs = self._model.encoder_outputs[0] | |||
self._pitch_embeddings = self._model.pitch_embeddings[0] | |||
self._energy_embeddings = self._model.energy_embeddings[0] | |||
self._LR_outputs = self._model.LR_outputs[0] | |||
self._postnet_fsmn_outputs = self._model.postnet_fsmn_outputs[0] | |||
self._attention_h = self._model.attention_h | |||
self._attention_x = self._model.attention_x | |||
print('Loading checkpoint: %s' % self._ckpt_path) | |||
config = tf.ConfigProto() | |||
config.gpu_options.allow_growth = True | |||
self._session = tf.Session(config=config) | |||
self._session.run(tf.global_variables_initializer()) | |||
saver = tf.train.Saver() | |||
saver.restore(self._session, self._ckpt_path) | |||
duration_cfg_lst = [] | |||
if len(duration_control_str) != 0: | |||
for item in duration_control_str.strip().split('|'): | |||
percent, scale = item.lstrip('(').rstrip(')').split(',') | |||
duration_cfg_lst.append((float(percent), float(scale))) | |||
self._duration_cfg_lst = duration_cfg_lst | |||
pitch_contours_cfg_lst = [] | |||
if len(pitch_control_str) != 0: | |||
for item in pitch_control_str.strip().split('|'): | |||
percent, scale = item.lstrip('(').rstrip(')').split(',') | |||
pitch_contours_cfg_lst.append( | |||
(float(percent), float(scale))) | |||
self._pitch_contours_cfg_lst = pitch_contours_cfg_lst | |||
energy_contours_cfg_lst = [] | |||
if len(energy_control_str) != 0: | |||
for item in energy_control_str.strip().split('|'): | |||
percent, scale = item.lstrip('(').rstrip(')').split(',') | |||
energy_contours_cfg_lst.append( | |||
(float(percent), float(scale))) | |||
self._energy_contours_cfg_lst = energy_contours_cfg_lst | |||
def forward(self, text): | |||
cleaner_names = [x.strip() for x in self._hparams.cleaners.split(',')] | |||
lfeat_symbol = text.strip().split(' ') | |||
lfeat_symbol_separate = [''] * int(len(self._lfeat_type_list)) | |||
for this_lfeat_symbol in lfeat_symbol: | |||
this_lfeat_symbol = this_lfeat_symbol.strip('{').strip('}').split( | |||
'$') | |||
if len(this_lfeat_symbol) != len(self._lfeat_type_list): | |||
raise Exception( | |||
'Length of this_lfeat_symbol in training data' | |||
+ ' is not equal to the length of lfeat_type_list, ' | |||
+ str(len(this_lfeat_symbol)) + ' VS. ' | |||
+ str(len(self._lfeat_type_list))) | |||
index = 0 | |||
while index < len(lfeat_symbol_separate): | |||
lfeat_symbol_separate[index] = lfeat_symbol_separate[ | |||
index] + this_lfeat_symbol[index] + ' ' | |||
index = index + 1 | |||
index = 0 | |||
lfeat_type = self._lfeat_type_list[index] | |||
sequence = self._symbols_dict.symbol_to_sequence( | |||
lfeat_symbol_separate[index].strip(), lfeat_type, cleaner_names) | |||
sequence_array = np.asarray( | |||
sequence[:-1], | |||
dtype=np.int32) # sequence length minus 1 to ignore EOS ~ | |||
inputs = np.eye( | |||
self._inputs_dim[lfeat_type], dtype=np.float32)[sequence_array] | |||
index = index + 1 | |||
while index < len(self._lfeat_type_list) - 2: | |||
lfeat_type = self._lfeat_type_list[index] | |||
sequence = self._symbols_dict.symbol_to_sequence( | |||
lfeat_symbol_separate[index].strip(), lfeat_type, | |||
cleaner_names) | |||
sequence_array = np.asarray( | |||
sequence[:-1], | |||
dtype=np.int32) # sequence length minus 1 to ignore EOS ~ | |||
inputs_temp = np.eye( | |||
self._inputs_dim[lfeat_type], dtype=np.float32)[sequence_array] | |||
inputs = np.concatenate((inputs, inputs_temp), axis=1) | |||
index = index + 1 | |||
seq = inputs | |||
lfeat_type = 'emo_category' | |||
inputs_emotion = multi_label_symbol_to_sequence( | |||
self._emo_category, lfeat_symbol_separate[index].strip()) | |||
# inputs_emotion = inputs_emotion * 1.5 | |||
index = index + 1 | |||
lfeat_type = 'speaker' | |||
inputs_speaker = multi_label_symbol_to_sequence( | |||
self._speaker, lfeat_symbol_separate[index].strip()) | |||
duration_scale = np.ones((len(seq), ), dtype=np.float32) | |||
start_idx = 0 | |||
for (percent, scale) in self._duration_cfg_lst: | |||
duration_scale[start_idx:start_idx | |||
+ int(percent * len(seq))] = scale | |||
start_idx += int(percent * len(seq)) | |||
pitch_contours_scale = np.ones((len(seq), ), dtype=np.float32) | |||
start_idx = 0 | |||
for (percent, scale) in self._pitch_contours_cfg_lst: | |||
pitch_contours_scale[start_idx:start_idx | |||
+ int(percent * len(seq))] = scale | |||
start_idx += int(percent * len(seq)) | |||
energy_contours_scale = np.ones((len(seq), ), dtype=np.float32) | |||
start_idx = 0 | |||
for (percent, scale) in self._energy_contours_cfg_lst: | |||
energy_contours_scale[start_idx:start_idx | |||
+ int(percent * len(seq))] = scale | |||
start_idx += int(percent * len(seq)) | |||
feed_dict = { | |||
self._model.inputs: [np.asarray(seq, dtype=np.float32)], | |||
self._model.inputs_emotion: | |||
[np.asarray(inputs_emotion, dtype=np.float32)], | |||
self._model.inputs_speaker: | |||
[np.asarray(inputs_speaker, dtype=np.float32)], | |||
self._model.input_lengths: | |||
np.asarray([len(seq)], dtype=np.int32), | |||
self._model.duration_scales: [duration_scale], | |||
self._model.pitch_scales: [pitch_contours_scale], | |||
self._model.energy_scales: [energy_contours_scale] | |||
} | |||
result = self._session.run([ | |||
self._mel_spec, self._duration_outputs, self._duration_outputs_, | |||
self._pitch_contour_outputs, self._embedded_inputs_emotion, | |||
self._embedding_fsmn_outputs, self._encoder_outputs, | |||
self._pitch_embeddings, self._LR_outputs, | |||
self._postnet_fsmn_outputs, self._energy_contour_outputs, | |||
self._energy_embeddings, self._attention_x, self._attention_h | |||
], feed_dict=feed_dict) # yapf:disable | |||
return result[0] |
@@ -0,0 +1,89 @@ | |||
''' | |||
Cleaners are transformations that run over the input text at both training and eval time. | |||
Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners" | |||
hyperparameter. Some cleaners are English-specific. You'll typically want to use: | |||
1. "english_cleaners" for English text | |||
2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using | |||
the Unidecode library (https://pypi.python.org/pypi/Unidecode) | |||
3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update | |||
the symbols in symbols.py to match your data). | |||
''' | |||
import re | |||
from unidecode import unidecode | |||
from .numbers import normalize_numbers | |||
# Regular expression matching whitespace: | |||
_whitespace_re = re.compile(r'\s+') | |||
# List of (regular expression, replacement) pairs for abbreviations: | |||
_abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) | |||
for x in [ | |||
('mrs', 'misess'), | |||
('mr', 'mister'), | |||
('dr', 'doctor'), | |||
('st', 'saint'), | |||
('co', 'company'), | |||
('jr', 'junior'), | |||
('maj', 'major'), | |||
('gen', 'general'), | |||
('drs', 'doctors'), | |||
('rev', 'reverend'), | |||
('lt', 'lieutenant'), | |||
('hon', 'honorable'), | |||
('sgt', 'sergeant'), | |||
('capt', 'captain'), | |||
('esq', 'esquire'), | |||
('ltd', 'limited'), | |||
('col', 'colonel'), | |||
('ft', 'fort'), ]] # yapf:disable | |||
def expand_abbreviations(text): | |||
for regex, replacement in _abbreviations: | |||
text = re.sub(regex, replacement, text) | |||
return text | |||
def expand_numbers(text): | |||
return normalize_numbers(text) | |||
def lowercase(text): | |||
return text.lower() | |||
def collapse_whitespace(text): | |||
return re.sub(_whitespace_re, ' ', text) | |||
def convert_to_ascii(text): | |||
return unidecode(text) | |||
def basic_cleaners(text): | |||
'''Basic pipeline that lowercases and collapses whitespace without transliteration.''' | |||
text = lowercase(text) | |||
text = collapse_whitespace(text) | |||
return text | |||
def transliteration_cleaners(text): | |||
'''Pipeline for non-English text that transliterates to ASCII.''' | |||
text = convert_to_ascii(text) | |||
text = lowercase(text) | |||
text = collapse_whitespace(text) | |||
return text | |||
def english_cleaners(text): | |||
'''Pipeline for English text, including number and abbreviation expansion.''' | |||
text = convert_to_ascii(text) | |||
text = lowercase(text) | |||
text = expand_numbers(text) | |||
text = expand_abbreviations(text) | |||
text = collapse_whitespace(text) | |||
return text |
@@ -0,0 +1,64 @@ | |||
import re | |||
valid_symbols = [ | |||
'AA', 'AA0', 'AA1', 'AA2', 'AE', 'AE0', 'AE1', 'AE2', 'AH', 'AH0', 'AH1', | |||
'AH2', 'AO', 'AO0', 'AO1', 'AO2', 'AW', 'AW0', 'AW1', 'AW2', 'AY', 'AY0', | |||
'AY1', 'AY2', 'B', 'CH', 'D', 'DH', 'EH', 'EH0', 'EH1', 'EH2', 'ER', 'ER0', | |||
'ER1', 'ER2', 'EY', 'EY0', 'EY1', 'EY2', 'F', 'G', 'HH', 'IH', 'IH0', | |||
'IH1', 'IH2', 'IY', 'IY0', 'IY1', 'IY2', 'JH', 'K', 'L', 'M', 'N', 'NG', | |||
'OW', 'OW0', 'OW1', 'OW2', 'OY', 'OY0', 'OY1', 'OY2', 'P', 'R', 'S', 'SH', | |||
'T', 'TH', 'UH', 'UH0', 'UH1', 'UH2', 'UW', 'UW0', 'UW1', 'UW2', 'V', 'W', | |||
'Y', 'Z', 'ZH' | |||
] | |||
_valid_symbol_set = set(valid_symbols) | |||
class CMUDict: | |||
'''Thin wrapper around CMUDict data. http://www.speech.cs.cmu.edu/cgi-bin/cmudict''' | |||
def __init__(self, file_or_path, keep_ambiguous=True): | |||
if isinstance(file_or_path, str): | |||
with open(file_or_path, encoding='latin-1') as f: | |||
entries = _parse_cmudict(f) | |||
else: | |||
entries = _parse_cmudict(file_or_path) | |||
if not keep_ambiguous: | |||
entries = { | |||
word: pron | |||
for word, pron in entries.items() if len(pron) == 1 | |||
} | |||
self._entries = entries | |||
def __len__(self): | |||
return len(self._entries) | |||
def lookup(self, word): | |||
'''Returns list of ARPAbet pronunciations of the given word.''' | |||
return self._entries.get(word.upper()) | |||
_alt_re = re.compile(r'\([0-9]+\)') | |||
def _parse_cmudict(file): | |||
cmudict = {} | |||
for line in file: | |||
if len(line) and (line[0] >= 'A' and line[0] <= 'Z' or line[0] == "'"): | |||
parts = line.split(' ') | |||
word = re.sub(_alt_re, '', parts[0]) | |||
pronunciation = _get_pronunciation(parts[1]) | |||
if pronunciation: | |||
if word in cmudict: | |||
cmudict[word].append(pronunciation) | |||
else: | |||
cmudict[word] = [pronunciation] | |||
return cmudict | |||
def _get_pronunciation(s): | |||
parts = s.strip().split(' ') | |||
for part in parts: | |||
if part not in _valid_symbol_set: | |||
return None | |||
return ' '.join(parts) |
@@ -0,0 +1,70 @@ | |||
import re | |||
import inflect | |||
_inflect = inflect.engine() | |||
_comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])') | |||
_decimal_number_re = re.compile(r'([0-9]+\.[0-9]+)') | |||
_pounds_re = re.compile(r'£([0-9\,]*[0-9]+)') | |||
_dollars_re = re.compile(r'\$([0-9\.\,]*[0-9]+)') | |||
_ordinal_re = re.compile(r'[0-9]+(st|nd|rd|th)') | |||
_number_re = re.compile(r'[0-9]+') | |||
def _remove_commas(m): | |||
return m.group(1).replace(',', '') | |||
def _expand_decimal_point(m): | |||
return m.group(1).replace('.', ' point ') | |||
def _expand_dollars(m): | |||
match = m.group(1) | |||
parts = match.split('.') | |||
if len(parts) > 2: | |||
return match + ' dollars' # Unexpected format | |||
dollars = int(parts[0]) if parts[0] else 0 | |||
cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0 | |||
if dollars and cents: | |||
dollar_unit = 'dollar' if dollars == 1 else 'dollars' | |||
cent_unit = 'cent' if cents == 1 else 'cents' | |||
return '%s %s, %s %s' % (dollars, dollar_unit, cents, cent_unit) | |||
elif dollars: | |||
dollar_unit = 'dollar' if dollars == 1 else 'dollars' | |||
return '%s %s' % (dollars, dollar_unit) | |||
elif cents: | |||
cent_unit = 'cent' if cents == 1 else 'cents' | |||
return '%s %s' % (cents, cent_unit) | |||
else: | |||
return 'zero dollars' | |||
def _expand_ordinal(m): | |||
return _inflect.number_to_words(m.group(0)) | |||
def _expand_number(m): | |||
num = int(m.group(0)) | |||
if num > 1000 and num < 3000: | |||
if num == 2000: | |||
return 'two thousand' | |||
elif num > 2000 and num < 2010: | |||
return 'two thousand ' + _inflect.number_to_words(num % 100) | |||
elif num % 100 == 0: | |||
return _inflect.number_to_words(num // 100) + ' hundred' | |||
else: | |||
return _inflect.number_to_words( | |||
num, andword='', zero='oh', group=2).replace(', ', ' ') | |||
else: | |||
return _inflect.number_to_words(num, andword='') | |||
def normalize_numbers(text): | |||
text = re.sub(_comma_number_re, _remove_commas, text) | |||
text = re.sub(_pounds_re, r'\1 pounds', text) | |||
text = re.sub(_dollars_re, _expand_dollars, text) | |||
text = re.sub(_decimal_number_re, _expand_decimal_point, text) | |||
text = re.sub(_ordinal_re, _expand_ordinal, text) | |||
text = re.sub(_number_re, _expand_number, text) | |||
return text |
@@ -0,0 +1,95 @@ | |||
''' | |||
Defines the set of symbols used in text input to the model. | |||
The default is a set of ASCII characters that works well for English or text that has been run | |||
through Unidecode. For other data, you can modify _characters. See TRAINING_DATA.md for details. | |||
''' | |||
import codecs | |||
import os | |||
_pad = '_' | |||
_eos = '~' | |||
_mask = '@[MASK]' | |||
def load_symbols(dict_path): | |||
_characters = '' | |||
_ch_symbols = [] | |||
sy_dict_name = 'sy_dict.txt' | |||
sy_dict_path = os.path.join(dict_path, sy_dict_name) | |||
f = codecs.open(sy_dict_path, 'r') | |||
for line in f: | |||
line = line.strip('\r\n') | |||
_ch_symbols.append(line) | |||
_arpabet = ['@' + s for s in _ch_symbols] | |||
# Export all symbols: | |||
sy = list(_characters) + _arpabet + [_pad, _eos, _mask] | |||
_characters = '' | |||
_ch_tones = [] | |||
tone_dict_name = 'tone_dict.txt' | |||
tone_dict_path = os.path.join(dict_path, tone_dict_name) | |||
f = codecs.open(tone_dict_path, 'r') | |||
for line in f: | |||
line = line.strip('\r\n') | |||
_ch_tones.append(line) | |||
# Export all tones: | |||
tone = list(_characters) + _ch_tones + [_pad, _eos, _mask] | |||
_characters = '' | |||
_ch_syllable_flags = [] | |||
syllable_flag_name = 'syllable_flag_dict.txt' | |||
syllable_flag_path = os.path.join(dict_path, syllable_flag_name) | |||
f = codecs.open(syllable_flag_path, 'r') | |||
for line in f: | |||
line = line.strip('\r\n') | |||
_ch_syllable_flags.append(line) | |||
# Export all syllable_flags: | |||
syllable_flag = list(_characters) + _ch_syllable_flags + [ | |||
_pad, _eos, _mask | |||
] | |||
_characters = '' | |||
_ch_word_segments = [] | |||
word_segment_name = 'word_segment_dict.txt' | |||
word_segment_path = os.path.join(dict_path, word_segment_name) | |||
f = codecs.open(word_segment_path, 'r') | |||
for line in f: | |||
line = line.strip('\r\n') | |||
_ch_word_segments.append(line) | |||
# Export all syllable_flags: | |||
word_segment = list(_characters) + _ch_word_segments + [_pad, _eos, _mask] | |||
_characters = '' | |||
_ch_emo_types = [] | |||
emo_category_name = 'emo_category_dict.txt' | |||
emo_category_path = os.path.join(dict_path, emo_category_name) | |||
f = codecs.open(emo_category_path, 'r') | |||
for line in f: | |||
line = line.strip('\r\n') | |||
_ch_emo_types.append(line) | |||
emo_category = list(_characters) + _ch_emo_types + [_pad, _eos, _mask] | |||
_characters = '' | |||
_ch_speakers = [] | |||
speaker_name = 'speaker_dict.txt' | |||
speaker_path = os.path.join(dict_path, speaker_name) | |||
f = codecs.open(speaker_path, 'r') | |||
for line in f: | |||
line = line.strip('\r\n') | |||
_ch_speakers.append(line) | |||
# Export all syllable_flags: | |||
speaker = list(_characters) + _ch_speakers + [_pad, _eos, _mask] | |||
return sy, tone, syllable_flag, word_segment, emo_category, speaker |
@@ -0,0 +1,200 @@ | |||
import re | |||
import sys | |||
from .cleaners import (basic_cleaners, english_cleaners, | |||
transliteration_cleaners) | |||
class SymbolsDict: | |||
def __init__(self, sy, tone, syllable_flag, word_segment, emo_category, | |||
speaker, inputs_dim, lfeat_type_list): | |||
self._inputs_dim = inputs_dim | |||
self._lfeat_type_list = lfeat_type_list | |||
self._sy_to_id = {s: i for i, s in enumerate(sy)} | |||
self._id_to_sy = {i: s for i, s in enumerate(sy)} | |||
self._tone_to_id = {s: i for i, s in enumerate(tone)} | |||
self._id_to_tone = {i: s for i, s in enumerate(tone)} | |||
self._syllable_flag_to_id = {s: i for i, s in enumerate(syllable_flag)} | |||
self._id_to_syllable_flag = {i: s for i, s in enumerate(syllable_flag)} | |||
self._word_segment_to_id = {s: i for i, s in enumerate(word_segment)} | |||
self._id_to_word_segment = {i: s for i, s in enumerate(word_segment)} | |||
self._emo_category_to_id = {s: i for i, s in enumerate(emo_category)} | |||
self._id_to_emo_category = {i: s for i, s in enumerate(emo_category)} | |||
self._speaker_to_id = {s: i for i, s in enumerate(speaker)} | |||
self._id_to_speaker = {i: s for i, s in enumerate(speaker)} | |||
print('_sy_to_id: ') | |||
print(self._sy_to_id) | |||
print('_tone_to_id: ') | |||
print(self._tone_to_id) | |||
print('_syllable_flag_to_id: ') | |||
print(self._syllable_flag_to_id) | |||
print('_word_segment_to_id: ') | |||
print(self._word_segment_to_id) | |||
print('_emo_category_to_id: ') | |||
print(self._emo_category_to_id) | |||
print('_speaker_to_id: ') | |||
print(self._speaker_to_id) | |||
self._curly_re = re.compile(r'(.*?)\{(.+?)\}(.*)') | |||
self._cleaners = { | |||
basic_cleaners.__name__: basic_cleaners, | |||
transliteration_cleaners.__name__: transliteration_cleaners, | |||
english_cleaners.__name__: english_cleaners | |||
} | |||
def _clean_text(self, text, cleaner_names): | |||
for name in cleaner_names: | |||
cleaner = self._cleaners.get(name) | |||
if not cleaner: | |||
raise Exception('Unknown cleaner: %s' % name) | |||
text = cleaner(text) | |||
return text | |||
def _sy_to_sequence(self, sy): | |||
return [self._sy_to_id[s] for s in sy if self._should_keep_sy(s)] | |||
def _arpabet_to_sequence(self, text): | |||
return self._sy_to_sequence(['@' + s for s in text.split()]) | |||
def _should_keep_sy(self, s): | |||
return s in self._sy_to_id and s != '_' and s != '~' | |||
def symbol_to_sequence(self, this_lfeat_symbol, lfeat_type, cleaner_names): | |||
sequence = [] | |||
if lfeat_type == 'sy': | |||
this_lfeat_symbol = this_lfeat_symbol.strip().split(' ') | |||
this_lfeat_symbol_format = '' | |||
index = 0 | |||
while index < len(this_lfeat_symbol): | |||
this_lfeat_symbol_format = this_lfeat_symbol_format + '{' + this_lfeat_symbol[ | |||
index] + '}' + ' ' | |||
index = index + 1 | |||
sequence = self.text_to_sequence(this_lfeat_symbol_format, | |||
cleaner_names) | |||
elif lfeat_type == 'tone': | |||
sequence = self.tone_to_sequence(this_lfeat_symbol) | |||
elif lfeat_type == 'syllable_flag': | |||
sequence = self.syllable_flag_to_sequence(this_lfeat_symbol) | |||
elif lfeat_type == 'word_segment': | |||
sequence = self.word_segment_to_sequence(this_lfeat_symbol) | |||
elif lfeat_type == 'emo_category': | |||
sequence = self.emo_category_to_sequence(this_lfeat_symbol) | |||
elif lfeat_type == 'speaker': | |||
sequence = self.speaker_to_sequence(this_lfeat_symbol) | |||
else: | |||
raise Exception('Unknown lfeat type: %s' % lfeat_type) | |||
return sequence | |||
def text_to_sequence(self, text, cleaner_names): | |||
'''Converts a string of text to a sequence of IDs corresponding to the symbols in the text. | |||
The text can optionally have ARPAbet sequences enclosed in curly braces embedded | |||
in it. For example, "Turn left on {HH AW1 S S T AH0 N} Street." | |||
Args: | |||
text: string to convert to a sequence | |||
cleaner_names: names of the cleaner functions to run the text through | |||
Returns: | |||
List of integers corresponding to the symbols in the text | |||
''' | |||
sequence = [] | |||
# Check for curly braces and treat their contents as ARPAbet: | |||
while len(text): | |||
m = self._curly_re.match(text) | |||
if not m: | |||
sequence += self._sy_to_sequence( | |||
self._clean_text(text, cleaner_names)) | |||
break | |||
sequence += self._sy_to_sequence( | |||
self._clean_text(m.group(1), cleaner_names)) | |||
sequence += self._arpabet_to_sequence(m.group(2)) | |||
text = m.group(3) | |||
# Append EOS token | |||
sequence.append(self._sy_to_id['~']) | |||
return sequence | |||
def tone_to_sequence(self, tone): | |||
tones = tone.strip().split(' ') | |||
sequence = [] | |||
for this_tone in tones: | |||
sequence.append(self._tone_to_id[this_tone]) | |||
sequence.append(self._tone_to_id['~']) | |||
return sequence | |||
def syllable_flag_to_sequence(self, syllable_flag): | |||
syllable_flags = syllable_flag.strip().split(' ') | |||
sequence = [] | |||
for this_syllable_flag in syllable_flags: | |||
sequence.append(self._syllable_flag_to_id[this_syllable_flag]) | |||
sequence.append(self._syllable_flag_to_id['~']) | |||
return sequence | |||
def word_segment_to_sequence(self, word_segment): | |||
word_segments = word_segment.strip().split(' ') | |||
sequence = [] | |||
for this_word_segment in word_segments: | |||
sequence.append(self._word_segment_to_id[this_word_segment]) | |||
sequence.append(self._word_segment_to_id['~']) | |||
return sequence | |||
def emo_category_to_sequence(self, emo_type): | |||
emo_categories = emo_type.strip().split(' ') | |||
sequence = [] | |||
for this_category in emo_categories: | |||
sequence.append(self._emo_category_to_id[this_category]) | |||
sequence.append(self._emo_category_to_id['~']) | |||
return sequence | |||
def speaker_to_sequence(self, speaker): | |||
speakers = speaker.strip().split(' ') | |||
sequence = [] | |||
for this_speaker in speakers: | |||
sequence.append(self._speaker_to_id[this_speaker]) | |||
sequence.append(self._speaker_to_id['~']) | |||
return sequence | |||
def sequence_to_symbol(self, sequence): | |||
result = '' | |||
pre_lfeat_dim = 0 | |||
for lfeat_type in self._lfeat_type_list: | |||
current_one_hot_sequence = sequence[:, pre_lfeat_dim:pre_lfeat_dim | |||
+ self._inputs_dim[lfeat_type]] | |||
current_sequence = current_one_hot_sequence.argmax(1) | |||
length = current_sequence.shape[0] | |||
index = 0 | |||
while index < length: | |||
this_sequence = current_sequence[index] | |||
s = '' | |||
if lfeat_type == 'sy': | |||
s = self._id_to_sy[this_sequence] | |||
if len(s) > 1 and s[0] == '@': | |||
s = s[1:] | |||
elif lfeat_type == 'tone': | |||
s = self._id_to_tone[this_sequence] | |||
elif lfeat_type == 'syllable_flag': | |||
s = self._id_to_syllable_flag[this_sequence] | |||
elif lfeat_type == 'word_segment': | |||
s = self._id_to_word_segment[this_sequence] | |||
elif lfeat_type == 'emo_category': | |||
s = self._id_to_emo_category[this_sequence] | |||
elif lfeat_type == 'speaker': | |||
s = self._id_to_speaker[this_sequence] | |||
else: | |||
raise Exception('Unknown lfeat type: %s' % lfeat_type) | |||
if index == 0: | |||
result = result + lfeat_type + ': ' | |||
result = result + '{' + s + '}' | |||
if index == length - 1: | |||
result = result + '; ' | |||
index = index + 1 | |||
pre_lfeat_dim = pre_lfeat_dim + self._inputs_dim[lfeat_type] | |||
return result |
@@ -0,0 +1 @@ | |||
from .generic_text_to_speech_frontend import * # noqa F403 |
@@ -0,0 +1,39 @@ | |||
import os | |||
import zipfile | |||
from typing import Any, Dict, List | |||
import ttsfrd | |||
from modelscope.models.base import Model | |||
from modelscope.models.builder import MODELS | |||
from modelscope.utils.audio.tts_exceptions import ( | |||
TtsFrontendInitializeFailedException, | |||
TtsFrontendLanguageTypeInvalidException) | |||
from modelscope.utils.constant import Tasks | |||
__all__ = ['GenericTtsFrontend'] | |||
@MODELS.register_module( | |||
Tasks.text_to_speech, module_name=r'generic_tts_frontend') | |||
class GenericTtsFrontend(Model): | |||
def __init__(self, model_dir='.', lang_type='pinyin', *args, **kwargs): | |||
super().__init__(model_dir, *args, **kwargs) | |||
frontend = ttsfrd.TtsFrontendEngine() | |||
zip_file = os.path.join(model_dir, 'resource.zip') | |||
self._res_path = os.path.join(model_dir, 'resource') | |||
with zipfile.ZipFile(zip_file, 'r') as zip_ref: | |||
zip_ref.extractall(model_dir) | |||
if not frontend.initialize(self._res_path): | |||
raise TtsFrontendInitializeFailedException( | |||
'resource invalid: {}'.format(self._res_path)) | |||
if not frontend.set_lang_type(lang_type): | |||
raise TtsFrontendLanguageTypeInvalidException( | |||
'language type invalid: {}, valid is pinyin and chenmix'. | |||
format(lang_type)) | |||
self._frontend = frontend | |||
def forward(self, data: str) -> Dict[str, List]: | |||
result = self._frontend.gen_tacotron_symbols(data) | |||
return {'texts': [s for s in result.splitlines() if s != '']} |
@@ -0,0 +1 @@ | |||
from .hifigan16k import * # noqa F403 |
@@ -0,0 +1,73 @@ | |||
from __future__ import (absolute_import, division, print_function, | |||
unicode_literals) | |||
import argparse | |||
import glob | |||
import os | |||
import time | |||
import json | |||
import numpy as np | |||
import torch | |||
from scipy.io.wavfile import write | |||
from modelscope.models.base import Model | |||
from modelscope.models.builder import MODELS | |||
from modelscope.utils.audio.tts_exceptions import \ | |||
TtsVocoderMelspecShapeMismatchException | |||
from modelscope.utils.constant import ModelFile, Tasks | |||
from .models import Generator | |||
__all__ = ['Hifigan16k', 'AttrDict'] | |||
MAX_WAV_VALUE = 32768.0 | |||
def load_checkpoint(filepath, device): | |||
assert os.path.isfile(filepath) | |||
print("Loading '{}'".format(filepath)) | |||
checkpoint_dict = torch.load(filepath, map_location=device) | |||
print('Complete.') | |||
return checkpoint_dict | |||
class AttrDict(dict): | |||
def __init__(self, *args, **kwargs): | |||
super(AttrDict, self).__init__(*args, **kwargs) | |||
self.__dict__ = self | |||
@MODELS.register_module(Tasks.text_to_speech, module_name=r'hifigan16k') | |||
class Hifigan16k(Model): | |||
def __init__(self, model_dir, *args, **kwargs): | |||
self._ckpt_path = os.path.join(model_dir, | |||
ModelFile.TORCH_MODEL_BIN_FILE) | |||
self._config = AttrDict(**kwargs) | |||
super().__init__(self._ckpt_path, *args, **kwargs) | |||
if torch.cuda.is_available(): | |||
torch.manual_seed(self._config.seed) | |||
self._device = torch.device('cuda') | |||
else: | |||
self._device = torch.device('cpu') | |||
self._generator = Generator(self._config).to(self._device) | |||
state_dict_g = load_checkpoint(self._ckpt_path, self._device) | |||
self._generator.load_state_dict(state_dict_g['generator']) | |||
self._generator.eval() | |||
self._generator.remove_weight_norm() | |||
def forward(self, melspec): | |||
dim0 = list(melspec.shape)[-1] | |||
if dim0 != 80: | |||
raise TtsVocoderMelspecShapeMismatchException( | |||
'input melspec mismatch 0 dim require 80 but {}'.format(dim0)) | |||
with torch.no_grad(): | |||
x = melspec.T | |||
x = torch.FloatTensor(x).to(self._device) | |||
if len(x.shape) == 2: | |||
x = x.unsqueeze(0) | |||
y_g_hat = self._generator(x) | |||
audio = y_g_hat.squeeze() | |||
audio = audio * MAX_WAV_VALUE | |||
audio = audio.cpu().numpy().astype('int16') | |||
return audio |
@@ -0,0 +1 @@ | |||
from .models import Generator |
@@ -0,0 +1,516 @@ | |||
from distutils.version import LooseVersion | |||
import torch | |||
import torch.nn as nn | |||
import torch.nn.functional as F | |||
from pytorch_wavelets import DWT1DForward | |||
from torch.nn import AvgPool1d, Conv1d, Conv2d, ConvTranspose1d | |||
from torch.nn.utils import remove_weight_norm, spectral_norm, weight_norm | |||
from .utils import get_padding, init_weights | |||
is_pytorch_17plus = LooseVersion(torch.__version__) >= LooseVersion('1.7') | |||
def stft(x, fft_size, hop_size, win_length, window): | |||
"""Perform STFT and convert to magnitude spectrogram. | |||
Args: | |||
x (Tensor): Input signal tensor (B, T). | |||
fft_size (int): FFT size. | |||
hop_size (int): Hop size. | |||
win_length (int): Window length. | |||
window (str): Window function type. | |||
Returns: | |||
Tensor: Magnitude spectrogram (B, #frames, fft_size // 2 + 1). | |||
""" | |||
if is_pytorch_17plus: | |||
x_stft = torch.stft( | |||
x, fft_size, hop_size, win_length, window, return_complex=False) | |||
else: | |||
x_stft = torch.stft(x, fft_size, hop_size, win_length, window) | |||
real = x_stft[..., 0] | |||
imag = x_stft[..., 1] | |||
# NOTE(kan-bayashi): clamp is needed to avoid nan or inf | |||
return torch.sqrt(torch.clamp(real**2 + imag**2, min=1e-7)).transpose(2, 1) | |||
LRELU_SLOPE = 0.1 | |||
def get_padding_casual(kernel_size, dilation=1): | |||
return int(kernel_size * dilation - dilation) | |||
class Conv1dCasual(torch.nn.Module): | |||
def __init__(self, | |||
in_channels, | |||
out_channels, | |||
kernel_size, | |||
stride=1, | |||
padding=0, | |||
dilation=1, | |||
groups=1, | |||
bias=True, | |||
padding_mode='zeros'): | |||
super(Conv1dCasual, self).__init__() | |||
self.pad = padding | |||
self.conv1d = weight_norm( | |||
Conv1d( | |||
in_channels, | |||
out_channels, | |||
kernel_size, | |||
stride, | |||
padding=0, | |||
dilation=dilation, | |||
groups=groups, | |||
bias=bias, | |||
padding_mode=padding_mode)) | |||
self.conv1d.apply(init_weights) | |||
def forward(self, x): # bdt | |||
# described starting from the last dimension and moving forward. | |||
x = F.pad(x, (self.pad, 0, 0, 0, 0, 0), 'constant') | |||
x = self.conv1d(x) | |||
return x | |||
def remove_weight_norm(self): | |||
remove_weight_norm(self.conv1d) | |||
class ConvTranspose1dCausal(torch.nn.Module): | |||
"""CausalConvTranspose1d module with customized initialization.""" | |||
def __init__(self, | |||
in_channels, | |||
out_channels, | |||
kernel_size, | |||
stride, | |||
padding=0): | |||
"""Initialize CausalConvTranspose1d module.""" | |||
super(ConvTranspose1dCausal, self).__init__() | |||
self.deconv = weight_norm( | |||
ConvTranspose1d(in_channels, out_channels, kernel_size, stride)) | |||
self.stride = stride | |||
self.deconv.apply(init_weights) | |||
self.pad = kernel_size - stride | |||
def forward(self, x): | |||
"""Calculate forward propagation. | |||
Args: | |||
x (Tensor): Input tensor (B, in_channels, T_in). | |||
Returns: | |||
Tensor: Output tensor (B, out_channels, T_out). | |||
""" | |||
# x = F.pad(x, (self.pad, 0, 0, 0, 0, 0), "constant") | |||
return self.deconv(x)[:, :, :-self.pad] | |||
def remove_weight_norm(self): | |||
remove_weight_norm(self.deconv) | |||
class ResBlock1(torch.nn.Module): | |||
def __init__(self, h, channels, kernel_size=3, dilation=(1, 3, 5)): | |||
super(ResBlock1, self).__init__() | |||
self.h = h | |||
self.convs1 = nn.ModuleList([ | |||
Conv1dCasual( | |||
channels, | |||
channels, | |||
kernel_size, | |||
1, | |||
dilation=dilation[i], | |||
padding=get_padding_casual(kernel_size, dilation[i])) | |||
for i in range(len(dilation)) | |||
]) | |||
self.convs2 = nn.ModuleList([ | |||
Conv1dCasual( | |||
channels, | |||
channels, | |||
kernel_size, | |||
1, | |||
dilation=1, | |||
padding=get_padding_casual(kernel_size, 1)) | |||
for i in range(len(dilation)) | |||
]) | |||
def forward(self, x): | |||
for c1, c2 in zip(self.convs1, self.convs2): | |||
xt = F.leaky_relu(x, LRELU_SLOPE) | |||
xt = c1(xt) | |||
xt = F.leaky_relu(xt, LRELU_SLOPE) | |||
xt = c2(xt) | |||
x = xt + x | |||
return x | |||
def remove_weight_norm(self): | |||
for layer in self.convs1: | |||
layer.remove_weight_norm() | |||
for layer in self.convs2: | |||
layer.remove_weight_norm() | |||
class Generator(torch.nn.Module): | |||
def __init__(self, h): | |||
super(Generator, self).__init__() | |||
self.h = h | |||
self.num_kernels = len(h.resblock_kernel_sizes) | |||
self.num_upsamples = len(h.upsample_rates) | |||
print('num_kernels={}, num_upsamples={}'.format( | |||
self.num_kernels, self.num_upsamples)) | |||
self.conv_pre = Conv1dCasual( | |||
80, h.upsample_initial_channel, 7, 1, padding=7 - 1) | |||
resblock = ResBlock1 if h.resblock == '1' else ResBlock2 | |||
self.ups = nn.ModuleList() | |||
self.repeat_ups = nn.ModuleList() | |||
for i, (u, k) in enumerate( | |||
zip(h.upsample_rates, h.upsample_kernel_sizes)): | |||
upsample = nn.Sequential( | |||
nn.Upsample(mode='nearest', scale_factor=u), | |||
nn.LeakyReLU(LRELU_SLOPE), | |||
Conv1dCasual( | |||
h.upsample_initial_channel // (2**i), | |||
h.upsample_initial_channel // (2**(i + 1)), | |||
kernel_size=7, | |||
stride=1, | |||
padding=7 - 1)) | |||
self.repeat_ups.append(upsample) | |||
self.ups.append( | |||
ConvTranspose1dCausal( | |||
h.upsample_initial_channel // (2**i), | |||
h.upsample_initial_channel // (2**(i + 1)), | |||
k, | |||
u, | |||
padding=(k - u) // 2)) | |||
self.resblocks = nn.ModuleList() | |||
for i in range(len(self.ups)): | |||
ch = h.upsample_initial_channel // (2**(i + 1)) | |||
for j, (k, d) in enumerate( | |||
zip(h.resblock_kernel_sizes, h.resblock_dilation_sizes)): | |||
self.resblocks.append(resblock(h, ch, k, d)) | |||
self.conv_post = Conv1dCasual(ch, 1, 7, 1, padding=7 - 1) | |||
def forward(self, x): | |||
x = self.conv_pre(x) | |||
for i in range(self.num_upsamples): | |||
x = torch.sin(x) + x | |||
# transconv | |||
x1 = F.leaky_relu(x, LRELU_SLOPE) | |||
x1 = self.ups[i](x1) | |||
# repeat | |||
x2 = self.repeat_ups[i](x) | |||
x = x1 + x2 | |||
xs = None | |||
for j in range(self.num_kernels): | |||
if xs is None: | |||
xs = self.resblocks[i * self.num_kernels + j](x) | |||
else: | |||
xs += self.resblocks[i * self.num_kernels + j](x) | |||
x = xs / self.num_kernels | |||
x = F.leaky_relu(x) | |||
x = self.conv_post(x) | |||
x = torch.tanh(x) | |||
return x | |||
def remove_weight_norm(self): | |||
print('Removing weight norm...') | |||
for layer in self.ups: | |||
layer.remove_weight_norm() | |||
for layer in self.repeat_ups: | |||
layer[-1].remove_weight_norm() | |||
for layer in self.resblocks: | |||
layer.remove_weight_norm() | |||
self.conv_pre.remove_weight_norm() | |||
self.conv_post.remove_weight_norm() | |||
class DiscriminatorP(torch.nn.Module): | |||
def __init__(self, | |||
period, | |||
kernel_size=5, | |||
stride=3, | |||
use_spectral_norm=False): | |||
super(DiscriminatorP, self).__init__() | |||
self.period = period | |||
norm_f = weight_norm if use_spectral_norm is False else spectral_norm | |||
self.convs = nn.ModuleList([ | |||
norm_f( | |||
Conv2d( | |||
1, | |||
32, (kernel_size, 1), (stride, 1), | |||
padding=(get_padding(5, 1), 0))), | |||
norm_f( | |||
Conv2d( | |||
32, | |||
128, (kernel_size, 1), (stride, 1), | |||
padding=(get_padding(5, 1), 0))), | |||
norm_f( | |||
Conv2d( | |||
128, | |||
512, (kernel_size, 1), (stride, 1), | |||
padding=(get_padding(5, 1), 0))), | |||
norm_f( | |||
Conv2d( | |||
512, | |||
1024, (kernel_size, 1), (stride, 1), | |||
padding=(get_padding(5, 1), 0))), | |||
norm_f(Conv2d(1024, 1024, (kernel_size, 1), 1, padding=(2, 0))), | |||
]) | |||
self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0))) | |||
def forward(self, x): | |||
fmap = [] | |||
# 1d to 2d | |||
b, c, t = x.shape | |||
if t % self.period != 0: # pad first | |||
n_pad = self.period - (t % self.period) | |||
x = F.pad(x, (0, n_pad), 'reflect') | |||
t = t + n_pad | |||
x = x.view(b, c, t // self.period, self.period) | |||
for layer in self.convs: | |||
x = layer(x) | |||
x = F.leaky_relu(x, LRELU_SLOPE) | |||
fmap.append(x) | |||
x = self.conv_post(x) | |||
fmap.append(x) | |||
x = torch.flatten(x, 1, -1) | |||
return x, fmap | |||
class MultiPeriodDiscriminator(torch.nn.Module): | |||
def __init__(self): | |||
super(MultiPeriodDiscriminator, self).__init__() | |||
self.discriminators = nn.ModuleList([ | |||
DiscriminatorP(2), | |||
DiscriminatorP(3), | |||
DiscriminatorP(5), | |||
DiscriminatorP(7), | |||
DiscriminatorP(11), | |||
]) | |||
def forward(self, y, y_hat): | |||
y_d_rs = [] | |||
y_d_gs = [] | |||
fmap_rs = [] | |||
fmap_gs = [] | |||
for i, d in enumerate(self.discriminators): | |||
y_d_r, fmap_r = d(y) | |||
y_d_g, fmap_g = d(y_hat) | |||
y_d_rs.append(y_d_r) | |||
fmap_rs.append(fmap_r) | |||
y_d_gs.append(y_d_g) | |||
fmap_gs.append(fmap_g) | |||
return y_d_rs, y_d_gs, fmap_rs, fmap_gs | |||
class DiscriminatorS(torch.nn.Module): | |||
def __init__(self, use_spectral_norm=False): | |||
super(DiscriminatorS, self).__init__() | |||
norm_f = weight_norm if use_spectral_norm is False else spectral_norm | |||
self.convs = nn.ModuleList([ | |||
norm_f(Conv1d(1, 128, 15, 1, padding=7)), | |||
norm_f(Conv1d(128, 128, 41, 2, groups=4, padding=20)), | |||
norm_f(Conv1d(128, 256, 41, 2, groups=16, padding=20)), | |||
norm_f(Conv1d(256, 512, 41, 4, groups=16, padding=20)), | |||
norm_f(Conv1d(512, 1024, 41, 4, groups=16, padding=20)), | |||
norm_f(Conv1d(1024, 1024, 41, 1, groups=16, padding=20)), | |||
norm_f(Conv1d(1024, 1024, 5, 1, padding=2)), | |||
]) | |||
self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1)) | |||
def forward(self, x): | |||
fmap = [] | |||
for layer in self.convs: | |||
x = layer(x) | |||
x = F.leaky_relu(x, LRELU_SLOPE) | |||
fmap.append(x) | |||
x = self.conv_post(x) | |||
fmap.append(x) | |||
x = torch.flatten(x, 1, -1) | |||
return x, fmap | |||
class MultiScaleDiscriminator(torch.nn.Module): | |||
def __init__(self): | |||
super(MultiScaleDiscriminator, self).__init__() | |||
self.discriminators = nn.ModuleList([ | |||
DiscriminatorS(use_spectral_norm=True), | |||
DiscriminatorS(), | |||
DiscriminatorS(), | |||
]) | |||
self.meanpools = nn.ModuleList( | |||
[DWT1DForward(wave='db3', J=1), | |||
DWT1DForward(wave='db3', J=1)]) | |||
self.convs = nn.ModuleList([ | |||
weight_norm(Conv1d(2, 1, 15, 1, padding=7)), | |||
weight_norm(Conv1d(2, 1, 15, 1, padding=7)) | |||
]) | |||
def forward(self, y, y_hat): | |||
y_d_rs = [] | |||
y_d_gs = [] | |||
fmap_rs = [] | |||
fmap_gs = [] | |||
for i, d in enumerate(self.discriminators): | |||
if i != 0: | |||
yl, yh = self.meanpools[i - 1](y) | |||
y = torch.cat([yl, yh[0]], dim=1) | |||
y = self.convs[i - 1](y) | |||
y = F.leaky_relu(y, LRELU_SLOPE) | |||
yl_hat, yh_hat = self.meanpools[i - 1](y_hat) | |||
y_hat = torch.cat([yl_hat, yh_hat[0]], dim=1) | |||
y_hat = self.convs[i - 1](y_hat) | |||
y_hat = F.leaky_relu(y_hat, LRELU_SLOPE) | |||
y_d_r, fmap_r = d(y) | |||
y_d_g, fmap_g = d(y_hat) | |||
y_d_rs.append(y_d_r) | |||
fmap_rs.append(fmap_r) | |||
y_d_gs.append(y_d_g) | |||
fmap_gs.append(fmap_g) | |||
return y_d_rs, y_d_gs, fmap_rs, fmap_gs | |||
class DiscriminatorSTFT(torch.nn.Module): | |||
def __init__(self, | |||
kernel_size=11, | |||
stride=2, | |||
use_spectral_norm=False, | |||
fft_size=1024, | |||
shift_size=120, | |||
win_length=600, | |||
window='hann_window'): | |||
super(DiscriminatorSTFT, self).__init__() | |||
self.fft_size = fft_size | |||
self.shift_size = shift_size | |||
self.win_length = win_length | |||
norm_f = weight_norm if use_spectral_norm is False else spectral_norm | |||
self.convs = nn.ModuleList([ | |||
norm_f( | |||
Conv2d( | |||
fft_size // 2 + 1, | |||
32, (15, 1), (1, 1), | |||
padding=(get_padding(15, 1), 0))), | |||
norm_f( | |||
Conv2d( | |||
32, | |||
32, (kernel_size, 1), (stride, 1), | |||
padding=(get_padding(9, 1), 0))), | |||
norm_f( | |||
Conv2d( | |||
32, | |||
32, (kernel_size, 1), (stride, 1), | |||
padding=(get_padding(9, 1), 0))), | |||
norm_f( | |||
Conv2d( | |||
32, | |||
32, (kernel_size, 1), (stride, 1), | |||
padding=(get_padding(9, 1), 0))), | |||
norm_f(Conv2d(32, 32, (5, 1), (1, 1), padding=(2, 0))), | |||
]) | |||
self.conv_post = norm_f(Conv2d(32, 1, (3, 1), (1, 1), padding=(1, 0))) | |||
self.register_buffer('window', getattr(torch, window)(win_length)) | |||
def forward(self, wav): | |||
wav = torch.squeeze(wav, 1) | |||
x_mag = stft(wav, self.fft_size, self.shift_size, self.win_length, | |||
self.window) | |||
x = torch.transpose(x_mag, 2, 1).unsqueeze(-1) | |||
fmap = [] | |||
for layer in self.convs: | |||
x = layer(x) | |||
x = F.leaky_relu(x, LRELU_SLOPE) | |||
fmap.append(x) | |||
x = self.conv_post(x) | |||
fmap.append(x) | |||
x = x.squeeze(-1) | |||
return x, fmap | |||
class MultiSTFTDiscriminator(torch.nn.Module): | |||
def __init__( | |||
self, | |||
fft_sizes=[1024, 2048, 512], | |||
hop_sizes=[120, 240, 50], | |||
win_lengths=[600, 1200, 240], | |||
window='hann_window', | |||
): | |||
super(MultiSTFTDiscriminator, self).__init__() | |||
self.discriminators = nn.ModuleList() | |||
for fs, ss, wl in zip(fft_sizes, hop_sizes, win_lengths): | |||
self.discriminators += [ | |||
DiscriminatorSTFT(fft_size=fs, shift_size=ss, win_length=wl) | |||
] | |||
def forward(self, y, y_hat): | |||
y_d_rs = [] | |||
y_d_gs = [] | |||
fmap_rs = [] | |||
fmap_gs = [] | |||
for i, d in enumerate(self.discriminators): | |||
y_d_r, fmap_r = d(y) | |||
y_d_g, fmap_g = d(y_hat) | |||
y_d_rs.append(y_d_r) | |||
fmap_rs.append(fmap_r) | |||
y_d_gs.append(y_d_g) | |||
fmap_gs.append(fmap_g) | |||
return y_d_rs, y_d_gs, fmap_rs, fmap_gs | |||
def feature_loss(fmap_r, fmap_g): | |||
loss = 0 | |||
for dr, dg in zip(fmap_r, fmap_g): | |||
for rl, gl in zip(dr, dg): | |||
loss += torch.mean(torch.abs(rl - gl)) | |||
return loss * 2 | |||
def discriminator_loss(disc_real_outputs, disc_generated_outputs): | |||
loss = 0 | |||
r_losses = [] | |||
g_losses = [] | |||
for dr, dg in zip(disc_real_outputs, disc_generated_outputs): | |||
r_loss = torch.mean((1 - dr)**2) | |||
g_loss = torch.mean(dg**2) | |||
loss += (r_loss + g_loss) | |||
r_losses.append(r_loss.item()) | |||
g_losses.append(g_loss.item()) | |||
return loss, r_losses, g_losses | |||
def generator_loss(disc_outputs): | |||
loss = 0 | |||
gen_losses = [] | |||
for dg in disc_outputs: | |||
temp_loss = torch.mean((1 - dg)**2) | |||
gen_losses.append(temp_loss) | |||
loss += temp_loss | |||
return loss, gen_losses |
@@ -0,0 +1,59 @@ | |||
import glob | |||
import os | |||
import matplotlib | |||
import matplotlib.pylab as plt | |||
import torch | |||
from torch.nn.utils import weight_norm | |||
matplotlib.use('Agg') | |||
def plot_spectrogram(spectrogram): | |||
fig, ax = plt.subplots(figsize=(10, 2)) | |||
im = ax.imshow( | |||
spectrogram, aspect='auto', origin='lower', interpolation='none') | |||
plt.colorbar(im, ax=ax) | |||
fig.canvas.draw() | |||
plt.close() | |||
return fig | |||
def init_weights(m, mean=0.0, std=0.01): | |||
classname = m.__class__.__name__ | |||
if classname.find('Conv') != -1: | |||
m.weight.data.normal_(mean, std) | |||
def apply_weight_norm(m): | |||
classname = m.__class__.__name__ | |||
if classname.find('Conv') != -1: | |||
weight_norm(m) | |||
def get_padding(kernel_size, dilation=1): | |||
return int((kernel_size * dilation - dilation) / 2) | |||
def load_checkpoint(filepath, device): | |||
assert os.path.isfile(filepath) | |||
print("Loading '{}'".format(filepath)) | |||
checkpoint_dict = torch.load(filepath, map_location=device) | |||
print('Complete.') | |||
return checkpoint_dict | |||
def save_checkpoint(filepath, obj): | |||
print('Saving checkpoint to {}'.format(filepath)) | |||
torch.save(obj, filepath) | |||
print('Complete.') | |||
def scan_checkpoint(cp_dir, prefix): | |||
pattern = os.path.join(cp_dir, prefix + '????????') | |||
cp_list = glob.glob(pattern) | |||
if len(cp_list) == 0: | |||
return None | |||
return sorted(cp_list)[-1] |
@@ -62,4 +62,6 @@ class Model(ABC): | |||
if hasattr(model_cfg, 'model_type') and not hasattr(model_cfg, 'type'): | |||
model_cfg.type = model_cfg.model_type | |||
model_cfg.model_dir = local_model_dir | |||
for k, v in kwargs.items(): | |||
model_cfg.k = v | |||
return build_model(model_cfg, task_name) |
@@ -1 +1,2 @@ | |||
from .linear_aec_pipeline import LinearAECPipeline | |||
from .text_to_speech_pipeline import * # noqa F403 |
@@ -0,0 +1,46 @@ | |||
import time | |||
from typing import Any, Dict, List | |||
import numpy as np | |||
from modelscope.models import Model | |||
from modelscope.models.audio.tts.am import SambertNetHifi16k | |||
from modelscope.models.audio.tts.vocoder import Hifigan16k | |||
from modelscope.pipelines.base import Pipeline | |||
from modelscope.pipelines.builder import PIPELINES | |||
from modelscope.preprocessors import TextToTacotronSymbols, build_preprocessor | |||
from modelscope.utils.constant import Fields, Tasks | |||
__all__ = ['TextToSpeechSambertHifigan16kPipeline'] | |||
@PIPELINES.register_module( | |||
Tasks.text_to_speech, module_name=r'tts-sambert-hifigan-16k') | |||
class TextToSpeechSambertHifigan16kPipeline(Pipeline): | |||
def __init__(self, | |||
config_file: str = None, | |||
model: List[Model] = None, | |||
preprocessor: TextToTacotronSymbols = None, | |||
**kwargs): | |||
super().__init__( | |||
config_file=config_file, | |||
model=model, | |||
preprocessor=preprocessor, | |||
**kwargs) | |||
assert len(model) == 2, 'model number should be 2' | |||
self._am = model[0] | |||
self._vocoder = model[1] | |||
self._preprocessor = preprocessor | |||
def forward(self, inputs: Dict[str, Any]) -> Dict[str, np.ndarray]: | |||
texts = inputs['texts'] | |||
audio_total = np.empty((0), dtype='int16') | |||
for line in texts: | |||
line = line.strip().split('\t') | |||
audio = self._vocoder.forward(self._am.forward(line[1])) | |||
audio_total = np.append(audio_total, audio, axis=0) | |||
return {'output': audio_total} | |||
def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]: | |||
return inputs |
@@ -8,3 +8,4 @@ from .image import LoadImage, load_image | |||
from .nlp import * # noqa F403 | |||
from .space.dialog_intent_prediction_preprocessor import * # noqa F403 | |||
from .space.dialog_modeling_preprocessor import * # noqa F403 | |||
from .text_to_speech import * # noqa F403 |
@@ -5,7 +5,6 @@ from typing import Any, Dict | |||
import numpy as np | |||
import scipy.io.wavfile as wav | |||
import torch | |||
import torchaudio.compliance.kaldi as kaldi | |||
from numpy.ctypeslib import ndpointer | |||
from modelscope.utils.constant import Fields | |||
@@ -123,6 +122,8 @@ class Feature: | |||
if self.feat_type == 'raw': | |||
return utt | |||
elif self.feat_type == 'fbank': | |||
# have to use local import before modelscope framework supoort lazy loading | |||
import torchaudio.compliance.kaldi as kaldi | |||
if len(utt.shape) == 1: | |||
utt = utt.unsqueeze(0) | |||
feat = kaldi.fbank(utt, **self.fbank_config) | |||
@@ -0,0 +1,53 @@ | |||
# Copyright (c) Alibaba, Inc. and its affiliates. | |||
import io | |||
from typing import Any, Dict, Union | |||
import ttsfrd | |||
from modelscope.fileio import File | |||
from modelscope.models.audio.tts.frontend import GenericTtsFrontend | |||
from modelscope.models.base import Model | |||
from modelscope.utils.audio.tts_exceptions import * # noqa F403 | |||
from modelscope.utils.constant import Fields | |||
from .base import Preprocessor | |||
from .builder import PREPROCESSORS | |||
__all__ = ['TextToTacotronSymbols', 'text_to_tacotron_symbols'] | |||
@PREPROCESSORS.register_module( | |||
Fields.audio, module_name=r'text_to_tacotron_symbols') | |||
class TextToTacotronSymbols(Preprocessor): | |||
"""extract tacotron symbols from text. | |||
Args: | |||
res_path (str): TTS frontend resource url | |||
lang_type (str): language type, valid values are "pinyin" and "chenmix" | |||
""" | |||
def __init__(self, model_name, lang_type='pinyin'): | |||
self._frontend_model = Model.from_pretrained( | |||
model_name, lang_type=lang_type) | |||
assert self._frontend_model is not None, 'load model from pretained failed' | |||
def __call__(self, data: str) -> Dict[str, Any]: | |||
"""Call functions to load text and get tacotron symbols. | |||
Args: | |||
input (str): text with utf-8 | |||
Returns: | |||
symbos (list[str]): texts in tacotron symbols format. | |||
""" | |||
return self._frontend_model.forward(data) | |||
def text_to_tacotron_symbols(text='', path='./', lang='pinyin'): | |||
""" simple interface to transform text to tacotron symbols | |||
Args: | |||
text (str): input text | |||
path (str): resource path | |||
lang (str): language type from one of "pinyin" and "chenmix" | |||
""" | |||
transform = TextToTacotronSymbols(path, lang) | |||
return transform(text) |
@@ -0,0 +1,42 @@ | |||
""" | |||
Define TTS exceptions | |||
""" | |||
class TtsException(Exception): | |||
""" | |||
TTS exception class. | |||
""" | |||
pass | |||
class TtsFrontendException(TtsException): | |||
""" | |||
TTS frontend module level exceptions. | |||
""" | |||
pass | |||
class TtsFrontendInitializeFailedException(TtsFrontendException): | |||
""" | |||
If tts frontend resource is invalid or not exist, this exception will be raised. | |||
""" | |||
pass | |||
class TtsFrontendLanguageTypeInvalidException(TtsFrontendException): | |||
""" | |||
If language type is invalid, this exception will be raised. | |||
""" | |||
class TtsVocoderException(TtsException): | |||
""" | |||
Vocoder exception | |||
""" | |||
class TtsVocoderMelspecShapeMismatchException(TtsVocoderException): | |||
""" | |||
If vocoder's input melspec shape mismatch, this exception will be raised. | |||
""" |
@@ -67,7 +67,6 @@ class Registry(object): | |||
if module_name in self._modules[group_key]: | |||
raise KeyError(f'{module_name} is already registered in ' | |||
f'{self._name}[{group_key}]') | |||
self._modules[group_key][module_name] = module_cls | |||
module_cls.group_key = group_key | |||
@@ -2,4 +2,5 @@ | |||
-r requirements/pipeline.txt | |||
-r requirements/multi-modal.txt | |||
-r requirements/nlp.txt | |||
-r requirements/audio.txt | |||
-r requirements/cv.txt |
@@ -0,0 +1,26 @@ | |||
#tts | |||
h5py==2.10.0 | |||
#https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/TTS/requirements/ttsfrd-0.0.1-cp36-cp36m-linux_x86_64.whl | |||
https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/TTS/requirements/ttsfrd-0.0.1-cp37-cp37m-linux_x86_64.whl | |||
https://swap.oss-cn-hangzhou.aliyuncs.com/Jiaqi%2Fmaas%2Ftts%2Frequirements%2Fpytorch_wavelets-1.3.0-py3-none-any.whl?Expires=1685688388&OSSAccessKeyId=LTAI4Ffebq4d9jTVDwiSbY4L&Signature=jcQbg5EZ%2Bdys3%2F4BRn3srrKLdIg%3D | |||
#https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/TTS/requirements/ttsfrd-0.0.1-cp38-cp38-linux_x86_64.whl | |||
#https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/TTS/requirements/ttsfrd-0.0.1-cp39-cp39-linux_x86_64.whl | |||
inflect | |||
keras==2.2.4 | |||
librosa | |||
lxml | |||
matplotlib | |||
nara_wpe | |||
numpy==1.18.* | |||
protobuf==3.20.* | |||
ptflops | |||
PyWavelets>=1.0.0 | |||
scikit-learn==0.23.2 | |||
sox | |||
tensorboard | |||
tensorflow==1.15.* | |||
torch==1.10.* | |||
torchaudio | |||
torchvision | |||
tqdm | |||
unidecode |
@@ -0,0 +1,60 @@ | |||
import time | |||
import unittest | |||
import json | |||
import tensorflow as tf | |||
# NOTICE: Tensorflow 1.15 seems not so compatible with pytorch. | |||
# A segmentation fault may be raise by pytorch cpp library | |||
# if 'import tensorflow' in front of 'import torch'. | |||
# Puting a 'import torch' here can bypass this incompatibility. | |||
import torch | |||
from scipy.io.wavfile import write | |||
from modelscope.fileio import File | |||
from modelscope.models import Model, build_model | |||
from modelscope.models.audio.tts.am import SambertNetHifi16k | |||
from modelscope.models.audio.tts.vocoder import AttrDict, Hifigan16k | |||
from modelscope.pipelines import pipeline | |||
from modelscope.preprocessors import build_preprocessor | |||
from modelscope.utils.constant import Fields, InputFields, Tasks | |||
from modelscope.utils.logger import get_logger | |||
logger = get_logger() | |||
class TextToSpeechSambertHifigan16kPipelineTest(unittest.TestCase): | |||
def test_pipeline(self): | |||
lang_type = 'pinyin' | |||
text = '明天天气怎么样' | |||
preprocessor_model_id = 'damo/speech_binary_tts_frontend_resource' | |||
am_model_id = 'damo/speech_sambert16k_tts_zhitian_emo' | |||
voc_model_id = 'damo/speech_hifigan16k_tts_zhitian_emo' | |||
cfg_preprocessor = dict( | |||
type='text_to_tacotron_symbols', | |||
model_name=preprocessor_model_id, | |||
lang_type=lang_type) | |||
preprocessor = build_preprocessor(cfg_preprocessor, Fields.audio) | |||
self.assertTrue(preprocessor is not None) | |||
am = Model.from_pretrained(am_model_id) | |||
self.assertTrue(am is not None) | |||
voc = Model.from_pretrained(voc_model_id) | |||
self.assertTrue(voc is not None) | |||
sambert_tts = pipeline( | |||
pipeline_name='tts-sambert-hifigan-16k', | |||
config_file='', | |||
model=[am, voc], | |||
preprocessor=preprocessor) | |||
self.assertTrue(sambert_tts is not None) | |||
output = sambert_tts(text) | |||
self.assertTrue(len(output['output']) > 0) | |||
write('output.wav', 16000, output['output']) | |||
if __name__ == '__main__': | |||
unittest.main() |
@@ -0,0 +1,28 @@ | |||
import shutil | |||
import unittest | |||
from modelscope.preprocessors import build_preprocessor | |||
from modelscope.utils.constant import Fields, InputFields | |||
from modelscope.utils.logger import get_logger | |||
logger = get_logger() | |||
class TtsPreprocessorTest(unittest.TestCase): | |||
def test_preprocess(self): | |||
lang_type = 'pinyin' | |||
text = '今天天气不错,我们去散步吧。' | |||
cfg = dict( | |||
type='text_to_tacotron_symbols', | |||
model_name='damo/speech_binary_tts_frontend_resource', | |||
lang_type=lang_type) | |||
preprocessor = build_preprocessor(cfg, Fields.audio) | |||
output = preprocessor(text) | |||
self.assertTrue(output) | |||
for line in output['texts']: | |||
print(line) | |||
if __name__ == '__main__': | |||
unittest.main() |
@@ -7,6 +7,12 @@ import sys | |||
import unittest | |||
from fnmatch import fnmatch | |||
# NOTICE: Tensorflow 1.15 seems not so compatible with pytorch. | |||
# A segmentation fault may be raise by pytorch cpp library | |||
# if 'import tensorflow' in front of 'import torch'. | |||
# Puting a 'import torch' here can bypass this incompatibility. | |||
import torch | |||
from modelscope.utils.logger import get_logger | |||
from modelscope.utils.test_utils import set_test_level, test_level | |||