diff --git a/modelscope/metrics/text_generation_metric.py b/modelscope/metrics/text_generation_metric.py index ae61d225..3e5c1f93 100644 --- a/modelscope/metrics/text_generation_metric.py +++ b/modelscope/metrics/text_generation_metric.py @@ -1,8 +1,5 @@ from typing import Dict -import numpy as np -from rouge_score import rouge_scorer - from ..metainfo import Metrics from ..utils.registry import default_group from .base import Metric @@ -18,6 +15,7 @@ class TextGenerationMetric(Metric): def __init__(self): self.preds = [] self.tgts = [] + from rouge_score import rouge_scorer self.scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True) def add(self, outputs: Dict, inputs: Dict): diff --git a/modelscope/models/audio/tts/models/am_models.py b/modelscope/models/audio/tts/models/am_models.py index 1433fd7e..cd43ff12 100755 --- a/modelscope/models/audio/tts/models/am_models.py +++ b/modelscope/models/audio/tts/models/am_models.py @@ -1,7 +1,4 @@ import tensorflow as tf -from tensorflow.contrib.cudnn_rnn import CudnnLSTM -from tensorflow.contrib.cudnn_rnn.python.ops import cudnn_rnn_ops -from tensorflow.contrib.rnn import LSTMBlockCell def encoder_prenet(inputs, @@ -207,6 +204,7 @@ def conv_and_lstm(inputs, embedded_inputs_speaker, mask=None, scope='conv_and_lstm'): + from tensorflow.contrib.rnn import LSTMBlockCell x = inputs with tf.variable_scope(scope): for i in range(n_conv_layers): @@ -244,6 +242,7 @@ def conv_and_lstm_dec(inputs, mask=None, scope='conv_and_lstm'): x = inputs + from tensorflow.contrib.rnn import LSTMBlockCell with tf.variable_scope(scope): for i in range(n_conv_layers): x = conv1d( diff --git a/modelscope/models/audio/tts/models/helpers.py b/modelscope/models/audio/tts/models/helpers.py index f3e53277..371000a4 100755 --- a/modelscope/models/audio/tts/models/helpers.py +++ b/modelscope/models/audio/tts/models/helpers.py @@ -1,9 +1,8 @@ import numpy as np import tensorflow as tf -from tensorflow.contrib.seq2seq import Helper -class VarTestHelper(Helper): +class VarTestHelper(tf.contrib.seq2seq.Helper): def __init__(self, batch_size, inputs, dim): with tf.name_scope('VarTestHelper'): @@ -44,7 +43,7 @@ class VarTestHelper(Helper): return (finished, next_inputs, state) -class VarTrainingHelper(Helper): +class VarTrainingHelper(tf.contrib.seq2seq.Helper): def __init__(self, targets, inputs, dim): with tf.name_scope('VarTrainingHelper'): @@ -86,7 +85,7 @@ class VarTrainingHelper(Helper): return (finished, next_inputs, state) -class VarTrainingSSHelper(Helper): +class VarTrainingSSHelper(tf.contrib.seq2seq.Helper): def __init__(self, targets, inputs, dim, global_step, schedule_begin, alpha, decay_steps): diff --git a/modelscope/models/audio/tts/models/rnn_wrappers.py b/modelscope/models/audio/tts/models/rnn_wrappers.py index 85a6b335..6c487bab 100755 --- a/modelscope/models/audio/tts/models/rnn_wrappers.py +++ b/modelscope/models/audio/tts/models/rnn_wrappers.py @@ -1,14 +1,11 @@ -import numpy as np import tensorflow as tf -from tensorflow.contrib.rnn import RNNCell -from tensorflow.contrib.seq2seq import AttentionWrapperState from tensorflow.python.ops import rnn_cell_impl from .am_models import prenet -class VarPredictorCell(RNNCell): - '''Wrapper wrapper knock knock.''' +class VarPredictorCell(tf.contrib.rnn.RNNCell): + """Wrapper wrapper knock knock.""" def __init__(self, var_predictor_cell, is_training, dim, prenet_units): super(VarPredictorCell, self).__init__() @@ -33,7 +30,7 @@ class VarPredictorCell(RNNCell): ]) def call(self, inputs, state): - '''Run the Tacotron2 super decoder cell.''' + """Run the Tacotron2 super decoder cell.""" super_cell_out, decoder_state = state # split @@ -61,8 +58,8 @@ class VarPredictorCell(RNNCell): return new_super_cell_out, new_states -class DurPredictorCell(RNNCell): - '''Wrapper wrapper knock knock.''' +class DurPredictorCell(tf.contrib.rnn.RNNCell): + """Wrapper wrapper knock knock.""" def __init__(self, var_predictor_cell, is_training, dim, prenet_units): super(DurPredictorCell, self).__init__() @@ -87,7 +84,7 @@ class DurPredictorCell(RNNCell): ]) def call(self, inputs, state): - '''Run the Tacotron2 super decoder cell.''' + """Run the Tacotron2 super decoder cell.""" super_cell_out, decoder_state = state # split @@ -117,8 +114,8 @@ class DurPredictorCell(RNNCell): return new_super_cell_out, new_states -class DurPredictorCECell(RNNCell): - '''Wrapper wrapper knock knock.''' +class DurPredictorCECell(tf.contrib.rnn.RNNCell): + """Wrapper wrapper knock knock.""" def __init__(self, var_predictor_cell, is_training, dim, prenet_units, max_dur, dur_embedding_dim): @@ -146,7 +143,7 @@ class DurPredictorCECell(RNNCell): ]) def call(self, inputs, state): - '''Run the Tacotron2 super decoder cell.''' + """Run the Tacotron2 super decoder cell.""" super_cell_out, decoder_state = state # split @@ -181,8 +178,8 @@ class DurPredictorCECell(RNNCell): return new_super_cell_out, new_states -class VarPredictorCell2(RNNCell): - '''Wrapper wrapper knock knock.''' +class VarPredictorCell2(tf.contrib.rnn.RNNCell): + """Wrapper wrapper knock knock.""" def __init__(self, var_predictor_cell, is_training, dim, prenet_units): super(VarPredictorCell2, self).__init__() diff --git a/modelscope/models/audio/tts/models/robutrans.py b/modelscope/models/audio/tts/models/robutrans.py index d5bafcec..ab9fdfcc 100755 --- a/modelscope/models/audio/tts/models/robutrans.py +++ b/modelscope/models/audio/tts/models/robutrans.py @@ -1,14 +1,8 @@ import tensorflow as tf -from tensorflow.contrib.rnn import LSTMBlockCell, MultiRNNCell -from tensorflow.contrib.seq2seq import BasicDecoder from tensorflow.python.ops.ragged.ragged_util import repeat -from .am_models import conv_prenet, decoder_prenet, encoder_prenet from .fsmn_encoder import FsmnEncoderV2 -from .helpers import VarTestHelper, VarTrainingHelper -from .position import (BatchSinusodalPositionalEncoding, - SinusodalPositionalEncoding) -from .rnn_wrappers import DurPredictorCell, VarPredictorCell +from .position import BatchSinusodalPositionalEncoding from .self_attention_decoder import SelfAttentionDecoder from .self_attention_encoder import SelfAttentionEncoder @@ -32,7 +26,7 @@ class RobuTrans(): duration_scales=None, energy_contours=None, energy_scales=None): - '''Initializes the model for inference. + """Initializes the model for inference. Sets "mel_outputs", "linear_outputs", "stop_token_outputs", and "alignments" fields. @@ -46,7 +40,10 @@ class RobuTrans(): mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number of steps in the output time series, M is num_mels, and values are entries in the mel spectrogram. Only needed for training. - ''' + """ + from tensorflow.contrib.rnn import LSTMBlockCell, MultiRNNCell + from tensorflow.contrib.seq2seq import BasicDecoder + with tf.variable_scope('inference') as _: is_training = mel_targets is not None batch_size = tf.shape(inputs)[0] @@ -229,17 +226,20 @@ class RobuTrans(): LSTMBlockCell(hp.predictor_lstm_units), LSTMBlockCell(hp.predictor_lstm_units) ], state_is_tuple=True) # yapf:disable + from .rnn_wrappers import DurPredictorCell duration_output_cell = DurPredictorCell( duration_predictor_cell, is_training, 1, hp.predictor_prenet_units) duration_predictor_init_state = duration_output_cell.zero_state( batch_size=batch_size, dtype=tf.float32) if is_training: + from .helpers import VarTrainingHelper duration_helper = VarTrainingHelper( tf.expand_dims( tf.log(tf.cast(durations, tf.float32) + 1), axis=2), dur_inputs, 1) else: + from .helpers import VarTestHelper duration_helper = VarTestHelper(batch_size, dur_inputs, 1) ( duration_outputs, _ diff --git a/modelscope/models/audio/tts/sambert_hifi.py b/modelscope/models/audio/tts/sambert_hifi.py index 401e32c9..79f8068e 100644 --- a/modelscope/models/audio/tts/sambert_hifi.py +++ b/modelscope/models/audio/tts/sambert_hifi.py @@ -1,14 +1,10 @@ from __future__ import (absolute_import, division, print_function, unicode_literals) -import io import os -import time import zipfile -from typing import Any, Dict, Optional, Union import json import numpy as np -import torch from modelscope.metainfo import Models from modelscope.models.base import Model @@ -16,8 +12,8 @@ from modelscope.models.builder import MODELS from modelscope.utils.audio.tts_exceptions import ( TtsFrontendInitializeFailedException, TtsFrontendLanguageTypeInvalidException, TtsModelConfigurationExcetion, - TtsVocoderMelspecShapeMismatchException, TtsVoiceNotExistsException) -from modelscope.utils.constant import ModelFile, Tasks + TtsVoiceNotExistsException) +from modelscope.utils.constant import Tasks from .voice import Voice import tensorflow as tf # isort:skip diff --git a/modelscope/pipelines/base.py b/modelscope/pipelines/base.py index 8c260ece..d674052d 100644 --- a/modelscope/pipelines/base.py +++ b/modelscope/pipelines/base.py @@ -23,8 +23,8 @@ logger = get_logger() class Pipeline(ABC): def initiate_single_model(self, model): - logger.info(f'initiate model from {model}') if isinstance(model, str) and is_official_hub_path(model): + logger.info(f'initiate model from location {model}.') # expecting model has been prefetched to local cache beforehand return Model.from_pretrained( model, model_prefetched=True) if is_model(model) else model diff --git a/modelscope/pipelines/cv/cmdssl_video_embedding_pipleline.py b/modelscope/pipelines/cv/cmdssl_video_embedding_pipleline.py index 47d90d71..1d208841 100644 --- a/modelscope/pipelines/cv/cmdssl_video_embedding_pipleline.py +++ b/modelscope/pipelines/cv/cmdssl_video_embedding_pipleline.py @@ -1,11 +1,9 @@ import os.path as osp from typing import Any, Dict -import decord import numpy as np import torch import torchvision.transforms.functional as TF -from decord import VideoReader, cpu from PIL import Image from modelscope.metainfo import Pipelines @@ -49,6 +47,7 @@ class CMDSSLVideoEmbeddingPipeline(Pipeline): logger.info('load model done') def preprocess(self, input: Input) -> Dict[str, Any]: + import decord decord.bridge.set_bridge('native') transforms = VCompose([ @@ -60,7 +59,7 @@ class CMDSSLVideoEmbeddingPipeline(Pipeline): clip_len = (self.cfg.DATA.video_frames - 1) * self.cfg.DATA.video_stride + 1 - vr = VideoReader(input, ctx=cpu(0)) + vr = decord.VideoReader(input, ctx=decord.cpu(0)) if len(vr) <= clip_len: init_frames = np.zeros(self.cfg.DATA.multi_crop, dtype=int) else: diff --git a/modelscope/pipelines/cv/ocr_detection_pipeline.py b/modelscope/pipelines/cv/ocr_detection_pipeline.py index d8b31389..ed8bcccb 100644 --- a/modelscope/pipelines/cv/ocr_detection_pipeline.py +++ b/modelscope/pipelines/cv/ocr_detection_pipeline.py @@ -16,11 +16,6 @@ from ..base import Pipeline from ..builder import PIPELINES from .ocr_utils import model_resnet_mutex_v4_linewithchar, ops, utils -if tf.__version__ >= '2.0': - import tf_slim as slim -else: - from tensorflow.contrib import slim - if tf.__version__ >= '2.0': tf = tf.compat.v1 tf.compat.v1.disable_eager_execution() diff --git a/modelscope/preprocessors/video.py b/modelscope/preprocessors/video.py index 262fdaa5..33a92c1c 100644 --- a/modelscope/preprocessors/video.py +++ b/modelscope/preprocessors/video.py @@ -1,15 +1,11 @@ import math -import os import random -import decord import numpy as np import torch -import torch.nn as nn import torch.utils.data import torch.utils.dlpack as dlpack import torchvision.transforms._transforms_video as transforms -from decord import VideoReader from torchvision.transforms import Compose @@ -128,6 +124,7 @@ def _decode_video(cfg, path): Returns: frames (Tensor): video tensor data """ + from decord import VideoReader vr = VideoReader(path) num_clips_per_video = cfg.TEST.NUM_ENSEMBLE_VIEWS