diff --git a/modelscope/preprocessors/audio.py b/modelscope/preprocessors/audio.py index cdee968b..10057034 100644 --- a/modelscope/preprocessors/audio.py +++ b/modelscope/preprocessors/audio.py @@ -1,12 +1,13 @@ import io import os -from typing import Any, Dict +from typing import Any, Dict, Tuple, Union import numpy as np import scipy.io.wavfile as wav import torch from modelscope.utils.constant import Fields +from . import Preprocessor from .builder import PREPROCESSORS @@ -115,7 +116,7 @@ class Feature: @PREPROCESSORS.register_module(Fields.audio) -class LinearAECAndFbank: +class LinearAECAndFbank(Preprocessor): SAMPLE_RATE = 16000 def __init__(self, io_config): @@ -127,18 +128,27 @@ class LinearAECAndFbank: self.mitaec = MinDAEC.load() self.mask_on_mic = io_config['mask_on'] == 'nearend_mic' - def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]: - """ linear filtering the near end mic and far end audio, then extract the feature - :param data: dict with two keys and correspond audios: "nearend_mic" and "farend_speech" - :return: dict with two keys and Tensor values: "base" linear filtered audio,and "feature" + def __call__(self, data: Union[Tuple, Dict[str, Any]]) -> Dict[str, Any]: + """ Linear filtering the near end mic and far end audio, then extract the feature. + + Args: + data: Dict with two keys and correspond audios: "nearend_mic" and "farend_speech". + + Returns: + Dict with two keys and Tensor values: "base" linear filtered audio,and "feature" """ - # read files - nearend_mic, fs = self.load_wav(data['nearend_mic']) - farend_speech, fs = self.load_wav(data['farend_speech']) - if 'nearend_speech' in data: - nearend_speech, fs = self.load_wav(data['nearend_speech']) - else: + if isinstance(data, tuple): + nearend_mic, fs = self.load_wav(data[0]) + farend_speech, fs = self.load_wav(data[1]) nearend_speech = np.zeros_like(nearend_mic) + else: + # read files + nearend_mic, fs = self.load_wav(data['nearend_mic']) + farend_speech, fs = self.load_wav(data['farend_speech']) + if 'nearend_speech' in data: + nearend_speech, fs = self.load_wav(data['nearend_speech']) + else: + nearend_speech = np.zeros_like(nearend_mic) out_mic, out_ref, out_linear, out_echo = self.mitaec.do_linear_aec( nearend_mic, farend_speech) diff --git a/tests/pipelines/test_speech_signal_process.py b/tests/pipelines/test_speech_signal_process.py index 22dac2b6..4c056a86 100644 --- a/tests/pipelines/test_speech_signal_process.py +++ b/tests/pipelines/test_speech_signal_process.py @@ -68,6 +68,25 @@ class SpeechSignalProcessTest(unittest.TestCase): aec(input, output_path=output_path) print(f'Processed audio saved to {output_path}') + @unittest.skipUnless(test_level() >= 1, 'skip test in current test level') + def test_aec_tuple_bytes(self): + # Download audio files + download(NEAREND_MIC_URL, NEAREND_MIC_FILE) + download(FAREND_SPEECH_URL, FAREND_SPEECH_FILE) + model_id = 'damo/speech_dfsmn_aec_psm_16k' + with open(NEAREND_MIC_FILE, 'rb') as f: + nearend_bytes = f.read() + with open(FAREND_SPEECH_FILE, 'rb') as f: + farend_bytes = f.read() + inputs = (nearend_bytes, farend_bytes) + aec = pipeline( + Tasks.acoustic_echo_cancellation, + model=model_id, + pipeline_name=Pipelines.speech_dfsmn_aec_psm_16k) + output_path = os.path.abspath('output.wav') + aec(inputs, output_path=output_path) + print(f'Processed audio saved to {output_path}') + @unittest.skipUnless(test_level() >= 1, 'skip test in current test level') def test_ans(self): # Download audio files