support asr new models

Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10919277 * support new asr paraformer model * support asr conformer model
2 years ago · 9bfc77c178
--- a/modelscope/pipelines/audio/asr_inference_pipeline.py
+++ b/modelscope/pipelines/audio/asr_inference_pipeline.py
@@ -110,6 +110,7 @@ class AutomaticSpeechRecognitionPipeline(Pipeline):
            'sampled_lengths': 'seq2seq/sampled_lengths',
            'lang': 'zh-cn',
            'code_base': inputs['code_base'],
            'mode': inputs['mode'],
            'fs': {
                'audio_fs': inputs['audio_fs'],
                'model_fs': 16000
@@ -233,15 +234,16 @@ class AutomaticSpeechRecognitionPipeline(Pipeline):
    def run_inference(self, cmd):
        asr_result = []
        if self.framework == Frameworks.torch and cmd['code_base'] == 'funasr':
            from funasr.bin import asr_inference_paraformer_modelscope
            if cmd['mode'] == 'asr':
                from funasr.bin import asr_inference_modelscope as asr_inference
            else:
                from funasr.bin import asr_inference_paraformer_modelscope as asr_inference

            if hasattr(asr_inference_paraformer_modelscope, 'set_parameters'):
                asr_inference_paraformer_modelscope.set_parameters(
                    sample_rate=cmd['fs'])
                asr_inference_paraformer_modelscope.set_parameters(
                    language=cmd['lang'])
            if hasattr(asr_inference, 'set_parameters'):
                asr_inference.set_parameters(sample_rate=cmd['fs'])
                asr_inference.set_parameters(language=cmd['lang'])

            asr_result = asr_inference_paraformer_modelscope.asr_inference(
            asr_result = asr_inference.asr_inference(
                batch_size=cmd['batch_size'],
                maxlenratio=cmd['maxlenratio'],
                minlenratio=cmd['minlenratio'],
--- a/modelscope/preprocessors/asr.py
+++ b/modelscope/preprocessors/asr.py
@@ -103,6 +103,12 @@ class WavToScp(Preprocessor):
        else:
            code_base = None
        inputs['code_base'] = code_base
        # decoding mode
        if 'mode' in inputs['model_config']:
            mode = inputs['model_config']['mode']
        else:
            mode = None
        inputs['mode'] = mode

        if inputs['model_type'] == Frameworks.torch:
            assert inputs['model_config'].__contains__(
@@ -111,8 +117,6 @@ class WavToScp(Preprocessor):
                'am_model_config'), 'am_model_config does not exist'
            assert inputs['model_config'].__contains__(
                'asr_model_config'), 'asr_model_config does not exist'
            assert inputs['model_config'].__contains__(
                'asr_model_wav_config'), 'asr_model_wav_config does not exist'

            am_model_config: str = os.path.join(
                inputs['model_workspace'],
@@ -127,9 +131,14 @@ class WavToScp(Preprocessor):
            assert os.path.exists(
                asr_model_config), 'asr_model_config does not exist'

            asr_model_wav_config: str = os.path.join(
                inputs['model_workspace'],
                inputs['model_config']['asr_model_wav_config'])
            if 'asr_model_wav_config' in inputs['model_config']:
                asr_model_wav_config: str = os.path.join(
                    inputs['model_workspace'],
                    inputs['model_config']['asr_model_wav_config'])
            else:
                asr_model_wav_config: str = os.path.join(
                    inputs['model_workspace'],
                    inputs['model_config']['asr_model_config'])
            assert os.path.exists(
                asr_model_wav_config), 'asr_model_wav_config does not exist'

--- a/requirements/audio.txt
+++ b/requirements/audio.txt
@@ -1,6 +1,6 @@
 easyasr>=0.0.2
 espnet==202204
 funasr>=0.1.0
 funasr>=0.1.3
 h5py
 inflect
 keras
--- a/tests/pipelines/test_automatic_speech_recognition.py
+++ b/tests/pipelines/test_automatic_speech_recognition.py
@@ -217,6 +217,41 @@ class AutomaticSpeechRecognitionTest(unittest.TestCase,
            'damo/speech_UniASR_asr_2pass-id-16k-common-vocab1067-tensorflow1-offline',
            'wav_path': 'data/test/audios/asr_example_id.wav'
        },
        {
            'model_id':
            'damo/speech_conformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch',
            'wav_path': 'data/test/audios/asr_example_id.wav'
        },
        {
            'model_id':
            'damo/speech_conformer_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch',
            'wav_path': 'data/test/audios/asr_example_id.wav'
        },
        {
            'model_id':
            'damo/speech_paraformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch',
            'wav_path': 'data/test/audios/asr_example_id.wav'
        },
        {
            'model_id':
            'damo/speech_paraformer_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch',
            'wav_path': 'data/test/audios/asr_example_id.wav'
        },
        {
            'model_id':
            'damo/speech_paraformerbert_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch',
            'wav_path': 'data/test/audios/asr_example_id.wav'
        },
        {
            'model_id':
            'damo/speech_paraformerbert_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch',
            'wav_path': 'data/test/audios/asr_example_id.wav'
        },
        {
            'model_id':
            'damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch',
            'wav_path': 'data/test/audios/asr_example_id.wav'
        },
    ]

    def setUp(self) -> None: