|
|
|
@@ -55,7 +55,8 @@ class OfaASRPreprocessor(OfaBasePreprocessor): |
|
|
|
|
|
|
|
def _build_train_sample(self, data: Dict[str, Any]) -> Dict[str, Any]: |
|
|
|
speed = random.choice([0.9, 1.0, 1.1]) |
|
|
|
wav, sr = librosa.load(data[self.column_map['wav']], 16000, mono=True) |
|
|
|
audio_bytes = self.get_audio_bytes(data[self.column_map['wav']]) |
|
|
|
wav, sr = librosa.load(audio_bytes, 16000, mono=True) |
|
|
|
fbank = self.prepare_fbank( |
|
|
|
torch.tensor([wav], dtype=torch.float32), |
|
|
|
sr, |
|
|
|
@@ -91,7 +92,8 @@ class OfaASRPreprocessor(OfaBasePreprocessor): |
|
|
|
|
|
|
|
def _build_infer_sample(self, data: Dict[str, Any]) -> Dict[str, Any]: |
|
|
|
speed = 1.0 |
|
|
|
wav, sr = librosa.load(data[self.column_map['wav']], 16000, mono=True) |
|
|
|
audio_bytes = self.get_audio_bytes(data[self.column_map['wav']]) |
|
|
|
wav, sr = librosa.load(audio_bytes, 16000, mono=True) |
|
|
|
fbank = self.prepare_fbank( |
|
|
|
torch.tensor([wav], dtype=torch.float32), |
|
|
|
sr, |
|
|
|
|