You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

SpeechTranscription.cs 10 kB

1 year ago
1 year ago
1 year ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197
  1. using NAudio.Wave;
  2. using Whisper.net;
  3. namespace LLama.Examples.Examples
  4. {
  5. public class SpeechTranscription
  6. {
  7. public static async Task Run()
  8. {
  9. if (ConsoleStyleHelpers.SelectAudioModel() is not string model) { return; }
  10. bool loadFinished = false;
  11. var loading = ConsoleStyleHelpers.LoadPrint("Loading model...", () => loadFinished);
  12. using var audioServer = new AudioServer(model);
  13. audioServer.ServiceUsers.Add(new AudioEchoer());
  14. loadFinished = true; loading.Wait();
  15. await ConsoleStyleHelpers.WaitUntilExit();
  16. }
  17. class AudioEchoer : IAudioServiceUser
  18. {
  19. bool IAudioServiceUser.IsOfInterest(string AudioTranscription)
  20. {
  21. if (AudioTranscription.Contains("Artificial Intelligence", StringComparison.CurrentCultureIgnoreCase)) {
  22. Console.ForegroundColor = ConsoleColor.DarkRed;
  23. Console.WriteLine($"Skipped text because it's not of interest: {AudioTranscription}");
  24. Console.ForegroundColor = ConsoleColor.White;
  25. return false;
  26. }
  27. else { return true; }
  28. }
  29. void IAudioServiceUser.ProcessText(string AudioTranscription)
  30. {
  31. Console.ForegroundColor = ConsoleColor.Yellow;
  32. Console.WriteLine(AudioTranscription);
  33. Console.ForegroundColor = ConsoleColor.White;
  34. }
  35. }
  36. public interface IAudioServiceUser
  37. {
  38. bool IsOfInterest(string AudioTranscription);
  39. void ProcessText(string AudioTranscription);
  40. }
  41. public class AudioServer : IDisposable
  42. {
  43. const int clipLength = 250; // ms
  44. const float voiceDetectionThreshold = 0.01f;
  45. readonly string[] knownFalsePositives = ["[BLANK_AUDIO]", "Thank you", "[silence]"];
  46. WaveInEvent waveIn;
  47. WaveFormat waveFormat = new(16000, 16, 1); // 16KHz, 16 bits, Mono Channel
  48. List<byte> recordedBytes = [];
  49. WhisperFactory? whisperFactory;
  50. WhisperProcessor? processor;
  51. string whisperPrompt = """
  52. The short audio comes from a user that is speaking to an AI Language Model in real time.
  53. Pay extra attentions for commands like 'ok stop' or just 'stop'.
  54. In case of inaudible sentences that might be, assume they're saying 'stop'.
  55. """.Trim();
  56. // Tracked stats for Speech Recognition, Parsing, and Serving.
  57. int currentBlankClips; // Ideally would work with milliseconds,
  58. int totalNonBlankClips; // ..but for example's sake they work on a
  59. int nonIdleTime; // ..clip-based quant-length (1 = clipLength).
  60. // Default detection settings: A speech of 750ms, followed by pause of 500ms. (2x250ms)
  61. public (int minBlanksPerSeperation, int minNonBlanksForValidMessages) detectionSettings = (2, 3);
  62. public HashSet<IAudioServiceUser> ServiceUsers = [];
  63. public AudioServer(string modelPath)
  64. {
  65. // Adjust the path based on your GPU's type. On your build you ideally want just the correct runtime build for your project, but here we're having all references, so it's getting confused.
  66. var libPath = @$"{Environment.GetFolderPath(Environment.SpecialFolder.UserProfile)}\.nuget\packages\whisper.net.runtime.cublas\1.5.0\build\win-x64\whisper.dll"; // Defaulting to cuBlas.
  67. if (!File.Exists(libPath)) { Console.Error.WriteLine($"Could not find dll file at {libPath}.\nWhisper will load with the default runtime (possibly CPU)."); libPath = null; }
  68. whisperFactory = WhisperFactory.FromPath(modelPath, libraryPath: libPath);
  69. processor = whisperFactory.CreateBuilder().WithThreads(16).WithPrompt(whisperPrompt).WithLanguage("en").Build();
  70. waveIn = new WaveInEvent() { BufferMilliseconds = clipLength, WaveFormat = waveFormat };
  71. waveIn.DataAvailable += WaveIn_DataAvailable;
  72. waveIn.StartRecording();
  73. }
  74. void WaveIn_DataAvailable(object? sender, WaveInEventArgs e)
  75. {
  76. // Cache the recorded bytes
  77. recordedBytes.AddRange(e.Buffer[..e.BytesRecorded]);
  78. if (recordedBytes.Count > 110000000) { recordedBytes.RemoveRange(0, 50000000); }
  79. // Get the max volume contained inside the clip
  80. var maxVolume = 0f; // This byte->sample algorithm is from: https://github.com/naudio/NAudio/blob/master/Docs/RecordingLevelMeter.md#calculating-peak-values
  81. for (int i = 0; i < e.BytesRecorded; i += 2) { maxVolume = Math.Max(maxVolume, Math.Abs((short) ((e.Buffer[i + 1] << 8) | e.Buffer[i + 0]) / 32768f)); }
  82. // Compare the volume with the threshold and act accordingly.
  83. // Once an interesting and 'full' set of clips pops up, serve it.
  84. if (maxVolume > voiceDetectionThreshold) {
  85. currentBlankClips = 0;
  86. totalNonBlankClips++;
  87. nonIdleTime++;
  88. }
  89. else if (++currentBlankClips < detectionSettings.minBlanksPerSeperation) { nonIdleTime++; }
  90. else {
  91. if (totalNonBlankClips > detectionSettings.minNonBlanksForValidMessages) { SendTranscription(); }
  92. else if (totalNonBlankClips > 0) { } // This might be case of a false-positive -- knock, noise, cough, anything.
  93. (currentBlankClips, totalNonBlankClips, nonIdleTime) = (0, 0, 0);
  94. }
  95. async void SendTranscription()
  96. {
  97. var bytesPerClip = waveFormat.BitsPerSample * clipLength * 2;
  98. var capturedClipBytes = recordedBytes.TakeLast(bytesPerClip * (nonIdleTime + 2)).ToArray();
  99. var transcribedText = await ProcessAudio(capturedClipBytes, "Assets\\temp.wav"); // Save to temporary file.
  100. if (knownFalsePositives.Contains(transcribedText)) { return; } // False positive.. yikes!
  101. foreach (var user in ServiceUsers.Where(x => x.IsOfInterest(transcribedText))) { user.ProcessText(transcribedText); }
  102. }
  103. }
  104. /// <summary> Requests a transcription and responds with the text. </summary>
  105. async Task<string> ProcessAudio(byte[] bytes, string tempWavFilePath)
  106. {
  107. var wavStream = new MemoryStream();
  108. using (var writer = new WaveFileWriter(tempWavFilePath, waveFormat)) { writer.Write(bytes, 0, bytes.Length); }
  109. using (var fileStream = File.OpenRead(tempWavFilePath)) { await fileStream.CopyToAsync(wavStream); }
  110. wavStream.Seek(0, SeekOrigin.Begin);
  111. Console.Beep();
  112. return string.Join(' ', await processor!.ProcessAsync(wavStream).Select(x => x.Text).ToListAsync());
  113. }
  114. void IDisposable.Dispose()
  115. {
  116. waveIn.Dispose();
  117. processor?.Dispose();
  118. }
  119. }
  120. public static class ConsoleStyleHelpers
  121. {
  122. public static string? SelectAudioModel()
  123. {
  124. var models = Directory.GetFiles("Assets", "*bin");
  125. if (models.Length == 1) { return models[0]; }
  126. else if (models.Length != 0) {
  127. for (int i = 0; i < models.Length; i++) {
  128. Console.ForegroundColor = ConsoleColor.Blue;
  129. Console.Write($"{i + 1}: ");
  130. Console.ForegroundColor = ConsoleColor.Yellow;
  131. Console.WriteLine(models[i]["Assets\\".Length..]);
  132. }
  133. while (true) {
  134. Console.ForegroundColor = ConsoleColor.DarkCyan;
  135. Console.Write($"Please choose a model (1-{models.Length}): ");
  136. if (!int.TryParse(Console.ReadKey().KeyChar.ToString(), out var i) || i > models.Length) { Console.WriteLine(); continue; }
  137. Console.WriteLine();
  138. Console.ForegroundColor = ConsoleColor.White;
  139. return models[i - 1];
  140. }
  141. }
  142. else
  143. {
  144. Console.ForegroundColor = ConsoleColor.Red;
  145. Console.WriteLine($"Download a non-quantized model and place it in the executing directory:");
  146. Console.ForegroundColor = ConsoleColor.Yellow;
  147. Console.WriteLine($"\t{Environment.CurrentDirectory}\\Assets");
  148. Console.ForegroundColor = ConsoleColor.Red;
  149. Console.WriteLine("You can find the official ggml models in whisper.cpp's huggingface repository: ");
  150. Console.ForegroundColor = ConsoleColor.Blue;
  151. Console.WriteLine("\thttps://huggingface.co/ggerganov/whisper.cpp/tree/main");
  152. Console.ForegroundColor = ConsoleColor.White;
  153. return null;
  154. }
  155. }
  156. public static async Task LoadPrint(string initialText, Func<bool> ShouldContinue)
  157. {
  158. var startTime = DateTime.Now;
  159. Console.WriteLine(initialText);
  160. while (!ShouldContinue()) { Console.Write("."); await Task.Delay(100); }
  161. Console.WriteLine($" Completed in {(DateTime.Now - startTime).TotalSeconds:f2}s.");
  162. }
  163. public async static Task WaitUntilExit()
  164. {
  165. Console.ForegroundColor = ConsoleColor.Green;
  166. Console.WriteLine("Voice active. Begin talking to transcribe. Press any key at any time to exit.");
  167. Console.ForegroundColor = ConsoleColor.White;
  168. await Task.Delay(1000);
  169. Console.ReadKey();
  170. }
  171. }
  172. }
  173. }