You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

Prefill.cs 4.7 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138
  1. #pragma warning disable CS8618
  2. using System.Text;
  3. using BenchmarkDotNet.Attributes;
  4. using BenchmarkDotNet.Engines;
  5. using BenchmarkDotNet.Jobs;
  6. using LLama.Abstractions;
  7. using LLama.Common;
  8. using LLama.Native;
  9. namespace LLama.Benchmark.LLamaExecutorBenchmark
  10. {
  11. #if WINDOWS
  12. [BenchmarkDotNet.Diagnostics.Windows.Configs.NativeMemoryProfiler]
  13. #endif
  14. [BenchmarkCategory("Executor", "LLama")]
  15. [SimpleJob(RunStrategy.Monitoring, runtimeMoniker: RuntimeMoniker.Net80)]
  16. [MemoryDiagnoser]
  17. [MinIterationCount(1)]
  18. [MaxIterationCount(16)]
  19. [RPlotExporter]
  20. public class PrefillBenchmark
  21. {
  22. /// <summary>
  23. /// (prompt length, context length)
  24. /// </summary>
  25. public IEnumerable<(int, uint)> PromptAndContextLengths => new (int, uint)[]
  26. {
  27. (512, 2048),
  28. (2024, 2048)
  29. };
  30. /// <summary>
  31. /// (model path, gpu layer count)
  32. /// </summary>
  33. public IEnumerable<(string, int)> ModelAndGpuLayerCounts => new (string, int)[]
  34. // TODO: specify the native library to load here to test cpu case better.
  35. {
  36. (Path.Combine(Constants.ModelDir, Constants.Generative7BModelPath), 0),
  37. (Path.Combine(Constants.ModelDir, Constants.Generative7BModelPath), 10),
  38. (Path.Combine(Constants.ModelDir, Constants.Generative7BModelPath), 20)
  39. };
  40. public IEnumerable<ExecutorType> ExecutorTypes => new ExecutorType[]
  41. {
  42. ExecutorType.Interactive,
  43. ExecutorType.Stateless
  44. };
  45. [ParamsSource(nameof(PromptAndContextLengths))]
  46. public (int, uint) PromptAndContextLength { get; set; }
  47. [ParamsSource(nameof(ModelAndGpuLayerCounts))]
  48. public (string, int) ModelAndGpuLayerCount { get; set; }
  49. [ParamsSource(nameof(ExecutorTypes))]
  50. public ExecutorType ExecutorType { get; set; }
  51. /// <summary>
  52. /// Params used to create a model.
  53. /// </summary>
  54. public ModelParams ModelParams { get; set; }
  55. /// <summary>
  56. /// Params used in inference.
  57. /// </summary>
  58. public InferenceParams InferenceParams { get; set; }
  59. /// <summary>
  60. /// Prompt used to run text generation.
  61. /// </summary>
  62. public string Prompt { get; set; }
  63. public ILLamaExecutor Executor { get; set; }
  64. private void InitializeParamsAndModel()
  65. {
  66. ModelParams = new ModelParams(ModelAndGpuLayerCount.Item1)
  67. {
  68. ContextSize = PromptAndContextLength.Item2,
  69. GpuLayerCount = ModelAndGpuLayerCount.Item2
  70. };
  71. Prompt = File.ReadAllText(Constants.TextCompletionPromptsFilePath).Substring(0, PromptAndContextLength.Item1);
  72. InferenceParams = new InferenceParams()
  73. {
  74. Temperature = 0.6f,
  75. MaxTokens = 1 // Only prefill, no generation here.
  76. };
  77. LLamaWeights weights = LLamaWeights.LoadFromFile(ModelParams);
  78. LLamaContext context = weights.CreateContext(ModelParams);
  79. Executor = ExecutorType switch
  80. {
  81. ExecutorType.Interactive => new InteractiveExecutor(context),
  82. ExecutorType.Instruct => new InstructExecutor(context),
  83. ExecutorType.Stateless => new StatelessExecutor(weights, ModelParams),
  84. _ => throw new NotSupportedException()
  85. };
  86. }
  87. [GlobalSetup(Targets = [nameof(Basic)])]
  88. public void GlobalSetup()
  89. {
  90. var showLLamaCppLogs = true;
  91. NativeLibraryConfig
  92. .Instance
  93. .WithLogCallback((level, message) =>
  94. {
  95. if (showLLamaCppLogs)
  96. Console.WriteLine($"[llama {level}]: {message.TrimEnd('\n')}");
  97. }).WithCuda().SkipCheck().WithAutoFallback(false);
  98. // Calling this method forces loading to occur now.
  99. NativeApi.llama_empty_call();
  100. InitializeParamsAndModel();
  101. }
  102. [IterationCleanup(Targets = [nameof(Basic)])]
  103. public void GlobalCleanup()
  104. {
  105. if(ExecutorType != ExecutorType.Stateless) // stateless executor always dispose its `Context` property
  106. {
  107. Executor.Context.NativeHandle.KvCacheClear();
  108. }
  109. }
  110. [Benchmark]
  111. public async Task<string> Basic()
  112. {
  113. StringBuilder sb = new();
  114. await foreach(var text in Executor.InferAsync(Prompt, InferenceParams))
  115. {
  116. sb.Append(text);
  117. }
  118. return sb.ToString();
  119. }
  120. }
  121. }