#pragma warning disable CS8618
using System.Text;
using BenchmarkDotNet.Attributes;
using BenchmarkDotNet.Engines;
using BenchmarkDotNet.Jobs;
using LLama.Abstractions;
using LLama.Common;
using LLama.Native;
namespace LLama.Benchmark.LLamaExecutorBenchmark
{
#if WINDOWS
[BenchmarkDotNet.Diagnostics.Windows.Configs.NativeMemoryProfiler]
#endif
[BenchmarkCategory("Executor", "LLama")]
[SimpleJob(RunStrategy.Monitoring, runtimeMoniker: RuntimeMoniker.Net80)]
[MemoryDiagnoser]
[MinIterationCount(1)]
[MaxIterationCount(16)]
[RPlotExporter]
public class PrefillBenchmark
{
///
/// (prompt length, context length)
///
public IEnumerable<(int, uint)> PromptAndContextLengths => new (int, uint)[]
{
(512, 2048),
(2024, 2048)
};
///
/// (model path, gpu layer count)
///
public IEnumerable<(string, int)> ModelAndGpuLayerCounts => new (string, int)[]
// TODO: specify the native library to load here to test cpu case better.
{
(Path.Combine(Constants.ModelDir, Constants.Generative7BModelPath), 0),
(Path.Combine(Constants.ModelDir, Constants.Generative7BModelPath), 10),
(Path.Combine(Constants.ModelDir, Constants.Generative7BModelPath), 20)
};
public IEnumerable ExecutorTypes => new ExecutorType[]
{
ExecutorType.Interactive,
ExecutorType.Stateless
};
[ParamsSource(nameof(PromptAndContextLengths))]
public (int, uint) PromptAndContextLength { get; set; }
[ParamsSource(nameof(ModelAndGpuLayerCounts))]
public (string, int) ModelAndGpuLayerCount { get; set; }
[ParamsSource(nameof(ExecutorTypes))]
public ExecutorType ExecutorType { get; set; }
///
/// Params used to create a model.
///
public ModelParams ModelParams { get; set; }
///
/// Params used in inference.
///
public InferenceParams InferenceParams { get; set; }
///
/// Prompt used to run text generation.
///
public string Prompt { get; set; }
public ILLamaExecutor Executor { get; set; }
private void InitializeParamsAndModel()
{
ModelParams = new ModelParams(ModelAndGpuLayerCount.Item1)
{
ContextSize = PromptAndContextLength.Item2,
GpuLayerCount = ModelAndGpuLayerCount.Item2
};
Prompt = File.ReadAllText(Constants.TextCompletionPromptsFilePath).Substring(0, PromptAndContextLength.Item1);
InferenceParams = new InferenceParams()
{
Temperature = 0.6f,
MaxTokens = 1 // Only prefill, no generation here.
};
LLamaWeights weights = LLamaWeights.LoadFromFile(ModelParams);
LLamaContext context = weights.CreateContext(ModelParams);
Executor = ExecutorType switch
{
ExecutorType.Interactive => new InteractiveExecutor(context),
ExecutorType.Instruct => new InstructExecutor(context),
ExecutorType.Stateless => new StatelessExecutor(weights, ModelParams),
_ => throw new NotSupportedException()
};
}
[GlobalSetup(Targets = [nameof(Basic)])]
public void GlobalSetup()
{
var showLLamaCppLogs = true;
NativeLibraryConfig
.Instance
.WithLogCallback((level, message) =>
{
if (showLLamaCppLogs)
Console.WriteLine($"[llama {level}]: {message.TrimEnd('\n')}");
}).WithCuda().SkipCheck().WithAutoFallback(false);
// Calling this method forces loading to occur now.
NativeApi.llama_empty_call();
InitializeParamsAndModel();
}
[IterationCleanup(Targets = [nameof(Basic)])]
public void GlobalCleanup()
{
if(ExecutorType != ExecutorType.Stateless) // stateless executor always dispose its `Context` property
{
Executor.Context.NativeHandle.KvCacheClear();
}
}
[Benchmark]
public async Task Basic()
{
StringBuilder sb = new();
await foreach(var text in Executor.InferAsync(Prompt, InferenceParams))
{
sb.Append(text);
}
return sb.ToString();
}
}
}