scisharp
/
LLamaSharp

#pragma warning disable CS8618

using System.Text;
using BenchmarkDotNet.Attributes;
using BenchmarkDotNet.Engines;
using BenchmarkDotNet.Jobs;
using LLama.Abstractions;
using LLama.Common;
using LLama.Native;

namespace LLama.Benchmark.LLamaExecutorBenchmark
{
#if WINDOWS
    [BenchmarkDotNet.Diagnostics.Windows.Configs.NativeMemoryProfiler]
#endif
    [BenchmarkCategory("Executor", "LLama")]
    [SimpleJob(RunStrategy.Monitoring, runtimeMoniker: RuntimeMoniker.Net80)]
    [MemoryDiagnoser]
    [MinIterationCount(1)]
    [MaxIterationCount(16)]
    [RPlotExporter]
    public class PrefillBenchmark
    {
        /// <summary>
        /// (prompt length, context length)
        /// </summary>
        public IEnumerable<(int, uint)> PromptAndContextLengths => new (int, uint)[] 
        {
            (512, 2048),
            (2024, 2048)
        };

        /// <summary>
        /// (model path, gpu layer count)
        /// </summary>
        public IEnumerable<(string, int)> ModelAndGpuLayerCounts => new (string, int)[]
        // TODO: specify the native library to load here to test cpu case better.
        {
            (Path.Combine(Constants.ModelDir, Constants.Generative7BModelPath), 0),
            (Path.Combine(Constants.ModelDir, Constants.Generative7BModelPath), 10),
            (Path.Combine(Constants.ModelDir, Constants.Generative7BModelPath), 20)
        };

        public IEnumerable<ExecutorType> ExecutorTypes => new ExecutorType[]
        {
            ExecutorType.Interactive,
            ExecutorType.Stateless
        };

        [ParamsSource(nameof(PromptAndContextLengths))]
        public (int, uint) PromptAndContextLength { get; set; }

        [ParamsSource(nameof(ModelAndGpuLayerCounts))]
        public (string, int) ModelAndGpuLayerCount { get; set; }

        [ParamsSource(nameof(ExecutorTypes))]
        public ExecutorType ExecutorType { get; set; }

        /// <summary>
        /// Params used to create a model.
        /// </summary>
        public ModelParams ModelParams { get; set; }

        /// <summary>
        /// Params used in inference.
        /// </summary>
        public InferenceParams InferenceParams { get; set; }

        /// <summary>
        /// Prompt used to run text generation.
        /// </summary>
        public string Prompt { get; set; }

        public ILLamaExecutor Executor { get; set; }

        private void InitializeParamsAndModel()
        {
            ModelParams = new ModelParams(ModelAndGpuLayerCount.Item1)
            {
                ContextSize = PromptAndContextLength.Item2,
                GpuLayerCount = ModelAndGpuLayerCount.Item2
            };
            Prompt = File.ReadAllText(Constants.TextCompletionPromptsFilePath).Substring(0, PromptAndContextLength.Item1);
            InferenceParams = new InferenceParams()
            {
                Temperature = 0.6f,
                MaxTokens = 1 // Only prefill, no generation here.
            };

            LLamaWeights weights = LLamaWeights.LoadFromFile(ModelParams);
            LLamaContext context = weights.CreateContext(ModelParams);
            Executor = ExecutorType switch
            {
                ExecutorType.Interactive => new InteractiveExecutor(context),
                ExecutorType.Instruct => new InstructExecutor(context),
                ExecutorType.Stateless => new StatelessExecutor(weights, ModelParams),
                _ => throw new NotSupportedException()
            };
        }

        [GlobalSetup(Targets = [nameof(Basic)])]
        public void GlobalSetup()
        {
            var showLLamaCppLogs = true;
            NativeLibraryConfig
               .Instance
               .WithLogCallback((level, message) =>
               {
                   if (showLLamaCppLogs)
                       Console.WriteLine($"[llama {level}]: {message.TrimEnd('\n')}");
               }).WithCuda().SkipCheck().WithAutoFallback(false);

            // Calling this method forces loading to occur now.
            NativeApi.llama_empty_call();
            InitializeParamsAndModel();
        }

        [IterationCleanup(Targets = [nameof(Basic)])]
        public void GlobalCleanup()
        {
            if(ExecutorType != ExecutorType.Stateless) // stateless executor always dispose its `Context` property
            {
                Executor.Context.NativeHandle.KvCacheClear();
            }
        }

        [Benchmark]
        public async Task<string> Basic()
        {
            StringBuilder sb = new();
            await foreach(var text in Executor.InferAsync(Prompt, InferenceParams))
            {
                sb.Append(text);
            }
            return sb.ToString();
        }
    }
}