ci: add benchmark test. (#720)

* ci: add benchmark test.
1 year ago · 6f9097f25b
--- a/.github/_typos.toml
+++ b/.github/_typos.toml
@@ -10,5 +10,7 @@ extend-exclude = [
    "_typos.toml",
    "docs/xmldocs/",
    "LLama.Web/wwwroot/",
    "LLama/runtimes/deps/"
    "LLama/runtimes/deps/",
    "LLama.Benchmark/Assets/",
    "LLama.Examples/Assets/"
 ]
--- a/.github/download_models.py
+++ b/.github/download_models.py
@@ -0,0 +1,20 @@
 from huggingface_hub import hf_hub_download
 import argparse

 if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--model-list', type=str, required=True)
    parser.add_argument('--model-dir', type=str, required=True)
    parser.add_argument('--endpoint', type=str, default='https://huggingface.co')
    args = parser.parse_args()
    
    with open(args.model_list, 'r') as f:
        repo_id, filename = f.readline().split(',')
    
    hf_hub_download(
        repo_id=repo_id, 
        filename=filename, 
        local_dir=args.model_dir, 
        local_dir_use_symlinks=False, 
        endpoint=args.endpoint
    )
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@@ -0,0 +1,74 @@
 name: Benchmark Test
 on:
  push:
    branches: [master]
  pull_request:
    branches: [master]
 concurrency:
  group: ${{ github.workflow }}-${{ github.ref }}-benchmark
  cancel-in-progress: true

 jobs:
  linux-benchmark-cuda:
    if: contains(github.event.pull_request.labels.*.name, 'benchmark')
    runs-on: [self-hosted, linux, gpu]

    strategy:
      fail-fast: false
      matrix:
        build: [cuda11]
        include:
          - build: cuda11
            image: nvidia/cuda:11.7.1-devel-ubuntu22.04
            modeldir: /llamasharp_ci/models_benchmark
          # - build: cuda12
          #   image: nvidia/cuda:12.1.1-runtime-ubuntu22.04

    container:
      image: ${{ matrix.image }}
      env:
        BENCHMARK_MODEL_DIR: ${{ matrix.modeldir }}
      ports:
        - 80
      volumes:
        - /llamasharp_ci:/llamasharp_ci
      options: --gpus=all --ipc=host --runtime=nvidia

    steps:
    - uses: actions/checkout@v4
    
    - name: Install libraries
      run: |
        apt update
        apt install -y curl libicu-dev
        apt-get install wget
        wget https://packages.microsoft.com/config/ubuntu/22.04/packages-microsoft-prod.deb -O packages-microsoft-prod.deb
        dpkg -i packages-microsoft-prod.deb
        rm packages-microsoft-prod.deb
        apt-get update  && apt-get install -y dotnet-sdk-8.0

    - name: Prepare models
      run: | 
        apt-get update
        apt-get install -y python3.10 python3-pip
        python3 --version
        pip install huggingface_hub
        python3 .github/download_models.py --model-dir ${{ matrix.modeldir }} --model-list LLama.Benchmark/Assets/models.txt --endpoint https://hf-mirror.com

    - name: Clear package cache
      run: dotnet clean LLamaSharp.sln && dotnet nuget locals all --clear
    - name: Restore packages
      run: dotnet restore LLamaSharp.sln
    - name: Build
      run: | 
        dotnet clean
        dotnet build LLama/LLamaSharp.csproj -c Release --no-restore
        dotnet build LLama.Benchmark/LLama.Benchmark.csproj -c Release --no-restore
    - name: Run benchmark test
      run: dotnet run --project LLama.Benchmark/LLama.Benchmark.csproj -c Release --anyCategories LLama
    - name: Upload artifacts
      if: always()
      uses: actions/upload-artifact@v3
      with:
        name: Benchmark_Results
        path: BenchmarkDotNet.Artifacts/results/*
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -1,4 +1,4 @@
 name: CI
 name: Unit Test
 on:
  push:
    branches: [master]
--- a/.gitignore
+++ b/.gitignore
@@ -346,3 +346,5 @@ site/
 /LLama.Unittest/Models/*.bin
 /LLama.Unittest/Models/*.gguf

 /LLama.Benchmark/Models/*.bin
 /LLama.Benchmark/Models/*.gguf
--- a/LLama.Benchmark/Assets/TextCompletionPrompts.txt
+++ b/LLama.Benchmark/Assets/TextCompletionPrompts.txt
--- a/LLama.Benchmark/Assets/extreme-ironing-taxi-610x427.jpg
+++ b/LLama.Benchmark/Assets/extreme-ironing-taxi-610x427.jpg
--- a/LLama.Benchmark/Assets/models.txt
+++ b/LLama.Benchmark/Assets/models.txt
@@ -0,0 +1 @@
 TheBloke/Llama-2-7b-Chat-GGUF,llama-2-7b-chat.Q3_K_S.gguf
--- a/LLama.Benchmark/Common.cs
+++ b/LLama.Benchmark/Common.cs
@@ -0,0 +1,10 @@

 namespace LLama.Benchmark
 {
    public enum ExecutorType
    {
        Interactive,
        Instruct,
        Stateless
    }
 }
--- a/LLama.Benchmark/Constants.cs
+++ b/LLama.Benchmark/Constants.cs
@@ -0,0 +1,23 @@

 namespace LLama.Benchmark
 {
    internal static class Constants
    {
        public static string ModelDir
        {
            get
            {
                return Environment.GetEnvironmentVariable("BENCHMARK_MODEL_DIR") ?? "";
            }
        }

        public static string Generative7BModelPath =>  Path.Combine(ModelDir, "llama-2-7b-chat.Q3_K_S.gguf");
        public static string EmbeddingModelPath => Path.Combine(ModelDir, "all-MiniLM-L12-v2.Q8_0.gguf");

        public static string LLavaModelPath => Path.Combine("llava-v1.6-mistral-7b.Q3_K_XS.gguf");
        public static string LLavaMmpPath => Path.Combine("mmproj-model-f16.gguf");
        public static string LLavaImage => "Assets/extreme-ironing-taxi-610x427.jpg";

        public static string TextCompletionPromptsFilePath => "Assets/TextCompletionPrompts.txt";
    }
 }
--- a/LLama.Benchmark/LLama.Benchmark.csproj
+++ b/LLama.Benchmark/LLama.Benchmark.csproj
@@ -0,0 +1,30 @@
 <Project Sdk="Microsoft.NET.Sdk">
  <Import Project="..\LLama\LLamaSharp.Runtime.targets" />

  <PropertyGroup>
    <OutputType>Exe</OutputType>
    <TargetFramework>net8.0</TargetFramework>
    <ImplicitUsings>enable</ImplicitUsings>
    <Nullable>enable</Nullable>
    <Configuration>Release</Configuration>
  </PropertyGroup>

  <ItemGroup>
    <PackageReference Include="BenchmarkDotNet" Version="0.13.12" />
    <PackageReference Include="BenchmarkDotNet.Diagnostics.Windows" Version="0.13.12" />
  </ItemGroup>

    <ItemGroup>
      <ProjectReference Include="..\LLama\LLamaSharp.csproj" />
    </ItemGroup>

    <ItemGroup>
        <None Update="Assets\TextCompletionPrompts.txt">
            <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
        </None>
        <None Update="Models\extreme-ironing-taxi-610x427.jpg">
            <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
        </None>
    </ItemGroup>

 </Project>
--- a/LLama.Benchmark/LLamaExecutorBenchmark/Prefill.cs
+++ b/LLama.Benchmark/LLamaExecutorBenchmark/Prefill.cs
@@ -0,0 +1,126 @@
 #pragma warning disable CS8618

 using System.Text;
 using BenchmarkDotNet.Attributes;
 using BenchmarkDotNet.Engines;
 using BenchmarkDotNet.Jobs;
 using LLama.Abstractions;
 using LLama.Common;

 namespace LLama.Benchmark.LLamaExecutorBenchmark
 {
 #if WINDOWS
    [BenchmarkDotNet.Diagnostics.Windows.Configs.NativeMemoryProfiler]
 #endif
    [BenchmarkCategory("Executor", "LLama")]
    [SimpleJob(RunStrategy.Monitoring, runtimeMoniker: RuntimeMoniker.Net80)]
    [MemoryDiagnoser]
    [MinIterationCount(1)]
    [MaxIterationCount(16)]
    [RPlotExporter]
    public class PrefillBenchmark
    {
        /// <summary>
        /// (prompt length, context length)
        /// </summary>
        public IEnumerable<(int, uint)> PromptAndContextLengths => new (int, uint)[] 
        {
            (512, 2048),
            (2024, 2048)
        };

        /// <summary>
        /// (model path, gpu layer count)
        /// </summary>
        public IEnumerable<(string, int)> ModelAndGpuLayerCounts => new (string, int)[]
        // TODO: specify the native library to load here to test cpu case better.
        {
            (Path.Combine(Constants.ModelDir, Constants.Generative7BModelPath), 0),
            (Path.Combine(Constants.ModelDir, Constants.Generative7BModelPath), 10),
            (Path.Combine(Constants.ModelDir, Constants.Generative7BModelPath), 20)
        };

        public IEnumerable<ExecutorType> ExecutorTypes => new ExecutorType[]
        {
            ExecutorType.Interactive,
            ExecutorType.Stateless
        };

        [ParamsSource(nameof(PromptAndContextLengths))]
        public (int, uint) PromptAndContextLength { get; set; }

        [ParamsSource(nameof(ModelAndGpuLayerCounts))]
        public (string, int) ModelAndGpuLayerCount { get; set; }

        [ParamsSource(nameof(ExecutorTypes))]
        public ExecutorType ExecutorType { get; set; }

        /// <summary>
        /// Params used to create a model.
        /// </summary>
        public ModelParams ModelParams { get; set; }

        /// <summary>
        /// Params used in inference.
        /// </summary>
        public InferenceParams InferenceParams { get; set; }

        /// <summary>
        /// Prompt used to run text generation.
        /// </summary>
        public string Prompt { get; set; }

        public ILLamaExecutor Executor { get; set; }

        private void InitializeParamsAndModel()
        {
            ModelParams = new ModelParams(ModelAndGpuLayerCount.Item1)
            {
                ContextSize = PromptAndContextLength.Item2,
                GpuLayerCount = ModelAndGpuLayerCount.Item2
            };
            Prompt = File.ReadAllText(Constants.TextCompletionPromptsFilePath).Substring(0, PromptAndContextLength.Item1);
            InferenceParams = new InferenceParams()
            {
                Temperature = 0.6f,
                MaxTokens = 1 // Only prefill, no generation here.
            };

            LLamaWeights weights = LLamaWeights.LoadFromFile(ModelParams);
            LLamaContext context = weights.CreateContext(ModelParams);
            Executor = ExecutorType switch
            {
                ExecutorType.Interactive => new InteractiveExecutor(context),
                ExecutorType.Instruct => new InstructExecutor(context),
                ExecutorType.Stateless => new StatelessExecutor(weights, ModelParams),
                _ => throw new NotSupportedException()
            };
        }

        [GlobalSetup(Targets = [nameof(Basic)])]
        public void GlobalSetup()
        {
            InitializeParamsAndModel();
        }

        [IterationCleanup(Targets = [nameof(Basic)])]
        public void GlobalCleanup()
        {
            if(ExecutorType != ExecutorType.Stateless) // stateless executor always dispose its `Context` property
            {
                Executor.Context.NativeHandle.KvCacheClear();
            }
        }

        [Benchmark]
        public async Task<string> Basic()
        {
            StringBuilder sb = new();
            await foreach(var text in Executor.InferAsync(Prompt, InferenceParams))
            {
                sb.Append(text);
            }
            return sb.ToString();
        }
    }
 }
--- a/LLama.Benchmark/Program.cs
+++ b/LLama.Benchmark/Program.cs
@@ -0,0 +1,13 @@
 using BenchmarkDotNet.Running;

 namespace LLama.Benchmark
 {
    public class Program
    {
        public static void Main(string[] args)
        {
            var summary = BenchmarkSwitcher.FromAssembly(typeof(Program).Assembly).Run(args);
            Console.WriteLine(summary);
        }
    }
 }
--- a/LLamaSharp.sln
+++ b/LLamaSharp.sln
@@ -17,6 +17,8 @@ Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "LLamaSharp.SemanticKernel",
 EndProject
 Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "LLamaSharp.KernelMemory", "LLama.KernelMemory\LLamaSharp.KernelMemory.csproj", "{E5589AE7-B86F-4343-A1CC-8E5D34596E52}"
 EndProject
 Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "LLama.Benchmark", "LLama.Benchmark\LLama.Benchmark.csproj", "{90D38FEE-68EA-459E-A4EE-268B9DFA1CD5}"
 EndProject
 Global
 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
 		Debug|Any CPU = Debug|Any CPU
@@ -111,6 +113,18 @@ Global
 		{E5589AE7-B86F-4343-A1CC-8E5D34596E52}.Release|Any CPU.Build.0 = Release|Any CPU
 		{E5589AE7-B86F-4343-A1CC-8E5D34596E52}.Release|x64.ActiveCfg = Release|Any CPU
 		{E5589AE7-B86F-4343-A1CC-8E5D34596E52}.Release|x64.Build.0 = Release|Any CPU
 		{90D38FEE-68EA-459E-A4EE-268B9DFA1CD5}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
 		{90D38FEE-68EA-459E-A4EE-268B9DFA1CD5}.Debug|Any CPU.Build.0 = Debug|Any CPU
 		{90D38FEE-68EA-459E-A4EE-268B9DFA1CD5}.Debug|x64.ActiveCfg = Debug|Any CPU
 		{90D38FEE-68EA-459E-A4EE-268B9DFA1CD5}.Debug|x64.Build.0 = Debug|Any CPU
 		{90D38FEE-68EA-459E-A4EE-268B9DFA1CD5}.GPU|Any CPU.ActiveCfg = Debug|Any CPU
 		{90D38FEE-68EA-459E-A4EE-268B9DFA1CD5}.GPU|Any CPU.Build.0 = Debug|Any CPU
 		{90D38FEE-68EA-459E-A4EE-268B9DFA1CD5}.GPU|x64.ActiveCfg = Debug|Any CPU
 		{90D38FEE-68EA-459E-A4EE-268B9DFA1CD5}.GPU|x64.Build.0 = Debug|Any CPU
 		{90D38FEE-68EA-459E-A4EE-268B9DFA1CD5}.Release|Any CPU.ActiveCfg = Release|Any CPU
 		{90D38FEE-68EA-459E-A4EE-268B9DFA1CD5}.Release|Any CPU.Build.0 = Release|Any CPU
 		{90D38FEE-68EA-459E-A4EE-268B9DFA1CD5}.Release|x64.ActiveCfg = Release|Any CPU
 		{90D38FEE-68EA-459E-A4EE-268B9DFA1CD5}.Release|x64.Build.0 = Release|Any CPU
 	EndGlobalSection
 	GlobalSection(SolutionProperties) = preSolution
 		HideSolutionNode = FALSE