| @@ -10,5 +10,7 @@ extend-exclude = [ | |||
| "_typos.toml", | |||
| "docs/xmldocs/", | |||
| "LLama.Web/wwwroot/", | |||
| "LLama/runtimes/deps/" | |||
| "LLama/runtimes/deps/", | |||
| "LLama.Benchmark/Assets/", | |||
| "LLama.Examples/Assets/" | |||
| ] | |||
| @@ -0,0 +1,20 @@ | |||
| from huggingface_hub import hf_hub_download | |||
| import argparse | |||
| if __name__ == '__main__': | |||
| parser = argparse.ArgumentParser() | |||
| parser.add_argument('--model-list', type=str, required=True) | |||
| parser.add_argument('--model-dir', type=str, required=True) | |||
| parser.add_argument('--endpoint', type=str, default='https://huggingface.co') | |||
| args = parser.parse_args() | |||
| with open(args.model_list, 'r') as f: | |||
| repo_id, filename = f.readline().split(',') | |||
| hf_hub_download( | |||
| repo_id=repo_id, | |||
| filename=filename, | |||
| local_dir=args.model_dir, | |||
| local_dir_use_symlinks=False, | |||
| endpoint=args.endpoint | |||
| ) | |||
| @@ -0,0 +1,74 @@ | |||
| name: Benchmark Test | |||
| on: | |||
| push: | |||
| branches: [master] | |||
| pull_request: | |||
| branches: [master] | |||
| concurrency: | |||
| group: ${{ github.workflow }}-${{ github.ref }}-benchmark | |||
| cancel-in-progress: true | |||
| jobs: | |||
| linux-benchmark-cuda: | |||
| if: contains(github.event.pull_request.labels.*.name, 'benchmark') | |||
| runs-on: [self-hosted, linux, gpu] | |||
| strategy: | |||
| fail-fast: false | |||
| matrix: | |||
| build: [cuda11] | |||
| include: | |||
| - build: cuda11 | |||
| image: nvidia/cuda:11.7.1-devel-ubuntu22.04 | |||
| modeldir: /llamasharp_ci/models_benchmark | |||
| # - build: cuda12 | |||
| # image: nvidia/cuda:12.1.1-runtime-ubuntu22.04 | |||
| container: | |||
| image: ${{ matrix.image }} | |||
| env: | |||
| BENCHMARK_MODEL_DIR: ${{ matrix.modeldir }} | |||
| ports: | |||
| - 80 | |||
| volumes: | |||
| - /llamasharp_ci:/llamasharp_ci | |||
| options: --gpus=all --ipc=host --runtime=nvidia | |||
| steps: | |||
| - uses: actions/checkout@v4 | |||
| - name: Install libraries | |||
| run: | | |||
| apt update | |||
| apt install -y curl libicu-dev | |||
| apt-get install wget | |||
| wget https://packages.microsoft.com/config/ubuntu/22.04/packages-microsoft-prod.deb -O packages-microsoft-prod.deb | |||
| dpkg -i packages-microsoft-prod.deb | |||
| rm packages-microsoft-prod.deb | |||
| apt-get update && apt-get install -y dotnet-sdk-8.0 | |||
| - name: Prepare models | |||
| run: | | |||
| apt-get update | |||
| apt-get install -y python3.10 python3-pip | |||
| python3 --version | |||
| pip install huggingface_hub | |||
| python3 .github/download_models.py --model-dir ${{ matrix.modeldir }} --model-list LLama.Benchmark/Assets/models.txt --endpoint https://hf-mirror.com | |||
| - name: Clear package cache | |||
| run: dotnet clean LLamaSharp.sln && dotnet nuget locals all --clear | |||
| - name: Restore packages | |||
| run: dotnet restore LLamaSharp.sln | |||
| - name: Build | |||
| run: | | |||
| dotnet clean | |||
| dotnet build LLama/LLamaSharp.csproj -c Release --no-restore | |||
| dotnet build LLama.Benchmark/LLama.Benchmark.csproj -c Release --no-restore | |||
| - name: Run benchmark test | |||
| run: dotnet run --project LLama.Benchmark/LLama.Benchmark.csproj -c Release --anyCategories LLama | |||
| - name: Upload artifacts | |||
| if: always() | |||
| uses: actions/upload-artifact@v3 | |||
| with: | |||
| name: Benchmark_Results | |||
| path: BenchmarkDotNet.Artifacts/results/* | |||
| @@ -1,4 +1,4 @@ | |||
| name: CI | |||
| name: Unit Test | |||
| on: | |||
| push: | |||
| branches: [master] | |||
| @@ -346,3 +346,5 @@ site/ | |||
| /LLama.Unittest/Models/*.bin | |||
| /LLama.Unittest/Models/*.gguf | |||
| /LLama.Benchmark/Models/*.bin | |||
| /LLama.Benchmark/Models/*.gguf | |||
| @@ -0,0 +1 @@ | |||
| TheBloke/Llama-2-7b-Chat-GGUF,llama-2-7b-chat.Q3_K_S.gguf | |||
| @@ -0,0 +1,10 @@ | |||
| namespace LLama.Benchmark | |||
| { | |||
| public enum ExecutorType | |||
| { | |||
| Interactive, | |||
| Instruct, | |||
| Stateless | |||
| } | |||
| } | |||
| @@ -0,0 +1,23 @@ | |||
| namespace LLama.Benchmark | |||
| { | |||
| internal static class Constants | |||
| { | |||
| public static string ModelDir | |||
| { | |||
| get | |||
| { | |||
| return Environment.GetEnvironmentVariable("BENCHMARK_MODEL_DIR") ?? ""; | |||
| } | |||
| } | |||
| public static string Generative7BModelPath => Path.Combine(ModelDir, "llama-2-7b-chat.Q3_K_S.gguf"); | |||
| public static string EmbeddingModelPath => Path.Combine(ModelDir, "all-MiniLM-L12-v2.Q8_0.gguf"); | |||
| public static string LLavaModelPath => Path.Combine("llava-v1.6-mistral-7b.Q3_K_XS.gguf"); | |||
| public static string LLavaMmpPath => Path.Combine("mmproj-model-f16.gguf"); | |||
| public static string LLavaImage => "Assets/extreme-ironing-taxi-610x427.jpg"; | |||
| public static string TextCompletionPromptsFilePath => "Assets/TextCompletionPrompts.txt"; | |||
| } | |||
| } | |||
| @@ -0,0 +1,30 @@ | |||
| <Project Sdk="Microsoft.NET.Sdk"> | |||
| <Import Project="..\LLama\LLamaSharp.Runtime.targets" /> | |||
| <PropertyGroup> | |||
| <OutputType>Exe</OutputType> | |||
| <TargetFramework>net8.0</TargetFramework> | |||
| <ImplicitUsings>enable</ImplicitUsings> | |||
| <Nullable>enable</Nullable> | |||
| <Configuration>Release</Configuration> | |||
| </PropertyGroup> | |||
| <ItemGroup> | |||
| <PackageReference Include="BenchmarkDotNet" Version="0.13.12" /> | |||
| <PackageReference Include="BenchmarkDotNet.Diagnostics.Windows" Version="0.13.12" /> | |||
| </ItemGroup> | |||
| <ItemGroup> | |||
| <ProjectReference Include="..\LLama\LLamaSharp.csproj" /> | |||
| </ItemGroup> | |||
| <ItemGroup> | |||
| <None Update="Assets\TextCompletionPrompts.txt"> | |||
| <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory> | |||
| </None> | |||
| <None Update="Models\extreme-ironing-taxi-610x427.jpg"> | |||
| <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory> | |||
| </None> | |||
| </ItemGroup> | |||
| </Project> | |||
| @@ -0,0 +1,126 @@ | |||
| #pragma warning disable CS8618 | |||
| using System.Text; | |||
| using BenchmarkDotNet.Attributes; | |||
| using BenchmarkDotNet.Engines; | |||
| using BenchmarkDotNet.Jobs; | |||
| using LLama.Abstractions; | |||
| using LLama.Common; | |||
| namespace LLama.Benchmark.LLamaExecutorBenchmark | |||
| { | |||
| #if WINDOWS | |||
| [BenchmarkDotNet.Diagnostics.Windows.Configs.NativeMemoryProfiler] | |||
| #endif | |||
| [BenchmarkCategory("Executor", "LLama")] | |||
| [SimpleJob(RunStrategy.Monitoring, runtimeMoniker: RuntimeMoniker.Net80)] | |||
| [MemoryDiagnoser] | |||
| [MinIterationCount(1)] | |||
| [MaxIterationCount(16)] | |||
| [RPlotExporter] | |||
| public class PrefillBenchmark | |||
| { | |||
| /// <summary> | |||
| /// (prompt length, context length) | |||
| /// </summary> | |||
| public IEnumerable<(int, uint)> PromptAndContextLengths => new (int, uint)[] | |||
| { | |||
| (512, 2048), | |||
| (2024, 2048) | |||
| }; | |||
| /// <summary> | |||
| /// (model path, gpu layer count) | |||
| /// </summary> | |||
| public IEnumerable<(string, int)> ModelAndGpuLayerCounts => new (string, int)[] | |||
| // TODO: specify the native library to load here to test cpu case better. | |||
| { | |||
| (Path.Combine(Constants.ModelDir, Constants.Generative7BModelPath), 0), | |||
| (Path.Combine(Constants.ModelDir, Constants.Generative7BModelPath), 10), | |||
| (Path.Combine(Constants.ModelDir, Constants.Generative7BModelPath), 20) | |||
| }; | |||
| public IEnumerable<ExecutorType> ExecutorTypes => new ExecutorType[] | |||
| { | |||
| ExecutorType.Interactive, | |||
| ExecutorType.Stateless | |||
| }; | |||
| [ParamsSource(nameof(PromptAndContextLengths))] | |||
| public (int, uint) PromptAndContextLength { get; set; } | |||
| [ParamsSource(nameof(ModelAndGpuLayerCounts))] | |||
| public (string, int) ModelAndGpuLayerCount { get; set; } | |||
| [ParamsSource(nameof(ExecutorTypes))] | |||
| public ExecutorType ExecutorType { get; set; } | |||
| /// <summary> | |||
| /// Params used to create a model. | |||
| /// </summary> | |||
| public ModelParams ModelParams { get; set; } | |||
| /// <summary> | |||
| /// Params used in inference. | |||
| /// </summary> | |||
| public InferenceParams InferenceParams { get; set; } | |||
| /// <summary> | |||
| /// Prompt used to run text generation. | |||
| /// </summary> | |||
| public string Prompt { get; set; } | |||
| public ILLamaExecutor Executor { get; set; } | |||
| private void InitializeParamsAndModel() | |||
| { | |||
| ModelParams = new ModelParams(ModelAndGpuLayerCount.Item1) | |||
| { | |||
| ContextSize = PromptAndContextLength.Item2, | |||
| GpuLayerCount = ModelAndGpuLayerCount.Item2 | |||
| }; | |||
| Prompt = File.ReadAllText(Constants.TextCompletionPromptsFilePath).Substring(0, PromptAndContextLength.Item1); | |||
| InferenceParams = new InferenceParams() | |||
| { | |||
| Temperature = 0.6f, | |||
| MaxTokens = 1 // Only prefill, no generation here. | |||
| }; | |||
| LLamaWeights weights = LLamaWeights.LoadFromFile(ModelParams); | |||
| LLamaContext context = weights.CreateContext(ModelParams); | |||
| Executor = ExecutorType switch | |||
| { | |||
| ExecutorType.Interactive => new InteractiveExecutor(context), | |||
| ExecutorType.Instruct => new InstructExecutor(context), | |||
| ExecutorType.Stateless => new StatelessExecutor(weights, ModelParams), | |||
| _ => throw new NotSupportedException() | |||
| }; | |||
| } | |||
| [GlobalSetup(Targets = [nameof(Basic)])] | |||
| public void GlobalSetup() | |||
| { | |||
| InitializeParamsAndModel(); | |||
| } | |||
| [IterationCleanup(Targets = [nameof(Basic)])] | |||
| public void GlobalCleanup() | |||
| { | |||
| if(ExecutorType != ExecutorType.Stateless) // stateless executor always dispose its `Context` property | |||
| { | |||
| Executor.Context.NativeHandle.KvCacheClear(); | |||
| } | |||
| } | |||
| [Benchmark] | |||
| public async Task<string> Basic() | |||
| { | |||
| StringBuilder sb = new(); | |||
| await foreach(var text in Executor.InferAsync(Prompt, InferenceParams)) | |||
| { | |||
| sb.Append(text); | |||
| } | |||
| return sb.ToString(); | |||
| } | |||
| } | |||
| } | |||
| @@ -0,0 +1,13 @@ | |||
| using BenchmarkDotNet.Running; | |||
| namespace LLama.Benchmark | |||
| { | |||
| public class Program | |||
| { | |||
| public static void Main(string[] args) | |||
| { | |||
| var summary = BenchmarkSwitcher.FromAssembly(typeof(Program).Assembly).Run(args); | |||
| Console.WriteLine(summary); | |||
| } | |||
| } | |||
| } | |||
| @@ -17,6 +17,8 @@ Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "LLamaSharp.SemanticKernel", | |||
| EndProject | |||
| Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "LLamaSharp.KernelMemory", "LLama.KernelMemory\LLamaSharp.KernelMemory.csproj", "{E5589AE7-B86F-4343-A1CC-8E5D34596E52}" | |||
| EndProject | |||
| Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "LLama.Benchmark", "LLama.Benchmark\LLama.Benchmark.csproj", "{90D38FEE-68EA-459E-A4EE-268B9DFA1CD5}" | |||
| EndProject | |||
| Global | |||
| GlobalSection(SolutionConfigurationPlatforms) = preSolution | |||
| Debug|Any CPU = Debug|Any CPU | |||
| @@ -111,6 +113,18 @@ Global | |||
| {E5589AE7-B86F-4343-A1CC-8E5D34596E52}.Release|Any CPU.Build.0 = Release|Any CPU | |||
| {E5589AE7-B86F-4343-A1CC-8E5D34596E52}.Release|x64.ActiveCfg = Release|Any CPU | |||
| {E5589AE7-B86F-4343-A1CC-8E5D34596E52}.Release|x64.Build.0 = Release|Any CPU | |||
| {90D38FEE-68EA-459E-A4EE-268B9DFA1CD5}.Debug|Any CPU.ActiveCfg = Debug|Any CPU | |||
| {90D38FEE-68EA-459E-A4EE-268B9DFA1CD5}.Debug|Any CPU.Build.0 = Debug|Any CPU | |||
| {90D38FEE-68EA-459E-A4EE-268B9DFA1CD5}.Debug|x64.ActiveCfg = Debug|Any CPU | |||
| {90D38FEE-68EA-459E-A4EE-268B9DFA1CD5}.Debug|x64.Build.0 = Debug|Any CPU | |||
| {90D38FEE-68EA-459E-A4EE-268B9DFA1CD5}.GPU|Any CPU.ActiveCfg = Debug|Any CPU | |||
| {90D38FEE-68EA-459E-A4EE-268B9DFA1CD5}.GPU|Any CPU.Build.0 = Debug|Any CPU | |||
| {90D38FEE-68EA-459E-A4EE-268B9DFA1CD5}.GPU|x64.ActiveCfg = Debug|Any CPU | |||
| {90D38FEE-68EA-459E-A4EE-268B9DFA1CD5}.GPU|x64.Build.0 = Debug|Any CPU | |||
| {90D38FEE-68EA-459E-A4EE-268B9DFA1CD5}.Release|Any CPU.ActiveCfg = Release|Any CPU | |||
| {90D38FEE-68EA-459E-A4EE-268B9DFA1CD5}.Release|Any CPU.Build.0 = Release|Any CPU | |||
| {90D38FEE-68EA-459E-A4EE-268B9DFA1CD5}.Release|x64.ActiveCfg = Release|Any CPU | |||
| {90D38FEE-68EA-459E-A4EE-268B9DFA1CD5}.Release|x64.Build.0 = Release|Any CPU | |||
| EndGlobalSection | |||
| GlobalSection(SolutionProperties) = preSolution | |||
| HideSolutionNode = FALSE | |||