| @@ -10,5 +10,7 @@ extend-exclude = [ | |||||
| "_typos.toml", | "_typos.toml", | ||||
| "docs/xmldocs/", | "docs/xmldocs/", | ||||
| "LLama.Web/wwwroot/", | "LLama.Web/wwwroot/", | ||||
| "LLama/runtimes/deps/" | |||||
| "LLama/runtimes/deps/", | |||||
| "LLama.Benchmark/Assets/", | |||||
| "LLama.Examples/Assets/" | |||||
| ] | ] | ||||
| @@ -0,0 +1,20 @@ | |||||
| from huggingface_hub import hf_hub_download | |||||
| import argparse | |||||
| if __name__ == '__main__': | |||||
| parser = argparse.ArgumentParser() | |||||
| parser.add_argument('--model-list', type=str, required=True) | |||||
| parser.add_argument('--model-dir', type=str, required=True) | |||||
| parser.add_argument('--endpoint', type=str, default='https://huggingface.co') | |||||
| args = parser.parse_args() | |||||
| with open(args.model_list, 'r') as f: | |||||
| repo_id, filename = f.readline().split(',') | |||||
| hf_hub_download( | |||||
| repo_id=repo_id, | |||||
| filename=filename, | |||||
| local_dir=args.model_dir, | |||||
| local_dir_use_symlinks=False, | |||||
| endpoint=args.endpoint | |||||
| ) | |||||
| @@ -0,0 +1,74 @@ | |||||
| name: Benchmark Test | |||||
| on: | |||||
| push: | |||||
| branches: [master] | |||||
| pull_request: | |||||
| branches: [master] | |||||
| concurrency: | |||||
| group: ${{ github.workflow }}-${{ github.ref }}-benchmark | |||||
| cancel-in-progress: true | |||||
| jobs: | |||||
| linux-benchmark-cuda: | |||||
| if: contains(github.event.pull_request.labels.*.name, 'benchmark') | |||||
| runs-on: [self-hosted, linux, gpu] | |||||
| strategy: | |||||
| fail-fast: false | |||||
| matrix: | |||||
| build: [cuda11] | |||||
| include: | |||||
| - build: cuda11 | |||||
| image: nvidia/cuda:11.7.1-devel-ubuntu22.04 | |||||
| modeldir: /llamasharp_ci/models_benchmark | |||||
| # - build: cuda12 | |||||
| # image: nvidia/cuda:12.1.1-runtime-ubuntu22.04 | |||||
| container: | |||||
| image: ${{ matrix.image }} | |||||
| env: | |||||
| BENCHMARK_MODEL_DIR: ${{ matrix.modeldir }} | |||||
| ports: | |||||
| - 80 | |||||
| volumes: | |||||
| - /llamasharp_ci:/llamasharp_ci | |||||
| options: --gpus=all --ipc=host --runtime=nvidia | |||||
| steps: | |||||
| - uses: actions/checkout@v4 | |||||
| - name: Install libraries | |||||
| run: | | |||||
| apt update | |||||
| apt install -y curl libicu-dev | |||||
| apt-get install wget | |||||
| wget https://packages.microsoft.com/config/ubuntu/22.04/packages-microsoft-prod.deb -O packages-microsoft-prod.deb | |||||
| dpkg -i packages-microsoft-prod.deb | |||||
| rm packages-microsoft-prod.deb | |||||
| apt-get update && apt-get install -y dotnet-sdk-8.0 | |||||
| - name: Prepare models | |||||
| run: | | |||||
| apt-get update | |||||
| apt-get install -y python3.10 python3-pip | |||||
| python3 --version | |||||
| pip install huggingface_hub | |||||
| python3 .github/download_models.py --model-dir ${{ matrix.modeldir }} --model-list LLama.Benchmark/Assets/models.txt --endpoint https://hf-mirror.com | |||||
| - name: Clear package cache | |||||
| run: dotnet clean LLamaSharp.sln && dotnet nuget locals all --clear | |||||
| - name: Restore packages | |||||
| run: dotnet restore LLamaSharp.sln | |||||
| - name: Build | |||||
| run: | | |||||
| dotnet clean | |||||
| dotnet build LLama/LLamaSharp.csproj -c Release --no-restore | |||||
| dotnet build LLama.Benchmark/LLama.Benchmark.csproj -c Release --no-restore | |||||
| - name: Run benchmark test | |||||
| run: dotnet run --project LLama.Benchmark/LLama.Benchmark.csproj -c Release --anyCategories LLama | |||||
| - name: Upload artifacts | |||||
| if: always() | |||||
| uses: actions/upload-artifact@v3 | |||||
| with: | |||||
| name: Benchmark_Results | |||||
| path: BenchmarkDotNet.Artifacts/results/* | |||||
| @@ -1,4 +1,4 @@ | |||||
| name: CI | |||||
| name: Unit Test | |||||
| on: | on: | ||||
| push: | push: | ||||
| branches: [master] | branches: [master] | ||||
| @@ -346,3 +346,5 @@ site/ | |||||
| /LLama.Unittest/Models/*.bin | /LLama.Unittest/Models/*.bin | ||||
| /LLama.Unittest/Models/*.gguf | /LLama.Unittest/Models/*.gguf | ||||
| /LLama.Benchmark/Models/*.bin | |||||
| /LLama.Benchmark/Models/*.gguf | |||||
| @@ -0,0 +1 @@ | |||||
| TheBloke/Llama-2-7b-Chat-GGUF,llama-2-7b-chat.Q3_K_S.gguf | |||||
| @@ -0,0 +1,10 @@ | |||||
| namespace LLama.Benchmark | |||||
| { | |||||
| public enum ExecutorType | |||||
| { | |||||
| Interactive, | |||||
| Instruct, | |||||
| Stateless | |||||
| } | |||||
| } | |||||
| @@ -0,0 +1,23 @@ | |||||
| namespace LLama.Benchmark | |||||
| { | |||||
| internal static class Constants | |||||
| { | |||||
| public static string ModelDir | |||||
| { | |||||
| get | |||||
| { | |||||
| return Environment.GetEnvironmentVariable("BENCHMARK_MODEL_DIR") ?? ""; | |||||
| } | |||||
| } | |||||
| public static string Generative7BModelPath => Path.Combine(ModelDir, "llama-2-7b-chat.Q3_K_S.gguf"); | |||||
| public static string EmbeddingModelPath => Path.Combine(ModelDir, "all-MiniLM-L12-v2.Q8_0.gguf"); | |||||
| public static string LLavaModelPath => Path.Combine("llava-v1.6-mistral-7b.Q3_K_XS.gguf"); | |||||
| public static string LLavaMmpPath => Path.Combine("mmproj-model-f16.gguf"); | |||||
| public static string LLavaImage => "Assets/extreme-ironing-taxi-610x427.jpg"; | |||||
| public static string TextCompletionPromptsFilePath => "Assets/TextCompletionPrompts.txt"; | |||||
| } | |||||
| } | |||||
| @@ -0,0 +1,30 @@ | |||||
| <Project Sdk="Microsoft.NET.Sdk"> | |||||
| <Import Project="..\LLama\LLamaSharp.Runtime.targets" /> | |||||
| <PropertyGroup> | |||||
| <OutputType>Exe</OutputType> | |||||
| <TargetFramework>net8.0</TargetFramework> | |||||
| <ImplicitUsings>enable</ImplicitUsings> | |||||
| <Nullable>enable</Nullable> | |||||
| <Configuration>Release</Configuration> | |||||
| </PropertyGroup> | |||||
| <ItemGroup> | |||||
| <PackageReference Include="BenchmarkDotNet" Version="0.13.12" /> | |||||
| <PackageReference Include="BenchmarkDotNet.Diagnostics.Windows" Version="0.13.12" /> | |||||
| </ItemGroup> | |||||
| <ItemGroup> | |||||
| <ProjectReference Include="..\LLama\LLamaSharp.csproj" /> | |||||
| </ItemGroup> | |||||
| <ItemGroup> | |||||
| <None Update="Assets\TextCompletionPrompts.txt"> | |||||
| <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory> | |||||
| </None> | |||||
| <None Update="Models\extreme-ironing-taxi-610x427.jpg"> | |||||
| <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory> | |||||
| </None> | |||||
| </ItemGroup> | |||||
| </Project> | |||||
| @@ -0,0 +1,126 @@ | |||||
| #pragma warning disable CS8618 | |||||
| using System.Text; | |||||
| using BenchmarkDotNet.Attributes; | |||||
| using BenchmarkDotNet.Engines; | |||||
| using BenchmarkDotNet.Jobs; | |||||
| using LLama.Abstractions; | |||||
| using LLama.Common; | |||||
| namespace LLama.Benchmark.LLamaExecutorBenchmark | |||||
| { | |||||
| #if WINDOWS | |||||
| [BenchmarkDotNet.Diagnostics.Windows.Configs.NativeMemoryProfiler] | |||||
| #endif | |||||
| [BenchmarkCategory("Executor", "LLama")] | |||||
| [SimpleJob(RunStrategy.Monitoring, runtimeMoniker: RuntimeMoniker.Net80)] | |||||
| [MemoryDiagnoser] | |||||
| [MinIterationCount(1)] | |||||
| [MaxIterationCount(16)] | |||||
| [RPlotExporter] | |||||
| public class PrefillBenchmark | |||||
| { | |||||
| /// <summary> | |||||
| /// (prompt length, context length) | |||||
| /// </summary> | |||||
| public IEnumerable<(int, uint)> PromptAndContextLengths => new (int, uint)[] | |||||
| { | |||||
| (512, 2048), | |||||
| (2024, 2048) | |||||
| }; | |||||
| /// <summary> | |||||
| /// (model path, gpu layer count) | |||||
| /// </summary> | |||||
| public IEnumerable<(string, int)> ModelAndGpuLayerCounts => new (string, int)[] | |||||
| // TODO: specify the native library to load here to test cpu case better. | |||||
| { | |||||
| (Path.Combine(Constants.ModelDir, Constants.Generative7BModelPath), 0), | |||||
| (Path.Combine(Constants.ModelDir, Constants.Generative7BModelPath), 10), | |||||
| (Path.Combine(Constants.ModelDir, Constants.Generative7BModelPath), 20) | |||||
| }; | |||||
| public IEnumerable<ExecutorType> ExecutorTypes => new ExecutorType[] | |||||
| { | |||||
| ExecutorType.Interactive, | |||||
| ExecutorType.Stateless | |||||
| }; | |||||
| [ParamsSource(nameof(PromptAndContextLengths))] | |||||
| public (int, uint) PromptAndContextLength { get; set; } | |||||
| [ParamsSource(nameof(ModelAndGpuLayerCounts))] | |||||
| public (string, int) ModelAndGpuLayerCount { get; set; } | |||||
| [ParamsSource(nameof(ExecutorTypes))] | |||||
| public ExecutorType ExecutorType { get; set; } | |||||
| /// <summary> | |||||
| /// Params used to create a model. | |||||
| /// </summary> | |||||
| public ModelParams ModelParams { get; set; } | |||||
| /// <summary> | |||||
| /// Params used in inference. | |||||
| /// </summary> | |||||
| public InferenceParams InferenceParams { get; set; } | |||||
| /// <summary> | |||||
| /// Prompt used to run text generation. | |||||
| /// </summary> | |||||
| public string Prompt { get; set; } | |||||
| public ILLamaExecutor Executor { get; set; } | |||||
| private void InitializeParamsAndModel() | |||||
| { | |||||
| ModelParams = new ModelParams(ModelAndGpuLayerCount.Item1) | |||||
| { | |||||
| ContextSize = PromptAndContextLength.Item2, | |||||
| GpuLayerCount = ModelAndGpuLayerCount.Item2 | |||||
| }; | |||||
| Prompt = File.ReadAllText(Constants.TextCompletionPromptsFilePath).Substring(0, PromptAndContextLength.Item1); | |||||
| InferenceParams = new InferenceParams() | |||||
| { | |||||
| Temperature = 0.6f, | |||||
| MaxTokens = 1 // Only prefill, no generation here. | |||||
| }; | |||||
| LLamaWeights weights = LLamaWeights.LoadFromFile(ModelParams); | |||||
| LLamaContext context = weights.CreateContext(ModelParams); | |||||
| Executor = ExecutorType switch | |||||
| { | |||||
| ExecutorType.Interactive => new InteractiveExecutor(context), | |||||
| ExecutorType.Instruct => new InstructExecutor(context), | |||||
| ExecutorType.Stateless => new StatelessExecutor(weights, ModelParams), | |||||
| _ => throw new NotSupportedException() | |||||
| }; | |||||
| } | |||||
| [GlobalSetup(Targets = [nameof(Basic)])] | |||||
| public void GlobalSetup() | |||||
| { | |||||
| InitializeParamsAndModel(); | |||||
| } | |||||
| [IterationCleanup(Targets = [nameof(Basic)])] | |||||
| public void GlobalCleanup() | |||||
| { | |||||
| if(ExecutorType != ExecutorType.Stateless) // stateless executor always dispose its `Context` property | |||||
| { | |||||
| Executor.Context.NativeHandle.KvCacheClear(); | |||||
| } | |||||
| } | |||||
| [Benchmark] | |||||
| public async Task<string> Basic() | |||||
| { | |||||
| StringBuilder sb = new(); | |||||
| await foreach(var text in Executor.InferAsync(Prompt, InferenceParams)) | |||||
| { | |||||
| sb.Append(text); | |||||
| } | |||||
| return sb.ToString(); | |||||
| } | |||||
| } | |||||
| } | |||||
| @@ -0,0 +1,13 @@ | |||||
| using BenchmarkDotNet.Running; | |||||
| namespace LLama.Benchmark | |||||
| { | |||||
| public class Program | |||||
| { | |||||
| public static void Main(string[] args) | |||||
| { | |||||
| var summary = BenchmarkSwitcher.FromAssembly(typeof(Program).Assembly).Run(args); | |||||
| Console.WriteLine(summary); | |||||
| } | |||||
| } | |||||
| } | |||||
| @@ -17,6 +17,8 @@ Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "LLamaSharp.SemanticKernel", | |||||
| EndProject | EndProject | ||||
| Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "LLamaSharp.KernelMemory", "LLama.KernelMemory\LLamaSharp.KernelMemory.csproj", "{E5589AE7-B86F-4343-A1CC-8E5D34596E52}" | Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "LLamaSharp.KernelMemory", "LLama.KernelMemory\LLamaSharp.KernelMemory.csproj", "{E5589AE7-B86F-4343-A1CC-8E5D34596E52}" | ||||
| EndProject | EndProject | ||||
| Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "LLama.Benchmark", "LLama.Benchmark\LLama.Benchmark.csproj", "{90D38FEE-68EA-459E-A4EE-268B9DFA1CD5}" | |||||
| EndProject | |||||
| Global | Global | ||||
| GlobalSection(SolutionConfigurationPlatforms) = preSolution | GlobalSection(SolutionConfigurationPlatforms) = preSolution | ||||
| Debug|Any CPU = Debug|Any CPU | Debug|Any CPU = Debug|Any CPU | ||||
| @@ -111,6 +113,18 @@ Global | |||||
| {E5589AE7-B86F-4343-A1CC-8E5D34596E52}.Release|Any CPU.Build.0 = Release|Any CPU | {E5589AE7-B86F-4343-A1CC-8E5D34596E52}.Release|Any CPU.Build.0 = Release|Any CPU | ||||
| {E5589AE7-B86F-4343-A1CC-8E5D34596E52}.Release|x64.ActiveCfg = Release|Any CPU | {E5589AE7-B86F-4343-A1CC-8E5D34596E52}.Release|x64.ActiveCfg = Release|Any CPU | ||||
| {E5589AE7-B86F-4343-A1CC-8E5D34596E52}.Release|x64.Build.0 = Release|Any CPU | {E5589AE7-B86F-4343-A1CC-8E5D34596E52}.Release|x64.Build.0 = Release|Any CPU | ||||
| {90D38FEE-68EA-459E-A4EE-268B9DFA1CD5}.Debug|Any CPU.ActiveCfg = Debug|Any CPU | |||||
| {90D38FEE-68EA-459E-A4EE-268B9DFA1CD5}.Debug|Any CPU.Build.0 = Debug|Any CPU | |||||
| {90D38FEE-68EA-459E-A4EE-268B9DFA1CD5}.Debug|x64.ActiveCfg = Debug|Any CPU | |||||
| {90D38FEE-68EA-459E-A4EE-268B9DFA1CD5}.Debug|x64.Build.0 = Debug|Any CPU | |||||
| {90D38FEE-68EA-459E-A4EE-268B9DFA1CD5}.GPU|Any CPU.ActiveCfg = Debug|Any CPU | |||||
| {90D38FEE-68EA-459E-A4EE-268B9DFA1CD5}.GPU|Any CPU.Build.0 = Debug|Any CPU | |||||
| {90D38FEE-68EA-459E-A4EE-268B9DFA1CD5}.GPU|x64.ActiveCfg = Debug|Any CPU | |||||
| {90D38FEE-68EA-459E-A4EE-268B9DFA1CD5}.GPU|x64.Build.0 = Debug|Any CPU | |||||
| {90D38FEE-68EA-459E-A4EE-268B9DFA1CD5}.Release|Any CPU.ActiveCfg = Release|Any CPU | |||||
| {90D38FEE-68EA-459E-A4EE-268B9DFA1CD5}.Release|Any CPU.Build.0 = Release|Any CPU | |||||
| {90D38FEE-68EA-459E-A4EE-268B9DFA1CD5}.Release|x64.ActiveCfg = Release|Any CPU | |||||
| {90D38FEE-68EA-459E-A4EE-268B9DFA1CD5}.Release|x64.Build.0 = Release|Any CPU | |||||
| EndGlobalSection | EndGlobalSection | ||||
| GlobalSection(SolutionProperties) = preSolution | GlobalSection(SolutionProperties) = preSolution | ||||
| HideSolutionNode = FALSE | HideSolutionNode = FALSE | ||||