From 15a98b36d85810cc98be2d621d83c84b69499448 Mon Sep 17 00:00:00 2001 From: Martin Evans Date: Thu, 1 Feb 2024 16:35:05 +0000 Subject: [PATCH] Updated everything to work with llama.cpp ce32060198b7e2d6a13a9b8e1e1369e3c295ae2a --- LLama.Unittest/LLamaContextTests.cs | 2 +- LLama.Web/Common/ModelOptions.cs | 3 ++ LLama/Abstractions/IModelParams.cs | 23 +++++++++++-- LLama/Common/ModelParams.cs | 3 ++ LLama/Extensions/IContextParamsExtensions.cs | 3 ++ LLama/Extensions/IModelParamsExtensions.cs | 9 ++--- LLama/LLamaContext.cs | 4 +-- LLama/LLamaExecutorBase.cs | 4 +-- LLama/LLamaInteractExecutor.cs | 2 +- LLama/Native/GPUSplitMode.cs | 23 +++++++++++++ LLama/Native/LLamaContextParams.cs | 26 +++++++++----- LLama/Native/LLamaFtype.cs | 25 ++++++++++++++ LLama/Native/LLamaModelParams.cs | 8 ++++- LLama/Native/LLamaModelQuantizeParams.cs | 6 ++++ LLama/Native/NativeApi.Quantize.cs | 3 +- LLama/Native/NativeApi.Sampling.cs | 18 ++++++++-- LLama/Native/NativeApi.cs | 36 ++++++++++++++++---- LLama/Native/SafeLLamaContextHandle.cs | 2 +- README.md | 1 + 19 files changed, 168 insertions(+), 33 deletions(-) create mode 100644 LLama/Native/GPUSplitMode.cs diff --git a/LLama.Unittest/LLamaContextTests.cs b/LLama.Unittest/LLamaContextTests.cs index 345e518d..ab27d988 100644 --- a/LLama.Unittest/LLamaContextTests.cs +++ b/LLama.Unittest/LLamaContextTests.cs @@ -28,7 +28,7 @@ namespace LLama.Unittest [Fact] public void CheckProperties() { - Assert.Equal(768, _context.ContextSize); + Assert.Equal(768u, _context.ContextSize); Assert.Equal(4096, _context.EmbeddingSize); Assert.Equal(32000, _context.VocabCount); } diff --git a/LLama.Web/Common/ModelOptions.cs b/LLama.Web/Common/ModelOptions.cs index 7b770b38..e462401a 100644 --- a/LLama.Web/Common/ModelOptions.cs +++ b/LLama.Web/Common/ModelOptions.cs @@ -23,6 +23,9 @@ namespace LLama.Web.Common /// public int MainGpu { get; set; } = 0; + /// + public GPUSplitMode SplitMode { get; set; } = GPUSplitMode.None; + /// public int GpuLayerCount { get; set; } = 20; diff --git a/LLama/Abstractions/IModelParams.cs b/LLama/Abstractions/IModelParams.cs index 3ef41bec..73e03b99 100644 --- a/LLama/Abstractions/IModelParams.cs +++ b/LLama/Abstractions/IModelParams.cs @@ -16,9 +16,28 @@ namespace LLama.Abstractions public interface IModelParams { /// - /// the GPU that is used for scratch and small tensors + /// main_gpu interpretation depends on split_mode: + /// + /// + /// None + /// The GPU that is used for the entire mode. + /// + /// + /// Row + /// The GPU that is used for small tensors and intermediate results. + /// + /// + /// Layer + /// Ignored. + /// + /// /// - int MainGpu { get; } + int MainGpu { get; set; } + + /// + /// How to split the model across multiple GPUs + /// + GPUSplitMode SplitMode { get; } /// /// Number of layers to run in VRAM / GPU memory (n_gpu_layers) diff --git a/LLama/Common/ModelParams.cs b/LLama/Common/ModelParams.cs index b124b84d..3afee9cb 100644 --- a/LLama/Common/ModelParams.cs +++ b/LLama/Common/ModelParams.cs @@ -18,6 +18,9 @@ namespace LLama.Common /// public int MainGpu { get; set; } = 0; + /// + public GPUSplitMode SplitMode { get; set; } = GPUSplitMode.None; + /// public int GpuLayerCount { get; set; } = 20; diff --git a/LLama/Extensions/IContextParamsExtensions.cs b/LLama/Extensions/IContextParamsExtensions.cs index 21273617..cd3075ab 100644 --- a/LLama/Extensions/IContextParamsExtensions.cs +++ b/LLama/Extensions/IContextParamsExtensions.cs @@ -36,6 +36,9 @@ namespace LLama.Extensions result.yarn_orig_ctx = @params.YarnOriginalContext ?? 0; result.rope_scaling_type = @params.YarnScalingType ?? RopeScalingType.LLAMA_ROPE_SCALING_UNSPECIFIED; + result.cb_eval = IntPtr.Zero; + result.cb_eval_user_data = IntPtr.Zero; + result.type_k = @params.TypeK ?? GGMLType.GGML_TYPE_F16; result.type_k = @params.TypeV ?? GGMLType.GGML_TYPE_F16; result.offload_kqv = !@params.NoKqvOffload; diff --git a/LLama/Extensions/IModelParamsExtensions.cs b/LLama/Extensions/IModelParamsExtensions.cs index f7fadece..69b9e288 100644 --- a/LLama/Extensions/IModelParamsExtensions.cs +++ b/LLama/Extensions/IModelParamsExtensions.cs @@ -21,15 +21,16 @@ public static class IModelParamsExtensions /// public static IDisposable ToLlamaModelParams(this IModelParams @params, out LLamaModelParams result) { - if (@params.UseMemoryLock && !NativeApi.llama_mlock_supported()) - throw new NotSupportedException("'UseMemoryLock' is not supported (llama_mlock_supported() == false)"); - if (@params.UseMemorymap && !NativeApi.llama_mmap_supported()) - throw new NotSupportedException("'UseMemorymap' is not supported (llama_mmap_supported() == false)"); + if (@params.UseMemoryLock && !NativeApi.llama_supports_mlock()) + throw new NotSupportedException("'UseMemoryLock' is not supported (llama_supports_mlock() == false)"); + if (@params.UseMemorymap && !NativeApi.llama_supports_mmap()) + throw new NotSupportedException("'UseMemorymap' is not supported (llama_supports_mmap() == false)"); var disposer = new GroupDisposable(); result = NativeApi.llama_model_default_params(); result.main_gpu = @params.MainGpu; + result.split_mode = @params.SplitMode; result.n_gpu_layers = @params.GpuLayerCount < 0 ? int.MaxValue : @params.GpuLayerCount; result.use_mlock = @params.UseMemoryLock; result.use_mmap = @params.UseMemorymap; diff --git a/LLama/LLamaContext.cs b/LLama/LLamaContext.cs index 6d39a8f9..5d026b67 100644 --- a/LLama/LLamaContext.cs +++ b/LLama/LLamaContext.cs @@ -33,7 +33,7 @@ namespace LLama /// /// Total number of tokens in the context /// - public int ContextSize => NativeHandle.ContextSize; + public uint ContextSize => NativeHandle.ContextSize; /// /// Dimension of embedding vectors @@ -323,7 +323,7 @@ namespace LLama var candidates_p = LLamaTokenDataArray.Create(logits); // Extract most recently returned tokens - var last_n_repeat = Math.Min(ContextSize, repeatLastTokensCount); + var last_n_repeat = Math.Min((int)ContextSize, repeatLastTokensCount); var last_n_array = lastTokens.TakeLast(last_n_repeat).ToArray(); // Apply penalties to candidates diff --git a/LLama/LLamaExecutorBase.cs b/LLama/LLamaExecutorBase.cs index 4713166e..3a697507 100644 --- a/LLama/LLamaExecutorBase.cs +++ b/LLama/LLamaExecutorBase.cs @@ -83,7 +83,7 @@ namespace LLama _pastTokensCount = 0; _consumedTokensCount = 0; _n_session_consumed = 0; - _last_n_tokens = new FixedSizeQueue(Context.ContextSize); + _last_n_tokens = new FixedSizeQueue((int)Context.ContextSize); _decoder = new StreamingTokenDecoder(context); } @@ -170,7 +170,7 @@ namespace LLama _pastTokensCount = Math.Max(1, tokensToKeep); // insert n_left/2 tokens at the start of embed from last_n_tokens - _embeds.InsertRange(0, _last_n_tokens.Take(_last_n_tokens.Count - _embeds.Count).Skip(Context.ContextSize - n_left / 2 - _embeds.Count)); + _embeds.InsertRange(0, _last_n_tokens.Take(_last_n_tokens.Count - _embeds.Count).Skip((int)Context.ContextSize - n_left / 2 - _embeds.Count)); // stop saving session if we run out of context _pathSession = string.Empty; diff --git a/LLama/LLamaInteractExecutor.cs b/LLama/LLamaInteractExecutor.cs index 2e72c7ae..7d742c81 100644 --- a/LLama/LLamaInteractExecutor.cs +++ b/LLama/LLamaInteractExecutor.cs @@ -179,7 +179,7 @@ namespace LLama if (_embed_inps.Count <= _consumedTokensCount && !args.WaitForInput) { - var repeat_last_n = inferenceParams.RepeatLastTokensCount < 0 ? Context.ContextSize : inferenceParams.RepeatLastTokensCount; + var repeat_last_n = inferenceParams.RepeatLastTokensCount < 0 ? (int)Context.ContextSize : inferenceParams.RepeatLastTokensCount; // optionally save the session on first sample (for faster prompt loading next time) if (!string.IsNullOrEmpty(_pathSession) && args.NeedToSaveSession) diff --git a/LLama/Native/GPUSplitMode.cs b/LLama/Native/GPUSplitMode.cs new file mode 100644 index 00000000..96957d0f --- /dev/null +++ b/LLama/Native/GPUSplitMode.cs @@ -0,0 +1,23 @@ +namespace LLama.Native; + +/// +/// +/// +/// llama_split_mode +public enum GPUSplitMode +{ + /// + /// Single GPU + /// + None = 0, + + /// + /// Split layers and KV across GPUs + /// + Layer = 1, + + /// + /// split rows across GPUs + /// + Row = 2, +} \ No newline at end of file diff --git a/LLama/Native/LLamaContextParams.cs b/LLama/Native/LLamaContextParams.cs index bfd39ea4..118dd540 100644 --- a/LLama/Native/LLamaContextParams.cs +++ b/LLama/Native/LLamaContextParams.cs @@ -8,7 +8,8 @@ namespace LLama.Native /// /// /// - public delegate void LlamaProgressCallback(float progress, IntPtr ctx); + /// llama_progress_callback + public delegate bool LlamaProgressCallback(float progress, IntPtr ctx); /// /// A C# representation of the llama.cpp `llama_context_params` struct @@ -46,37 +47,46 @@ namespace LLama.Native /// public RopeScalingType rope_scaling_type; - /// /// RoPE base frequency, 0 = from model /// - public float rope_freq_base; + public float rope_freq_base; /// /// RoPE frequency scaling factor, 0 = from model /// - public float rope_freq_scale; + public float rope_freq_scale; /// /// YaRN extrapolation mix factor, negative = from model /// - public float yarn_ext_factor; + public float yarn_ext_factor; /// /// YaRN magnitude scaling factor /// - public float yarn_attn_factor; + public float yarn_attn_factor; /// /// YaRN low correction dim /// - public float yarn_beta_fast; + public float yarn_beta_fast; /// /// YaRN high correction dim /// - public float yarn_beta_slow; + public float yarn_beta_slow; /// /// YaRN original context size /// public uint yarn_orig_ctx; + /// + /// ggml_backend_sched_eval_callback + /// + public IntPtr cb_eval; + + /// + /// User data passed into cb_eval + /// + public IntPtr cb_eval_user_data; + /// /// data type for K cache /// diff --git a/LLama/Native/LLamaFtype.cs b/LLama/Native/LLamaFtype.cs index 0fa0fbe9..8eb0a8b9 100644 --- a/LLama/Native/LLamaFtype.cs +++ b/LLama/Native/LLamaFtype.cs @@ -106,6 +106,31 @@ /// Benchmark@7B: 5.15GB, +0.0044 ppl LLAMA_FTYPE_MOSTLY_Q6_K = 18, + /// + /// except 1d tensors + /// + LLAMA_FTYPE_MOSTLY_IQ2_XXS = 19, + + /// + /// except 1d tensors + /// + LLAMA_FTYPE_MOSTLY_IQ2_XS = 20, + + /// + /// except 1d tensors + /// + LLAMA_FTYPE_MOSTLY_Q2_K_S = 21, + + /// + /// except 1d tensors + /// + LLAMA_FTYPE_MOSTLY_Q3_K_XS = 22, + + /// + /// except 1d tensors + /// + LLAMA_FTYPE_MOSTLY_IQ3_XXS = 23, + /// /// File type was not specified /// diff --git a/LLama/Native/LLamaModelParams.cs b/LLama/Native/LLamaModelParams.cs index ed7b6043..a7cdd1a2 100644 --- a/LLama/Native/LLamaModelParams.cs +++ b/LLama/Native/LLamaModelParams.cs @@ -14,6 +14,11 @@ namespace LLama.Native /// public int n_gpu_layers; + /// + /// how to split the model across multiple GPUs + /// + public GPUSplitMode split_mode; + /// /// the GPU that is used for scratch and small tensors /// @@ -25,7 +30,8 @@ namespace LLama.Native public float* tensor_split; /// - /// called with a progress value between 0 and 1, pass NULL to disable + /// called with a progress value between 0 and 1, pass NULL to disable. If the provided progress_callback + /// returns true, model loading continues. If it returns false, model loading is immediately aborted. /// public LlamaProgressCallback progress_callback; diff --git a/LLama/Native/LLamaModelQuantizeParams.cs b/LLama/Native/LLamaModelQuantizeParams.cs index 39702b5a..34c1a974 100644 --- a/LLama/Native/LLamaModelQuantizeParams.cs +++ b/LLama/Native/LLamaModelQuantizeParams.cs @@ -6,6 +6,7 @@ namespace LLama.Native /// /// Quantizer parameters used in the native API /// + /// llama_model_quantize_params [StructLayout(LayoutKind.Sequential)] public struct LLamaModelQuantizeParams { @@ -58,5 +59,10 @@ namespace LLama.Native set => _pure = Convert.ToSByte(value); } private sbyte _pure; + + /// + /// pointer to importance matrix data + /// + public IntPtr imatrix; } } diff --git a/LLama/Native/NativeApi.Quantize.cs b/LLama/Native/NativeApi.Quantize.cs index b849e38d..1c4909bf 100644 --- a/LLama/Native/NativeApi.Quantize.cs +++ b/LLama/Native/NativeApi.Quantize.cs @@ -10,9 +10,8 @@ namespace LLama.Native /// /// /// - /// not great API - very likely to change /// Returns 0 on success [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)] - public static extern unsafe int llama_model_quantize(string fname_inp, string fname_out, LLamaModelQuantizeParams* param); + public static extern unsafe uint llama_model_quantize(string fname_inp, string fname_out, LLamaModelQuantizeParams* param); } } diff --git a/LLama/Native/NativeApi.Sampling.cs b/LLama/Native/NativeApi.Sampling.cs index 7128441e..a52edc66 100644 --- a/LLama/Native/NativeApi.Sampling.cs +++ b/LLama/Native/NativeApi.Sampling.cs @@ -27,11 +27,12 @@ namespace LLama.Native /// Apply classifier-free guidance to the logits as described in academic paper "Stay on topic with Classifier-Free Guidance" https://arxiv.org/abs/2306.17806 /// /// - /// A vector of `llama_token_data` containing the candidate tokens, the logits must be directly extracted from the original generation context without being sorted. - /// A separate context from the same model. Other than a negative prompt at the beginning, it should have all generated and user input tokens copied from the main context. + /// Logits extracted from the original generation context. + /// Logits extracted from a separate context from the same model. + /// Other than a negative prompt at the beginning, it should have all generated and user input tokens copied from the main context. /// Guidance strength. 1.0f means no guidance. Higher values mean stronger guidance. [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)] - public static extern void llama_sample_classifier_free_guidance(SafeLLamaContextHandle ctx, ref LLamaTokenDataArrayNative candidates, SafeLLamaContextHandle guidance_ctx, float scale); + public static extern unsafe void llama_sample_apply_guidance(SafeLLamaContextHandle ctx, float* logits, float* logits_guidance, float scale); /// /// Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits. @@ -92,6 +93,17 @@ namespace LLama.Native [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)] public static extern void llama_sample_typical(SafeLLamaContextHandle ctx, ref LLamaTokenDataArrayNative candidates, float p, ulong min_keep); + /// + /// Dynamic temperature implementation described in the paper https://arxiv.org/abs/2309.02772. + /// + /// + /// Pointer to LLamaTokenDataArray + /// + /// + /// + [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)] + public static extern void llama_sample_typical(SafeLLamaContextHandle ctx, ref LLamaTokenDataArrayNative candidates, float min_temp, float max_temp, float exponent_val); + /// /// Modify logits by temperature /// diff --git a/LLama/Native/NativeApi.cs b/LLama/Native/NativeApi.cs index bb28e7ab..c953cb23 100644 --- a/LLama/Native/NativeApi.cs +++ b/LLama/Native/NativeApi.cs @@ -23,7 +23,7 @@ namespace LLama.Native /// public static void llama_empty_call() { - llama_mmap_supported(); + llama_max_devices(); } /// @@ -31,7 +31,7 @@ namespace LLama.Native /// /// [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)] - public static extern int llama_max_devices(); + public static extern long llama_max_devices(); /// /// Create a LLamaModelParams with default values @@ -59,14 +59,21 @@ namespace LLama.Native /// /// [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)] - public static extern bool llama_mmap_supported(); + public static extern bool llama_supports_mmap(); /// - /// Check if memory lockingis supported + /// Check if memory locking is supported /// /// [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)] - public static extern bool llama_mlock_supported(); + public static extern bool llama_supports_mlock(); + + /// + /// Check if GPU offload is supported + /// + /// + [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)] + public static extern bool llama_supports_gpu_offload(); /// /// Initialize the llama + ggml backend @@ -163,7 +170,10 @@ namespace LLama.Native /// /// [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)] - public static extern int llama_n_ctx(SafeLLamaContextHandle ctx); + public static extern uint llama_n_ctx(SafeLLamaContextHandle ctx); + + [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)] + public static extern uint llama_n_batch(SafeLLamaContextHandle ctx); /// /// Token logits obtained from the last call to llama_eval() @@ -380,6 +390,20 @@ namespace LLama.Native [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)] public static extern void llama_kv_cache_seq_shift(SafeLLamaContextHandle ctx, LLamaSeqId seq, LLamaPos p0, LLamaPos p1, LLamaPos delta); + /// + /// Integer division of the positions by factor of `d > 1` + /// If the KV cache is RoPEd, the KV data is updated accordingly + /// p0 < 0 : [0, p1] + /// p1 < 0 : [p0, inf) + /// + /// + /// + /// + /// + /// + [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)] + public static extern void llama_kv_cache_seq_div(SafeLLamaContextHandle ctx, LLamaSeqId seq, LLamaPos p0, LLamaPos p1, int d); + /// /// Allocates a batch of tokens on the heap /// Each token can be assigned up to n_seq_max sequence ids diff --git a/LLama/Native/SafeLLamaContextHandle.cs b/LLama/Native/SafeLLamaContextHandle.cs index 2c5d8288..d90d46d5 100644 --- a/LLama/Native/SafeLLamaContextHandle.cs +++ b/LLama/Native/SafeLLamaContextHandle.cs @@ -21,7 +21,7 @@ namespace LLama.Native /// /// Total number of tokens in the context /// - public int ContextSize => NativeApi.llama_n_ctx(this); + public uint ContextSize => NativeApi.llama_n_ctx(this); /// /// Dimension of embedding vectors diff --git a/README.md b/README.md index a73fb3c7..c20a523e 100644 --- a/README.md +++ b/README.md @@ -222,6 +222,7 @@ If you want to compile llama.cpp yourself you **must** use the exact commit ID l | v0.7.0, v0.8.0 | [Thespis-13B](https://huggingface.co/TheBloke/Thespis-13B-v0.5-GGUF/tree/main?not-for-all-audiences=true), [LLaMA2-7B](https://huggingface.co/TheBloke/llama-2-7B-Guanaco-QLoRA-GGUF) | [`207b519`](https://github.com/ggerganov/llama.cpp/commit/207b51900e15cc7f89763a3bb1c565fe11cbb45d) | | v0.8.1 | | [`e937066`](https://github.com/ggerganov/llama.cpp/commit/e937066420b79a757bf80e9836eb12b88420a218) | | v0.9.0, v0.9.1 | [Mixtral-8x7B](https://huggingface.co/TheBloke/Mixtral-8x7B-v0.1-GGUF) | [`9fb13f9`](https://github.com/ggerganov/llama.cpp/blob/9fb13f95840c722ad419f390dc8a9c86080a3700) | +| v10.0.0 | [Phi2](https://huggingface.co/TheBloke/phi-2-GGUF) | [`d71ac90`](https://github.com/ggerganov/llama.cpp/tree/d71ac90985854b0905e1abba778e407e17f9f887) | ## License