| @@ -28,7 +28,7 @@ namespace LLama.Unittest | |||
| [Fact] | |||
| public void CheckProperties() | |||
| { | |||
| Assert.Equal(768, _context.ContextSize); | |||
| Assert.Equal(768u, _context.ContextSize); | |||
| Assert.Equal(4096, _context.EmbeddingSize); | |||
| Assert.Equal(32000, _context.VocabCount); | |||
| } | |||
| @@ -23,6 +23,9 @@ namespace LLama.Web.Common | |||
| /// <inheritdoc /> | |||
| public int MainGpu { get; set; } = 0; | |||
| /// <inheritdoc /> | |||
| public GPUSplitMode SplitMode { get; set; } = GPUSplitMode.None; | |||
| /// <inheritdoc /> | |||
| public int GpuLayerCount { get; set; } = 20; | |||
| @@ -16,9 +16,28 @@ namespace LLama.Abstractions | |||
| public interface IModelParams | |||
| { | |||
| /// <summary> | |||
| /// the GPU that is used for scratch and small tensors | |||
| /// main_gpu interpretation depends on split_mode: | |||
| /// <list type="bullet"> | |||
| /// <item> | |||
| /// <term>None</term> | |||
| /// <description>The GPU that is used for the entire mode.</description> | |||
| /// </item> | |||
| /// <item> | |||
| /// <term>Row</term> | |||
| /// <description>The GPU that is used for small tensors and intermediate results.</description> | |||
| /// </item> | |||
| /// <item> | |||
| /// <term>Layer</term> | |||
| /// <description>Ignored.</description> | |||
| /// </item> | |||
| /// </list> | |||
| /// </summary> | |||
| int MainGpu { get; } | |||
| int MainGpu { get; set; } | |||
| /// <summary> | |||
| /// How to split the model across multiple GPUs | |||
| /// </summary> | |||
| GPUSplitMode SplitMode { get; } | |||
| /// <summary> | |||
| /// Number of layers to run in VRAM / GPU memory (n_gpu_layers) | |||
| @@ -18,6 +18,9 @@ namespace LLama.Common | |||
| /// <inheritdoc /> | |||
| public int MainGpu { get; set; } = 0; | |||
| /// <inheritdoc /> | |||
| public GPUSplitMode SplitMode { get; set; } = GPUSplitMode.None; | |||
| /// <inheritdoc /> | |||
| public int GpuLayerCount { get; set; } = 20; | |||
| @@ -36,6 +36,9 @@ namespace LLama.Extensions | |||
| result.yarn_orig_ctx = @params.YarnOriginalContext ?? 0; | |||
| result.rope_scaling_type = @params.YarnScalingType ?? RopeScalingType.LLAMA_ROPE_SCALING_UNSPECIFIED; | |||
| result.cb_eval = IntPtr.Zero; | |||
| result.cb_eval_user_data = IntPtr.Zero; | |||
| result.type_k = @params.TypeK ?? GGMLType.GGML_TYPE_F16; | |||
| result.type_k = @params.TypeV ?? GGMLType.GGML_TYPE_F16; | |||
| result.offload_kqv = !@params.NoKqvOffload; | |||
| @@ -21,15 +21,16 @@ public static class IModelParamsExtensions | |||
| /// <exception cref="ArgumentException"></exception> | |||
| public static IDisposable ToLlamaModelParams(this IModelParams @params, out LLamaModelParams result) | |||
| { | |||
| if (@params.UseMemoryLock && !NativeApi.llama_mlock_supported()) | |||
| throw new NotSupportedException("'UseMemoryLock' is not supported (llama_mlock_supported() == false)"); | |||
| if (@params.UseMemorymap && !NativeApi.llama_mmap_supported()) | |||
| throw new NotSupportedException("'UseMemorymap' is not supported (llama_mmap_supported() == false)"); | |||
| if (@params.UseMemoryLock && !NativeApi.llama_supports_mlock()) | |||
| throw new NotSupportedException("'UseMemoryLock' is not supported (llama_supports_mlock() == false)"); | |||
| if (@params.UseMemorymap && !NativeApi.llama_supports_mmap()) | |||
| throw new NotSupportedException("'UseMemorymap' is not supported (llama_supports_mmap() == false)"); | |||
| var disposer = new GroupDisposable(); | |||
| result = NativeApi.llama_model_default_params(); | |||
| result.main_gpu = @params.MainGpu; | |||
| result.split_mode = @params.SplitMode; | |||
| result.n_gpu_layers = @params.GpuLayerCount < 0 ? int.MaxValue : @params.GpuLayerCount; | |||
| result.use_mlock = @params.UseMemoryLock; | |||
| result.use_mmap = @params.UseMemorymap; | |||
| @@ -33,7 +33,7 @@ namespace LLama | |||
| /// <summary> | |||
| /// Total number of tokens in the context | |||
| /// </summary> | |||
| public int ContextSize => NativeHandle.ContextSize; | |||
| public uint ContextSize => NativeHandle.ContextSize; | |||
| /// <summary> | |||
| /// Dimension of embedding vectors | |||
| @@ -323,7 +323,7 @@ namespace LLama | |||
| var candidates_p = LLamaTokenDataArray.Create(logits); | |||
| // Extract most recently returned tokens | |||
| var last_n_repeat = Math.Min(ContextSize, repeatLastTokensCount); | |||
| var last_n_repeat = Math.Min((int)ContextSize, repeatLastTokensCount); | |||
| var last_n_array = lastTokens.TakeLast(last_n_repeat).ToArray(); | |||
| // Apply penalties to candidates | |||
| @@ -83,7 +83,7 @@ namespace LLama | |||
| _pastTokensCount = 0; | |||
| _consumedTokensCount = 0; | |||
| _n_session_consumed = 0; | |||
| _last_n_tokens = new FixedSizeQueue<LLamaToken>(Context.ContextSize); | |||
| _last_n_tokens = new FixedSizeQueue<LLamaToken>((int)Context.ContextSize); | |||
| _decoder = new StreamingTokenDecoder(context); | |||
| } | |||
| @@ -170,7 +170,7 @@ namespace LLama | |||
| _pastTokensCount = Math.Max(1, tokensToKeep); | |||
| // insert n_left/2 tokens at the start of embed from last_n_tokens | |||
| _embeds.InsertRange(0, _last_n_tokens.Take(_last_n_tokens.Count - _embeds.Count).Skip(Context.ContextSize - n_left / 2 - _embeds.Count)); | |||
| _embeds.InsertRange(0, _last_n_tokens.Take(_last_n_tokens.Count - _embeds.Count).Skip((int)Context.ContextSize - n_left / 2 - _embeds.Count)); | |||
| // stop saving session if we run out of context | |||
| _pathSession = string.Empty; | |||
| @@ -179,7 +179,7 @@ namespace LLama | |||
| if (_embed_inps.Count <= _consumedTokensCount && !args.WaitForInput) | |||
| { | |||
| var repeat_last_n = inferenceParams.RepeatLastTokensCount < 0 ? Context.ContextSize : inferenceParams.RepeatLastTokensCount; | |||
| var repeat_last_n = inferenceParams.RepeatLastTokensCount < 0 ? (int)Context.ContextSize : inferenceParams.RepeatLastTokensCount; | |||
| // optionally save the session on first sample (for faster prompt loading next time) | |||
| if (!string.IsNullOrEmpty(_pathSession) && args.NeedToSaveSession) | |||
| @@ -0,0 +1,23 @@ | |||
| namespace LLama.Native; | |||
| /// <summary> | |||
| /// | |||
| /// </summary> | |||
| /// <remarks>llama_split_mode</remarks> | |||
| public enum GPUSplitMode | |||
| { | |||
| /// <summary> | |||
| /// Single GPU | |||
| /// </summary> | |||
| None = 0, | |||
| /// <summary> | |||
| /// Split layers and KV across GPUs | |||
| /// </summary> | |||
| Layer = 1, | |||
| /// <summary> | |||
| /// split rows across GPUs | |||
| /// </summary> | |||
| Row = 2, | |||
| } | |||
| @@ -8,7 +8,8 @@ namespace LLama.Native | |||
| /// </summary> | |||
| /// <param name="progress"></param> | |||
| /// <param name="ctx"></param> | |||
| public delegate void LlamaProgressCallback(float progress, IntPtr ctx); | |||
| /// <remarks>llama_progress_callback</remarks> | |||
| public delegate bool LlamaProgressCallback(float progress, IntPtr ctx); | |||
| /// <summary> | |||
| /// A C# representation of the llama.cpp `llama_context_params` struct | |||
| @@ -46,37 +47,46 @@ namespace LLama.Native | |||
| /// </summary> | |||
| public RopeScalingType rope_scaling_type; | |||
| /// <summary> | |||
| /// RoPE base frequency, 0 = from model | |||
| /// </summary> | |||
| public float rope_freq_base; | |||
| public float rope_freq_base; | |||
| /// <summary> | |||
| /// RoPE frequency scaling factor, 0 = from model | |||
| /// </summary> | |||
| public float rope_freq_scale; | |||
| public float rope_freq_scale; | |||
| /// <summary> | |||
| /// YaRN extrapolation mix factor, negative = from model | |||
| /// </summary> | |||
| public float yarn_ext_factor; | |||
| public float yarn_ext_factor; | |||
| /// <summary> | |||
| /// YaRN magnitude scaling factor | |||
| /// </summary> | |||
| public float yarn_attn_factor; | |||
| public float yarn_attn_factor; | |||
| /// <summary> | |||
| /// YaRN low correction dim | |||
| /// </summary> | |||
| public float yarn_beta_fast; | |||
| public float yarn_beta_fast; | |||
| /// <summary> | |||
| /// YaRN high correction dim | |||
| /// </summary> | |||
| public float yarn_beta_slow; | |||
| public float yarn_beta_slow; | |||
| /// <summary> | |||
| /// YaRN original context size | |||
| /// </summary> | |||
| public uint yarn_orig_ctx; | |||
| /// <summary> | |||
| /// ggml_backend_sched_eval_callback | |||
| /// </summary> | |||
| public IntPtr cb_eval; | |||
| /// <summary> | |||
| /// User data passed into cb_eval | |||
| /// </summary> | |||
| public IntPtr cb_eval_user_data; | |||
| /// <summary> | |||
| /// data type for K cache | |||
| /// </summary> | |||
| @@ -106,6 +106,31 @@ | |||
| /// <remarks>Benchmark@7B: 5.15GB, +0.0044 ppl</remarks> | |||
| LLAMA_FTYPE_MOSTLY_Q6_K = 18, | |||
| /// <summary> | |||
| /// except 1d tensors | |||
| /// </summary> | |||
| LLAMA_FTYPE_MOSTLY_IQ2_XXS = 19, | |||
| /// <summary> | |||
| /// except 1d tensors | |||
| /// </summary> | |||
| LLAMA_FTYPE_MOSTLY_IQ2_XS = 20, | |||
| /// <summary> | |||
| /// except 1d tensors | |||
| /// </summary> | |||
| LLAMA_FTYPE_MOSTLY_Q2_K_S = 21, | |||
| /// <summary> | |||
| /// except 1d tensors | |||
| /// </summary> | |||
| LLAMA_FTYPE_MOSTLY_Q3_K_XS = 22, | |||
| /// <summary> | |||
| /// except 1d tensors | |||
| /// </summary> | |||
| LLAMA_FTYPE_MOSTLY_IQ3_XXS = 23, | |||
| /// <summary> | |||
| /// File type was not specified | |||
| /// </summary> | |||
| @@ -14,6 +14,11 @@ namespace LLama.Native | |||
| /// </summary> | |||
| public int n_gpu_layers; | |||
| /// <summary> | |||
| /// how to split the model across multiple GPUs | |||
| /// </summary> | |||
| public GPUSplitMode split_mode; | |||
| /// <summary> | |||
| /// the GPU that is used for scratch and small tensors | |||
| /// </summary> | |||
| @@ -25,7 +30,8 @@ namespace LLama.Native | |||
| public float* tensor_split; | |||
| /// <summary> | |||
| /// called with a progress value between 0 and 1, pass NULL to disable | |||
| /// called with a progress value between 0 and 1, pass NULL to disable. If the provided progress_callback | |||
| /// returns true, model loading continues. If it returns false, model loading is immediately aborted. | |||
| /// </summary> | |||
| public LlamaProgressCallback progress_callback; | |||
| @@ -6,6 +6,7 @@ namespace LLama.Native | |||
| /// <summary> | |||
| /// Quantizer parameters used in the native API | |||
| /// </summary> | |||
| /// <remarks>llama_model_quantize_params</remarks> | |||
| [StructLayout(LayoutKind.Sequential)] | |||
| public struct LLamaModelQuantizeParams | |||
| { | |||
| @@ -58,5 +59,10 @@ namespace LLama.Native | |||
| set => _pure = Convert.ToSByte(value); | |||
| } | |||
| private sbyte _pure; | |||
| /// <summary> | |||
| /// pointer to importance matrix data | |||
| /// </summary> | |||
| public IntPtr imatrix; | |||
| } | |||
| } | |||
| @@ -10,9 +10,8 @@ namespace LLama.Native | |||
| /// <param name="fname_inp"></param> | |||
| /// <param name="fname_out"></param> | |||
| /// <param name="param"></param> | |||
| /// <remarks>not great API - very likely to change</remarks> | |||
| /// <returns>Returns 0 on success</returns> | |||
| [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)] | |||
| public static extern unsafe int llama_model_quantize(string fname_inp, string fname_out, LLamaModelQuantizeParams* param); | |||
| public static extern unsafe uint llama_model_quantize(string fname_inp, string fname_out, LLamaModelQuantizeParams* param); | |||
| } | |||
| } | |||
| @@ -27,11 +27,12 @@ namespace LLama.Native | |||
| /// Apply classifier-free guidance to the logits as described in academic paper "Stay on topic with Classifier-Free Guidance" https://arxiv.org/abs/2306.17806 | |||
| /// </summary> | |||
| /// <param name="ctx"></param> | |||
| /// <param name="candidates">A vector of `llama_token_data` containing the candidate tokens, the logits must be directly extracted from the original generation context without being sorted.</param> | |||
| /// <param name="guidance_ctx">A separate context from the same model. Other than a negative prompt at the beginning, it should have all generated and user input tokens copied from the main context.</param> | |||
| /// <param name="logits">Logits extracted from the original generation context.</param> | |||
| /// <param name="logits_guidance">Logits extracted from a separate context from the same model. | |||
| /// Other than a negative prompt at the beginning, it should have all generated and user input tokens copied from the main context.</param> | |||
| /// <param name="scale">Guidance strength. 1.0f means no guidance. Higher values mean stronger guidance.</param> | |||
| [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)] | |||
| public static extern void llama_sample_classifier_free_guidance(SafeLLamaContextHandle ctx, ref LLamaTokenDataArrayNative candidates, SafeLLamaContextHandle guidance_ctx, float scale); | |||
| public static extern unsafe void llama_sample_apply_guidance(SafeLLamaContextHandle ctx, float* logits, float* logits_guidance, float scale); | |||
| /// <summary> | |||
| /// Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits. | |||
| @@ -92,6 +93,17 @@ namespace LLama.Native | |||
| [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)] | |||
| public static extern void llama_sample_typical(SafeLLamaContextHandle ctx, ref LLamaTokenDataArrayNative candidates, float p, ulong min_keep); | |||
| /// <summary> | |||
| /// Dynamic temperature implementation described in the paper https://arxiv.org/abs/2309.02772. | |||
| /// </summary> | |||
| /// <param name="ctx"></param> | |||
| /// <param name="candidates">Pointer to LLamaTokenDataArray</param> | |||
| /// <param name="min_temp"></param> | |||
| /// <param name="max_temp"></param> | |||
| /// <param name="exponent_val"></param> | |||
| [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)] | |||
| public static extern void llama_sample_typical(SafeLLamaContextHandle ctx, ref LLamaTokenDataArrayNative candidates, float min_temp, float max_temp, float exponent_val); | |||
| /// <summary> | |||
| /// Modify logits by temperature | |||
| /// </summary> | |||
| @@ -23,7 +23,7 @@ namespace LLama.Native | |||
| /// <returns></returns> | |||
| public static void llama_empty_call() | |||
| { | |||
| llama_mmap_supported(); | |||
| llama_max_devices(); | |||
| } | |||
| /// <summary> | |||
| @@ -31,7 +31,7 @@ namespace LLama.Native | |||
| /// </summary> | |||
| /// <returns></returns> | |||
| [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)] | |||
| public static extern int llama_max_devices(); | |||
| public static extern long llama_max_devices(); | |||
| /// <summary> | |||
| /// Create a LLamaModelParams with default values | |||
| @@ -59,14 +59,21 @@ namespace LLama.Native | |||
| /// </summary> | |||
| /// <returns></returns> | |||
| [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)] | |||
| public static extern bool llama_mmap_supported(); | |||
| public static extern bool llama_supports_mmap(); | |||
| /// <summary> | |||
| /// Check if memory lockingis supported | |||
| /// Check if memory locking is supported | |||
| /// </summary> | |||
| /// <returns></returns> | |||
| [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)] | |||
| public static extern bool llama_mlock_supported(); | |||
| public static extern bool llama_supports_mlock(); | |||
| /// <summary> | |||
| /// Check if GPU offload is supported | |||
| /// </summary> | |||
| /// <returns></returns> | |||
| [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)] | |||
| public static extern bool llama_supports_gpu_offload(); | |||
| /// <summary> | |||
| /// Initialize the llama + ggml backend | |||
| @@ -163,7 +170,10 @@ namespace LLama.Native | |||
| /// <param name="ctx"></param> | |||
| /// <returns></returns> | |||
| [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)] | |||
| public static extern int llama_n_ctx(SafeLLamaContextHandle ctx); | |||
| public static extern uint llama_n_ctx(SafeLLamaContextHandle ctx); | |||
| [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)] | |||
| public static extern uint llama_n_batch(SafeLLamaContextHandle ctx); | |||
| /// <summary> | |||
| /// Token logits obtained from the last call to llama_eval() | |||
| @@ -380,6 +390,20 @@ namespace LLama.Native | |||
| [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)] | |||
| public static extern void llama_kv_cache_seq_shift(SafeLLamaContextHandle ctx, LLamaSeqId seq, LLamaPos p0, LLamaPos p1, LLamaPos delta); | |||
| /// <summary> | |||
| /// Integer division of the positions by factor of `d > 1` | |||
| /// If the KV cache is RoPEd, the KV data is updated accordingly | |||
| /// p0 < 0 : [0, p1] | |||
| /// p1 < 0 : [p0, inf) | |||
| /// </summary> | |||
| /// <param name="ctx"></param> | |||
| /// <param name="seq"></param> | |||
| /// <param name="p0"></param> | |||
| /// <param name="p1"></param> | |||
| /// <param name="d"></param> | |||
| [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)] | |||
| public static extern void llama_kv_cache_seq_div(SafeLLamaContextHandle ctx, LLamaSeqId seq, LLamaPos p0, LLamaPos p1, int d); | |||
| /// <summary> | |||
| /// Allocates a batch of tokens on the heap | |||
| /// Each token can be assigned up to n_seq_max sequence ids | |||
| @@ -21,7 +21,7 @@ namespace LLama.Native | |||
| /// <summary> | |||
| /// Total number of tokens in the context | |||
| /// </summary> | |||
| public int ContextSize => NativeApi.llama_n_ctx(this); | |||
| public uint ContextSize => NativeApi.llama_n_ctx(this); | |||
| /// <summary> | |||
| /// Dimension of embedding vectors | |||
| @@ -222,6 +222,7 @@ If you want to compile llama.cpp yourself you **must** use the exact commit ID l | |||
| | v0.7.0, v0.8.0 | [Thespis-13B](https://huggingface.co/TheBloke/Thespis-13B-v0.5-GGUF/tree/main?not-for-all-audiences=true), [LLaMA2-7B](https://huggingface.co/TheBloke/llama-2-7B-Guanaco-QLoRA-GGUF) | [`207b519`](https://github.com/ggerganov/llama.cpp/commit/207b51900e15cc7f89763a3bb1c565fe11cbb45d) | | |||
| | v0.8.1 | | [`e937066`](https://github.com/ggerganov/llama.cpp/commit/e937066420b79a757bf80e9836eb12b88420a218) | | |||
| | v0.9.0, v0.9.1 | [Mixtral-8x7B](https://huggingface.co/TheBloke/Mixtral-8x7B-v0.1-GGUF) | [`9fb13f9`](https://github.com/ggerganov/llama.cpp/blob/9fb13f95840c722ad419f390dc8a9c86080a3700) | | |||
| | v10.0.0 | [Phi2](https://huggingface.co/TheBloke/phi-2-GGUF) | [`d71ac90`](https://github.com/ggerganov/llama.cpp/tree/d71ac90985854b0905e1abba778e407e17f9f887) | | |||
| ## License | |||