Updated everything to work with llama.cpp ce32060198

1 year ago · 15a98b36d8
--- a/LLama.Unittest/LLamaContextTests.cs
+++ b/LLama.Unittest/LLamaContextTests.cs
@@ -28,7 +28,7 @@ namespace LLama.Unittest
        [Fact]
        public void CheckProperties()
        {
            Assert.Equal(768, _context.ContextSize);
            Assert.Equal(768u, _context.ContextSize);
            Assert.Equal(4096, _context.EmbeddingSize);
            Assert.Equal(32000, _context.VocabCount);
        }
--- a/LLama.Web/Common/ModelOptions.cs
+++ b/LLama.Web/Common/ModelOptions.cs
@@ -23,6 +23,9 @@ namespace LLama.Web.Common
        /// <inheritdoc />
        public int MainGpu { get; set; } = 0;

        /// <inheritdoc />
        public GPUSplitMode SplitMode { get; set; } = GPUSplitMode.None;

        /// <inheritdoc />
        public int GpuLayerCount { get; set; } = 20;

--- a/LLama/Abstractions/IModelParams.cs
+++ b/LLama/Abstractions/IModelParams.cs
@@ -16,9 +16,28 @@ namespace LLama.Abstractions
    public interface IModelParams
    {
        /// <summary>
        /// the GPU that is used for scratch and small tensors
        /// main_gpu interpretation depends on split_mode:
        /// <list type="bullet">
        ///     <item>
        ///         <term>None</term>
        ///         <description>The GPU that is used for the entire mode.</description>
        ///     </item>
        ///     <item>
        ///         <term>Row</term>
        ///         <description>The GPU that is used for small tensors and intermediate results.</description>
        ///     </item>
        ///     <item>
        ///         <term>Layer</term>
        ///         <description>Ignored.</description>
        ///     </item>
        /// </list>
        /// </summary>
        int MainGpu { get; }
        int MainGpu { get; set; }

        /// <summary>
        /// How to split the model across multiple GPUs
        /// </summary>
        GPUSplitMode SplitMode { get; }

        /// <summary>
        /// Number of layers to run in VRAM / GPU memory (n_gpu_layers)
--- a/LLama/Common/ModelParams.cs
+++ b/LLama/Common/ModelParams.cs
@@ -18,6 +18,9 @@ namespace LLama.Common
        /// <inheritdoc />
        public int MainGpu { get; set; } = 0;

        /// <inheritdoc />
        public GPUSplitMode SplitMode { get; set; } = GPUSplitMode.None;

        /// <inheritdoc />
        public int GpuLayerCount { get; set; } = 20;

--- a/LLama/Extensions/IContextParamsExtensions.cs
+++ b/LLama/Extensions/IContextParamsExtensions.cs
@@ -36,6 +36,9 @@ namespace LLama.Extensions
            result.yarn_orig_ctx = @params.YarnOriginalContext ?? 0;
            result.rope_scaling_type = @params.YarnScalingType ?? RopeScalingType.LLAMA_ROPE_SCALING_UNSPECIFIED;

            result.cb_eval = IntPtr.Zero;
            result.cb_eval_user_data = IntPtr.Zero;

            result.type_k = @params.TypeK ?? GGMLType.GGML_TYPE_F16;
            result.type_k = @params.TypeV ?? GGMLType.GGML_TYPE_F16;
            result.offload_kqv = !@params.NoKqvOffload;
--- a/LLama/Extensions/IModelParamsExtensions.cs
+++ b/LLama/Extensions/IModelParamsExtensions.cs
@@ -21,15 +21,16 @@ public static class IModelParamsExtensions
    /// <exception cref="ArgumentException"></exception>
    public static IDisposable ToLlamaModelParams(this IModelParams @params, out LLamaModelParams result)
    {
        if (@params.UseMemoryLock && !NativeApi.llama_mlock_supported())
            throw new NotSupportedException("'UseMemoryLock' is not supported (llama_mlock_supported() == false)");
        if (@params.UseMemorymap && !NativeApi.llama_mmap_supported())
            throw new NotSupportedException("'UseMemorymap' is not supported (llama_mmap_supported() == false)");
        if (@params.UseMemoryLock && !NativeApi.llama_supports_mlock())
            throw new NotSupportedException("'UseMemoryLock' is not supported (llama_supports_mlock() == false)");
        if (@params.UseMemorymap && !NativeApi.llama_supports_mmap())
            throw new NotSupportedException("'UseMemorymap' is not supported (llama_supports_mmap() == false)");

        var disposer = new GroupDisposable();

        result = NativeApi.llama_model_default_params();
        result.main_gpu = @params.MainGpu;
        result.split_mode = @params.SplitMode;
        result.n_gpu_layers = @params.GpuLayerCount < 0 ? int.MaxValue : @params.GpuLayerCount;
        result.use_mlock = @params.UseMemoryLock;
        result.use_mmap = @params.UseMemorymap;
--- a/LLama/LLamaContext.cs
+++ b/LLama/LLamaContext.cs
@@ -33,7 +33,7 @@ namespace LLama
        /// <summary>
        /// Total number of tokens in the context
        /// </summary>
        public int ContextSize => NativeHandle.ContextSize;
        public uint ContextSize => NativeHandle.ContextSize;

        /// <summary>
        /// Dimension of embedding vectors
@@ -323,7 +323,7 @@ namespace LLama
            var candidates_p = LLamaTokenDataArray.Create(logits);

            // Extract most recently returned tokens
            var last_n_repeat = Math.Min(ContextSize, repeatLastTokensCount);
            var last_n_repeat = Math.Min((int)ContextSize, repeatLastTokensCount);
            var last_n_array = lastTokens.TakeLast(last_n_repeat).ToArray();

            // Apply penalties to candidates
--- a/LLama/LLamaExecutorBase.cs
+++ b/LLama/LLamaExecutorBase.cs
@@ -83,7 +83,7 @@ namespace LLama
            _pastTokensCount = 0;
            _consumedTokensCount = 0;
            _n_session_consumed = 0;
            _last_n_tokens = new FixedSizeQueue<LLamaToken>(Context.ContextSize);
            _last_n_tokens = new FixedSizeQueue<LLamaToken>((int)Context.ContextSize);
            _decoder = new StreamingTokenDecoder(context);
        }

@@ -170,7 +170,7 @@ namespace LLama
            _pastTokensCount = Math.Max(1, tokensToKeep);

            // insert n_left/2 tokens at the start of embed from last_n_tokens
            _embeds.InsertRange(0, _last_n_tokens.Take(_last_n_tokens.Count - _embeds.Count).Skip(Context.ContextSize - n_left / 2 - _embeds.Count));
            _embeds.InsertRange(0, _last_n_tokens.Take(_last_n_tokens.Count - _embeds.Count).Skip((int)Context.ContextSize - n_left / 2 - _embeds.Count));

            // stop saving session if we run out of context
            _pathSession = string.Empty;
--- a/LLama/LLamaInteractExecutor.cs
+++ b/LLama/LLamaInteractExecutor.cs
@@ -179,7 +179,7 @@ namespace LLama

            if (_embed_inps.Count <= _consumedTokensCount && !args.WaitForInput)
            {
                var repeat_last_n = inferenceParams.RepeatLastTokensCount < 0 ? Context.ContextSize : inferenceParams.RepeatLastTokensCount;
                var repeat_last_n = inferenceParams.RepeatLastTokensCount < 0 ? (int)Context.ContextSize : inferenceParams.RepeatLastTokensCount;

                // optionally save the session on first sample (for faster prompt loading next time)
                if (!string.IsNullOrEmpty(_pathSession) && args.NeedToSaveSession)
--- a/LLama/Native/GPUSplitMode.cs
+++ b/LLama/Native/GPUSplitMode.cs
@@ -0,0 +1,23 @@
 namespace LLama.Native;

 /// <summary>
 /// 
 /// </summary>
 /// <remarks>llama_split_mode</remarks>
 public enum GPUSplitMode
 {
    /// <summary>
    /// Single GPU
    /// </summary>
    None = 0,

    /// <summary>
    /// Split layers and KV across GPUs
    /// </summary>
    Layer = 1,

    /// <summary>
    /// split rows across GPUs
    /// </summary>
    Row = 2,
 }
--- a/LLama/Native/LLamaContextParams.cs
+++ b/LLama/Native/LLamaContextParams.cs
@@ -8,7 +8,8 @@ namespace LLama.Native
    /// </summary>
    /// <param name="progress"></param>
    /// <param name="ctx"></param>
    public delegate void LlamaProgressCallback(float progress, IntPtr ctx);
    /// <remarks>llama_progress_callback</remarks>
    public delegate bool LlamaProgressCallback(float progress, IntPtr ctx);

    /// <summary>
    /// A C# representation of the llama.cpp `llama_context_params` struct
@@ -46,37 +47,46 @@ namespace LLama.Native
        /// </summary>
        public RopeScalingType rope_scaling_type;        
        

        /// <summary>
        /// RoPE base frequency, 0 = from model
        /// </summary>
        public float    rope_freq_base;
        public float rope_freq_base;
        /// <summary>
        /// RoPE frequency scaling factor, 0 = from model
        /// </summary>
        public float    rope_freq_scale; 
        public float rope_freq_scale; 
        /// <summary>
        /// YaRN extrapolation mix factor, negative = from model
        /// </summary>
        public float    yarn_ext_factor;  
        public float yarn_ext_factor;  
        /// <summary>
        /// YaRN magnitude scaling factor
        /// </summary>
        public float    yarn_attn_factor; 
        public float yarn_attn_factor; 
        /// <summary>
        /// YaRN low correction dim
        /// </summary>
        public float    yarn_beta_fast;   
        public float yarn_beta_fast;   
        /// <summary>
        /// YaRN high correction dim
        /// </summary>
        public float    yarn_beta_slow;  
        public float yarn_beta_slow;  
        
        /// <summary>
        /// YaRN original context size
        /// </summary>
        public uint yarn_orig_ctx;

        /// <summary>
        /// ggml_backend_sched_eval_callback
        /// </summary>
        public IntPtr cb_eval;

        /// <summary>
        /// User data passed into cb_eval
        /// </summary>
        public IntPtr cb_eval_user_data;

        /// <summary>
        /// data type for K cache
        /// </summary>
--- a/LLama/Native/LLamaFtype.cs
+++ b/LLama/Native/LLamaFtype.cs
@@ -106,6 +106,31 @@
        /// <remarks>Benchmark@7B: 5.15GB, +0.0044 ppl</remarks>
        LLAMA_FTYPE_MOSTLY_Q6_K = 18,

        /// <summary>
        /// except 1d tensors
        /// </summary>
        LLAMA_FTYPE_MOSTLY_IQ2_XXS = 19,

        /// <summary>
        /// except 1d tensors
        /// </summary>
        LLAMA_FTYPE_MOSTLY_IQ2_XS = 20,

        /// <summary>
        /// except 1d tensors
        /// </summary>
        LLAMA_FTYPE_MOSTLY_Q2_K_S = 21,

        /// <summary>
        /// except 1d tensors
        /// </summary>
        LLAMA_FTYPE_MOSTLY_Q3_K_XS = 22,

        /// <summary>
        /// except 1d tensors
        /// </summary>
        LLAMA_FTYPE_MOSTLY_IQ3_XXS = 23,

        /// <summary>
        /// File type was not specified
        /// </summary>
--- a/LLama/Native/LLamaModelParams.cs
+++ b/LLama/Native/LLamaModelParams.cs
@@ -14,6 +14,11 @@ namespace LLama.Native
        /// </summary>
        public int n_gpu_layers;

        /// <summary>
        /// how to split the model across multiple GPUs
        /// </summary>
        public GPUSplitMode split_mode;

        /// <summary>
        /// the GPU that is used for scratch and small tensors
        /// </summary>
@@ -25,7 +30,8 @@ namespace LLama.Native
        public float* tensor_split;

        /// <summary>
        /// called with a progress value between 0 and 1, pass NULL to disable
        /// called with a progress value between 0 and 1, pass NULL to disable. If the provided progress_callback
        /// returns true, model loading continues. If it returns false, model loading is immediately aborted.
        /// </summary>
        public LlamaProgressCallback progress_callback;

--- a/LLama/Native/LLamaModelQuantizeParams.cs
+++ b/LLama/Native/LLamaModelQuantizeParams.cs
@@ -6,6 +6,7 @@ namespace LLama.Native
    /// <summary>
    /// Quantizer parameters used in the native API
    /// </summary>
    /// <remarks>llama_model_quantize_params</remarks>
    [StructLayout(LayoutKind.Sequential)]
    public struct LLamaModelQuantizeParams
    {
@@ -58,5 +59,10 @@ namespace LLama.Native
            set => _pure = Convert.ToSByte(value);
        }
        private sbyte _pure;

        /// <summary>
        /// pointer to importance matrix data
        /// </summary>
        public IntPtr imatrix;
    }
 }
--- a/LLama/Native/NativeApi.Quantize.cs
+++ b/LLama/Native/NativeApi.Quantize.cs
@@ -10,9 +10,8 @@ namespace LLama.Native
        /// <param name="fname_inp"></param>
        /// <param name="fname_out"></param>
        /// <param name="param"></param>
        /// <remarks>not great API - very likely to change</remarks>
        /// <returns>Returns 0 on success</returns>
        [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
        public static extern unsafe int llama_model_quantize(string fname_inp, string fname_out, LLamaModelQuantizeParams* param);
        public static extern unsafe uint llama_model_quantize(string fname_inp, string fname_out, LLamaModelQuantizeParams* param);
    }
 }
--- a/LLama/Native/NativeApi.Sampling.cs
+++ b/LLama/Native/NativeApi.Sampling.cs
@@ -27,11 +27,12 @@ namespace LLama.Native
        /// Apply classifier-free guidance to the logits as described in academic paper "Stay on topic with Classifier-Free Guidance" https://arxiv.org/abs/2306.17806
        /// </summary>
        /// <param name="ctx"></param>
        /// <param name="candidates">A vector of `llama_token_data` containing the candidate tokens, the logits must be directly extracted from the original generation context without being sorted.</param>
        /// <param name="guidance_ctx">A separate context from the same model. Other than a negative prompt at the beginning, it should have all generated and user input tokens copied from the main context.</param>
        /// <param name="logits">Logits extracted from the original generation context.</param>
        /// <param name="logits_guidance">Logits extracted from a separate context from the same model.
        /// Other than a negative prompt at the beginning, it should have all generated and user input tokens copied from the main context.</param>
        /// <param name="scale">Guidance strength. 1.0f means no guidance. Higher values mean stronger guidance.</param>
        [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
        public static extern void llama_sample_classifier_free_guidance(SafeLLamaContextHandle ctx, ref LLamaTokenDataArrayNative candidates, SafeLLamaContextHandle guidance_ctx, float scale);
        public static extern unsafe void llama_sample_apply_guidance(SafeLLamaContextHandle ctx, float* logits, float* logits_guidance, float scale);

        /// <summary>
        /// Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
@@ -92,6 +93,17 @@ namespace LLama.Native
        [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
        public static extern void llama_sample_typical(SafeLLamaContextHandle ctx, ref LLamaTokenDataArrayNative candidates, float p, ulong min_keep);

        /// <summary>
        /// Dynamic temperature implementation described in the paper https://arxiv.org/abs/2309.02772.
        /// </summary>
        /// <param name="ctx"></param>
        /// <param name="candidates">Pointer to LLamaTokenDataArray</param>
        /// <param name="min_temp"></param>
        /// <param name="max_temp"></param>
        /// <param name="exponent_val"></param>
        [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
        public static extern void llama_sample_typical(SafeLLamaContextHandle ctx, ref LLamaTokenDataArrayNative candidates, float min_temp, float max_temp, float exponent_val);

        /// <summary>
        /// Modify logits by temperature
        /// </summary>
--- a/LLama/Native/NativeApi.cs
+++ b/LLama/Native/NativeApi.cs
@@ -23,7 +23,7 @@ namespace LLama.Native
        /// <returns></returns>
        public static void llama_empty_call()
        {
            llama_mmap_supported();
            llama_max_devices();
        }

        /// <summary>
@@ -31,7 +31,7 @@ namespace LLama.Native
        /// </summary>
        /// <returns></returns>
        [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
        public static extern int llama_max_devices();
        public static extern long llama_max_devices();

        /// <summary>
        /// Create a LLamaModelParams with default values
@@ -59,14 +59,21 @@ namespace LLama.Native
        /// </summary>
        /// <returns></returns>
        [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
        public static extern bool llama_mmap_supported();
        public static extern bool llama_supports_mmap();

        /// <summary>
        /// Check if memory lockingis supported
        /// Check if memory locking is supported
        /// </summary>
        /// <returns></returns>
        [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
        public static extern bool llama_mlock_supported();
        public static extern bool llama_supports_mlock();

        /// <summary>
        /// Check if GPU offload is supported
        /// </summary>
        /// <returns></returns>
        [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
        public static extern bool llama_supports_gpu_offload();

        /// <summary>
        /// Initialize the llama + ggml backend
@@ -163,7 +170,10 @@ namespace LLama.Native
        /// <param name="ctx"></param>
        /// <returns></returns>
        [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
        public static extern int llama_n_ctx(SafeLLamaContextHandle ctx);
        public static extern uint llama_n_ctx(SafeLLamaContextHandle ctx);

        [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
        public static extern uint llama_n_batch(SafeLLamaContextHandle ctx);

        /// <summary>
        /// Token logits obtained from the last call to llama_eval()
@@ -380,6 +390,20 @@ namespace LLama.Native
        [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
        public static extern void llama_kv_cache_seq_shift(SafeLLamaContextHandle ctx, LLamaSeqId seq, LLamaPos p0, LLamaPos p1, LLamaPos delta);

        /// <summary>
        /// Integer division of the positions by factor of `d > 1`
        /// If the KV cache is RoPEd, the KV data is updated accordingly
        /// p0 &lt; 0 : [0,  p1]
        /// p1 &lt; 0 : [p0, inf)
        /// </summary>
        /// <param name="ctx"></param>
        /// <param name="seq"></param>
        /// <param name="p0"></param>
        /// <param name="p1"></param>
        /// <param name="d"></param>
        [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
        public static extern void llama_kv_cache_seq_div(SafeLLamaContextHandle ctx, LLamaSeqId seq, LLamaPos p0, LLamaPos p1, int d);

        /// <summary>
        /// Allocates a batch of tokens on the heap
        /// Each token can be assigned up to n_seq_max sequence ids
--- a/LLama/Native/SafeLLamaContextHandle.cs
+++ b/LLama/Native/SafeLLamaContextHandle.cs
@@ -21,7 +21,7 @@ namespace LLama.Native
        /// <summary>
        /// Total number of tokens in the context
        /// </summary>
        public int ContextSize => NativeApi.llama_n_ctx(this);
        public uint ContextSize => NativeApi.llama_n_ctx(this);

        /// <summary>
        /// Dimension of embedding vectors
--- a/README.md
+++ b/README.md
@@ -222,6 +222,7 @@ If you want to compile llama.cpp yourself you **must** use the exact commit ID l
 | v0.7.0, v0.8.0 | [Thespis-13B](https://huggingface.co/TheBloke/Thespis-13B-v0.5-GGUF/tree/main?not-for-all-audiences=true), [LLaMA2-7B](https://huggingface.co/TheBloke/llama-2-7B-Guanaco-QLoRA-GGUF) | [`207b519`](https://github.com/ggerganov/llama.cpp/commit/207b51900e15cc7f89763a3bb1c565fe11cbb45d) |
 | v0.8.1 | | [`e937066`](https://github.com/ggerganov/llama.cpp/commit/e937066420b79a757bf80e9836eb12b88420a218) |
 | v0.9.0, v0.9.1 | [Mixtral-8x7B](https://huggingface.co/TheBloke/Mixtral-8x7B-v0.1-GGUF) | [`9fb13f9`](https://github.com/ggerganov/llama.cpp/blob/9fb13f95840c722ad419f390dc8a9c86080a3700) |
 | v10.0.0 | [Phi2](https://huggingface.co/TheBloke/phi-2-GGUF) | [`d71ac90`](https://github.com/ggerganov/llama.cpp/tree/d71ac90985854b0905e1abba778e407e17f9f887) |

 ## License