From 15a98b36d85810cc98be2d621d83c84b69499448 Mon Sep 17 00:00:00 2001
From: Martin Evans <martindevans@gmail.com>
Date: Thu, 1 Feb 2024 16:35:05 +0000
Subject: [PATCH] Updated everything to work with llama.cpp
 ce32060198b7e2d6a13a9b8e1e1369e3c295ae2a

---
 LLama.Unittest/LLamaContextTests.cs          |  2 +-
 LLama.Web/Common/ModelOptions.cs             |  3 ++
 LLama/Abstractions/IModelParams.cs           | 23 +++++++++++--
 LLama/Common/ModelParams.cs                  |  3 ++
 LLama/Extensions/IContextParamsExtensions.cs |  3 ++
 LLama/Extensions/IModelParamsExtensions.cs   |  9 ++---
 LLama/LLamaContext.cs                        |  4 +--
 LLama/LLamaExecutorBase.cs                   |  4 +--
 LLama/LLamaInteractExecutor.cs               |  2 +-
 LLama/Native/GPUSplitMode.cs                 | 23 +++++++++++++
 LLama/Native/LLamaContextParams.cs           | 26 +++++++++-----
 LLama/Native/LLamaFtype.cs                   | 25 ++++++++++++++
 LLama/Native/LLamaModelParams.cs             |  8 ++++-
 LLama/Native/LLamaModelQuantizeParams.cs     |  6 ++++
 LLama/Native/NativeApi.Quantize.cs           |  3 +-
 LLama/Native/NativeApi.Sampling.cs           | 18 ++++++++--
 LLama/Native/NativeApi.cs                    | 36 ++++++++++++++++----
 LLama/Native/SafeLLamaContextHandle.cs       |  2 +-
 README.md                                    |  1 +
 19 files changed, 168 insertions(+), 33 deletions(-)
 create mode 100644 LLama/Native/GPUSplitMode.cs
diff --git a/LLama.Unittest/LLamaContextTests.cs b/LLama.Unittest/LLamaContextTests.cs
index 345e518d..ab27d988 100644
--- a/LLama.Unittest/LLamaContextTests.cs
+++ b/LLama.Unittest/LLamaContextTests.cs
@@ -28,7 +28,7 @@ namespace LLama.Unittest
         [Fact]
         public void CheckProperties()
         {
-            Assert.Equal(768, _context.ContextSize);
+            Assert.Equal(768u, _context.ContextSize);
             Assert.Equal(4096, _context.EmbeddingSize);
             Assert.Equal(32000, _context.VocabCount);
         }
diff --git a/LLama.Web/Common/ModelOptions.cs b/LLama.Web/Common/ModelOptions.cs
index 7b770b38..e462401a 100644
--- a/LLama.Web/Common/ModelOptions.cs
+++ b/LLama.Web/Common/ModelOptions.cs
@@ -23,6 +23,9 @@ namespace LLama.Web.Common
         /// <inheritdoc />
         public int MainGpu { get; set; } = 0;
 
+        /// <inheritdoc />
+        public GPUSplitMode SplitMode { get; set; } = GPUSplitMode.None;
+
         /// <inheritdoc />
         public int GpuLayerCount { get; set; } = 20;
 
diff --git a/LLama/Abstractions/IModelParams.cs b/LLama/Abstractions/IModelParams.cs
index 3ef41bec..73e03b99 100644
--- a/LLama/Abstractions/IModelParams.cs
+++ b/LLama/Abstractions/IModelParams.cs
@@ -16,9 +16,28 @@ namespace LLama.Abstractions
     public interface IModelParams
     {
         /// <summary>
-        /// the GPU that is used for scratch and small tensors
+        /// main_gpu interpretation depends on split_mode:
+        /// <list type="bullet">
+        ///     <item>
+        ///         <term>None</term>
+        ///         <description>The GPU that is used for the entire mode.</description>
+        ///     </item>
+        ///     <item>
+        ///         <term>Row</term>
+        ///         <description>The GPU that is used for small tensors and intermediate results.</description>
+        ///     </item>
+        ///     <item>
+        ///         <term>Layer</term>
+        ///         <description>Ignored.</description>
+        ///     </item>
+        /// </list>
         /// </summary>
-        int MainGpu { get; }
+        int MainGpu { get; set; }
+
+        /// <summary>
+        /// How to split the model across multiple GPUs
+        /// </summary>
+        GPUSplitMode SplitMode { get; }
 
         /// <summary>
         /// Number of layers to run in VRAM / GPU memory (n_gpu_layers)
diff --git a/LLama/Common/ModelParams.cs b/LLama/Common/ModelParams.cs
index b124b84d..3afee9cb 100644
--- a/LLama/Common/ModelParams.cs
+++ b/LLama/Common/ModelParams.cs
@@ -18,6 +18,9 @@ namespace LLama.Common
         /// <inheritdoc />
         public int MainGpu { get; set; } = 0;
 
+        /// <inheritdoc />
+        public GPUSplitMode SplitMode { get; set; } = GPUSplitMode.None;
+
         /// <inheritdoc />
         public int GpuLayerCount { get; set; } = 20;
 
diff --git a/LLama/Extensions/IContextParamsExtensions.cs b/LLama/Extensions/IContextParamsExtensions.cs
index 21273617..cd3075ab 100644
--- a/LLama/Extensions/IContextParamsExtensions.cs
+++ b/LLama/Extensions/IContextParamsExtensions.cs
@@ -36,6 +36,9 @@ namespace LLama.Extensions
             result.yarn_orig_ctx = @params.YarnOriginalContext ?? 0;
             result.rope_scaling_type = @params.YarnScalingType ?? RopeScalingType.LLAMA_ROPE_SCALING_UNSPECIFIED;
 
+            result.cb_eval = IntPtr.Zero;
+            result.cb_eval_user_data = IntPtr.Zero;
+
             result.type_k = @params.TypeK ?? GGMLType.GGML_TYPE_F16;
             result.type_k = @params.TypeV ?? GGMLType.GGML_TYPE_F16;
             result.offload_kqv = !@params.NoKqvOffload;
diff --git a/LLama/Extensions/IModelParamsExtensions.cs b/LLama/Extensions/IModelParamsExtensions.cs
index f7fadece..69b9e288 100644
--- a/LLama/Extensions/IModelParamsExtensions.cs
+++ b/LLama/Extensions/IModelParamsExtensions.cs
@@ -21,15 +21,16 @@ public static class IModelParamsExtensions
     /// <exception cref="ArgumentException"></exception>
     public static IDisposable ToLlamaModelParams(this IModelParams @params, out LLamaModelParams result)
     {
-        if (@params.UseMemoryLock && !NativeApi.llama_mlock_supported())
-            throw new NotSupportedException("'UseMemoryLock' is not supported (llama_mlock_supported() == false)");
-        if (@params.UseMemorymap && !NativeApi.llama_mmap_supported())
-            throw new NotSupportedException("'UseMemorymap' is not supported (llama_mmap_supported() == false)");
+        if (@params.UseMemoryLock && !NativeApi.llama_supports_mlock())
+            throw new NotSupportedException("'UseMemoryLock' is not supported (llama_supports_mlock() == false)");
+        if (@params.UseMemorymap && !NativeApi.llama_supports_mmap())
+            throw new NotSupportedException("'UseMemorymap' is not supported (llama_supports_mmap() == false)");
 
         var disposer = new GroupDisposable();
 
         result = NativeApi.llama_model_default_params();
         result.main_gpu = @params.MainGpu;
+        result.split_mode = @params.SplitMode;
         result.n_gpu_layers = @params.GpuLayerCount < 0 ? int.MaxValue : @params.GpuLayerCount;
         result.use_mlock = @params.UseMemoryLock;
         result.use_mmap = @params.UseMemorymap;
diff --git a/LLama/LLamaContext.cs b/LLama/LLamaContext.cs
index 6d39a8f9..5d026b67 100644
--- a/LLama/LLamaContext.cs
+++ b/LLama/LLamaContext.cs
@@ -33,7 +33,7 @@ namespace LLama
         /// <summary>
         /// Total number of tokens in the context
         /// </summary>
-        public int ContextSize => NativeHandle.ContextSize;
+        public uint ContextSize => NativeHandle.ContextSize;
 
         /// <summary>
         /// Dimension of embedding vectors
@@ -323,7 +323,7 @@ namespace LLama
             var candidates_p = LLamaTokenDataArray.Create(logits);
 
             // Extract most recently returned tokens
-            var last_n_repeat = Math.Min(ContextSize, repeatLastTokensCount);
+            var last_n_repeat = Math.Min((int)ContextSize, repeatLastTokensCount);
             var last_n_array = lastTokens.TakeLast(last_n_repeat).ToArray();
 
             // Apply penalties to candidates
diff --git a/LLama/LLamaExecutorBase.cs b/LLama/LLamaExecutorBase.cs
index 4713166e..3a697507 100644
--- a/LLama/LLamaExecutorBase.cs
+++ b/LLama/LLamaExecutorBase.cs
@@ -83,7 +83,7 @@ namespace LLama
             _pastTokensCount = 0;
             _consumedTokensCount = 0;
             _n_session_consumed = 0;
-            _last_n_tokens = new FixedSizeQueue<LLamaToken>(Context.ContextSize);
+            _last_n_tokens = new FixedSizeQueue<LLamaToken>((int)Context.ContextSize);
             _decoder = new StreamingTokenDecoder(context);
         }
 
@@ -170,7 +170,7 @@ namespace LLama
             _pastTokensCount = Math.Max(1, tokensToKeep);
 
             // insert n_left/2 tokens at the start of embed from last_n_tokens
-            _embeds.InsertRange(0, _last_n_tokens.Take(_last_n_tokens.Count - _embeds.Count).Skip(Context.ContextSize - n_left / 2 - _embeds.Count));
+            _embeds.InsertRange(0, _last_n_tokens.Take(_last_n_tokens.Count - _embeds.Count).Skip((int)Context.ContextSize - n_left / 2 - _embeds.Count));
 
             // stop saving session if we run out of context
             _pathSession = string.Empty;
diff --git a/LLama/LLamaInteractExecutor.cs b/LLama/LLamaInteractExecutor.cs
index 2e72c7ae..7d742c81 100644
--- a/LLama/LLamaInteractExecutor.cs
+++ b/LLama/LLamaInteractExecutor.cs
@@ -179,7 +179,7 @@ namespace LLama
 
             if (_embed_inps.Count <= _consumedTokensCount && !args.WaitForInput)
             {
-                var repeat_last_n = inferenceParams.RepeatLastTokensCount < 0 ? Context.ContextSize : inferenceParams.RepeatLastTokensCount;
+                var repeat_last_n = inferenceParams.RepeatLastTokensCount < 0 ? (int)Context.ContextSize : inferenceParams.RepeatLastTokensCount;
 
                 // optionally save the session on first sample (for faster prompt loading next time)
                 if (!string.IsNullOrEmpty(_pathSession) && args.NeedToSaveSession)
diff --git a/LLama/Native/GPUSplitMode.cs b/LLama/Native/GPUSplitMode.cs
new file mode 100644
index 00000000..96957d0f
--- /dev/null
+++ b/LLama/Native/GPUSplitMode.cs
@@ -0,0 +1,23 @@
+﻿namespace LLama.Native;
+
+/// <summary>
+/// 
+/// </summary>
+/// <remarks>llama_split_mode</remarks>
+public enum GPUSplitMode
+{
+    /// <summary>
+    /// Single GPU
+    /// </summary>
+    None = 0,
+
+    /// <summary>
+    /// Split layers and KV across GPUs
+    /// </summary>
+    Layer = 1,
+
+    /// <summary>
+    /// split rows across GPUs
+    /// </summary>
+    Row = 2,
+}
\ No newline at end of file
diff --git a/LLama/Native/LLamaContextParams.cs b/LLama/Native/LLamaContextParams.cs
index bfd39ea4..118dd540 100644
--- a/LLama/Native/LLamaContextParams.cs
+++ b/LLama/Native/LLamaContextParams.cs
@@ -8,7 +8,8 @@ namespace LLama.Native
     /// </summary>
     /// <param name="progress"></param>
     /// <param name="ctx"></param>
-    public delegate void LlamaProgressCallback(float progress, IntPtr ctx);
+    /// <remarks>llama_progress_callback</remarks>
+    public delegate bool LlamaProgressCallback(float progress, IntPtr ctx);
 
     /// <summary>
     /// A C# representation of the llama.cpp `llama_context_params` struct
@@ -46,37 +47,46 @@ namespace LLama.Native
         /// </summary>
         public RopeScalingType rope_scaling_type;        
         
-
         /// <summary>
         /// RoPE base frequency, 0 = from model
         /// </summary>
-        public float    rope_freq_base;
+        public float rope_freq_base;
         /// <summary>
         /// RoPE frequency scaling factor, 0 = from model
         /// </summary>
-        public float    rope_freq_scale; 
+        public float rope_freq_scale; 
         /// <summary>
         /// YaRN extrapolation mix factor, negative = from model
         /// </summary>
-        public float    yarn_ext_factor;  
+        public float yarn_ext_factor;  
         /// <summary>
         /// YaRN magnitude scaling factor
         /// </summary>
-        public float    yarn_attn_factor; 
+        public float yarn_attn_factor; 
         /// <summary>
         /// YaRN low correction dim
         /// </summary>
-        public float    yarn_beta_fast;   
+        public float yarn_beta_fast;   
         /// <summary>
         /// YaRN high correction dim
         /// </summary>
-        public float    yarn_beta_slow;  
+        public float yarn_beta_slow;  
         
         /// <summary>
         /// YaRN original context size
         /// </summary>
         public uint yarn_orig_ctx;
 
+        /// <summary>
+        /// ggml_backend_sched_eval_callback
+        /// </summary>
+        public IntPtr cb_eval;
+
+        /// <summary>
+        /// User data passed into cb_eval
+        /// </summary>
+        public IntPtr cb_eval_user_data;
+
         /// <summary>
         /// data type for K cache
         /// </summary>
diff --git a/LLama/Native/LLamaFtype.cs b/LLama/Native/LLamaFtype.cs
index 0fa0fbe9..8eb0a8b9 100644
--- a/LLama/Native/LLamaFtype.cs
+++ b/LLama/Native/LLamaFtype.cs
@@ -106,6 +106,31 @@
         /// <remarks>Benchmark@7B: 5.15GB, +0.0044 ppl</remarks>
         LLAMA_FTYPE_MOSTLY_Q6_K = 18,
 
+        /// <summary>
+        /// except 1d tensors
+        /// </summary>
+        LLAMA_FTYPE_MOSTLY_IQ2_XXS = 19,
+
+        /// <summary>
+        /// except 1d tensors
+        /// </summary>
+        LLAMA_FTYPE_MOSTLY_IQ2_XS = 20,
+
+        /// <summary>
+        /// except 1d tensors
+        /// </summary>
+        LLAMA_FTYPE_MOSTLY_Q2_K_S = 21,
+
+        /// <summary>
+        /// except 1d tensors
+        /// </summary>
+        LLAMA_FTYPE_MOSTLY_Q3_K_XS = 22,
+
+        /// <summary>
+        /// except 1d tensors
+        /// </summary>
+        LLAMA_FTYPE_MOSTLY_IQ3_XXS = 23,
+
         /// <summary>
         /// File type was not specified
         /// </summary>
diff --git a/LLama/Native/LLamaModelParams.cs b/LLama/Native/LLamaModelParams.cs
index ed7b6043..a7cdd1a2 100644
--- a/LLama/Native/LLamaModelParams.cs
+++ b/LLama/Native/LLamaModelParams.cs
@@ -14,6 +14,11 @@ namespace LLama.Native
         /// </summary>
         public int n_gpu_layers;
 
+        /// <summary>
+        /// how to split the model across multiple GPUs
+        /// </summary>
+        public GPUSplitMode split_mode;
+
         /// <summary>
         /// the GPU that is used for scratch and small tensors
         /// </summary>
@@ -25,7 +30,8 @@ namespace LLama.Native
         public float* tensor_split;
 
         /// <summary>
-        /// called with a progress value between 0 and 1, pass NULL to disable
+        /// called with a progress value between 0 and 1, pass NULL to disable. If the provided progress_callback
+        /// returns true, model loading continues. If it returns false, model loading is immediately aborted.
         /// </summary>
         public LlamaProgressCallback progress_callback;
 
diff --git a/LLama/Native/LLamaModelQuantizeParams.cs b/LLama/Native/LLamaModelQuantizeParams.cs
index 39702b5a..34c1a974 100644
--- a/LLama/Native/LLamaModelQuantizeParams.cs
+++ b/LLama/Native/LLamaModelQuantizeParams.cs
@@ -6,6 +6,7 @@ namespace LLama.Native
     /// <summary>
     /// Quantizer parameters used in the native API
     /// </summary>
+    /// <remarks>llama_model_quantize_params</remarks>
     [StructLayout(LayoutKind.Sequential)]
     public struct LLamaModelQuantizeParams
     {
@@ -58,5 +59,10 @@ namespace LLama.Native
             set => _pure = Convert.ToSByte(value);
         }
         private sbyte _pure;
+
+        /// <summary>
+        /// pointer to importance matrix data
+        /// </summary>
+        public IntPtr imatrix;
     }
 }
diff --git a/LLama/Native/NativeApi.Quantize.cs b/LLama/Native/NativeApi.Quantize.cs
index b849e38d..1c4909bf 100644
--- a/LLama/Native/NativeApi.Quantize.cs
+++ b/LLama/Native/NativeApi.Quantize.cs
@@ -10,9 +10,8 @@ namespace LLama.Native
         /// <param name="fname_inp"></param>
         /// <param name="fname_out"></param>
         /// <param name="param"></param>
-        /// <remarks>not great API - very likely to change</remarks>
         /// <returns>Returns 0 on success</returns>
         [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
-        public static extern unsafe int llama_model_quantize(string fname_inp, string fname_out, LLamaModelQuantizeParams* param);
+        public static extern unsafe uint llama_model_quantize(string fname_inp, string fname_out, LLamaModelQuantizeParams* param);
     }
 }
diff --git a/LLama/Native/NativeApi.Sampling.cs b/LLama/Native/NativeApi.Sampling.cs
index 7128441e..a52edc66 100644
--- a/LLama/Native/NativeApi.Sampling.cs
+++ b/LLama/Native/NativeApi.Sampling.cs
@@ -27,11 +27,12 @@ namespace LLama.Native
         /// Apply classifier-free guidance to the logits as described in academic paper "Stay on topic with Classifier-Free Guidance" https://arxiv.org/abs/2306.17806
         /// </summary>
         /// <param name="ctx"></param>
-        /// <param name="candidates">A vector of `llama_token_data` containing the candidate tokens, the logits must be directly extracted from the original generation context without being sorted.</param>
-        /// <param name="guidance_ctx">A separate context from the same model. Other than a negative prompt at the beginning, it should have all generated and user input tokens copied from the main context.</param>
+        /// <param name="logits">Logits extracted from the original generation context.</param>
+        /// <param name="logits_guidance">Logits extracted from a separate context from the same model.
+        /// Other than a negative prompt at the beginning, it should have all generated and user input tokens copied from the main context.</param>
         /// <param name="scale">Guidance strength. 1.0f means no guidance. Higher values mean stronger guidance.</param>
         [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
-        public static extern void llama_sample_classifier_free_guidance(SafeLLamaContextHandle ctx, ref LLamaTokenDataArrayNative candidates, SafeLLamaContextHandle guidance_ctx, float scale);
+        public static extern unsafe void llama_sample_apply_guidance(SafeLLamaContextHandle ctx, float* logits, float* logits_guidance, float scale);
 
         /// <summary>
         /// Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
@@ -92,6 +93,17 @@ namespace LLama.Native
         [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
         public static extern void llama_sample_typical(SafeLLamaContextHandle ctx, ref LLamaTokenDataArrayNative candidates, float p, ulong min_keep);
 
+        /// <summary>
+        /// Dynamic temperature implementation described in the paper https://arxiv.org/abs/2309.02772.
+        /// </summary>
+        /// <param name="ctx"></param>
+        /// <param name="candidates">Pointer to LLamaTokenDataArray</param>
+        /// <param name="min_temp"></param>
+        /// <param name="max_temp"></param>
+        /// <param name="exponent_val"></param>
+        [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
+        public static extern void llama_sample_typical(SafeLLamaContextHandle ctx, ref LLamaTokenDataArrayNative candidates, float min_temp, float max_temp, float exponent_val);
+
         /// <summary>
         /// Modify logits by temperature
         /// </summary>
diff --git a/LLama/Native/NativeApi.cs b/LLama/Native/NativeApi.cs
index bb28e7ab..c953cb23 100644
--- a/LLama/Native/NativeApi.cs
+++ b/LLama/Native/NativeApi.cs
@@ -23,7 +23,7 @@ namespace LLama.Native
         /// <returns></returns>
         public static void llama_empty_call()
         {
-            llama_mmap_supported();
+            llama_max_devices();
         }
 
         /// <summary>
@@ -31,7 +31,7 @@ namespace LLama.Native
         /// </summary>
         /// <returns></returns>
         [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
-        public static extern int llama_max_devices();
+        public static extern long llama_max_devices();
 
         /// <summary>
         /// Create a LLamaModelParams with default values
@@ -59,14 +59,21 @@ namespace LLama.Native
         /// </summary>
         /// <returns></returns>
         [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
-        public static extern bool llama_mmap_supported();
+        public static extern bool llama_supports_mmap();
 
         /// <summary>
-        /// Check if memory lockingis supported
+        /// Check if memory locking is supported
         /// </summary>
         /// <returns></returns>
         [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
-        public static extern bool llama_mlock_supported();
+        public static extern bool llama_supports_mlock();
+
+        /// <summary>
+        /// Check if GPU offload is supported
+        /// </summary>
+        /// <returns></returns>
+        [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
+        public static extern bool llama_supports_gpu_offload();
 
         /// <summary>
         /// Initialize the llama + ggml backend
@@ -163,7 +170,10 @@ namespace LLama.Native
         /// <param name="ctx"></param>
         /// <returns></returns>
         [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
-        public static extern int llama_n_ctx(SafeLLamaContextHandle ctx);
+        public static extern uint llama_n_ctx(SafeLLamaContextHandle ctx);
+
+        [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
+        public static extern uint llama_n_batch(SafeLLamaContextHandle ctx);
 
         /// <summary>
         /// Token logits obtained from the last call to llama_eval()
@@ -380,6 +390,20 @@ namespace LLama.Native
         [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
         public static extern void llama_kv_cache_seq_shift(SafeLLamaContextHandle ctx, LLamaSeqId seq, LLamaPos p0, LLamaPos p1, LLamaPos delta);
 
+        /// <summary>
+        /// Integer division of the positions by factor of `d > 1`
+        /// If the KV cache is RoPEd, the KV data is updated accordingly
+        /// p0 &lt; 0 : [0,  p1]
+        /// p1 &lt; 0 : [p0, inf)
+        /// </summary>
+        /// <param name="ctx"></param>
+        /// <param name="seq"></param>
+        /// <param name="p0"></param>
+        /// <param name="p1"></param>
+        /// <param name="d"></param>
+        [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
+        public static extern void llama_kv_cache_seq_div(SafeLLamaContextHandle ctx, LLamaSeqId seq, LLamaPos p0, LLamaPos p1, int d);
+
         /// <summary>
         /// Allocates a batch of tokens on the heap
         /// Each token can be assigned up to n_seq_max sequence ids
diff --git a/LLama/Native/SafeLLamaContextHandle.cs b/LLama/Native/SafeLLamaContextHandle.cs
index 2c5d8288..d90d46d5 100644
--- a/LLama/Native/SafeLLamaContextHandle.cs
+++ b/LLama/Native/SafeLLamaContextHandle.cs
@@ -21,7 +21,7 @@ namespace LLama.Native
         /// <summary>
         /// Total number of tokens in the context
         /// </summary>
-        public int ContextSize => NativeApi.llama_n_ctx(this);
+        public uint ContextSize => NativeApi.llama_n_ctx(this);
 
         /// <summary>
         /// Dimension of embedding vectors
diff --git a/README.md b/README.md
index a73fb3c7..c20a523e 100644
--- a/README.md
+++ b/README.md
@@ -222,6 +222,7 @@ If you want to compile llama.cpp yourself you **must** use the exact commit ID l
 | v0.7.0, v0.8.0 | [Thespis-13B](https://huggingface.co/TheBloke/Thespis-13B-v0.5-GGUF/tree/main?not-for-all-audiences=true), [LLaMA2-7B](https://huggingface.co/TheBloke/llama-2-7B-Guanaco-QLoRA-GGUF) | [`207b519`](https://github.com/ggerganov/llama.cpp/commit/207b51900e15cc7f89763a3bb1c565fe11cbb45d) |
 | v0.8.1 | | [`e937066`](https://github.com/ggerganov/llama.cpp/commit/e937066420b79a757bf80e9836eb12b88420a218) |
 | v0.9.0, v0.9.1 | [Mixtral-8x7B](https://huggingface.co/TheBloke/Mixtral-8x7B-v0.1-GGUF) | [`9fb13f9`](https://github.com/ggerganov/llama.cpp/blob/9fb13f95840c722ad419f390dc8a9c86080a3700) |
+| v10.0.0 | [Phi2](https://huggingface.co/TheBloke/phi-2-GGUF) | [`d71ac90`](https://github.com/ggerganov/llama.cpp/tree/d71ac90985854b0905e1abba778e407e17f9f887) |
 
 ## License