diff --git a/LLama.Unittest/LLamaContextTests.cs b/LLama.Unittest/LLamaContextTests.cs
index 345e518d..ab27d988 100644
--- a/LLama.Unittest/LLamaContextTests.cs
+++ b/LLama.Unittest/LLamaContextTests.cs
@@ -28,7 +28,7 @@ namespace LLama.Unittest
[Fact]
public void CheckProperties()
{
- Assert.Equal(768, _context.ContextSize);
+ Assert.Equal(768u, _context.ContextSize);
Assert.Equal(4096, _context.EmbeddingSize);
Assert.Equal(32000, _context.VocabCount);
}
diff --git a/LLama.Web/Common/ModelOptions.cs b/LLama.Web/Common/ModelOptions.cs
index 7b770b38..e462401a 100644
--- a/LLama.Web/Common/ModelOptions.cs
+++ b/LLama.Web/Common/ModelOptions.cs
@@ -23,6 +23,9 @@ namespace LLama.Web.Common
///
public int MainGpu { get; set; } = 0;
+ ///
+ public GPUSplitMode SplitMode { get; set; } = GPUSplitMode.None;
+
///
public int GpuLayerCount { get; set; } = 20;
diff --git a/LLama/Abstractions/IModelParams.cs b/LLama/Abstractions/IModelParams.cs
index 3ef41bec..73e03b99 100644
--- a/LLama/Abstractions/IModelParams.cs
+++ b/LLama/Abstractions/IModelParams.cs
@@ -16,9 +16,28 @@ namespace LLama.Abstractions
public interface IModelParams
{
///
- /// the GPU that is used for scratch and small tensors
+ /// main_gpu interpretation depends on split_mode:
+ ///
+ /// -
+ /// None
+ /// The GPU that is used for the entire mode.
+ ///
+ /// -
+ /// Row
+ /// The GPU that is used for small tensors and intermediate results.
+ ///
+ /// -
+ /// Layer
+ /// Ignored.
+ ///
+ ///
///
- int MainGpu { get; }
+ int MainGpu { get; set; }
+
+ ///
+ /// How to split the model across multiple GPUs
+ ///
+ GPUSplitMode SplitMode { get; }
///
/// Number of layers to run in VRAM / GPU memory (n_gpu_layers)
diff --git a/LLama/Common/ModelParams.cs b/LLama/Common/ModelParams.cs
index b124b84d..3afee9cb 100644
--- a/LLama/Common/ModelParams.cs
+++ b/LLama/Common/ModelParams.cs
@@ -18,6 +18,9 @@ namespace LLama.Common
///
public int MainGpu { get; set; } = 0;
+ ///
+ public GPUSplitMode SplitMode { get; set; } = GPUSplitMode.None;
+
///
public int GpuLayerCount { get; set; } = 20;
diff --git a/LLama/Extensions/IContextParamsExtensions.cs b/LLama/Extensions/IContextParamsExtensions.cs
index 21273617..cd3075ab 100644
--- a/LLama/Extensions/IContextParamsExtensions.cs
+++ b/LLama/Extensions/IContextParamsExtensions.cs
@@ -36,6 +36,9 @@ namespace LLama.Extensions
result.yarn_orig_ctx = @params.YarnOriginalContext ?? 0;
result.rope_scaling_type = @params.YarnScalingType ?? RopeScalingType.LLAMA_ROPE_SCALING_UNSPECIFIED;
+ result.cb_eval = IntPtr.Zero;
+ result.cb_eval_user_data = IntPtr.Zero;
+
result.type_k = @params.TypeK ?? GGMLType.GGML_TYPE_F16;
result.type_k = @params.TypeV ?? GGMLType.GGML_TYPE_F16;
result.offload_kqv = !@params.NoKqvOffload;
diff --git a/LLama/Extensions/IModelParamsExtensions.cs b/LLama/Extensions/IModelParamsExtensions.cs
index f7fadece..69b9e288 100644
--- a/LLama/Extensions/IModelParamsExtensions.cs
+++ b/LLama/Extensions/IModelParamsExtensions.cs
@@ -21,15 +21,16 @@ public static class IModelParamsExtensions
///
public static IDisposable ToLlamaModelParams(this IModelParams @params, out LLamaModelParams result)
{
- if (@params.UseMemoryLock && !NativeApi.llama_mlock_supported())
- throw new NotSupportedException("'UseMemoryLock' is not supported (llama_mlock_supported() == false)");
- if (@params.UseMemorymap && !NativeApi.llama_mmap_supported())
- throw new NotSupportedException("'UseMemorymap' is not supported (llama_mmap_supported() == false)");
+ if (@params.UseMemoryLock && !NativeApi.llama_supports_mlock())
+ throw new NotSupportedException("'UseMemoryLock' is not supported (llama_supports_mlock() == false)");
+ if (@params.UseMemorymap && !NativeApi.llama_supports_mmap())
+ throw new NotSupportedException("'UseMemorymap' is not supported (llama_supports_mmap() == false)");
var disposer = new GroupDisposable();
result = NativeApi.llama_model_default_params();
result.main_gpu = @params.MainGpu;
+ result.split_mode = @params.SplitMode;
result.n_gpu_layers = @params.GpuLayerCount < 0 ? int.MaxValue : @params.GpuLayerCount;
result.use_mlock = @params.UseMemoryLock;
result.use_mmap = @params.UseMemorymap;
diff --git a/LLama/LLamaContext.cs b/LLama/LLamaContext.cs
index 6d39a8f9..5d026b67 100644
--- a/LLama/LLamaContext.cs
+++ b/LLama/LLamaContext.cs
@@ -33,7 +33,7 @@ namespace LLama
///
/// Total number of tokens in the context
///
- public int ContextSize => NativeHandle.ContextSize;
+ public uint ContextSize => NativeHandle.ContextSize;
///
/// Dimension of embedding vectors
@@ -323,7 +323,7 @@ namespace LLama
var candidates_p = LLamaTokenDataArray.Create(logits);
// Extract most recently returned tokens
- var last_n_repeat = Math.Min(ContextSize, repeatLastTokensCount);
+ var last_n_repeat = Math.Min((int)ContextSize, repeatLastTokensCount);
var last_n_array = lastTokens.TakeLast(last_n_repeat).ToArray();
// Apply penalties to candidates
diff --git a/LLama/LLamaExecutorBase.cs b/LLama/LLamaExecutorBase.cs
index 4713166e..3a697507 100644
--- a/LLama/LLamaExecutorBase.cs
+++ b/LLama/LLamaExecutorBase.cs
@@ -83,7 +83,7 @@ namespace LLama
_pastTokensCount = 0;
_consumedTokensCount = 0;
_n_session_consumed = 0;
- _last_n_tokens = new FixedSizeQueue(Context.ContextSize);
+ _last_n_tokens = new FixedSizeQueue((int)Context.ContextSize);
_decoder = new StreamingTokenDecoder(context);
}
@@ -170,7 +170,7 @@ namespace LLama
_pastTokensCount = Math.Max(1, tokensToKeep);
// insert n_left/2 tokens at the start of embed from last_n_tokens
- _embeds.InsertRange(0, _last_n_tokens.Take(_last_n_tokens.Count - _embeds.Count).Skip(Context.ContextSize - n_left / 2 - _embeds.Count));
+ _embeds.InsertRange(0, _last_n_tokens.Take(_last_n_tokens.Count - _embeds.Count).Skip((int)Context.ContextSize - n_left / 2 - _embeds.Count));
// stop saving session if we run out of context
_pathSession = string.Empty;
diff --git a/LLama/LLamaInteractExecutor.cs b/LLama/LLamaInteractExecutor.cs
index 2e72c7ae..7d742c81 100644
--- a/LLama/LLamaInteractExecutor.cs
+++ b/LLama/LLamaInteractExecutor.cs
@@ -179,7 +179,7 @@ namespace LLama
if (_embed_inps.Count <= _consumedTokensCount && !args.WaitForInput)
{
- var repeat_last_n = inferenceParams.RepeatLastTokensCount < 0 ? Context.ContextSize : inferenceParams.RepeatLastTokensCount;
+ var repeat_last_n = inferenceParams.RepeatLastTokensCount < 0 ? (int)Context.ContextSize : inferenceParams.RepeatLastTokensCount;
// optionally save the session on first sample (for faster prompt loading next time)
if (!string.IsNullOrEmpty(_pathSession) && args.NeedToSaveSession)
diff --git a/LLama/Native/GPUSplitMode.cs b/LLama/Native/GPUSplitMode.cs
new file mode 100644
index 00000000..96957d0f
--- /dev/null
+++ b/LLama/Native/GPUSplitMode.cs
@@ -0,0 +1,23 @@
+namespace LLama.Native;
+
+///
+///
+///
+/// llama_split_mode
+public enum GPUSplitMode
+{
+ ///
+ /// Single GPU
+ ///
+ None = 0,
+
+ ///
+ /// Split layers and KV across GPUs
+ ///
+ Layer = 1,
+
+ ///
+ /// split rows across GPUs
+ ///
+ Row = 2,
+}
\ No newline at end of file
diff --git a/LLama/Native/LLamaContextParams.cs b/LLama/Native/LLamaContextParams.cs
index bfd39ea4..118dd540 100644
--- a/LLama/Native/LLamaContextParams.cs
+++ b/LLama/Native/LLamaContextParams.cs
@@ -8,7 +8,8 @@ namespace LLama.Native
///
///
///
- public delegate void LlamaProgressCallback(float progress, IntPtr ctx);
+ /// llama_progress_callback
+ public delegate bool LlamaProgressCallback(float progress, IntPtr ctx);
///
/// A C# representation of the llama.cpp `llama_context_params` struct
@@ -46,37 +47,46 @@ namespace LLama.Native
///
public RopeScalingType rope_scaling_type;
-
///
/// RoPE base frequency, 0 = from model
///
- public float rope_freq_base;
+ public float rope_freq_base;
///
/// RoPE frequency scaling factor, 0 = from model
///
- public float rope_freq_scale;
+ public float rope_freq_scale;
///
/// YaRN extrapolation mix factor, negative = from model
///
- public float yarn_ext_factor;
+ public float yarn_ext_factor;
///
/// YaRN magnitude scaling factor
///
- public float yarn_attn_factor;
+ public float yarn_attn_factor;
///
/// YaRN low correction dim
///
- public float yarn_beta_fast;
+ public float yarn_beta_fast;
///
/// YaRN high correction dim
///
- public float yarn_beta_slow;
+ public float yarn_beta_slow;
///
/// YaRN original context size
///
public uint yarn_orig_ctx;
+ ///
+ /// ggml_backend_sched_eval_callback
+ ///
+ public IntPtr cb_eval;
+
+ ///
+ /// User data passed into cb_eval
+ ///
+ public IntPtr cb_eval_user_data;
+
///
/// data type for K cache
///
diff --git a/LLama/Native/LLamaFtype.cs b/LLama/Native/LLamaFtype.cs
index 0fa0fbe9..8eb0a8b9 100644
--- a/LLama/Native/LLamaFtype.cs
+++ b/LLama/Native/LLamaFtype.cs
@@ -106,6 +106,31 @@
/// Benchmark@7B: 5.15GB, +0.0044 ppl
LLAMA_FTYPE_MOSTLY_Q6_K = 18,
+ ///
+ /// except 1d tensors
+ ///
+ LLAMA_FTYPE_MOSTLY_IQ2_XXS = 19,
+
+ ///
+ /// except 1d tensors
+ ///
+ LLAMA_FTYPE_MOSTLY_IQ2_XS = 20,
+
+ ///
+ /// except 1d tensors
+ ///
+ LLAMA_FTYPE_MOSTLY_Q2_K_S = 21,
+
+ ///
+ /// except 1d tensors
+ ///
+ LLAMA_FTYPE_MOSTLY_Q3_K_XS = 22,
+
+ ///
+ /// except 1d tensors
+ ///
+ LLAMA_FTYPE_MOSTLY_IQ3_XXS = 23,
+
///
/// File type was not specified
///
diff --git a/LLama/Native/LLamaModelParams.cs b/LLama/Native/LLamaModelParams.cs
index ed7b6043..a7cdd1a2 100644
--- a/LLama/Native/LLamaModelParams.cs
+++ b/LLama/Native/LLamaModelParams.cs
@@ -14,6 +14,11 @@ namespace LLama.Native
///
public int n_gpu_layers;
+ ///
+ /// how to split the model across multiple GPUs
+ ///
+ public GPUSplitMode split_mode;
+
///
/// the GPU that is used for scratch and small tensors
///
@@ -25,7 +30,8 @@ namespace LLama.Native
public float* tensor_split;
///
- /// called with a progress value between 0 and 1, pass NULL to disable
+ /// called with a progress value between 0 and 1, pass NULL to disable. If the provided progress_callback
+ /// returns true, model loading continues. If it returns false, model loading is immediately aborted.
///
public LlamaProgressCallback progress_callback;
diff --git a/LLama/Native/LLamaModelQuantizeParams.cs b/LLama/Native/LLamaModelQuantizeParams.cs
index 39702b5a..34c1a974 100644
--- a/LLama/Native/LLamaModelQuantizeParams.cs
+++ b/LLama/Native/LLamaModelQuantizeParams.cs
@@ -6,6 +6,7 @@ namespace LLama.Native
///
/// Quantizer parameters used in the native API
///
+ /// llama_model_quantize_params
[StructLayout(LayoutKind.Sequential)]
public struct LLamaModelQuantizeParams
{
@@ -58,5 +59,10 @@ namespace LLama.Native
set => _pure = Convert.ToSByte(value);
}
private sbyte _pure;
+
+ ///
+ /// pointer to importance matrix data
+ ///
+ public IntPtr imatrix;
}
}
diff --git a/LLama/Native/NativeApi.Quantize.cs b/LLama/Native/NativeApi.Quantize.cs
index b849e38d..1c4909bf 100644
--- a/LLama/Native/NativeApi.Quantize.cs
+++ b/LLama/Native/NativeApi.Quantize.cs
@@ -10,9 +10,8 @@ namespace LLama.Native
///
///
///
- /// not great API - very likely to change
/// Returns 0 on success
[DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
- public static extern unsafe int llama_model_quantize(string fname_inp, string fname_out, LLamaModelQuantizeParams* param);
+ public static extern unsafe uint llama_model_quantize(string fname_inp, string fname_out, LLamaModelQuantizeParams* param);
}
}
diff --git a/LLama/Native/NativeApi.Sampling.cs b/LLama/Native/NativeApi.Sampling.cs
index 7128441e..a52edc66 100644
--- a/LLama/Native/NativeApi.Sampling.cs
+++ b/LLama/Native/NativeApi.Sampling.cs
@@ -27,11 +27,12 @@ namespace LLama.Native
/// Apply classifier-free guidance to the logits as described in academic paper "Stay on topic with Classifier-Free Guidance" https://arxiv.org/abs/2306.17806
///
///
- /// A vector of `llama_token_data` containing the candidate tokens, the logits must be directly extracted from the original generation context without being sorted.
- /// A separate context from the same model. Other than a negative prompt at the beginning, it should have all generated and user input tokens copied from the main context.
+ /// Logits extracted from the original generation context.
+ /// Logits extracted from a separate context from the same model.
+ /// Other than a negative prompt at the beginning, it should have all generated and user input tokens copied from the main context.
/// Guidance strength. 1.0f means no guidance. Higher values mean stronger guidance.
[DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
- public static extern void llama_sample_classifier_free_guidance(SafeLLamaContextHandle ctx, ref LLamaTokenDataArrayNative candidates, SafeLLamaContextHandle guidance_ctx, float scale);
+ public static extern unsafe void llama_sample_apply_guidance(SafeLLamaContextHandle ctx, float* logits, float* logits_guidance, float scale);
///
/// Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
@@ -92,6 +93,17 @@ namespace LLama.Native
[DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
public static extern void llama_sample_typical(SafeLLamaContextHandle ctx, ref LLamaTokenDataArrayNative candidates, float p, ulong min_keep);
+ ///
+ /// Dynamic temperature implementation described in the paper https://arxiv.org/abs/2309.02772.
+ ///
+ ///
+ /// Pointer to LLamaTokenDataArray
+ ///
+ ///
+ ///
+ [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
+ public static extern void llama_sample_typical(SafeLLamaContextHandle ctx, ref LLamaTokenDataArrayNative candidates, float min_temp, float max_temp, float exponent_val);
+
///
/// Modify logits by temperature
///
diff --git a/LLama/Native/NativeApi.cs b/LLama/Native/NativeApi.cs
index bb28e7ab..c953cb23 100644
--- a/LLama/Native/NativeApi.cs
+++ b/LLama/Native/NativeApi.cs
@@ -23,7 +23,7 @@ namespace LLama.Native
///
public static void llama_empty_call()
{
- llama_mmap_supported();
+ llama_max_devices();
}
///
@@ -31,7 +31,7 @@ namespace LLama.Native
///
///
[DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
- public static extern int llama_max_devices();
+ public static extern long llama_max_devices();
///
/// Create a LLamaModelParams with default values
@@ -59,14 +59,21 @@ namespace LLama.Native
///
///
[DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
- public static extern bool llama_mmap_supported();
+ public static extern bool llama_supports_mmap();
///
- /// Check if memory lockingis supported
+ /// Check if memory locking is supported
///
///
[DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
- public static extern bool llama_mlock_supported();
+ public static extern bool llama_supports_mlock();
+
+ ///
+ /// Check if GPU offload is supported
+ ///
+ ///
+ [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
+ public static extern bool llama_supports_gpu_offload();
///
/// Initialize the llama + ggml backend
@@ -163,7 +170,10 @@ namespace LLama.Native
///
///
[DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
- public static extern int llama_n_ctx(SafeLLamaContextHandle ctx);
+ public static extern uint llama_n_ctx(SafeLLamaContextHandle ctx);
+
+ [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
+ public static extern uint llama_n_batch(SafeLLamaContextHandle ctx);
///
/// Token logits obtained from the last call to llama_eval()
@@ -380,6 +390,20 @@ namespace LLama.Native
[DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
public static extern void llama_kv_cache_seq_shift(SafeLLamaContextHandle ctx, LLamaSeqId seq, LLamaPos p0, LLamaPos p1, LLamaPos delta);
+ ///
+ /// Integer division of the positions by factor of `d > 1`
+ /// If the KV cache is RoPEd, the KV data is updated accordingly
+ /// p0 < 0 : [0, p1]
+ /// p1 < 0 : [p0, inf)
+ ///
+ ///
+ ///
+ ///
+ ///
+ ///
+ [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
+ public static extern void llama_kv_cache_seq_div(SafeLLamaContextHandle ctx, LLamaSeqId seq, LLamaPos p0, LLamaPos p1, int d);
+
///
/// Allocates a batch of tokens on the heap
/// Each token can be assigned up to n_seq_max sequence ids
diff --git a/LLama/Native/SafeLLamaContextHandle.cs b/LLama/Native/SafeLLamaContextHandle.cs
index 2c5d8288..d90d46d5 100644
--- a/LLama/Native/SafeLLamaContextHandle.cs
+++ b/LLama/Native/SafeLLamaContextHandle.cs
@@ -21,7 +21,7 @@ namespace LLama.Native
///
/// Total number of tokens in the context
///
- public int ContextSize => NativeApi.llama_n_ctx(this);
+ public uint ContextSize => NativeApi.llama_n_ctx(this);
///
/// Dimension of embedding vectors
diff --git a/README.md b/README.md
index a73fb3c7..c20a523e 100644
--- a/README.md
+++ b/README.md
@@ -222,6 +222,7 @@ If you want to compile llama.cpp yourself you **must** use the exact commit ID l
| v0.7.0, v0.8.0 | [Thespis-13B](https://huggingface.co/TheBloke/Thespis-13B-v0.5-GGUF/tree/main?not-for-all-audiences=true), [LLaMA2-7B](https://huggingface.co/TheBloke/llama-2-7B-Guanaco-QLoRA-GGUF) | [`207b519`](https://github.com/ggerganov/llama.cpp/commit/207b51900e15cc7f89763a3bb1c565fe11cbb45d) |
| v0.8.1 | | [`e937066`](https://github.com/ggerganov/llama.cpp/commit/e937066420b79a757bf80e9836eb12b88420a218) |
| v0.9.0, v0.9.1 | [Mixtral-8x7B](https://huggingface.co/TheBloke/Mixtral-8x7B-v0.1-GGUF) | [`9fb13f9`](https://github.com/ggerganov/llama.cpp/blob/9fb13f95840c722ad419f390dc8a9c86080a3700) |
+| v10.0.0 | [Phi2](https://huggingface.co/TheBloke/phi-2-GGUF) | [`d71ac90`](https://github.com/ggerganov/llama.cpp/tree/d71ac90985854b0905e1abba778e407e17f9f887) |
## License