Update binaries feb 2024tags/v0.10.0
| @@ -28,7 +28,7 @@ namespace LLama.Unittest | |||||
| [Fact] | [Fact] | ||||
| public void CheckProperties() | public void CheckProperties() | ||||
| { | { | ||||
| Assert.Equal(768, _context.ContextSize); | |||||
| Assert.Equal(768u, _context.ContextSize); | |||||
| Assert.Equal(4096, _context.EmbeddingSize); | Assert.Equal(4096, _context.EmbeddingSize); | ||||
| Assert.Equal(32000, _context.VocabCount); | Assert.Equal(32000, _context.VocabCount); | ||||
| } | } | ||||
| @@ -23,6 +23,9 @@ namespace LLama.Web.Common | |||||
| /// <inheritdoc /> | /// <inheritdoc /> | ||||
| public int MainGpu { get; set; } = 0; | public int MainGpu { get; set; } = 0; | ||||
| /// <inheritdoc /> | |||||
| public GPUSplitMode SplitMode { get; set; } = GPUSplitMode.None; | |||||
| /// <inheritdoc /> | /// <inheritdoc /> | ||||
| public int GpuLayerCount { get; set; } = 20; | public int GpuLayerCount { get; set; } = 20; | ||||
| @@ -16,9 +16,28 @@ namespace LLama.Abstractions | |||||
| public interface IModelParams | public interface IModelParams | ||||
| { | { | ||||
| /// <summary> | /// <summary> | ||||
| /// the GPU that is used for scratch and small tensors | |||||
| /// main_gpu interpretation depends on split_mode: | |||||
| /// <list type="bullet"> | |||||
| /// <item> | |||||
| /// <term>None</term> | |||||
| /// <description>The GPU that is used for the entire mode.</description> | |||||
| /// </item> | |||||
| /// <item> | |||||
| /// <term>Row</term> | |||||
| /// <description>The GPU that is used for small tensors and intermediate results.</description> | |||||
| /// </item> | |||||
| /// <item> | |||||
| /// <term>Layer</term> | |||||
| /// <description>Ignored.</description> | |||||
| /// </item> | |||||
| /// </list> | |||||
| /// </summary> | /// </summary> | ||||
| int MainGpu { get; } | |||||
| int MainGpu { get; set; } | |||||
| /// <summary> | |||||
| /// How to split the model across multiple GPUs | |||||
| /// </summary> | |||||
| GPUSplitMode SplitMode { get; } | |||||
| /// <summary> | /// <summary> | ||||
| /// Number of layers to run in VRAM / GPU memory (n_gpu_layers) | /// Number of layers to run in VRAM / GPU memory (n_gpu_layers) | ||||
| @@ -18,6 +18,9 @@ namespace LLama.Common | |||||
| /// <inheritdoc /> | /// <inheritdoc /> | ||||
| public int MainGpu { get; set; } = 0; | public int MainGpu { get; set; } = 0; | ||||
| /// <inheritdoc /> | |||||
| public GPUSplitMode SplitMode { get; set; } = GPUSplitMode.None; | |||||
| /// <inheritdoc /> | /// <inheritdoc /> | ||||
| public int GpuLayerCount { get; set; } = 20; | public int GpuLayerCount { get; set; } = 20; | ||||
| @@ -36,6 +36,9 @@ namespace LLama.Extensions | |||||
| result.yarn_orig_ctx = @params.YarnOriginalContext ?? 0; | result.yarn_orig_ctx = @params.YarnOriginalContext ?? 0; | ||||
| result.rope_scaling_type = @params.YarnScalingType ?? RopeScalingType.LLAMA_ROPE_SCALING_UNSPECIFIED; | result.rope_scaling_type = @params.YarnScalingType ?? RopeScalingType.LLAMA_ROPE_SCALING_UNSPECIFIED; | ||||
| result.cb_eval = IntPtr.Zero; | |||||
| result.cb_eval_user_data = IntPtr.Zero; | |||||
| result.type_k = @params.TypeK ?? GGMLType.GGML_TYPE_F16; | result.type_k = @params.TypeK ?? GGMLType.GGML_TYPE_F16; | ||||
| result.type_k = @params.TypeV ?? GGMLType.GGML_TYPE_F16; | result.type_k = @params.TypeV ?? GGMLType.GGML_TYPE_F16; | ||||
| result.offload_kqv = !@params.NoKqvOffload; | result.offload_kqv = !@params.NoKqvOffload; | ||||
| @@ -21,15 +21,16 @@ public static class IModelParamsExtensions | |||||
| /// <exception cref="ArgumentException"></exception> | /// <exception cref="ArgumentException"></exception> | ||||
| public static IDisposable ToLlamaModelParams(this IModelParams @params, out LLamaModelParams result) | public static IDisposable ToLlamaModelParams(this IModelParams @params, out LLamaModelParams result) | ||||
| { | { | ||||
| if (@params.UseMemoryLock && !NativeApi.llama_mlock_supported()) | |||||
| throw new NotSupportedException("'UseMemoryLock' is not supported (llama_mlock_supported() == false)"); | |||||
| if (@params.UseMemorymap && !NativeApi.llama_mmap_supported()) | |||||
| throw new NotSupportedException("'UseMemorymap' is not supported (llama_mmap_supported() == false)"); | |||||
| if (@params.UseMemoryLock && !NativeApi.llama_supports_mlock()) | |||||
| throw new NotSupportedException("'UseMemoryLock' is not supported (llama_supports_mlock() == false)"); | |||||
| if (@params.UseMemorymap && !NativeApi.llama_supports_mmap()) | |||||
| throw new NotSupportedException("'UseMemorymap' is not supported (llama_supports_mmap() == false)"); | |||||
| var disposer = new GroupDisposable(); | var disposer = new GroupDisposable(); | ||||
| result = NativeApi.llama_model_default_params(); | result = NativeApi.llama_model_default_params(); | ||||
| result.main_gpu = @params.MainGpu; | result.main_gpu = @params.MainGpu; | ||||
| result.split_mode = @params.SplitMode; | |||||
| result.n_gpu_layers = @params.GpuLayerCount < 0 ? int.MaxValue : @params.GpuLayerCount; | result.n_gpu_layers = @params.GpuLayerCount < 0 ? int.MaxValue : @params.GpuLayerCount; | ||||
| result.use_mlock = @params.UseMemoryLock; | result.use_mlock = @params.UseMemoryLock; | ||||
| result.use_mmap = @params.UseMemorymap; | result.use_mmap = @params.UseMemorymap; | ||||
| @@ -33,7 +33,7 @@ namespace LLama | |||||
| /// <summary> | /// <summary> | ||||
| /// Total number of tokens in the context | /// Total number of tokens in the context | ||||
| /// </summary> | /// </summary> | ||||
| public int ContextSize => NativeHandle.ContextSize; | |||||
| public uint ContextSize => NativeHandle.ContextSize; | |||||
| /// <summary> | /// <summary> | ||||
| /// Dimension of embedding vectors | /// Dimension of embedding vectors | ||||
| @@ -323,7 +323,7 @@ namespace LLama | |||||
| var candidates_p = LLamaTokenDataArray.Create(logits); | var candidates_p = LLamaTokenDataArray.Create(logits); | ||||
| // Extract most recently returned tokens | // Extract most recently returned tokens | ||||
| var last_n_repeat = Math.Min(ContextSize, repeatLastTokensCount); | |||||
| var last_n_repeat = Math.Min((int)ContextSize, repeatLastTokensCount); | |||||
| var last_n_array = lastTokens.TakeLast(last_n_repeat).ToArray(); | var last_n_array = lastTokens.TakeLast(last_n_repeat).ToArray(); | ||||
| // Apply penalties to candidates | // Apply penalties to candidates | ||||
| @@ -83,7 +83,7 @@ namespace LLama | |||||
| _pastTokensCount = 0; | _pastTokensCount = 0; | ||||
| _consumedTokensCount = 0; | _consumedTokensCount = 0; | ||||
| _n_session_consumed = 0; | _n_session_consumed = 0; | ||||
| _last_n_tokens = new FixedSizeQueue<LLamaToken>(Context.ContextSize); | |||||
| _last_n_tokens = new FixedSizeQueue<LLamaToken>((int)Context.ContextSize); | |||||
| _decoder = new StreamingTokenDecoder(context); | _decoder = new StreamingTokenDecoder(context); | ||||
| } | } | ||||
| @@ -170,7 +170,7 @@ namespace LLama | |||||
| _pastTokensCount = Math.Max(1, tokensToKeep); | _pastTokensCount = Math.Max(1, tokensToKeep); | ||||
| // insert n_left/2 tokens at the start of embed from last_n_tokens | // insert n_left/2 tokens at the start of embed from last_n_tokens | ||||
| _embeds.InsertRange(0, _last_n_tokens.Take(_last_n_tokens.Count - _embeds.Count).Skip(Context.ContextSize - n_left / 2 - _embeds.Count)); | |||||
| _embeds.InsertRange(0, _last_n_tokens.Take(_last_n_tokens.Count - _embeds.Count).Skip((int)Context.ContextSize - n_left / 2 - _embeds.Count)); | |||||
| // stop saving session if we run out of context | // stop saving session if we run out of context | ||||
| _pathSession = string.Empty; | _pathSession = string.Empty; | ||||
| @@ -200,7 +200,7 @@ namespace LLama | |||||
| if (_embed_inps.Count <= _consumedTokensCount && !args.WaitForInput) | if (_embed_inps.Count <= _consumedTokensCount && !args.WaitForInput) | ||||
| { | { | ||||
| var repeat_last_n = inferenceParams.RepeatLastTokensCount < 0 ? Context.ContextSize : inferenceParams.RepeatLastTokensCount; | |||||
| var repeat_last_n = inferenceParams.RepeatLastTokensCount < 0 ? (int)Context.ContextSize : inferenceParams.RepeatLastTokensCount; | |||||
| // optionally save the session on first sample (for faster prompt loading next time) | // optionally save the session on first sample (for faster prompt loading next time) | ||||
| if (!string.IsNullOrEmpty(_pathSession) && args.NeedToSaveSession) | if (!string.IsNullOrEmpty(_pathSession) && args.NeedToSaveSession) | ||||
| @@ -179,7 +179,7 @@ namespace LLama | |||||
| if (_embed_inps.Count <= _consumedTokensCount && !args.WaitForInput) | if (_embed_inps.Count <= _consumedTokensCount && !args.WaitForInput) | ||||
| { | { | ||||
| var repeat_last_n = inferenceParams.RepeatLastTokensCount < 0 ? Context.ContextSize : inferenceParams.RepeatLastTokensCount; | |||||
| var repeat_last_n = inferenceParams.RepeatLastTokensCount < 0 ? (int)Context.ContextSize : inferenceParams.RepeatLastTokensCount; | |||||
| // optionally save the session on first sample (for faster prompt loading next time) | // optionally save the session on first sample (for faster prompt loading next time) | ||||
| if (!string.IsNullOrEmpty(_pathSession) && args.NeedToSaveSession) | if (!string.IsNullOrEmpty(_pathSession) && args.NeedToSaveSession) | ||||
| @@ -0,0 +1,23 @@ | |||||
| namespace LLama.Native; | |||||
| /// <summary> | |||||
| /// | |||||
| /// </summary> | |||||
| /// <remarks>llama_split_mode</remarks> | |||||
| public enum GPUSplitMode | |||||
| { | |||||
| /// <summary> | |||||
| /// Single GPU | |||||
| /// </summary> | |||||
| None = 0, | |||||
| /// <summary> | |||||
| /// Split layers and KV across GPUs | |||||
| /// </summary> | |||||
| Layer = 1, | |||||
| /// <summary> | |||||
| /// split rows across GPUs | |||||
| /// </summary> | |||||
| Row = 2, | |||||
| } | |||||
| @@ -8,7 +8,8 @@ namespace LLama.Native | |||||
| /// </summary> | /// </summary> | ||||
| /// <param name="progress"></param> | /// <param name="progress"></param> | ||||
| /// <param name="ctx"></param> | /// <param name="ctx"></param> | ||||
| public delegate void LlamaProgressCallback(float progress, IntPtr ctx); | |||||
| /// <remarks>llama_progress_callback</remarks> | |||||
| public delegate bool LlamaProgressCallback(float progress, IntPtr ctx); | |||||
| /// <summary> | /// <summary> | ||||
| /// A C# representation of the llama.cpp `llama_context_params` struct | /// A C# representation of the llama.cpp `llama_context_params` struct | ||||
| @@ -46,37 +47,46 @@ namespace LLama.Native | |||||
| /// </summary> | /// </summary> | ||||
| public RopeScalingType rope_scaling_type; | public RopeScalingType rope_scaling_type; | ||||
| /// <summary> | /// <summary> | ||||
| /// RoPE base frequency, 0 = from model | /// RoPE base frequency, 0 = from model | ||||
| /// </summary> | /// </summary> | ||||
| public float rope_freq_base; | |||||
| public float rope_freq_base; | |||||
| /// <summary> | /// <summary> | ||||
| /// RoPE frequency scaling factor, 0 = from model | /// RoPE frequency scaling factor, 0 = from model | ||||
| /// </summary> | /// </summary> | ||||
| public float rope_freq_scale; | |||||
| public float rope_freq_scale; | |||||
| /// <summary> | /// <summary> | ||||
| /// YaRN extrapolation mix factor, negative = from model | /// YaRN extrapolation mix factor, negative = from model | ||||
| /// </summary> | /// </summary> | ||||
| public float yarn_ext_factor; | |||||
| public float yarn_ext_factor; | |||||
| /// <summary> | /// <summary> | ||||
| /// YaRN magnitude scaling factor | /// YaRN magnitude scaling factor | ||||
| /// </summary> | /// </summary> | ||||
| public float yarn_attn_factor; | |||||
| public float yarn_attn_factor; | |||||
| /// <summary> | /// <summary> | ||||
| /// YaRN low correction dim | /// YaRN low correction dim | ||||
| /// </summary> | /// </summary> | ||||
| public float yarn_beta_fast; | |||||
| public float yarn_beta_fast; | |||||
| /// <summary> | /// <summary> | ||||
| /// YaRN high correction dim | /// YaRN high correction dim | ||||
| /// </summary> | /// </summary> | ||||
| public float yarn_beta_slow; | |||||
| public float yarn_beta_slow; | |||||
| /// <summary> | /// <summary> | ||||
| /// YaRN original context size | /// YaRN original context size | ||||
| /// </summary> | /// </summary> | ||||
| public uint yarn_orig_ctx; | public uint yarn_orig_ctx; | ||||
| /// <summary> | |||||
| /// ggml_backend_sched_eval_callback | |||||
| /// </summary> | |||||
| public IntPtr cb_eval; | |||||
| /// <summary> | |||||
| /// User data passed into cb_eval | |||||
| /// </summary> | |||||
| public IntPtr cb_eval_user_data; | |||||
| /// <summary> | /// <summary> | ||||
| /// data type for K cache | /// data type for K cache | ||||
| /// </summary> | /// </summary> | ||||
| @@ -106,6 +106,31 @@ | |||||
| /// <remarks>Benchmark@7B: 5.15GB, +0.0044 ppl</remarks> | /// <remarks>Benchmark@7B: 5.15GB, +0.0044 ppl</remarks> | ||||
| LLAMA_FTYPE_MOSTLY_Q6_K = 18, | LLAMA_FTYPE_MOSTLY_Q6_K = 18, | ||||
| /// <summary> | |||||
| /// except 1d tensors | |||||
| /// </summary> | |||||
| LLAMA_FTYPE_MOSTLY_IQ2_XXS = 19, | |||||
| /// <summary> | |||||
| /// except 1d tensors | |||||
| /// </summary> | |||||
| LLAMA_FTYPE_MOSTLY_IQ2_XS = 20, | |||||
| /// <summary> | |||||
| /// except 1d tensors | |||||
| /// </summary> | |||||
| LLAMA_FTYPE_MOSTLY_Q2_K_S = 21, | |||||
| /// <summary> | |||||
| /// except 1d tensors | |||||
| /// </summary> | |||||
| LLAMA_FTYPE_MOSTLY_Q3_K_XS = 22, | |||||
| /// <summary> | |||||
| /// except 1d tensors | |||||
| /// </summary> | |||||
| LLAMA_FTYPE_MOSTLY_IQ3_XXS = 23, | |||||
| /// <summary> | /// <summary> | ||||
| /// File type was not specified | /// File type was not specified | ||||
| /// </summary> | /// </summary> | ||||
| @@ -14,6 +14,11 @@ namespace LLama.Native | |||||
| /// </summary> | /// </summary> | ||||
| public int n_gpu_layers; | public int n_gpu_layers; | ||||
| /// <summary> | |||||
| /// how to split the model across multiple GPUs | |||||
| /// </summary> | |||||
| public GPUSplitMode split_mode; | |||||
| /// <summary> | /// <summary> | ||||
| /// the GPU that is used for scratch and small tensors | /// the GPU that is used for scratch and small tensors | ||||
| /// </summary> | /// </summary> | ||||
| @@ -25,7 +30,8 @@ namespace LLama.Native | |||||
| public float* tensor_split; | public float* tensor_split; | ||||
| /// <summary> | /// <summary> | ||||
| /// called with a progress value between 0 and 1, pass NULL to disable | |||||
| /// called with a progress value between 0 and 1, pass NULL to disable. If the provided progress_callback | |||||
| /// returns true, model loading continues. If it returns false, model loading is immediately aborted. | |||||
| /// </summary> | /// </summary> | ||||
| public LlamaProgressCallback progress_callback; | public LlamaProgressCallback progress_callback; | ||||
| @@ -6,6 +6,7 @@ namespace LLama.Native | |||||
| /// <summary> | /// <summary> | ||||
| /// Quantizer parameters used in the native API | /// Quantizer parameters used in the native API | ||||
| /// </summary> | /// </summary> | ||||
| /// <remarks>llama_model_quantize_params</remarks> | |||||
| [StructLayout(LayoutKind.Sequential)] | [StructLayout(LayoutKind.Sequential)] | ||||
| public struct LLamaModelQuantizeParams | public struct LLamaModelQuantizeParams | ||||
| { | { | ||||
| @@ -58,5 +59,10 @@ namespace LLama.Native | |||||
| set => _pure = Convert.ToSByte(value); | set => _pure = Convert.ToSByte(value); | ||||
| } | } | ||||
| private sbyte _pure; | private sbyte _pure; | ||||
| /// <summary> | |||||
| /// pointer to importance matrix data | |||||
| /// </summary> | |||||
| public IntPtr imatrix; | |||||
| } | } | ||||
| } | } | ||||
| @@ -10,9 +10,8 @@ namespace LLama.Native | |||||
| /// <param name="fname_inp"></param> | /// <param name="fname_inp"></param> | ||||
| /// <param name="fname_out"></param> | /// <param name="fname_out"></param> | ||||
| /// <param name="param"></param> | /// <param name="param"></param> | ||||
| /// <remarks>not great API - very likely to change</remarks> | |||||
| /// <returns>Returns 0 on success</returns> | /// <returns>Returns 0 on success</returns> | ||||
| [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)] | [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)] | ||||
| public static extern unsafe int llama_model_quantize(string fname_inp, string fname_out, LLamaModelQuantizeParams* param); | |||||
| public static extern unsafe uint llama_model_quantize(string fname_inp, string fname_out, LLamaModelQuantizeParams* param); | |||||
| } | } | ||||
| } | } | ||||
| @@ -27,11 +27,12 @@ namespace LLama.Native | |||||
| /// Apply classifier-free guidance to the logits as described in academic paper "Stay on topic with Classifier-Free Guidance" https://arxiv.org/abs/2306.17806 | /// Apply classifier-free guidance to the logits as described in academic paper "Stay on topic with Classifier-Free Guidance" https://arxiv.org/abs/2306.17806 | ||||
| /// </summary> | /// </summary> | ||||
| /// <param name="ctx"></param> | /// <param name="ctx"></param> | ||||
| /// <param name="candidates">A vector of `llama_token_data` containing the candidate tokens, the logits must be directly extracted from the original generation context without being sorted.</param> | |||||
| /// <param name="guidance_ctx">A separate context from the same model. Other than a negative prompt at the beginning, it should have all generated and user input tokens copied from the main context.</param> | |||||
| /// <param name="logits">Logits extracted from the original generation context.</param> | |||||
| /// <param name="logits_guidance">Logits extracted from a separate context from the same model. | |||||
| /// Other than a negative prompt at the beginning, it should have all generated and user input tokens copied from the main context.</param> | |||||
| /// <param name="scale">Guidance strength. 1.0f means no guidance. Higher values mean stronger guidance.</param> | /// <param name="scale">Guidance strength. 1.0f means no guidance. Higher values mean stronger guidance.</param> | ||||
| [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)] | [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)] | ||||
| public static extern void llama_sample_classifier_free_guidance(SafeLLamaContextHandle ctx, ref LLamaTokenDataArrayNative candidates, SafeLLamaContextHandle guidance_ctx, float scale); | |||||
| public static extern unsafe void llama_sample_apply_guidance(SafeLLamaContextHandle ctx, float* logits, float* logits_guidance, float scale); | |||||
| /// <summary> | /// <summary> | ||||
| /// Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits. | /// Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits. | ||||
| @@ -92,6 +93,17 @@ namespace LLama.Native | |||||
| [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)] | [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)] | ||||
| public static extern void llama_sample_typical(SafeLLamaContextHandle ctx, ref LLamaTokenDataArrayNative candidates, float p, ulong min_keep); | public static extern void llama_sample_typical(SafeLLamaContextHandle ctx, ref LLamaTokenDataArrayNative candidates, float p, ulong min_keep); | ||||
| /// <summary> | |||||
| /// Dynamic temperature implementation described in the paper https://arxiv.org/abs/2309.02772. | |||||
| /// </summary> | |||||
| /// <param name="ctx"></param> | |||||
| /// <param name="candidates">Pointer to LLamaTokenDataArray</param> | |||||
| /// <param name="min_temp"></param> | |||||
| /// <param name="max_temp"></param> | |||||
| /// <param name="exponent_val"></param> | |||||
| [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)] | |||||
| public static extern void llama_sample_typical(SafeLLamaContextHandle ctx, ref LLamaTokenDataArrayNative candidates, float min_temp, float max_temp, float exponent_val); | |||||
| /// <summary> | /// <summary> | ||||
| /// Modify logits by temperature | /// Modify logits by temperature | ||||
| /// </summary> | /// </summary> | ||||
| @@ -23,7 +23,7 @@ namespace LLama.Native | |||||
| /// <returns></returns> | /// <returns></returns> | ||||
| public static void llama_empty_call() | public static void llama_empty_call() | ||||
| { | { | ||||
| llama_mmap_supported(); | |||||
| llama_max_devices(); | |||||
| } | } | ||||
| /// <summary> | /// <summary> | ||||
| @@ -31,7 +31,7 @@ namespace LLama.Native | |||||
| /// </summary> | /// </summary> | ||||
| /// <returns></returns> | /// <returns></returns> | ||||
| [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)] | [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)] | ||||
| public static extern int llama_max_devices(); | |||||
| public static extern long llama_max_devices(); | |||||
| /// <summary> | /// <summary> | ||||
| /// Create a LLamaModelParams with default values | /// Create a LLamaModelParams with default values | ||||
| @@ -59,14 +59,21 @@ namespace LLama.Native | |||||
| /// </summary> | /// </summary> | ||||
| /// <returns></returns> | /// <returns></returns> | ||||
| [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)] | [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)] | ||||
| public static extern bool llama_mmap_supported(); | |||||
| public static extern bool llama_supports_mmap(); | |||||
| /// <summary> | /// <summary> | ||||
| /// Check if memory lockingis supported | |||||
| /// Check if memory locking is supported | |||||
| /// </summary> | /// </summary> | ||||
| /// <returns></returns> | /// <returns></returns> | ||||
| [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)] | [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)] | ||||
| public static extern bool llama_mlock_supported(); | |||||
| public static extern bool llama_supports_mlock(); | |||||
| /// <summary> | |||||
| /// Check if GPU offload is supported | |||||
| /// </summary> | |||||
| /// <returns></returns> | |||||
| [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)] | |||||
| public static extern bool llama_supports_gpu_offload(); | |||||
| /// <summary> | /// <summary> | ||||
| /// Initialize the llama + ggml backend | /// Initialize the llama + ggml backend | ||||
| @@ -163,7 +170,10 @@ namespace LLama.Native | |||||
| /// <param name="ctx"></param> | /// <param name="ctx"></param> | ||||
| /// <returns></returns> | /// <returns></returns> | ||||
| [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)] | [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)] | ||||
| public static extern int llama_n_ctx(SafeLLamaContextHandle ctx); | |||||
| public static extern uint llama_n_ctx(SafeLLamaContextHandle ctx); | |||||
| [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)] | |||||
| public static extern uint llama_n_batch(SafeLLamaContextHandle ctx); | |||||
| /// <summary> | /// <summary> | ||||
| /// Token logits obtained from the last call to llama_eval() | /// Token logits obtained from the last call to llama_eval() | ||||
| @@ -380,6 +390,20 @@ namespace LLama.Native | |||||
| [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)] | [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)] | ||||
| public static extern void llama_kv_cache_seq_shift(SafeLLamaContextHandle ctx, LLamaSeqId seq, LLamaPos p0, LLamaPos p1, LLamaPos delta); | public static extern void llama_kv_cache_seq_shift(SafeLLamaContextHandle ctx, LLamaSeqId seq, LLamaPos p0, LLamaPos p1, LLamaPos delta); | ||||
| /// <summary> | |||||
| /// Integer division of the positions by factor of `d > 1` | |||||
| /// If the KV cache is RoPEd, the KV data is updated accordingly | |||||
| /// p0 < 0 : [0, p1] | |||||
| /// p1 < 0 : [p0, inf) | |||||
| /// </summary> | |||||
| /// <param name="ctx"></param> | |||||
| /// <param name="seq"></param> | |||||
| /// <param name="p0"></param> | |||||
| /// <param name="p1"></param> | |||||
| /// <param name="d"></param> | |||||
| [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)] | |||||
| public static extern void llama_kv_cache_seq_div(SafeLLamaContextHandle ctx, LLamaSeqId seq, LLamaPos p0, LLamaPos p1, int d); | |||||
| /// <summary> | /// <summary> | ||||
| /// Allocates a batch of tokens on the heap | /// Allocates a batch of tokens on the heap | ||||
| /// Each token can be assigned up to n_seq_max sequence ids | /// Each token can be assigned up to n_seq_max sequence ids | ||||
| @@ -21,7 +21,7 @@ namespace LLama.Native | |||||
| /// <summary> | /// <summary> | ||||
| /// Total number of tokens in the context | /// Total number of tokens in the context | ||||
| /// </summary> | /// </summary> | ||||
| public int ContextSize => NativeApi.llama_n_ctx(this); | |||||
| public uint ContextSize => NativeApi.llama_n_ctx(this); | |||||
| /// <summary> | /// <summary> | ||||
| /// Dimension of embedding vectors | /// Dimension of embedding vectors | ||||
| @@ -222,6 +222,7 @@ If you want to compile llama.cpp yourself you **must** use the exact commit ID l | |||||
| | v0.7.0, v0.8.0 | [Thespis-13B](https://huggingface.co/TheBloke/Thespis-13B-v0.5-GGUF/tree/main?not-for-all-audiences=true), [LLaMA2-7B](https://huggingface.co/TheBloke/llama-2-7B-Guanaco-QLoRA-GGUF) | [`207b519`](https://github.com/ggerganov/llama.cpp/commit/207b51900e15cc7f89763a3bb1c565fe11cbb45d) | | | v0.7.0, v0.8.0 | [Thespis-13B](https://huggingface.co/TheBloke/Thespis-13B-v0.5-GGUF/tree/main?not-for-all-audiences=true), [LLaMA2-7B](https://huggingface.co/TheBloke/llama-2-7B-Guanaco-QLoRA-GGUF) | [`207b519`](https://github.com/ggerganov/llama.cpp/commit/207b51900e15cc7f89763a3bb1c565fe11cbb45d) | | ||||
| | v0.8.1 | | [`e937066`](https://github.com/ggerganov/llama.cpp/commit/e937066420b79a757bf80e9836eb12b88420a218) | | | v0.8.1 | | [`e937066`](https://github.com/ggerganov/llama.cpp/commit/e937066420b79a757bf80e9836eb12b88420a218) | | ||||
| | v0.9.0, v0.9.1 | [Mixtral-8x7B](https://huggingface.co/TheBloke/Mixtral-8x7B-v0.1-GGUF) | [`9fb13f9`](https://github.com/ggerganov/llama.cpp/blob/9fb13f95840c722ad419f390dc8a9c86080a3700) | | | v0.9.0, v0.9.1 | [Mixtral-8x7B](https://huggingface.co/TheBloke/Mixtral-8x7B-v0.1-GGUF) | [`9fb13f9`](https://github.com/ggerganov/llama.cpp/blob/9fb13f95840c722ad419f390dc8a9c86080a3700) | | ||||
| | v0.10.0 | [Phi2](https://huggingface.co/TheBloke/phi-2-GGUF) | [`d71ac90`](https://github.com/ggerganov/llama.cpp/tree/d71ac90985854b0905e1abba778e407e17f9f887) | | |||||
| ## License | ## License | ||||