Browse Source

Merge pull request #479 from martindevans/update_binaries_feb_2024

Update binaries feb 2024
tags/v0.10.0
Martin Evans GitHub 1 year ago
parent
commit
17385e12b6
No known key found for this signature in database GPG Key ID: B5690EEEBB952194
38 changed files with 1973 additions and 421 deletions
  1. +1
    -1
      LLama.Unittest/LLamaContextTests.cs
  2. +3
    -0
      LLama.Web/Common/ModelOptions.cs
  3. +21
    -2
      LLama/Abstractions/IModelParams.cs
  4. +3
    -0
      LLama/Common/ModelParams.cs
  5. +3
    -0
      LLama/Extensions/IContextParamsExtensions.cs
  6. +5
    -4
      LLama/Extensions/IModelParamsExtensions.cs
  7. +2
    -2
      LLama/LLamaContext.cs
  8. +2
    -2
      LLama/LLamaExecutorBase.cs
  9. +1
    -1
      LLama/LLamaInstructExecutor.cs
  10. +1
    -1
      LLama/LLamaInteractExecutor.cs
  11. +23
    -0
      LLama/Native/GPUSplitMode.cs
  12. +18
    -8
      LLama/Native/LLamaContextParams.cs
  13. +25
    -0
      LLama/Native/LLamaFtype.cs
  14. +7
    -1
      LLama/Native/LLamaModelParams.cs
  15. +6
    -0
      LLama/Native/LLamaModelQuantizeParams.cs
  16. +1
    -2
      LLama/Native/NativeApi.Quantize.cs
  17. +15
    -3
      LLama/Native/NativeApi.Sampling.cs
  18. +30
    -6
      LLama/Native/NativeApi.cs
  19. +1
    -1
      LLama/Native/SafeLLamaContextHandle.cs
  20. BIN
      LLama/runtimes/deps/avx/libllama.so
  21. BIN
      LLama/runtimes/deps/avx/llama.dll
  22. BIN
      LLama/runtimes/deps/avx2/libllama.so
  23. BIN
      LLama/runtimes/deps/avx2/llama.dll
  24. BIN
      LLama/runtimes/deps/avx512/libllama.so
  25. BIN
      LLama/runtimes/deps/avx512/llama.dll
  26. BIN
      LLama/runtimes/deps/clblast/clblast.dll
  27. BIN
      LLama/runtimes/deps/clblast/libllama.so
  28. BIN
      LLama/runtimes/deps/clblast/llama.dll
  29. BIN
      LLama/runtimes/deps/cu11.7.1/libllama.so
  30. BIN
      LLama/runtimes/deps/cu11.7.1/llama.dll
  31. BIN
      LLama/runtimes/deps/cu12.1.0/libllama.so
  32. BIN
      LLama/runtimes/deps/cu12.1.0/llama.dll
  33. BIN
      LLama/runtimes/deps/libllama.so
  34. BIN
      LLama/runtimes/deps/llama.dll
  35. +1804
    -387
      LLama/runtimes/deps/osx-arm64/ggml-metal.metal
  36. BIN
      LLama/runtimes/deps/osx-arm64/libllama.dylib
  37. BIN
      LLama/runtimes/deps/osx-x64/libllama.dylib
  38. +1
    -0
      README.md

+ 1
- 1
LLama.Unittest/LLamaContextTests.cs View File

@@ -28,7 +28,7 @@ namespace LLama.Unittest
[Fact]
public void CheckProperties()
{
Assert.Equal(768, _context.ContextSize);
Assert.Equal(768u, _context.ContextSize);
Assert.Equal(4096, _context.EmbeddingSize);
Assert.Equal(32000, _context.VocabCount);
}


+ 3
- 0
LLama.Web/Common/ModelOptions.cs View File

@@ -23,6 +23,9 @@ namespace LLama.Web.Common
/// <inheritdoc />
public int MainGpu { get; set; } = 0;

/// <inheritdoc />
public GPUSplitMode SplitMode { get; set; } = GPUSplitMode.None;

/// <inheritdoc />
public int GpuLayerCount { get; set; } = 20;



+ 21
- 2
LLama/Abstractions/IModelParams.cs View File

@@ -16,9 +16,28 @@ namespace LLama.Abstractions
public interface IModelParams
{
/// <summary>
/// the GPU that is used for scratch and small tensors
/// main_gpu interpretation depends on split_mode:
/// <list type="bullet">
/// <item>
/// <term>None</term>
/// <description>The GPU that is used for the entire mode.</description>
/// </item>
/// <item>
/// <term>Row</term>
/// <description>The GPU that is used for small tensors and intermediate results.</description>
/// </item>
/// <item>
/// <term>Layer</term>
/// <description>Ignored.</description>
/// </item>
/// </list>
/// </summary>
int MainGpu { get; }
int MainGpu { get; set; }

/// <summary>
/// How to split the model across multiple GPUs
/// </summary>
GPUSplitMode SplitMode { get; }

/// <summary>
/// Number of layers to run in VRAM / GPU memory (n_gpu_layers)


+ 3
- 0
LLama/Common/ModelParams.cs View File

@@ -18,6 +18,9 @@ namespace LLama.Common
/// <inheritdoc />
public int MainGpu { get; set; } = 0;

/// <inheritdoc />
public GPUSplitMode SplitMode { get; set; } = GPUSplitMode.None;

/// <inheritdoc />
public int GpuLayerCount { get; set; } = 20;



+ 3
- 0
LLama/Extensions/IContextParamsExtensions.cs View File

@@ -36,6 +36,9 @@ namespace LLama.Extensions
result.yarn_orig_ctx = @params.YarnOriginalContext ?? 0;
result.rope_scaling_type = @params.YarnScalingType ?? RopeScalingType.LLAMA_ROPE_SCALING_UNSPECIFIED;

result.cb_eval = IntPtr.Zero;
result.cb_eval_user_data = IntPtr.Zero;

result.type_k = @params.TypeK ?? GGMLType.GGML_TYPE_F16;
result.type_k = @params.TypeV ?? GGMLType.GGML_TYPE_F16;
result.offload_kqv = !@params.NoKqvOffload;


+ 5
- 4
LLama/Extensions/IModelParamsExtensions.cs View File

@@ -21,15 +21,16 @@ public static class IModelParamsExtensions
/// <exception cref="ArgumentException"></exception>
public static IDisposable ToLlamaModelParams(this IModelParams @params, out LLamaModelParams result)
{
if (@params.UseMemoryLock && !NativeApi.llama_mlock_supported())
throw new NotSupportedException("'UseMemoryLock' is not supported (llama_mlock_supported() == false)");
if (@params.UseMemorymap && !NativeApi.llama_mmap_supported())
throw new NotSupportedException("'UseMemorymap' is not supported (llama_mmap_supported() == false)");
if (@params.UseMemoryLock && !NativeApi.llama_supports_mlock())
throw new NotSupportedException("'UseMemoryLock' is not supported (llama_supports_mlock() == false)");
if (@params.UseMemorymap && !NativeApi.llama_supports_mmap())
throw new NotSupportedException("'UseMemorymap' is not supported (llama_supports_mmap() == false)");

var disposer = new GroupDisposable();

result = NativeApi.llama_model_default_params();
result.main_gpu = @params.MainGpu;
result.split_mode = @params.SplitMode;
result.n_gpu_layers = @params.GpuLayerCount < 0 ? int.MaxValue : @params.GpuLayerCount;
result.use_mlock = @params.UseMemoryLock;
result.use_mmap = @params.UseMemorymap;


+ 2
- 2
LLama/LLamaContext.cs View File

@@ -33,7 +33,7 @@ namespace LLama
/// <summary>
/// Total number of tokens in the context
/// </summary>
public int ContextSize => NativeHandle.ContextSize;
public uint ContextSize => NativeHandle.ContextSize;

/// <summary>
/// Dimension of embedding vectors
@@ -323,7 +323,7 @@ namespace LLama
var candidates_p = LLamaTokenDataArray.Create(logits);

// Extract most recently returned tokens
var last_n_repeat = Math.Min(ContextSize, repeatLastTokensCount);
var last_n_repeat = Math.Min((int)ContextSize, repeatLastTokensCount);
var last_n_array = lastTokens.TakeLast(last_n_repeat).ToArray();

// Apply penalties to candidates


+ 2
- 2
LLama/LLamaExecutorBase.cs View File

@@ -83,7 +83,7 @@ namespace LLama
_pastTokensCount = 0;
_consumedTokensCount = 0;
_n_session_consumed = 0;
_last_n_tokens = new FixedSizeQueue<LLamaToken>(Context.ContextSize);
_last_n_tokens = new FixedSizeQueue<LLamaToken>((int)Context.ContextSize);
_decoder = new StreamingTokenDecoder(context);
}

@@ -170,7 +170,7 @@ namespace LLama
_pastTokensCount = Math.Max(1, tokensToKeep);

// insert n_left/2 tokens at the start of embed from last_n_tokens
_embeds.InsertRange(0, _last_n_tokens.Take(_last_n_tokens.Count - _embeds.Count).Skip(Context.ContextSize - n_left / 2 - _embeds.Count));
_embeds.InsertRange(0, _last_n_tokens.Take(_last_n_tokens.Count - _embeds.Count).Skip((int)Context.ContextSize - n_left / 2 - _embeds.Count));

// stop saving session if we run out of context
_pathSession = string.Empty;


+ 1
- 1
LLama/LLamaInstructExecutor.cs View File

@@ -200,7 +200,7 @@ namespace LLama

if (_embed_inps.Count <= _consumedTokensCount && !args.WaitForInput)
{
var repeat_last_n = inferenceParams.RepeatLastTokensCount < 0 ? Context.ContextSize : inferenceParams.RepeatLastTokensCount;
var repeat_last_n = inferenceParams.RepeatLastTokensCount < 0 ? (int)Context.ContextSize : inferenceParams.RepeatLastTokensCount;

// optionally save the session on first sample (for faster prompt loading next time)
if (!string.IsNullOrEmpty(_pathSession) && args.NeedToSaveSession)


+ 1
- 1
LLama/LLamaInteractExecutor.cs View File

@@ -179,7 +179,7 @@ namespace LLama

if (_embed_inps.Count <= _consumedTokensCount && !args.WaitForInput)
{
var repeat_last_n = inferenceParams.RepeatLastTokensCount < 0 ? Context.ContextSize : inferenceParams.RepeatLastTokensCount;
var repeat_last_n = inferenceParams.RepeatLastTokensCount < 0 ? (int)Context.ContextSize : inferenceParams.RepeatLastTokensCount;

// optionally save the session on first sample (for faster prompt loading next time)
if (!string.IsNullOrEmpty(_pathSession) && args.NeedToSaveSession)


+ 23
- 0
LLama/Native/GPUSplitMode.cs View File

@@ -0,0 +1,23 @@
namespace LLama.Native;

/// <summary>
///
/// </summary>
/// <remarks>llama_split_mode</remarks>
public enum GPUSplitMode
{
/// <summary>
/// Single GPU
/// </summary>
None = 0,

/// <summary>
/// Split layers and KV across GPUs
/// </summary>
Layer = 1,

/// <summary>
/// split rows across GPUs
/// </summary>
Row = 2,
}

+ 18
- 8
LLama/Native/LLamaContextParams.cs View File

@@ -8,7 +8,8 @@ namespace LLama.Native
/// </summary>
/// <param name="progress"></param>
/// <param name="ctx"></param>
public delegate void LlamaProgressCallback(float progress, IntPtr ctx);
/// <remarks>llama_progress_callback</remarks>
public delegate bool LlamaProgressCallback(float progress, IntPtr ctx);

/// <summary>
/// A C# representation of the llama.cpp `llama_context_params` struct
@@ -46,37 +47,46 @@ namespace LLama.Native
/// </summary>
public RopeScalingType rope_scaling_type;

/// <summary>
/// RoPE base frequency, 0 = from model
/// </summary>
public float rope_freq_base;
public float rope_freq_base;
/// <summary>
/// RoPE frequency scaling factor, 0 = from model
/// </summary>
public float rope_freq_scale;
public float rope_freq_scale;
/// <summary>
/// YaRN extrapolation mix factor, negative = from model
/// </summary>
public float yarn_ext_factor;
public float yarn_ext_factor;
/// <summary>
/// YaRN magnitude scaling factor
/// </summary>
public float yarn_attn_factor;
public float yarn_attn_factor;
/// <summary>
/// YaRN low correction dim
/// </summary>
public float yarn_beta_fast;
public float yarn_beta_fast;
/// <summary>
/// YaRN high correction dim
/// </summary>
public float yarn_beta_slow;
public float yarn_beta_slow;
/// <summary>
/// YaRN original context size
/// </summary>
public uint yarn_orig_ctx;

/// <summary>
/// ggml_backend_sched_eval_callback
/// </summary>
public IntPtr cb_eval;

/// <summary>
/// User data passed into cb_eval
/// </summary>
public IntPtr cb_eval_user_data;

/// <summary>
/// data type for K cache
/// </summary>


+ 25
- 0
LLama/Native/LLamaFtype.cs View File

@@ -106,6 +106,31 @@
/// <remarks>Benchmark@7B: 5.15GB, +0.0044 ppl</remarks>
LLAMA_FTYPE_MOSTLY_Q6_K = 18,

/// <summary>
/// except 1d tensors
/// </summary>
LLAMA_FTYPE_MOSTLY_IQ2_XXS = 19,

/// <summary>
/// except 1d tensors
/// </summary>
LLAMA_FTYPE_MOSTLY_IQ2_XS = 20,

/// <summary>
/// except 1d tensors
/// </summary>
LLAMA_FTYPE_MOSTLY_Q2_K_S = 21,

/// <summary>
/// except 1d tensors
/// </summary>
LLAMA_FTYPE_MOSTLY_Q3_K_XS = 22,

/// <summary>
/// except 1d tensors
/// </summary>
LLAMA_FTYPE_MOSTLY_IQ3_XXS = 23,

/// <summary>
/// File type was not specified
/// </summary>


+ 7
- 1
LLama/Native/LLamaModelParams.cs View File

@@ -14,6 +14,11 @@ namespace LLama.Native
/// </summary>
public int n_gpu_layers;

/// <summary>
/// how to split the model across multiple GPUs
/// </summary>
public GPUSplitMode split_mode;

/// <summary>
/// the GPU that is used for scratch and small tensors
/// </summary>
@@ -25,7 +30,8 @@ namespace LLama.Native
public float* tensor_split;

/// <summary>
/// called with a progress value between 0 and 1, pass NULL to disable
/// called with a progress value between 0 and 1, pass NULL to disable. If the provided progress_callback
/// returns true, model loading continues. If it returns false, model loading is immediately aborted.
/// </summary>
public LlamaProgressCallback progress_callback;



+ 6
- 0
LLama/Native/LLamaModelQuantizeParams.cs View File

@@ -6,6 +6,7 @@ namespace LLama.Native
/// <summary>
/// Quantizer parameters used in the native API
/// </summary>
/// <remarks>llama_model_quantize_params</remarks>
[StructLayout(LayoutKind.Sequential)]
public struct LLamaModelQuantizeParams
{
@@ -58,5 +59,10 @@ namespace LLama.Native
set => _pure = Convert.ToSByte(value);
}
private sbyte _pure;

/// <summary>
/// pointer to importance matrix data
/// </summary>
public IntPtr imatrix;
}
}

+ 1
- 2
LLama/Native/NativeApi.Quantize.cs View File

@@ -10,9 +10,8 @@ namespace LLama.Native
/// <param name="fname_inp"></param>
/// <param name="fname_out"></param>
/// <param name="param"></param>
/// <remarks>not great API - very likely to change</remarks>
/// <returns>Returns 0 on success</returns>
[DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
public static extern unsafe int llama_model_quantize(string fname_inp, string fname_out, LLamaModelQuantizeParams* param);
public static extern unsafe uint llama_model_quantize(string fname_inp, string fname_out, LLamaModelQuantizeParams* param);
}
}

+ 15
- 3
LLama/Native/NativeApi.Sampling.cs View File

@@ -27,11 +27,12 @@ namespace LLama.Native
/// Apply classifier-free guidance to the logits as described in academic paper "Stay on topic with Classifier-Free Guidance" https://arxiv.org/abs/2306.17806
/// </summary>
/// <param name="ctx"></param>
/// <param name="candidates">A vector of `llama_token_data` containing the candidate tokens, the logits must be directly extracted from the original generation context without being sorted.</param>
/// <param name="guidance_ctx">A separate context from the same model. Other than a negative prompt at the beginning, it should have all generated and user input tokens copied from the main context.</param>
/// <param name="logits">Logits extracted from the original generation context.</param>
/// <param name="logits_guidance">Logits extracted from a separate context from the same model.
/// Other than a negative prompt at the beginning, it should have all generated and user input tokens copied from the main context.</param>
/// <param name="scale">Guidance strength. 1.0f means no guidance. Higher values mean stronger guidance.</param>
[DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
public static extern void llama_sample_classifier_free_guidance(SafeLLamaContextHandle ctx, ref LLamaTokenDataArrayNative candidates, SafeLLamaContextHandle guidance_ctx, float scale);
public static extern unsafe void llama_sample_apply_guidance(SafeLLamaContextHandle ctx, float* logits, float* logits_guidance, float scale);

/// <summary>
/// Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
@@ -92,6 +93,17 @@ namespace LLama.Native
[DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
public static extern void llama_sample_typical(SafeLLamaContextHandle ctx, ref LLamaTokenDataArrayNative candidates, float p, ulong min_keep);

/// <summary>
/// Dynamic temperature implementation described in the paper https://arxiv.org/abs/2309.02772.
/// </summary>
/// <param name="ctx"></param>
/// <param name="candidates">Pointer to LLamaTokenDataArray</param>
/// <param name="min_temp"></param>
/// <param name="max_temp"></param>
/// <param name="exponent_val"></param>
[DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
public static extern void llama_sample_typical(SafeLLamaContextHandle ctx, ref LLamaTokenDataArrayNative candidates, float min_temp, float max_temp, float exponent_val);

/// <summary>
/// Modify logits by temperature
/// </summary>


+ 30
- 6
LLama/Native/NativeApi.cs View File

@@ -23,7 +23,7 @@ namespace LLama.Native
/// <returns></returns>
public static void llama_empty_call()
{
llama_mmap_supported();
llama_max_devices();
}

/// <summary>
@@ -31,7 +31,7 @@ namespace LLama.Native
/// </summary>
/// <returns></returns>
[DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
public static extern int llama_max_devices();
public static extern long llama_max_devices();

/// <summary>
/// Create a LLamaModelParams with default values
@@ -59,14 +59,21 @@ namespace LLama.Native
/// </summary>
/// <returns></returns>
[DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
public static extern bool llama_mmap_supported();
public static extern bool llama_supports_mmap();

/// <summary>
/// Check if memory lockingis supported
/// Check if memory locking is supported
/// </summary>
/// <returns></returns>
[DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
public static extern bool llama_mlock_supported();
public static extern bool llama_supports_mlock();

/// <summary>
/// Check if GPU offload is supported
/// </summary>
/// <returns></returns>
[DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
public static extern bool llama_supports_gpu_offload();

/// <summary>
/// Initialize the llama + ggml backend
@@ -163,7 +170,10 @@ namespace LLama.Native
/// <param name="ctx"></param>
/// <returns></returns>
[DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
public static extern int llama_n_ctx(SafeLLamaContextHandle ctx);
public static extern uint llama_n_ctx(SafeLLamaContextHandle ctx);

[DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
public static extern uint llama_n_batch(SafeLLamaContextHandle ctx);

/// <summary>
/// Token logits obtained from the last call to llama_eval()
@@ -380,6 +390,20 @@ namespace LLama.Native
[DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
public static extern void llama_kv_cache_seq_shift(SafeLLamaContextHandle ctx, LLamaSeqId seq, LLamaPos p0, LLamaPos p1, LLamaPos delta);

/// <summary>
/// Integer division of the positions by factor of `d > 1`
/// If the KV cache is RoPEd, the KV data is updated accordingly
/// p0 &lt; 0 : [0, p1]
/// p1 &lt; 0 : [p0, inf)
/// </summary>
/// <param name="ctx"></param>
/// <param name="seq"></param>
/// <param name="p0"></param>
/// <param name="p1"></param>
/// <param name="d"></param>
[DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
public static extern void llama_kv_cache_seq_div(SafeLLamaContextHandle ctx, LLamaSeqId seq, LLamaPos p0, LLamaPos p1, int d);

/// <summary>
/// Allocates a batch of tokens on the heap
/// Each token can be assigned up to n_seq_max sequence ids


+ 1
- 1
LLama/Native/SafeLLamaContextHandle.cs View File

@@ -21,7 +21,7 @@ namespace LLama.Native
/// <summary>
/// Total number of tokens in the context
/// </summary>
public int ContextSize => NativeApi.llama_n_ctx(this);
public uint ContextSize => NativeApi.llama_n_ctx(this);

/// <summary>
/// Dimension of embedding vectors


BIN
LLama/runtimes/deps/avx/libllama.so View File


BIN
LLama/runtimes/deps/avx/llama.dll View File


BIN
LLama/runtimes/deps/avx2/libllama.so View File


BIN
LLama/runtimes/deps/avx2/llama.dll View File


BIN
LLama/runtimes/deps/avx512/libllama.so View File


BIN
LLama/runtimes/deps/avx512/llama.dll View File


BIN
LLama/runtimes/deps/clblast/clblast.dll View File


BIN
LLama/runtimes/deps/clblast/libllama.so View File


BIN
LLama/runtimes/deps/clblast/llama.dll View File


BIN
LLama/runtimes/deps/cu11.7.1/libllama.so View File


BIN
LLama/runtimes/deps/cu11.7.1/llama.dll View File


BIN
LLama/runtimes/deps/cu12.1.0/libllama.so View File


BIN
LLama/runtimes/deps/cu12.1.0/llama.dll View File


BIN
LLama/runtimes/deps/libllama.so View File


BIN
LLama/runtimes/deps/llama.dll View File


+ 1804
- 387
LLama/runtimes/deps/osx-arm64/ggml-metal.metal
File diff suppressed because it is too large
View File


BIN
LLama/runtimes/deps/osx-arm64/libllama.dylib View File


BIN
LLama/runtimes/deps/osx-x64/libllama.dylib View File


+ 1
- 0
README.md View File

@@ -222,6 +222,7 @@ If you want to compile llama.cpp yourself you **must** use the exact commit ID l
| v0.7.0, v0.8.0 | [Thespis-13B](https://huggingface.co/TheBloke/Thespis-13B-v0.5-GGUF/tree/main?not-for-all-audiences=true), [LLaMA2-7B](https://huggingface.co/TheBloke/llama-2-7B-Guanaco-QLoRA-GGUF) | [`207b519`](https://github.com/ggerganov/llama.cpp/commit/207b51900e15cc7f89763a3bb1c565fe11cbb45d) |
| v0.8.1 | | [`e937066`](https://github.com/ggerganov/llama.cpp/commit/e937066420b79a757bf80e9836eb12b88420a218) |
| v0.9.0, v0.9.1 | [Mixtral-8x7B](https://huggingface.co/TheBloke/Mixtral-8x7B-v0.1-GGUF) | [`9fb13f9`](https://github.com/ggerganov/llama.cpp/blob/9fb13f95840c722ad419f390dc8a9c86080a3700) |
| v0.10.0 | [Phi2](https://huggingface.co/TheBloke/phi-2-GGUF) | [`d71ac90`](https://github.com/ggerganov/llama.cpp/tree/d71ac90985854b0905e1abba778e407e17f9f887) |

## License



Loading…
Cancel
Save