Browse Source

Initial changes required for GGUF support

tags/v0.5.1
Martin Evans 2 years ago
parent
commit
2056078aef
19 changed files with 76 additions and 2043 deletions
  1. +24
    -0
      LLama.Examples/LLama.Examples.csproj
  2. +24
    -0
      LLama.Unittest/LLama.Unittest.csproj
  3. +0
    -2
      LLama.Unittest/ModelsParamsTests.cs
  4. +0
    -10
      LLama.Web/Common/ModelOptions.cs
  5. +0
    -10
      LLama/Abstractions/IModelParams.cs
  6. +1
    -15
      LLama/Common/ModelParams.cs
  7. +0
    -2
      LLama/Extensions/IModelParamsExtensions.cs
  8. +16
    -26
      LLama/Native/LLamaContextParams.cs
  9. +5
    -0
      LLama/Native/LLamaFtype.cs
  10. +3
    -3
      LLama/Native/NativeApi.cs
  11. +3
    -3
      LLama/Native/SafeLlamaModelHandle.cs
  12. +0
    -1972
      LLama/runtimes/ggml-metal.metal
  13. BIN
      LLama/runtimes/libllama-cuda11.dll
  14. BIN
      LLama/runtimes/libllama-cuda11.so
  15. BIN
      LLama/runtimes/libllama-cuda12.dll
  16. BIN
      LLama/runtimes/libllama-cuda12.so
  17. BIN
      LLama/runtimes/libllama-metal.dylib
  18. BIN
      LLama/runtimes/libllama.dylib
  19. BIN
      LLama/runtimes/libllama.so

+ 24
- 0
LLama.Examples/LLama.Examples.csproj View File

@@ -52,6 +52,30 @@
<None Update="Assets\reason-act.txt">
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
</None>
<None Update="C:\Users\Martin\Documents\dotnet\LLamaSharp\LLama\runtimes/ggml-metal.metal">
<CopyToOutputDirectory>Never</CopyToOutputDirectory>
</None>
<None Update="C:\Users\Martin\Documents\dotnet\LLamaSharp\LLama\runtimes/libllama-cuda11.dll">
<CopyToOutputDirectory>Never</CopyToOutputDirectory>
</None>
<None Update="C:\Users\Martin\Documents\dotnet\LLamaSharp\LLama\runtimes/libllama-cuda11.so">
<CopyToOutputDirectory>Never</CopyToOutputDirectory>
</None>
<None Update="C:\Users\Martin\Documents\dotnet\LLamaSharp\LLama\runtimes/libllama-cuda12.dll">
<CopyToOutputDirectory>Never</CopyToOutputDirectory>
</None>
<None Update="C:\Users\Martin\Documents\dotnet\LLamaSharp\LLama\runtimes/libllama-cuda12.so">
<CopyToOutputDirectory>Never</CopyToOutputDirectory>
</None>
<None Update="C:\Users\Martin\Documents\dotnet\LLamaSharp\LLama\runtimes/libllama-metal.dylib">
<CopyToOutputDirectory>Never</CopyToOutputDirectory>
</None>
<None Update="C:\Users\Martin\Documents\dotnet\LLamaSharp\LLama\runtimes/libllama.dylib">
<CopyToOutputDirectory>Never</CopyToOutputDirectory>
</None>
<None Update="C:\Users\Martin\Documents\dotnet\LLamaSharp\LLama\runtimes/libllama.so">
<CopyToOutputDirectory>Never</CopyToOutputDirectory>
</None>
</ItemGroup>

</Project>

+ 24
- 0
LLama.Unittest/LLama.Unittest.csproj View File

@@ -37,6 +37,30 @@
</ItemGroup>

<ItemGroup>
<None Update="C:\Users\Martin\Documents\dotnet\LLamaSharp\LLama\runtimes/ggml-metal.metal">
<CopyToOutputDirectory>Never</CopyToOutputDirectory>
</None>
<None Update="C:\Users\Martin\Documents\dotnet\LLamaSharp\LLama\runtimes/libllama-cuda11.dll">
<CopyToOutputDirectory>Never</CopyToOutputDirectory>
</None>
<None Update="C:\Users\Martin\Documents\dotnet\LLamaSharp\LLama\runtimes/libllama-cuda11.so">
<CopyToOutputDirectory>Never</CopyToOutputDirectory>
</None>
<None Update="C:\Users\Martin\Documents\dotnet\LLamaSharp\LLama\runtimes/libllama-cuda12.dll">
<CopyToOutputDirectory>Never</CopyToOutputDirectory>
</None>
<None Update="C:\Users\Martin\Documents\dotnet\LLamaSharp\LLama\runtimes/libllama-cuda12.so">
<CopyToOutputDirectory>Never</CopyToOutputDirectory>
</None>
<None Update="C:\Users\Martin\Documents\dotnet\LLamaSharp\LLama\runtimes/libllama-metal.dylib">
<CopyToOutputDirectory>Never</CopyToOutputDirectory>
</None>
<None Update="C:\Users\Martin\Documents\dotnet\LLamaSharp\LLama\runtimes/libllama.dylib">
<CopyToOutputDirectory>Never</CopyToOutputDirectory>
</None>
<None Update="C:\Users\Martin\Documents\dotnet\LLamaSharp\LLama\runtimes/libllama.so">
<CopyToOutputDirectory>Never</CopyToOutputDirectory>
</None>
<None Update="Models\llama-2-7b-chat.ggmlv3.q3_K_S.bin">
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
</None>


+ 0
- 2
LLama.Unittest/ModelsParamsTests.cs View File

@@ -14,7 +14,6 @@ namespace LLama.Unittest
BatchSize = 17,
ContextSize = 42,
LoraAdapter = "adapter",
GroupedQueryAttention = 7,
Seed = 42,
GpuLayerCount = 111
};
@@ -33,7 +32,6 @@ namespace LLama.Unittest
BatchSize = 17,
ContextSize = 42,
LoraAdapter = "adapter",
GroupedQueryAttention = 7,
Seed = 42,
GpuLayerCount = 111
};


+ 0
- 10
LLama.Web/Common/ModelOptions.cs View File

@@ -88,16 +88,6 @@ namespace LLama.Web.Common
/// </summary>
public float[] TensorSplits { get; set; }

/// <summary>
/// Grouped-Query Attention
/// </summary>
public int GroupedQueryAttention { get; set; } = 1;

/// <summary>
/// RMS Norm Epsilon
/// </summary>
public float RmsNormEpsilon { get; set; } = 5e-6f;

/// <summary>
/// RoPE base frequency
/// </summary>


+ 0
- 10
LLama/Abstractions/IModelParams.cs View File

@@ -98,16 +98,6 @@ namespace LLama.Abstractions
/// </summary>
float[]? TensorSplits { get; set; }

/// <summary>
/// Grouped-Query Attention
/// </summary>
int GroupedQueryAttention { get; set; }

/// <summary>
/// RMS Norm Epsilon
/// </summary>
float RmsNormEpsilon { get; set; }

/// <summary>
/// RoPE base frequency
/// </summary>


+ 1
- 15
LLama/Common/ModelParams.cs View File

@@ -89,16 +89,6 @@ namespace LLama.Common
/// </summary>
public float[]? TensorSplits { get; set; }

/// <summary>
/// Grouped-Query Attention
/// </summary>
public int GroupedQueryAttention { get; set; } = 1;

/// <summary>
/// RMS Norm Epsilon
/// </summary>
public float RmsNormEpsilon { get; set; } = 5e-6f;

/// <summary>
/// RoPE base frequency
/// </summary>
@@ -153,8 +143,6 @@ namespace LLama.Common
/// <param name="batchSize">Batch size for prompt processing (must be >=32 to use BLAS) (n_batch)</param>
/// <param name="convertEosToNewLine">Whether to convert eos to newline during the inference.</param>
/// <param name="embeddingMode">Whether to use embedding mode. (embedding) Note that if this is set to true, The LLamaModel won't produce text response anymore.</param>
/// <param name="groupedQueryAttention">Grouped-Query Attention</param>
/// <param name="rmsNormEpsilon">RMS Norm Epsilon</param>
/// <param name="ropeFrequencyBase">RoPE base frequency.</param>
/// <param name="ropeFrequencyScale">RoPE frequency scaling factor</param>
/// <param name="mulMatQ">Use experimental mul_mat_q kernels</param>
@@ -165,7 +153,7 @@ namespace LLama.Common
bool useMemorymap = true, bool useMemoryLock = false, bool perplexity = false,
string loraAdapter = "", string loraBase = "", int threads = -1, int batchSize = 512,
bool convertEosToNewLine = false, bool embeddingMode = false,
int groupedQueryAttention = 1, float rmsNormEpsilon = 5e-6f, float ropeFrequencyBase = 10000.0f, float ropeFrequencyScale = 1f, bool mulMatQ = false,
float ropeFrequencyBase = 10000.0f, float ropeFrequencyScale = 1f, bool mulMatQ = false,
string encoding = "UTF-8")
{
ContextSize = contextSize;
@@ -182,8 +170,6 @@ namespace LLama.Common
BatchSize = batchSize;
ConvertEosToNewLine = convertEosToNewLine;
EmbeddingMode = embeddingMode;
GroupedQueryAttention = groupedQueryAttention;
RmsNormEpsilon = rmsNormEpsilon;
RopeFrequencyBase = ropeFrequencyBase;
RopeFrequencyScale = ropeFrequencyScale;
MulMatQ = mulMatQ;


+ 0
- 2
LLama/Extensions/IModelParamsExtensions.cs View File

@@ -39,8 +39,6 @@ namespace LLama.Extensions
result.logits_all = @params.Perplexity;
result.embedding = @params.EmbeddingMode;
result.low_vram = @params.LowVram;
result.n_gqa = @params.GroupedQueryAttention;
result.rms_norm_eps = @params.RmsNormEpsilon;
result.rope_freq_base = @params.RopeFrequencyBase;
result.rope_freq_scale = @params.RopeFrequencyScale;
result.mul_mat_q = @params.MulMatQ;


+ 16
- 26
LLama/Native/LLamaContextParams.cs View File

@@ -31,16 +31,6 @@ namespace LLama.Native
/// </summary>
public int n_batch;

/// <summary>
/// grouped-query attention (TEMP - will be moved to model hparams)
/// </summary>
public int n_gqa;

/// <summary>
/// rms norm epsilon (TEMP - will be moved to model hparams)
/// </summary>
public float rms_norm_eps;

/// <summary>
/// number of layers to store in VRAM
/// </summary>
@@ -82,8 +72,8 @@ namespace LLama.Native
/// if true, reduce VRAM usage at the cost of performance
/// </summary>
public bool low_vram
{
get => Convert.ToBoolean(_low_vram);
{
readonly get => Convert.ToBoolean(_low_vram);
set => _low_vram = Convert.ToSByte(value);
}
private sbyte _low_vram;
@@ -92,8 +82,8 @@ namespace LLama.Native
/// if true, use experimental mul_mat_q kernels
/// </summary>
public bool mul_mat_q
{
get => Convert.ToBoolean(_mul_mat_q);
{
readonly get => Convert.ToBoolean(_mul_mat_q);
set => _mul_mat_q = Convert.ToSByte(value);
}
private sbyte _mul_mat_q;
@@ -102,8 +92,8 @@ namespace LLama.Native
/// use fp16 for KV cache
/// </summary>
public bool f16_kv
{
get => Convert.ToBoolean(_f16_kv);
{
readonly get => Convert.ToBoolean(_f16_kv);
set => _f16_kv = Convert.ToSByte(value);
}
private sbyte _f16_kv;
@@ -112,8 +102,8 @@ namespace LLama.Native
/// the llama_eval() call computes all logits, not just the last one
/// </summary>
public bool logits_all
{
get => Convert.ToBoolean(_logits_all);
{
readonly get => Convert.ToBoolean(_logits_all);
set => _logits_all = Convert.ToSByte(value);
}
private sbyte _logits_all;
@@ -122,8 +112,8 @@ namespace LLama.Native
/// only load the vocabulary, no weights
/// </summary>
public bool vocab_only
{
get => Convert.ToBoolean(_vocab_only);
{
readonly get => Convert.ToBoolean(_vocab_only);
set => _vocab_only = Convert.ToSByte(value);
}
private sbyte _vocab_only;
@@ -132,8 +122,8 @@ namespace LLama.Native
/// use mmap if possible
/// </summary>
public bool use_mmap
{
get => Convert.ToBoolean(_use_mmap);
{
readonly get => Convert.ToBoolean(_use_mmap);
set => _use_mmap = Convert.ToSByte(value);
}
private sbyte _use_mmap;
@@ -142,8 +132,8 @@ namespace LLama.Native
/// force system to keep model in RAM
/// </summary>
public bool use_mlock
{
get => Convert.ToBoolean(_use_mlock);
{
readonly get => Convert.ToBoolean(_use_mlock);
set => _use_mlock = Convert.ToSByte(value);
}
private sbyte _use_mlock;
@@ -152,8 +142,8 @@ namespace LLama.Native
/// embedding mode only
/// </summary>
public bool embedding
{
get => Convert.ToBoolean(_embedding);
{
readonly get => Convert.ToBoolean(_embedding);
set => _embedding = Convert.ToSByte(value);
}
private sbyte _embedding;


+ 5
- 0
LLama/Native/LLamaFtype.cs View File

@@ -105,5 +105,10 @@
/// </summary>
/// <remarks>Benchmark@7B: 5.15GB, +0.0044 ppl</remarks>
LLAMA_FTYPE_MOSTLY_Q6_K = 18,

/// <summary>
/// File type was not specified
/// </summary>
LLAMA_FTYPE_GUESSED = 1024
}
}

+ 3
- 3
LLama/Native/NativeApi.cs View File

@@ -377,7 +377,7 @@ namespace LLama.Native
/// <param name="model"></param>
/// <returns></returns>
[DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
public static extern int llama_n_vocab_from_model(SafeLlamaModelHandle model);
public static extern int llama_model_n_vocab(SafeLlamaModelHandle model);

/// <summary>
/// Get the size of the context window for the model
@@ -385,7 +385,7 @@ namespace LLama.Native
/// <param name="model"></param>
/// <returns></returns>
[DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
public static extern int llama_n_ctx_from_model(SafeLlamaModelHandle model);
public static extern int llama_model_n_ctx(SafeLlamaModelHandle model);

/// <summary>
/// Get the dimension of embedding vectors from this model
@@ -393,7 +393,7 @@ namespace LLama.Native
/// <param name="model"></param>
/// <returns></returns>
[DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
public static extern int llama_n_embd_from_model(SafeLlamaModelHandle model);
public static extern int llama_model_n_embd(SafeLlamaModelHandle model);

/// <summary>
/// Convert a single token into text


+ 3
- 3
LLama/Native/SafeLlamaModelHandle.cs View File

@@ -28,9 +28,9 @@ namespace LLama.Native
internal SafeLlamaModelHandle(IntPtr handle)
: base(handle)
{
VocabCount = NativeApi.llama_n_vocab_from_model(this);
ContextSize = NativeApi.llama_n_ctx_from_model(this);
EmbeddingSize = NativeApi.llama_n_embd_from_model(this);
VocabCount = NativeApi.llama_model_n_vocab(this);
ContextSize = NativeApi.llama_model_n_ctx(this);
EmbeddingSize = NativeApi.llama_model_n_embd(this);
}

/// <inheritdoc />


+ 0
- 1972
LLama/runtimes/ggml-metal.metal
File diff suppressed because it is too large
View File


BIN
LLama/runtimes/libllama-cuda11.dll View File


BIN
LLama/runtimes/libllama-cuda11.so View File


BIN
LLama/runtimes/libllama-cuda12.dll View File


BIN
LLama/runtimes/libllama-cuda12.so View File


BIN
LLama/runtimes/libllama-metal.dylib View File


BIN
LLama/runtimes/libllama.dylib View File


BIN
LLama/runtimes/libllama.so View File


Loading…
Cancel
Save