Initial changes required for GGUF support

2 years ago · 2056078aef
--- a/LLama.Examples/LLama.Examples.csproj
+++ b/LLama.Examples/LLama.Examples.csproj
@@ -52,6 +52,30 @@
    <None Update="Assets\reason-act.txt">
      <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
    </None>
    <None Update="C:\Users\Martin\Documents\dotnet\LLamaSharp\LLama\runtimes/ggml-metal.metal">
      <CopyToOutputDirectory>Never</CopyToOutputDirectory>
    </None>
    <None Update="C:\Users\Martin\Documents\dotnet\LLamaSharp\LLama\runtimes/libllama-cuda11.dll">
      <CopyToOutputDirectory>Never</CopyToOutputDirectory>
    </None>
    <None Update="C:\Users\Martin\Documents\dotnet\LLamaSharp\LLama\runtimes/libllama-cuda11.so">
      <CopyToOutputDirectory>Never</CopyToOutputDirectory>
    </None>
    <None Update="C:\Users\Martin\Documents\dotnet\LLamaSharp\LLama\runtimes/libllama-cuda12.dll">
      <CopyToOutputDirectory>Never</CopyToOutputDirectory>
    </None>
    <None Update="C:\Users\Martin\Documents\dotnet\LLamaSharp\LLama\runtimes/libllama-cuda12.so">
      <CopyToOutputDirectory>Never</CopyToOutputDirectory>
    </None>
    <None Update="C:\Users\Martin\Documents\dotnet\LLamaSharp\LLama\runtimes/libllama-metal.dylib">
      <CopyToOutputDirectory>Never</CopyToOutputDirectory>
    </None>
    <None Update="C:\Users\Martin\Documents\dotnet\LLamaSharp\LLama\runtimes/libllama.dylib">
      <CopyToOutputDirectory>Never</CopyToOutputDirectory>
    </None>
    <None Update="C:\Users\Martin\Documents\dotnet\LLamaSharp\LLama\runtimes/libllama.so">
      <CopyToOutputDirectory>Never</CopyToOutputDirectory>
    </None>
  </ItemGroup>

 </Project>
--- a/LLama.Unittest/LLama.Unittest.csproj
+++ b/LLama.Unittest/LLama.Unittest.csproj
@@ -37,6 +37,30 @@
  </ItemGroup>

  <ItemGroup>
    <None Update="C:\Users\Martin\Documents\dotnet\LLamaSharp\LLama\runtimes/ggml-metal.metal">
      <CopyToOutputDirectory>Never</CopyToOutputDirectory>
    </None>
    <None Update="C:\Users\Martin\Documents\dotnet\LLamaSharp\LLama\runtimes/libllama-cuda11.dll">
      <CopyToOutputDirectory>Never</CopyToOutputDirectory>
    </None>
    <None Update="C:\Users\Martin\Documents\dotnet\LLamaSharp\LLama\runtimes/libllama-cuda11.so">
      <CopyToOutputDirectory>Never</CopyToOutputDirectory>
    </None>
    <None Update="C:\Users\Martin\Documents\dotnet\LLamaSharp\LLama\runtimes/libllama-cuda12.dll">
      <CopyToOutputDirectory>Never</CopyToOutputDirectory>
    </None>
    <None Update="C:\Users\Martin\Documents\dotnet\LLamaSharp\LLama\runtimes/libllama-cuda12.so">
      <CopyToOutputDirectory>Never</CopyToOutputDirectory>
    </None>
    <None Update="C:\Users\Martin\Documents\dotnet\LLamaSharp\LLama\runtimes/libllama-metal.dylib">
      <CopyToOutputDirectory>Never</CopyToOutputDirectory>
    </None>
    <None Update="C:\Users\Martin\Documents\dotnet\LLamaSharp\LLama\runtimes/libllama.dylib">
      <CopyToOutputDirectory>Never</CopyToOutputDirectory>
    </None>
    <None Update="C:\Users\Martin\Documents\dotnet\LLamaSharp\LLama\runtimes/libllama.so">
      <CopyToOutputDirectory>Never</CopyToOutputDirectory>
    </None>
    <None Update="Models\llama-2-7b-chat.ggmlv3.q3_K_S.bin">
      <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
    </None>
--- a/LLama.Unittest/ModelsParamsTests.cs
+++ b/LLama.Unittest/ModelsParamsTests.cs
@@ -14,7 +14,6 @@ namespace LLama.Unittest
                BatchSize = 17,
                ContextSize = 42,
                LoraAdapter = "adapter",
                GroupedQueryAttention = 7,
                Seed = 42,
                GpuLayerCount = 111
            };
@@ -33,7 +32,6 @@ namespace LLama.Unittest
                BatchSize = 17,
                ContextSize = 42,
                LoraAdapter = "adapter",
                GroupedQueryAttention = 7,
                Seed = 42,
                GpuLayerCount = 111
            };
--- a/LLama.Web/Common/ModelOptions.cs
+++ b/LLama.Web/Common/ModelOptions.cs
@@ -88,16 +88,6 @@ namespace LLama.Web.Common
 		/// </summary>
 		public float[] TensorSplits { get; set; }

 		/// <summary>
 		/// Grouped-Query Attention
 		/// </summary>
 		public int GroupedQueryAttention { get; set; } = 1;

 		/// <summary>
 		/// RMS Norm Epsilon
 		/// </summary>
 		public float RmsNormEpsilon { get; set; } = 5e-6f;

 		/// <summary>
 		/// RoPE base frequency
 		/// </summary>
--- a/LLama/Abstractions/IModelParams.cs
+++ b/LLama/Abstractions/IModelParams.cs
@@ -98,16 +98,6 @@ namespace LLama.Abstractions
        /// </summary>
        float[]? TensorSplits { get; set; }

        /// <summary>
        /// Grouped-Query Attention
        /// </summary>
        int GroupedQueryAttention { get; set; }

        /// <summary>
        /// RMS Norm Epsilon
        /// </summary>
        float RmsNormEpsilon { get; set; }

        /// <summary>
        /// RoPE base frequency
        /// </summary>
--- a/LLama/Common/ModelParams.cs
+++ b/LLama/Common/ModelParams.cs
@@ -89,16 +89,6 @@ namespace LLama.Common
        /// </summary>
        public float[]? TensorSplits { get; set; }

 		/// <summary>
 		/// Grouped-Query Attention
 		/// </summary>
 		public int GroupedQueryAttention { get; set; } = 1;

 		/// <summary>
 		/// RMS Norm Epsilon
 		/// </summary>
 		public float RmsNormEpsilon { get; set; } = 5e-6f;

 		/// <summary>
 		/// RoPE base frequency
 		/// </summary>
@@ -153,8 +143,6 @@ namespace LLama.Common
        /// <param name="batchSize">Batch size for prompt processing (must be >=32 to use BLAS) (n_batch)</param>
        /// <param name="convertEosToNewLine">Whether to convert eos to newline during the inference.</param>
        /// <param name="embeddingMode">Whether to use embedding mode. (embedding) Note that if this is set to true, The LLamaModel won't produce text response anymore.</param>
        /// <param name="groupedQueryAttention">Grouped-Query Attention</param>
        /// <param name="rmsNormEpsilon">RMS Norm Epsilon</param>
        /// <param name="ropeFrequencyBase">RoPE base frequency.</param>
        /// <param name="ropeFrequencyScale">RoPE frequency scaling factor</param>
        /// <param name="mulMatQ">Use experimental mul_mat_q kernels</param>
@@ -165,7 +153,7 @@ namespace LLama.Common
                           bool useMemorymap = true, bool useMemoryLock = false, bool perplexity = false,
                           string loraAdapter = "", string loraBase = "", int threads = -1, int batchSize = 512,
                           bool convertEosToNewLine = false, bool embeddingMode = false,
                           int groupedQueryAttention = 1, float rmsNormEpsilon = 5e-6f, float ropeFrequencyBase = 10000.0f, float ropeFrequencyScale = 1f, bool mulMatQ = false,
                           float ropeFrequencyBase = 10000.0f, float ropeFrequencyScale = 1f, bool mulMatQ = false,
                           string encoding = "UTF-8")
        {
            ContextSize = contextSize;
@@ -182,8 +170,6 @@ namespace LLama.Common
            BatchSize = batchSize;
            ConvertEosToNewLine = convertEosToNewLine;
            EmbeddingMode = embeddingMode;
            GroupedQueryAttention = groupedQueryAttention;
            RmsNormEpsilon = rmsNormEpsilon;
            RopeFrequencyBase = ropeFrequencyBase;
            RopeFrequencyScale = ropeFrequencyScale;
            MulMatQ = mulMatQ;
--- a/LLama/Extensions/IModelParamsExtensions.cs
+++ b/LLama/Extensions/IModelParamsExtensions.cs
@@ -39,8 +39,6 @@ namespace LLama.Extensions
            result.logits_all = @params.Perplexity;
            result.embedding = @params.EmbeddingMode;
            result.low_vram = @params.LowVram;
            result.n_gqa = @params.GroupedQueryAttention;
            result.rms_norm_eps = @params.RmsNormEpsilon;
            result.rope_freq_base = @params.RopeFrequencyBase;
            result.rope_freq_scale = @params.RopeFrequencyScale;
            result.mul_mat_q = @params.MulMatQ;
--- a/LLama/Native/LLamaContextParams.cs
+++ b/LLama/Native/LLamaContextParams.cs
@@ -31,16 +31,6 @@ namespace LLama.Native
        /// </summary>
        public int n_batch;

        /// <summary>
        /// grouped-query attention (TEMP - will be moved to model hparams)
        /// </summary>
        public int n_gqa;

        /// <summary>
        /// rms norm epsilon (TEMP - will be moved to model hparams)
        /// </summary>
        public float rms_norm_eps;

        /// <summary>
        /// number of layers to store in VRAM
        /// </summary>
@@ -82,8 +72,8 @@ namespace LLama.Native
        /// if true, reduce VRAM usage at the cost of performance
        /// </summary>
        public bool low_vram
        {
            get => Convert.ToBoolean(_low_vram);
        { 
            readonly get => Convert.ToBoolean(_low_vram);
            set => _low_vram = Convert.ToSByte(value);
        }
        private sbyte _low_vram;
@@ -92,8 +82,8 @@ namespace LLama.Native
        /// if true, use experimental mul_mat_q kernels
        /// </summary>
        public bool mul_mat_q
        {
            get => Convert.ToBoolean(_mul_mat_q);
        { 
            readonly get => Convert.ToBoolean(_mul_mat_q);
            set => _mul_mat_q = Convert.ToSByte(value);
        }
        private sbyte _mul_mat_q;
@@ -102,8 +92,8 @@ namespace LLama.Native
        /// use fp16 for KV cache
        /// </summary>
        public bool f16_kv
        {
            get => Convert.ToBoolean(_f16_kv);
        { 
            readonly get => Convert.ToBoolean(_f16_kv);
            set => _f16_kv = Convert.ToSByte(value);
        }
        private sbyte _f16_kv;
@@ -112,8 +102,8 @@ namespace LLama.Native
        /// the llama_eval() call computes all logits, not just the last one
        /// </summary>
        public bool logits_all
        {
            get => Convert.ToBoolean(_logits_all);
        { 
            readonly get => Convert.ToBoolean(_logits_all);
            set => _logits_all = Convert.ToSByte(value);
        }
        private sbyte _logits_all;
@@ -122,8 +112,8 @@ namespace LLama.Native
        /// only load the vocabulary, no weights
        /// </summary>
        public bool vocab_only
        {
            get => Convert.ToBoolean(_vocab_only);
        { 
            readonly get => Convert.ToBoolean(_vocab_only);
            set => _vocab_only = Convert.ToSByte(value);
        }
        private sbyte _vocab_only;
@@ -132,8 +122,8 @@ namespace LLama.Native
        /// use mmap if possible
        /// </summary>
        public bool use_mmap
        {
            get => Convert.ToBoolean(_use_mmap);
        { 
            readonly get => Convert.ToBoolean(_use_mmap);
            set => _use_mmap = Convert.ToSByte(value);
        }
        private sbyte _use_mmap;
@@ -142,8 +132,8 @@ namespace LLama.Native
        /// force system to keep model in RAM
        /// </summary>
        public bool use_mlock
        {
            get => Convert.ToBoolean(_use_mlock);
        { 
            readonly get => Convert.ToBoolean(_use_mlock);
            set => _use_mlock = Convert.ToSByte(value);
        }
        private sbyte _use_mlock;
@@ -152,8 +142,8 @@ namespace LLama.Native
        /// embedding mode only
        /// </summary>
        public bool embedding
        {
            get => Convert.ToBoolean(_embedding);
        { 
            readonly get => Convert.ToBoolean(_embedding);
            set => _embedding = Convert.ToSByte(value);
        }
        private sbyte _embedding;
--- a/LLama/Native/LLamaFtype.cs
+++ b/LLama/Native/LLamaFtype.cs
@@ -105,5 +105,10 @@
        /// </summary>
        /// <remarks>Benchmark@7B: 5.15GB, +0.0044 ppl</remarks>
        LLAMA_FTYPE_MOSTLY_Q6_K = 18,

        /// <summary>
        /// File type was not specified
        /// </summary>
        LLAMA_FTYPE_GUESSED = 1024
    }
 }
--- a/LLama/Native/NativeApi.cs
+++ b/LLama/Native/NativeApi.cs
@@ -377,7 +377,7 @@ namespace LLama.Native
        /// <param name="model"></param>
        /// <returns></returns>
        [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
        public static extern int llama_n_vocab_from_model(SafeLlamaModelHandle model);
        public static extern int llama_model_n_vocab(SafeLlamaModelHandle model);

        /// <summary>
        /// Get the size of the context window for the model
@@ -385,7 +385,7 @@ namespace LLama.Native
        /// <param name="model"></param>
        /// <returns></returns>
        [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
        public static extern int llama_n_ctx_from_model(SafeLlamaModelHandle model);
        public static extern int llama_model_n_ctx(SafeLlamaModelHandle model);

        /// <summary>
        /// Get the dimension of embedding vectors from this model
@@ -393,7 +393,7 @@ namespace LLama.Native
        /// <param name="model"></param>
        /// <returns></returns>
        [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
        public static extern int llama_n_embd_from_model(SafeLlamaModelHandle model);
        public static extern int llama_model_n_embd(SafeLlamaModelHandle model);

        /// <summary>
        /// Convert a single token into text
--- a/LLama/Native/SafeLlamaModelHandle.cs
+++ b/LLama/Native/SafeLlamaModelHandle.cs
@@ -28,9 +28,9 @@ namespace LLama.Native
        internal SafeLlamaModelHandle(IntPtr handle)
            : base(handle)
        {
            VocabCount = NativeApi.llama_n_vocab_from_model(this);
            ContextSize = NativeApi.llama_n_ctx_from_model(this);
            EmbeddingSize = NativeApi.llama_n_embd_from_model(this);
            VocabCount = NativeApi.llama_model_n_vocab(this);
            ContextSize = NativeApi.llama_model_n_ctx(this);
            EmbeddingSize = NativeApi.llama_model_n_embd(this);
        }

        /// <inheritdoc />
--- a/LLama/runtimes/ggml-metal.metal
+++ b/LLama/runtimes/ggml-metal.metal
--- a/LLama/runtimes/libllama-cuda11.dll
+++ b/LLama/runtimes/libllama-cuda11.dll
--- a/LLama/runtimes/libllama-cuda11.so
+++ b/LLama/runtimes/libllama-cuda11.so
--- a/LLama/runtimes/libllama-cuda12.dll
+++ b/LLama/runtimes/libllama-cuda12.dll
--- a/LLama/runtimes/libllama-cuda12.so
+++ b/LLama/runtimes/libllama-cuda12.so
--- a/LLama/runtimes/libllama-metal.dylib
+++ b/LLama/runtimes/libllama-metal.dylib
--- a/LLama/runtimes/libllama.dylib
+++ b/LLama/runtimes/libllama.dylib
--- a/LLama/runtimes/libllama.so
+++ b/LLama/runtimes/libllama.so