ModelParams abstraction

2 years ago · 2a04e31b7d
--- a/LLama.Web/Common/ModelOptions.cs
+++ b/LLama.Web/Common/ModelOptions.cs
@@ -1,15 +1,115 @@
 using LLama.Common;
 using LLama.Abstractions;

 namespace LLama.Web.Common
 {
    public class ModelOptions : ModelParams
    public class ModelOptions : IModelParams
    {
        public ModelOptions() : base("", 512, 20, 1337, true, true, false, false, "", "", -1, 512, false, false)
        {
        }

      
        public string Name { get; set; }
        public int MaxInstances { get; set; }

    }

 		/// <summary>
 		/// Model context size (n_ctx)
 		/// </summary>
 		public int ContextSize { get; set; } = 512;
 		/// <summary>
 		/// the GPU that is used for scratch and small tensors
 		/// </summary>
 		public int MainGpu { get; set; } = 0;
 		/// <summary>
 		/// if true, reduce VRAM usage at the cost of performance
 		/// </summary>
 		public bool LowVram { get; set; } = false;
 		/// <summary>
 		/// Number of layers to run in VRAM / GPU memory (n_gpu_layers)
 		/// </summary>
 		public int GpuLayerCount { get; set; } = 20;
 		/// <summary>
 		/// Seed for the random number generator (seed)
 		/// </summary>
 		public int Seed { get; set; } = 1686349486;
 		/// <summary>
 		/// Use f16 instead of f32 for memory kv (memory_f16)
 		/// </summary>
 		public bool UseFp16Memory { get; set; } = true;
 		/// <summary>
 		/// Use mmap for faster loads (use_mmap)
 		/// </summary>
 		public bool UseMemorymap { get; set; } = true;
 		/// <summary>
 		/// Use mlock to keep model in memory (use_mlock)
 		/// </summary>
 		public bool UseMemoryLock { get; set; } = false;
 		/// <summary>
 		/// Compute perplexity over the prompt (perplexity)
 		/// </summary>
 		public bool Perplexity { get; set; } = false;
 		/// <summary>
 		/// Model path (model)
 		/// </summary>
 		public string ModelPath { get; set; }
 		/// <summary>
 		/// model alias
 		/// </summary>
 		public string ModelAlias { get; set; } = "unknown";
 		/// <summary>
 		/// lora adapter path (lora_adapter)
 		/// </summary>
 		public string LoraAdapter { get; set; } = string.Empty;
 		/// <summary>
 		/// base model path for the lora adapter (lora_base)
 		/// </summary>
 		public string LoraBase { get; set; } = string.Empty;
 		/// <summary>
 		/// Number of threads (-1 = autodetect) (n_threads)
 		/// </summary>
 		public int Threads { get; set; } = Math.Max(Environment.ProcessorCount / 2, 1);
 		/// <summary>
 		/// batch size for prompt processing (must be >=32 to use BLAS) (n_batch)
 		/// </summary>
 		public int BatchSize { get; set; } = 512;

 		/// <summary>
 		/// Whether to convert eos to newline during the inference.
 		/// </summary>
 		public bool ConvertEosToNewLine { get; set; } = false;

 		/// <summary>
 		/// Whether to use embedding mode. (embedding) Note that if this is set to true, 
 		/// The LLamaModel won't produce text response anymore.
 		/// </summary>
 		public bool EmbeddingMode { get; set; } = false;

 		/// <summary>
 		/// how split tensors should be distributed across GPUs
 		/// </summary>
 		public nint TensorSplits { get; set; }

 		/// <summary>
 		/// Grouped-Query Attention
 		/// </summary>
 		public int GroupedQueryAttention { get; set; } = 1;

 		/// <summary>
 		/// RMS Norm Epsilon
 		/// </summary>
 		public float RmsNormEpsilon { get; set; } = 5e-6f;

 		/// <summary>
 		/// RoPE base frequency
 		/// </summary>
 		public float RopeFrequencyBase { get; set; } = 10000.0f;

 		/// <summary>
 		/// RoPE frequency scaling factor
 		/// </summary>
 		public float RopeFrequencyScale { get; set; } = 1.0f;

 		/// <summary>
 		/// Use experimental mul_mat_q kernels
 		/// </summary>
 		public bool MulMatQ { get; set; }

 	}
 }
--- a/LLama/Abstractions/IModelParams.cs
+++ b/LLama/Abstractions/IModelParams.cs
@@ -0,0 +1,123 @@
 using System;

 namespace LLama.Abstractions
 {
    public interface IModelParams
    {
        /// <summary>
        /// Model context size (n_ctx)
        /// </summary>
        int ContextSize { get; set; }

        /// <summary>
        /// the GPU that is used for scratch and small tensors
        /// </summary>
        int MainGpu { get; set; }

        /// <summary>
        /// if true, reduce VRAM usage at the cost of performance
        /// </summary>
        bool LowVram { get; set; }

        /// <summary>
        /// Number of layers to run in VRAM / GPU memory (n_gpu_layers)
        /// </summary>
        int GpuLayerCount { get; set; }

        /// <summary>
        /// Seed for the random number generator (seed)
        /// </summary>
        int Seed { get; set; }

        /// <summary>
        /// Use f16 instead of f32 for memory kv (memory_f16)
        /// </summary>
        bool UseFp16Memory { get; set; }

        /// <summary>
        /// Use mmap for faster loads (use_mmap)
        /// </summary>
        bool UseMemorymap { get; set; }

        /// <summary>
        /// Use mlock to keep model in memory (use_mlock)
        /// </summary>
        bool UseMemoryLock { get; set; }

        /// <summary>
        /// Compute perplexity over the prompt (perplexity)
        /// </summary>
        bool Perplexity { get; set; }

        /// <summary>
        /// Model path (model)
        /// </summary>
        string ModelPath { get; set; }

        /// <summary>
        /// model alias
        /// </summary>
        string ModelAlias { get; set; }

        /// <summary>
        /// lora adapter path (lora_adapter)
        /// </summary>
        string LoraAdapter { get; set; }

        /// <summary>
        /// base model path for the lora adapter (lora_base)
        /// </summary>
        string LoraBase { get; set; }

        /// <summary>
        /// Number of threads (-1 = autodetect) (n_threads)
        /// </summary>
        int Threads { get; set; }

        /// <summary>
        /// batch size for prompt processing (must be >=32 to use BLAS) (n_batch)
        /// </summary>
        int BatchSize { get; set; }

        /// <summary>
        /// Whether to convert eos to newline during the inference.
        /// </summary>
        bool ConvertEosToNewLine { get; set; }

        /// <summary>
        /// Whether to use embedding mode. (embedding) Note that if this is set to true, 
        /// The LLamaModel won't produce text response anymore.
        /// </summary>
        bool EmbeddingMode { get; set; }

        /// <summary>
        /// how split tensors should be distributed across GPUs
        /// </summary>
        nint TensorSplits { get; set; }

        /// <summary>
        /// Grouped-Query Attention
        /// </summary>
        int GroupedQueryAttention { get; set; }

        /// <summary>
        /// RMS Norm Epsilon
        /// </summary>
        float RmsNormEpsilon { get; set; }

        /// <summary>
        /// RoPE base frequency
        /// </summary>
        float RopeFrequencyBase { get; set; }

        /// <summary>
        /// RoPE frequency scaling factor
        /// </summary>
        float RopeFrequencyScale { get; set; }

        /// <summary>
        /// Use experimental mul_mat_q kernels
        /// </summary>
        bool MulMatQ { get; set; }
    }
 }
--- a/LLama/Common/ModelParams.cs
+++ b/LLama/Common/ModelParams.cs
@@ -1,4 +1,5 @@
 using System;
 using LLama.Abstractions;
 using System;
 using System.Collections.Generic;
 using System.Text;

@@ -7,7 +8,7 @@ namespace LLama.Common
    /// <summary>
    /// The parameters for initializing a LLama model.
    /// </summary>
    public class ModelParams
    public class ModelParams : IModelParams
    {
        /// <summary>
        /// Model context size (n_ctx)
@@ -86,28 +87,59 @@ namespace LLama.Common
        /// </summary>
        public nint TensorSplits { get; set; }

        /// <summary>
        /// 
        /// </summary>
        /// <param name="modelPath">The model path.</param>
        /// <param name="contextSize">Model context size (n_ctx)</param>
        /// <param name="gpuLayerCount">Number of layers to run in VRAM / GPU memory (n_gpu_layers)</param>
        /// <param name="seed">Seed for the random number generator (seed)</param>
        /// <param name="useFp16Memory">Whether to use f16 instead of f32 for memory kv (memory_f16)</param>
        /// <param name="useMemorymap">Whether to use mmap for faster loads (use_mmap)</param>
        /// <param name="useMemoryLock">Whether to use mlock to keep model in memory (use_mlock)</param>
        /// <param name="perplexity">Thether to compute perplexity over the prompt (perplexity)</param>
        /// <param name="loraAdapter">Lora adapter path (lora_adapter)</param>
        /// <param name="loraBase">Base model path for the lora adapter (lora_base)</param>
        /// <param name="threads">Number of threads (-1 = autodetect) (n_threads)</param>
        /// <param name="batchSize">Batch size for prompt processing (must be >=32 to use BLAS) (n_batch)</param>
        /// <param name="convertEosToNewLine">Whether to convert eos to newline during the inference.</param>
        /// <param name="embeddingMode">Whether to use embedding mode. (embedding) Note that if this is set to true, The LLamaModel won't produce text response anymore.</param>
        public ModelParams(string modelPath, int contextSize = 512, int gpuLayerCount = 20,
 		/// <summary>
 		/// Grouped-Query Attention
 		/// </summary>
 		public int GroupedQueryAttention { get; set; } = 1;

 		/// <summary>
 		/// RMS Norm Epsilon
 		/// </summary>
 		public float RmsNormEpsilon { get; set; } = 5e-6f;

 		/// <summary>
 		/// RoPE base frequency
 		/// </summary>
 		public float RopeFrequencyBase { get; set; } = 10000.0f;

 		/// <summary>
 		/// RoPE frequency scaling factor
 		/// </summary>
 		public float RopeFrequencyScale { get; set; } = 1.0f;

 		/// <summary>
 		/// Use experimental mul_mat_q kernels
 		/// </summary>
 		public bool MulMatQ { get; set; }

 		/// <summary>
 		/// 
 		/// </summary>
 		/// <param name="modelPath">The model path.</param>
 		/// <param name="contextSize">Model context size (n_ctx)</param>
 		/// <param name="gpuLayerCount">Number of layers to run in VRAM / GPU memory (n_gpu_layers)</param>
 		/// <param name="seed">Seed for the random number generator (seed)</param>
 		/// <param name="useFp16Memory">Whether to use f16 instead of f32 for memory kv (memory_f16)</param>
 		/// <param name="useMemorymap">Whether to use mmap for faster loads (use_mmap)</param>
 		/// <param name="useMemoryLock">Whether to use mlock to keep model in memory (use_mlock)</param>
 		/// <param name="perplexity">Thether to compute perplexity over the prompt (perplexity)</param>
 		/// <param name="loraAdapter">Lora adapter path (lora_adapter)</param>
 		/// <param name="loraBase">Base model path for the lora adapter (lora_base)</param>
 		/// <param name="threads">Number of threads (-1 = autodetect) (n_threads)</param>
 		/// <param name="batchSize">Batch size for prompt processing (must be >=32 to use BLAS) (n_batch)</param>
 		/// <param name="convertEosToNewLine">Whether to convert eos to newline during the inference.</param>
 		/// <param name="embeddingMode">Whether to use embedding mode. (embedding) Note that if this is set to true, The LLamaModel won't produce text response anymore.</param>
 		/// <param name="gqa">Grouped-Query Attention</param>
 		/// <param name="rmsNormEps">RMS Norm Epsilon</param>
 		/// <param name="rope_freq_base">RoPE base frequency.</param>
 		/// <param name="rope_freq_scale">RoPE frequency scaling factor</param>
 		/// <param name="muMatQ">Use experimental mul_mat_q kernels</param>
 		public ModelParams(string modelPath, int contextSize = 512, int gpuLayerCount = 20,
                   int seed = 1337, bool useFp16Memory = true,
                   bool useMemorymap = true, bool useMemoryLock = false, bool perplexity = false,
                   string loraAdapter = "", string loraBase = "", int threads = -1, int batchSize = 512,
                   bool convertEosToNewLine = false, bool embeddingMode = false)
                   bool convertEosToNewLine = false, bool embeddingMode = false,
                   int gqa = 1, float rmsNormEps = 5e-6f, float rope_freq_base = 10000.0f, float rope_freq_scale = 1f, bool muMatQ = false)
        {
            ContextSize = contextSize;
            GpuLayerCount = gpuLayerCount;
@@ -123,6 +155,11 @@ namespace LLama.Common
            BatchSize = batchSize;
            ConvertEosToNewLine = convertEosToNewLine;
            EmbeddingMode = embeddingMode;
        }
            GroupedQueryAttention  = gqa;
            RmsNormEpsilon = rmsNormEps;
            RopeFrequencyBase  = rope_freq_base;
            RopeFrequencyScale  = rope_freq_scale;
            MulMatQ = muMatQ;
 	    }
    }
 }
--- a/LLama/LLamaEmbedder.cs
+++ b/LLama/LLamaEmbedder.cs
@@ -4,7 +4,7 @@ using System.Collections.Generic;
 using System.Text;
 using LLama.Exceptions;
 using System.Linq;
 using LLama.Common;
 using LLama.Abstractions;

 namespace LLama
 {
@@ -28,7 +28,7 @@ namespace LLama
        /// 
        /// </summary>
        /// <param name="params"></param>
        public LLamaEmbedder(ModelParams @params)
        public LLamaEmbedder(IModelParams @params)
        {
            @params.EmbeddingMode = true;
            _ctx = Utils.InitLLamaContextFromModelParams(@params);
--- a/LLama/LLamaModel.cs
+++ b/LLama/LLamaModel.cs
@@ -10,6 +10,7 @@ using LLama.Common;
 using System.Runtime.InteropServices;
 using LLama.Extensions;
 using Microsoft.Win32.SafeHandles;
 using LLama.Abstractions;

 namespace LLama
 {
@@ -30,7 +31,7 @@ namespace LLama
        /// <summary>
        /// The model params set for this model.
        /// </summary>
        public ModelParams Params { get; set; }
        public IModelParams Params { get; set; }
        /// <summary>
        /// The native handle, which is used to be passed to the native APIs. Please avoid using it 
        /// unless you know what is the usage of the Native API.
@@ -47,7 +48,7 @@ namespace LLama
        /// <param name="Params">Model params.</param>
        /// <param name="encoding">Encoding to deal with text input.</param>
        /// <param name="logger">The logger.</param>
        public LLamaModel(ModelParams Params, string encoding = "UTF-8", ILLamaLogger? logger = null)
        public LLamaModel(IModelParams Params, string encoding = "UTF-8", ILLamaLogger? logger = null)
        {
            _logger = logger;
            this.Params = Params;
--- a/LLama/ResettableLLamaModel.cs
+++ b/LLama/ResettableLLamaModel.cs
@@ -1,4 +1,4 @@
 using LLama.Common;
 using LLama.Abstractions;
 using System;
 using System.Collections.Generic;
 using System.Text;
@@ -19,7 +19,7 @@ namespace LLama
        /// </summary>
        /// <param name="Params"></param>
        /// <param name="encoding"></param>
        public ResettableLLamaModel(ModelParams Params, string encoding = "UTF-8") : base(Params, encoding)
        public ResettableLLamaModel(IModelParams Params, string encoding = "UTF-8") : base(Params, encoding)
        {
            OriginalState = GetState();
        }
--- a/LLama/Utils.cs
+++ b/LLama/Utils.cs
@@ -1,4 +1,4 @@
 using LLama.Common;
 using LLama.Abstractions;
 using LLama.Exceptions;
 using LLama.Native;
 using System;
@@ -13,7 +13,7 @@ namespace LLama
    using llama_token = Int32;
    internal static class Utils
    {
        public static SafeLLamaContextHandle InitLLamaContextFromModelParams(ModelParams @params)
        public static SafeLLamaContextHandle InitLLamaContextFromModelParams(IModelParams @params)
        {
            var lparams = NativeApi.llama_context_default_params();

@@ -28,6 +28,11 @@ namespace LLama
            lparams.logits_all = @params.Perplexity;
            lparams.embedding = @params.EmbeddingMode;
            lparams.low_vram = @params.LowVram;
            lparams.n_gqa = @params.GroupedQueryAttention;
            lparams.rms_norm_eps = @params.RmsNormEpsilon;
            lparams.rope_freq_base = @params.RopeFrequencyBase;
            lparams.rope_freq_scale = @params.RopeFrequencyScale;
            lparams.mul_mat_q = @params.MulMatQ;

            /*
            if (@params.TensorSplits.Length != 1)