diff --git a/LLama.Web/Common/ModelOptions.cs b/LLama.Web/Common/ModelOptions.cs
index 4f8f97e6..3d88d6b3 100644
--- a/LLama.Web/Common/ModelOptions.cs
+++ b/LLama.Web/Common/ModelOptions.cs
@@ -1,15 +1,115 @@
-﻿using LLama.Common;
+﻿using LLama.Abstractions;
 
 namespace LLama.Web.Common
 {
-    public class ModelOptions : ModelParams
+    public class ModelOptions : IModelParams
     {
-        public ModelOptions() : base("", 512, 20, 1337, true, true, false, false, "", "", -1, 512, false, false)
-        {
-        }
-
+      
         public string Name { get; set; }
         public int MaxInstances { get; set; }
 
-    }
+
+		/// <summary>
+		/// Model context size (n_ctx)
+		/// </summary>
+		public int ContextSize { get; set; } = 512;
+		/// <summary>
+		/// the GPU that is used for scratch and small tensors
+		/// </summary>
+		public int MainGpu { get; set; } = 0;
+		/// <summary>
+		/// if true, reduce VRAM usage at the cost of performance
+		/// </summary>
+		public bool LowVram { get; set; } = false;
+		/// <summary>
+		/// Number of layers to run in VRAM / GPU memory (n_gpu_layers)
+		/// </summary>
+		public int GpuLayerCount { get; set; } = 20;
+		/// <summary>
+		/// Seed for the random number generator (seed)
+		/// </summary>
+		public int Seed { get; set; } = 1686349486;
+		/// <summary>
+		/// Use f16 instead of f32 for memory kv (memory_f16)
+		/// </summary>
+		public bool UseFp16Memory { get; set; } = true;
+		/// <summary>
+		/// Use mmap for faster loads (use_mmap)
+		/// </summary>
+		public bool UseMemorymap { get; set; } = true;
+		/// <summary>
+		/// Use mlock to keep model in memory (use_mlock)
+		/// </summary>
+		public bool UseMemoryLock { get; set; } = false;
+		/// <summary>
+		/// Compute perplexity over the prompt (perplexity)
+		/// </summary>
+		public bool Perplexity { get; set; } = false;
+		/// <summary>
+		/// Model path (model)
+		/// </summary>
+		public string ModelPath { get; set; }
+		/// <summary>
+		/// model alias
+		/// </summary>
+		public string ModelAlias { get; set; } = "unknown";
+		/// <summary>
+		/// lora adapter path (lora_adapter)
+		/// </summary>
+		public string LoraAdapter { get; set; } = string.Empty;
+		/// <summary>
+		/// base model path for the lora adapter (lora_base)
+		/// </summary>
+		public string LoraBase { get; set; } = string.Empty;
+		/// <summary>
+		/// Number of threads (-1 = autodetect) (n_threads)
+		/// </summary>
+		public int Threads { get; set; } = Math.Max(Environment.ProcessorCount / 2, 1);
+		/// <summary>
+		/// batch size for prompt processing (must be >=32 to use BLAS) (n_batch)
+		/// </summary>
+		public int BatchSize { get; set; } = 512;
+
+		/// <summary>
+		/// Whether to convert eos to newline during the inference.
+		/// </summary>
+		public bool ConvertEosToNewLine { get; set; } = false;
+
+		/// <summary>
+		/// Whether to use embedding mode. (embedding) Note that if this is set to true, 
+		/// The LLamaModel won't produce text response anymore.
+		/// </summary>
+		public bool EmbeddingMode { get; set; } = false;
+
+		/// <summary>
+		/// how split tensors should be distributed across GPUs
+		/// </summary>
+		public nint TensorSplits { get; set; }
+
+		/// <summary>
+		/// Grouped-Query Attention
+		/// </summary>
+		public int GroupedQueryAttention { get; set; } = 1;
+
+		/// <summary>
+		/// RMS Norm Epsilon
+		/// </summary>
+		public float RmsNormEpsilon { get; set; } = 5e-6f;
+
+		/// <summary>
+		/// RoPE base frequency
+		/// </summary>
+		public float RopeFrequencyBase { get; set; } = 10000.0f;
+
+		/// <summary>
+		/// RoPE frequency scaling factor
+		/// </summary>
+		public float RopeFrequencyScale { get; set; } = 1.0f;
+
+		/// <summary>
+		/// Use experimental mul_mat_q kernels
+		/// </summary>
+		public bool MulMatQ { get; set; }
+
+	}
 }
diff --git a/LLama/Abstractions/IModelParams.cs b/LLama/Abstractions/IModelParams.cs
new file mode 100644
index 00000000..40c5432b
--- /dev/null
+++ b/LLama/Abstractions/IModelParams.cs
@@ -0,0 +1,123 @@
+﻿using System;
+
+namespace LLama.Abstractions
+{
+    public interface IModelParams
+    {
+        /// <summary>
+        /// Model context size (n_ctx)
+        /// </summary>
+        int ContextSize { get; set; }
+
+        /// <summary>
+        /// the GPU that is used for scratch and small tensors
+        /// </summary>
+        int MainGpu { get; set; }
+
+        /// <summary>
+        /// if true, reduce VRAM usage at the cost of performance
+        /// </summary>
+        bool LowVram { get; set; }
+
+        /// <summary>
+        /// Number of layers to run in VRAM / GPU memory (n_gpu_layers)
+        /// </summary>
+        int GpuLayerCount { get; set; }
+
+        /// <summary>
+        /// Seed for the random number generator (seed)
+        /// </summary>
+        int Seed { get; set; }
+
+        /// <summary>
+        /// Use f16 instead of f32 for memory kv (memory_f16)
+        /// </summary>
+        bool UseFp16Memory { get; set; }
+
+        /// <summary>
+        /// Use mmap for faster loads (use_mmap)
+        /// </summary>
+        bool UseMemorymap { get; set; }
+
+        /// <summary>
+        /// Use mlock to keep model in memory (use_mlock)
+        /// </summary>
+        bool UseMemoryLock { get; set; }
+
+        /// <summary>
+        /// Compute perplexity over the prompt (perplexity)
+        /// </summary>
+        bool Perplexity { get; set; }
+
+        /// <summary>
+        /// Model path (model)
+        /// </summary>
+        string ModelPath { get; set; }
+
+        /// <summary>
+        /// model alias
+        /// </summary>
+        string ModelAlias { get; set; }
+
+        /// <summary>
+        /// lora adapter path (lora_adapter)
+        /// </summary>
+        string LoraAdapter { get; set; }
+
+        /// <summary>
+        /// base model path for the lora adapter (lora_base)
+        /// </summary>
+        string LoraBase { get; set; }
+
+        /// <summary>
+        /// Number of threads (-1 = autodetect) (n_threads)
+        /// </summary>
+        int Threads { get; set; }
+
+        /// <summary>
+        /// batch size for prompt processing (must be >=32 to use BLAS) (n_batch)
+        /// </summary>
+        int BatchSize { get; set; }
+
+        /// <summary>
+        /// Whether to convert eos to newline during the inference.
+        /// </summary>
+        bool ConvertEosToNewLine { get; set; }
+
+        /// <summary>
+        /// Whether to use embedding mode. (embedding) Note that if this is set to true, 
+        /// The LLamaModel won't produce text response anymore.
+        /// </summary>
+        bool EmbeddingMode { get; set; }
+
+        /// <summary>
+        /// how split tensors should be distributed across GPUs
+        /// </summary>
+        nint TensorSplits { get; set; }
+
+        /// <summary>
+        /// Grouped-Query Attention
+        /// </summary>
+        int GroupedQueryAttention { get; set; }
+
+        /// <summary>
+        /// RMS Norm Epsilon
+        /// </summary>
+        float RmsNormEpsilon { get; set; }
+
+        /// <summary>
+        /// RoPE base frequency
+        /// </summary>
+        float RopeFrequencyBase { get; set; }
+
+        /// <summary>
+        /// RoPE frequency scaling factor
+        /// </summary>
+        float RopeFrequencyScale { get; set; }
+
+        /// <summary>
+        /// Use experimental mul_mat_q kernels
+        /// </summary>
+        bool MulMatQ { get; set; }
+    }
+}
\ No newline at end of file
diff --git a/LLama/Common/ModelParams.cs b/LLama/Common/ModelParams.cs
index 4f72eff3..72c77937 100644
--- a/LLama/Common/ModelParams.cs
+++ b/LLama/Common/ModelParams.cs
@@ -1,4 +1,5 @@
-﻿using System;
+﻿using LLama.Abstractions;
+using System;
 using System.Collections.Generic;
 using System.Text;
 
@@ -7,7 +8,7 @@ namespace LLama.Common
     /// <summary>
     /// The parameters for initializing a LLama model.
     /// </summary>
-    public class ModelParams
+    public class ModelParams : IModelParams
     {
         /// <summary>
         /// Model context size (n_ctx)
@@ -86,28 +87,59 @@ namespace LLama.Common
         /// </summary>
         public nint TensorSplits { get; set; }
 
-        /// <summary>
-        /// 
-        /// </summary>
-        /// <param name="modelPath">The model path.</param>
-        /// <param name="contextSize">Model context size (n_ctx)</param>
-        /// <param name="gpuLayerCount">Number of layers to run in VRAM / GPU memory (n_gpu_layers)</param>
-        /// <param name="seed">Seed for the random number generator (seed)</param>
-        /// <param name="useFp16Memory">Whether to use f16 instead of f32 for memory kv (memory_f16)</param>
-        /// <param name="useMemorymap">Whether to use mmap for faster loads (use_mmap)</param>
-        /// <param name="useMemoryLock">Whether to use mlock to keep model in memory (use_mlock)</param>
-        /// <param name="perplexity">Thether to compute perplexity over the prompt (perplexity)</param>
-        /// <param name="loraAdapter">Lora adapter path (lora_adapter)</param>
-        /// <param name="loraBase">Base model path for the lora adapter (lora_base)</param>
-        /// <param name="threads">Number of threads (-1 = autodetect) (n_threads)</param>
-        /// <param name="batchSize">Batch size for prompt processing (must be >=32 to use BLAS) (n_batch)</param>
-        /// <param name="convertEosToNewLine">Whether to convert eos to newline during the inference.</param>
-        /// <param name="embeddingMode">Whether to use embedding mode. (embedding) Note that if this is set to true, The LLamaModel won't produce text response anymore.</param>
-        public ModelParams(string modelPath, int contextSize = 512, int gpuLayerCount = 20,
+		/// <summary>
+		/// Grouped-Query Attention
+		/// </summary>
+		public int GroupedQueryAttention { get; set; } = 1;
+
+		/// <summary>
+		/// RMS Norm Epsilon
+		/// </summary>
+		public float RmsNormEpsilon { get; set; } = 5e-6f;
+
+		/// <summary>
+		/// RoPE base frequency
+		/// </summary>
+		public float RopeFrequencyBase { get; set; } = 10000.0f;
+
+		/// <summary>
+		/// RoPE frequency scaling factor
+		/// </summary>
+		public float RopeFrequencyScale { get; set; } = 1.0f;
+
+		/// <summary>
+		/// Use experimental mul_mat_q kernels
+		/// </summary>
+		public bool MulMatQ { get; set; }
+
+		/// <summary>
+		/// 
+		/// </summary>
+		/// <param name="modelPath">The model path.</param>
+		/// <param name="contextSize">Model context size (n_ctx)</param>
+		/// <param name="gpuLayerCount">Number of layers to run in VRAM / GPU memory (n_gpu_layers)</param>
+		/// <param name="seed">Seed for the random number generator (seed)</param>
+		/// <param name="useFp16Memory">Whether to use f16 instead of f32 for memory kv (memory_f16)</param>
+		/// <param name="useMemorymap">Whether to use mmap for faster loads (use_mmap)</param>
+		/// <param name="useMemoryLock">Whether to use mlock to keep model in memory (use_mlock)</param>
+		/// <param name="perplexity">Thether to compute perplexity over the prompt (perplexity)</param>
+		/// <param name="loraAdapter">Lora adapter path (lora_adapter)</param>
+		/// <param name="loraBase">Base model path for the lora adapter (lora_base)</param>
+		/// <param name="threads">Number of threads (-1 = autodetect) (n_threads)</param>
+		/// <param name="batchSize">Batch size for prompt processing (must be >=32 to use BLAS) (n_batch)</param>
+		/// <param name="convertEosToNewLine">Whether to convert eos to newline during the inference.</param>
+		/// <param name="embeddingMode">Whether to use embedding mode. (embedding) Note that if this is set to true, The LLamaModel won't produce text response anymore.</param>
+		/// <param name="gqa">Grouped-Query Attention</param>
+		/// <param name="rmsNormEps">RMS Norm Epsilon</param>
+		/// <param name="rope_freq_base">RoPE base frequency.</param>
+		/// <param name="rope_freq_scale">RoPE frequency scaling factor</param>
+		/// <param name="muMatQ">Use experimental mul_mat_q kernels</param>
+		public ModelParams(string modelPath, int contextSize = 512, int gpuLayerCount = 20,
                    int seed = 1337, bool useFp16Memory = true,
                    bool useMemorymap = true, bool useMemoryLock = false, bool perplexity = false,
                    string loraAdapter = "", string loraBase = "", int threads = -1, int batchSize = 512,
-                   bool convertEosToNewLine = false, bool embeddingMode = false)
+                   bool convertEosToNewLine = false, bool embeddingMode = false,
+                   int gqa = 1, float rmsNormEps = 5e-6f, float rope_freq_base = 10000.0f, float rope_freq_scale = 1f, bool muMatQ = false)
         {
             ContextSize = contextSize;
             GpuLayerCount = gpuLayerCount;
@@ -123,6 +155,11 @@ namespace LLama.Common
             BatchSize = batchSize;
             ConvertEosToNewLine = convertEosToNewLine;
             EmbeddingMode = embeddingMode;
-        }
+            GroupedQueryAttention  = gqa;
+            RmsNormEpsilon = rmsNormEps;
+            RopeFrequencyBase  = rope_freq_base;
+            RopeFrequencyScale  = rope_freq_scale;
+            MulMatQ = muMatQ;
+	    }
     }
 }
diff --git a/LLama/LLamaEmbedder.cs b/LLama/LLamaEmbedder.cs
index 4bbb61d2..24b6ee80 100644
--- a/LLama/LLamaEmbedder.cs
+++ b/LLama/LLamaEmbedder.cs
@@ -4,7 +4,7 @@ using System.Collections.Generic;
 using System.Text;
 using LLama.Exceptions;
 using System.Linq;
-using LLama.Common;
+using LLama.Abstractions;
 
 namespace LLama
 {
@@ -28,7 +28,7 @@ namespace LLama
         /// 
         /// </summary>
         /// <param name="params"></param>
-        public LLamaEmbedder(ModelParams @params)
+        public LLamaEmbedder(IModelParams @params)
         {
             @params.EmbeddingMode = true;
             _ctx = Utils.InitLLamaContextFromModelParams(@params);
diff --git a/LLama/LLamaModel.cs b/LLama/LLamaModel.cs
index d82e2f43..2bd31199 100644
--- a/LLama/LLamaModel.cs
+++ b/LLama/LLamaModel.cs
@@ -10,6 +10,7 @@ using LLama.Common;
 using System.Runtime.InteropServices;
 using LLama.Extensions;
 using Microsoft.Win32.SafeHandles;
+using LLama.Abstractions;
 
 namespace LLama
 {
@@ -30,7 +31,7 @@ namespace LLama
         /// <summary>
         /// The model params set for this model.
         /// </summary>
-        public ModelParams Params { get; set; }
+        public IModelParams Params { get; set; }
         /// <summary>
         /// The native handle, which is used to be passed to the native APIs. Please avoid using it 
         /// unless you know what is the usage of the Native API.
@@ -47,7 +48,7 @@ namespace LLama
         /// <param name="Params">Model params.</param>
         /// <param name="encoding">Encoding to deal with text input.</param>
         /// <param name="logger">The logger.</param>
-        public LLamaModel(ModelParams Params, string encoding = "UTF-8", ILLamaLogger? logger = null)
+        public LLamaModel(IModelParams Params, string encoding = "UTF-8", ILLamaLogger? logger = null)
         {
             _logger = logger;
             this.Params = Params;
diff --git a/LLama/ResettableLLamaModel.cs b/LLama/ResettableLLamaModel.cs
index f2862dc7..d9b4e822 100644
--- a/LLama/ResettableLLamaModel.cs
+++ b/LLama/ResettableLLamaModel.cs
@@ -1,4 +1,4 @@
-﻿using LLama.Common;
+﻿using LLama.Abstractions;
 using System;
 using System.Collections.Generic;
 using System.Text;
@@ -19,7 +19,7 @@ namespace LLama
         /// </summary>
         /// <param name="Params"></param>
         /// <param name="encoding"></param>
-        public ResettableLLamaModel(ModelParams Params, string encoding = "UTF-8") : base(Params, encoding)
+        public ResettableLLamaModel(IModelParams Params, string encoding = "UTF-8") : base(Params, encoding)
         {
             OriginalState = GetState();
         }
diff --git a/LLama/Utils.cs b/LLama/Utils.cs
index e99e6b29..0371718a 100644
--- a/LLama/Utils.cs
+++ b/LLama/Utils.cs
@@ -1,4 +1,4 @@
-﻿using LLama.Common;
+﻿using LLama.Abstractions;
 using LLama.Exceptions;
 using LLama.Native;
 using System;
@@ -13,7 +13,7 @@ namespace LLama
     using llama_token = Int32;
     internal static class Utils
     {
-        public static SafeLLamaContextHandle InitLLamaContextFromModelParams(ModelParams @params)
+        public static SafeLLamaContextHandle InitLLamaContextFromModelParams(IModelParams @params)
         {
             var lparams = NativeApi.llama_context_default_params();
 
@@ -28,6 +28,11 @@ namespace LLama
             lparams.logits_all = @params.Perplexity;
             lparams.embedding = @params.EmbeddingMode;
             lparams.low_vram = @params.LowVram;
+            lparams.n_gqa = @params.GroupedQueryAttention;
+            lparams.rms_norm_eps = @params.RmsNormEpsilon;
+            lparams.rope_freq_base = @params.RopeFrequencyBase;
+            lparams.rope_freq_scale = @params.RopeFrequencyScale;
+            lparams.mul_mat_q = @params.MulMatQ;
 
             /*
             if (@params.TensorSplits.Length != 1)