| @@ -1,15 +1,115 @@ | |||
| using LLama.Common; | |||
| using LLama.Abstractions; | |||
| namespace LLama.Web.Common | |||
| { | |||
| public class ModelOptions : ModelParams | |||
| public class ModelOptions : IModelParams | |||
| { | |||
| public ModelOptions() : base("", 512, 20, 1337, true, true, false, false, "", "", -1, 512, false, false) | |||
| { | |||
| } | |||
| public string Name { get; set; } | |||
| public int MaxInstances { get; set; } | |||
| } | |||
| /// <summary> | |||
| /// Model context size (n_ctx) | |||
| /// </summary> | |||
| public int ContextSize { get; set; } = 512; | |||
| /// <summary> | |||
| /// the GPU that is used for scratch and small tensors | |||
| /// </summary> | |||
| public int MainGpu { get; set; } = 0; | |||
| /// <summary> | |||
| /// if true, reduce VRAM usage at the cost of performance | |||
| /// </summary> | |||
| public bool LowVram { get; set; } = false; | |||
| /// <summary> | |||
| /// Number of layers to run in VRAM / GPU memory (n_gpu_layers) | |||
| /// </summary> | |||
| public int GpuLayerCount { get; set; } = 20; | |||
| /// <summary> | |||
| /// Seed for the random number generator (seed) | |||
| /// </summary> | |||
| public int Seed { get; set; } = 1686349486; | |||
| /// <summary> | |||
| /// Use f16 instead of f32 for memory kv (memory_f16) | |||
| /// </summary> | |||
| public bool UseFp16Memory { get; set; } = true; | |||
| /// <summary> | |||
| /// Use mmap for faster loads (use_mmap) | |||
| /// </summary> | |||
| public bool UseMemorymap { get; set; } = true; | |||
| /// <summary> | |||
| /// Use mlock to keep model in memory (use_mlock) | |||
| /// </summary> | |||
| public bool UseMemoryLock { get; set; } = false; | |||
| /// <summary> | |||
| /// Compute perplexity over the prompt (perplexity) | |||
| /// </summary> | |||
| public bool Perplexity { get; set; } = false; | |||
| /// <summary> | |||
| /// Model path (model) | |||
| /// </summary> | |||
| public string ModelPath { get; set; } | |||
| /// <summary> | |||
| /// model alias | |||
| /// </summary> | |||
| public string ModelAlias { get; set; } = "unknown"; | |||
| /// <summary> | |||
| /// lora adapter path (lora_adapter) | |||
| /// </summary> | |||
| public string LoraAdapter { get; set; } = string.Empty; | |||
| /// <summary> | |||
| /// base model path for the lora adapter (lora_base) | |||
| /// </summary> | |||
| public string LoraBase { get; set; } = string.Empty; | |||
| /// <summary> | |||
| /// Number of threads (-1 = autodetect) (n_threads) | |||
| /// </summary> | |||
| public int Threads { get; set; } = Math.Max(Environment.ProcessorCount / 2, 1); | |||
| /// <summary> | |||
| /// batch size for prompt processing (must be >=32 to use BLAS) (n_batch) | |||
| /// </summary> | |||
| public int BatchSize { get; set; } = 512; | |||
| /// <summary> | |||
| /// Whether to convert eos to newline during the inference. | |||
| /// </summary> | |||
| public bool ConvertEosToNewLine { get; set; } = false; | |||
| /// <summary> | |||
| /// Whether to use embedding mode. (embedding) Note that if this is set to true, | |||
| /// The LLamaModel won't produce text response anymore. | |||
| /// </summary> | |||
| public bool EmbeddingMode { get; set; } = false; | |||
| /// <summary> | |||
| /// how split tensors should be distributed across GPUs | |||
| /// </summary> | |||
| public nint TensorSplits { get; set; } | |||
| /// <summary> | |||
| /// Grouped-Query Attention | |||
| /// </summary> | |||
| public int GroupedQueryAttention { get; set; } = 1; | |||
| /// <summary> | |||
| /// RMS Norm Epsilon | |||
| /// </summary> | |||
| public float RmsNormEpsilon { get; set; } = 5e-6f; | |||
| /// <summary> | |||
| /// RoPE base frequency | |||
| /// </summary> | |||
| public float RopeFrequencyBase { get; set; } = 10000.0f; | |||
| /// <summary> | |||
| /// RoPE frequency scaling factor | |||
| /// </summary> | |||
| public float RopeFrequencyScale { get; set; } = 1.0f; | |||
| /// <summary> | |||
| /// Use experimental mul_mat_q kernels | |||
| /// </summary> | |||
| public bool MulMatQ { get; set; } | |||
| } | |||
| } | |||
| @@ -0,0 +1,123 @@ | |||
| using System; | |||
| namespace LLama.Abstractions | |||
| { | |||
| public interface IModelParams | |||
| { | |||
| /// <summary> | |||
| /// Model context size (n_ctx) | |||
| /// </summary> | |||
| int ContextSize { get; set; } | |||
| /// <summary> | |||
| /// the GPU that is used for scratch and small tensors | |||
| /// </summary> | |||
| int MainGpu { get; set; } | |||
| /// <summary> | |||
| /// if true, reduce VRAM usage at the cost of performance | |||
| /// </summary> | |||
| bool LowVram { get; set; } | |||
| /// <summary> | |||
| /// Number of layers to run in VRAM / GPU memory (n_gpu_layers) | |||
| /// </summary> | |||
| int GpuLayerCount { get; set; } | |||
| /// <summary> | |||
| /// Seed for the random number generator (seed) | |||
| /// </summary> | |||
| int Seed { get; set; } | |||
| /// <summary> | |||
| /// Use f16 instead of f32 for memory kv (memory_f16) | |||
| /// </summary> | |||
| bool UseFp16Memory { get; set; } | |||
| /// <summary> | |||
| /// Use mmap for faster loads (use_mmap) | |||
| /// </summary> | |||
| bool UseMemorymap { get; set; } | |||
| /// <summary> | |||
| /// Use mlock to keep model in memory (use_mlock) | |||
| /// </summary> | |||
| bool UseMemoryLock { get; set; } | |||
| /// <summary> | |||
| /// Compute perplexity over the prompt (perplexity) | |||
| /// </summary> | |||
| bool Perplexity { get; set; } | |||
| /// <summary> | |||
| /// Model path (model) | |||
| /// </summary> | |||
| string ModelPath { get; set; } | |||
| /// <summary> | |||
| /// model alias | |||
| /// </summary> | |||
| string ModelAlias { get; set; } | |||
| /// <summary> | |||
| /// lora adapter path (lora_adapter) | |||
| /// </summary> | |||
| string LoraAdapter { get; set; } | |||
| /// <summary> | |||
| /// base model path for the lora adapter (lora_base) | |||
| /// </summary> | |||
| string LoraBase { get; set; } | |||
| /// <summary> | |||
| /// Number of threads (-1 = autodetect) (n_threads) | |||
| /// </summary> | |||
| int Threads { get; set; } | |||
| /// <summary> | |||
| /// batch size for prompt processing (must be >=32 to use BLAS) (n_batch) | |||
| /// </summary> | |||
| int BatchSize { get; set; } | |||
| /// <summary> | |||
| /// Whether to convert eos to newline during the inference. | |||
| /// </summary> | |||
| bool ConvertEosToNewLine { get; set; } | |||
| /// <summary> | |||
| /// Whether to use embedding mode. (embedding) Note that if this is set to true, | |||
| /// The LLamaModel won't produce text response anymore. | |||
| /// </summary> | |||
| bool EmbeddingMode { get; set; } | |||
| /// <summary> | |||
| /// how split tensors should be distributed across GPUs | |||
| /// </summary> | |||
| nint TensorSplits { get; set; } | |||
| /// <summary> | |||
| /// Grouped-Query Attention | |||
| /// </summary> | |||
| int GroupedQueryAttention { get; set; } | |||
| /// <summary> | |||
| /// RMS Norm Epsilon | |||
| /// </summary> | |||
| float RmsNormEpsilon { get; set; } | |||
| /// <summary> | |||
| /// RoPE base frequency | |||
| /// </summary> | |||
| float RopeFrequencyBase { get; set; } | |||
| /// <summary> | |||
| /// RoPE frequency scaling factor | |||
| /// </summary> | |||
| float RopeFrequencyScale { get; set; } | |||
| /// <summary> | |||
| /// Use experimental mul_mat_q kernels | |||
| /// </summary> | |||
| bool MulMatQ { get; set; } | |||
| } | |||
| } | |||
| @@ -1,4 +1,5 @@ | |||
| using System; | |||
| using LLama.Abstractions; | |||
| using System; | |||
| using System.Collections.Generic; | |||
| using System.Text; | |||
| @@ -7,7 +8,7 @@ namespace LLama.Common | |||
| /// <summary> | |||
| /// The parameters for initializing a LLama model. | |||
| /// </summary> | |||
| public class ModelParams | |||
| public class ModelParams : IModelParams | |||
| { | |||
| /// <summary> | |||
| /// Model context size (n_ctx) | |||
| @@ -86,28 +87,59 @@ namespace LLama.Common | |||
| /// </summary> | |||
| public nint TensorSplits { get; set; } | |||
| /// <summary> | |||
| /// | |||
| /// </summary> | |||
| /// <param name="modelPath">The model path.</param> | |||
| /// <param name="contextSize">Model context size (n_ctx)</param> | |||
| /// <param name="gpuLayerCount">Number of layers to run in VRAM / GPU memory (n_gpu_layers)</param> | |||
| /// <param name="seed">Seed for the random number generator (seed)</param> | |||
| /// <param name="useFp16Memory">Whether to use f16 instead of f32 for memory kv (memory_f16)</param> | |||
| /// <param name="useMemorymap">Whether to use mmap for faster loads (use_mmap)</param> | |||
| /// <param name="useMemoryLock">Whether to use mlock to keep model in memory (use_mlock)</param> | |||
| /// <param name="perplexity">Thether to compute perplexity over the prompt (perplexity)</param> | |||
| /// <param name="loraAdapter">Lora adapter path (lora_adapter)</param> | |||
| /// <param name="loraBase">Base model path for the lora adapter (lora_base)</param> | |||
| /// <param name="threads">Number of threads (-1 = autodetect) (n_threads)</param> | |||
| /// <param name="batchSize">Batch size for prompt processing (must be >=32 to use BLAS) (n_batch)</param> | |||
| /// <param name="convertEosToNewLine">Whether to convert eos to newline during the inference.</param> | |||
| /// <param name="embeddingMode">Whether to use embedding mode. (embedding) Note that if this is set to true, The LLamaModel won't produce text response anymore.</param> | |||
| public ModelParams(string modelPath, int contextSize = 512, int gpuLayerCount = 20, | |||
| /// <summary> | |||
| /// Grouped-Query Attention | |||
| /// </summary> | |||
| public int GroupedQueryAttention { get; set; } = 1; | |||
| /// <summary> | |||
| /// RMS Norm Epsilon | |||
| /// </summary> | |||
| public float RmsNormEpsilon { get; set; } = 5e-6f; | |||
| /// <summary> | |||
| /// RoPE base frequency | |||
| /// </summary> | |||
| public float RopeFrequencyBase { get; set; } = 10000.0f; | |||
| /// <summary> | |||
| /// RoPE frequency scaling factor | |||
| /// </summary> | |||
| public float RopeFrequencyScale { get; set; } = 1.0f; | |||
| /// <summary> | |||
| /// Use experimental mul_mat_q kernels | |||
| /// </summary> | |||
| public bool MulMatQ { get; set; } | |||
| /// <summary> | |||
| /// | |||
| /// </summary> | |||
| /// <param name="modelPath">The model path.</param> | |||
| /// <param name="contextSize">Model context size (n_ctx)</param> | |||
| /// <param name="gpuLayerCount">Number of layers to run in VRAM / GPU memory (n_gpu_layers)</param> | |||
| /// <param name="seed">Seed for the random number generator (seed)</param> | |||
| /// <param name="useFp16Memory">Whether to use f16 instead of f32 for memory kv (memory_f16)</param> | |||
| /// <param name="useMemorymap">Whether to use mmap for faster loads (use_mmap)</param> | |||
| /// <param name="useMemoryLock">Whether to use mlock to keep model in memory (use_mlock)</param> | |||
| /// <param name="perplexity">Thether to compute perplexity over the prompt (perplexity)</param> | |||
| /// <param name="loraAdapter">Lora adapter path (lora_adapter)</param> | |||
| /// <param name="loraBase">Base model path for the lora adapter (lora_base)</param> | |||
| /// <param name="threads">Number of threads (-1 = autodetect) (n_threads)</param> | |||
| /// <param name="batchSize">Batch size for prompt processing (must be >=32 to use BLAS) (n_batch)</param> | |||
| /// <param name="convertEosToNewLine">Whether to convert eos to newline during the inference.</param> | |||
| /// <param name="embeddingMode">Whether to use embedding mode. (embedding) Note that if this is set to true, The LLamaModel won't produce text response anymore.</param> | |||
| /// <param name="gqa">Grouped-Query Attention</param> | |||
| /// <param name="rmsNormEps">RMS Norm Epsilon</param> | |||
| /// <param name="rope_freq_base">RoPE base frequency.</param> | |||
| /// <param name="rope_freq_scale">RoPE frequency scaling factor</param> | |||
| /// <param name="muMatQ">Use experimental mul_mat_q kernels</param> | |||
| public ModelParams(string modelPath, int contextSize = 512, int gpuLayerCount = 20, | |||
| int seed = 1337, bool useFp16Memory = true, | |||
| bool useMemorymap = true, bool useMemoryLock = false, bool perplexity = false, | |||
| string loraAdapter = "", string loraBase = "", int threads = -1, int batchSize = 512, | |||
| bool convertEosToNewLine = false, bool embeddingMode = false) | |||
| bool convertEosToNewLine = false, bool embeddingMode = false, | |||
| int gqa = 1, float rmsNormEps = 5e-6f, float rope_freq_base = 10000.0f, float rope_freq_scale = 1f, bool muMatQ = false) | |||
| { | |||
| ContextSize = contextSize; | |||
| GpuLayerCount = gpuLayerCount; | |||
| @@ -123,6 +155,11 @@ namespace LLama.Common | |||
| BatchSize = batchSize; | |||
| ConvertEosToNewLine = convertEosToNewLine; | |||
| EmbeddingMode = embeddingMode; | |||
| } | |||
| GroupedQueryAttention = gqa; | |||
| RmsNormEpsilon = rmsNormEps; | |||
| RopeFrequencyBase = rope_freq_base; | |||
| RopeFrequencyScale = rope_freq_scale; | |||
| MulMatQ = muMatQ; | |||
| } | |||
| } | |||
| } | |||
| @@ -4,7 +4,7 @@ using System.Collections.Generic; | |||
| using System.Text; | |||
| using LLama.Exceptions; | |||
| using System.Linq; | |||
| using LLama.Common; | |||
| using LLama.Abstractions; | |||
| namespace LLama | |||
| { | |||
| @@ -28,7 +28,7 @@ namespace LLama | |||
| /// | |||
| /// </summary> | |||
| /// <param name="params"></param> | |||
| public LLamaEmbedder(ModelParams @params) | |||
| public LLamaEmbedder(IModelParams @params) | |||
| { | |||
| @params.EmbeddingMode = true; | |||
| _ctx = Utils.InitLLamaContextFromModelParams(@params); | |||
| @@ -10,6 +10,7 @@ using LLama.Common; | |||
| using System.Runtime.InteropServices; | |||
| using LLama.Extensions; | |||
| using Microsoft.Win32.SafeHandles; | |||
| using LLama.Abstractions; | |||
| namespace LLama | |||
| { | |||
| @@ -30,7 +31,7 @@ namespace LLama | |||
| /// <summary> | |||
| /// The model params set for this model. | |||
| /// </summary> | |||
| public ModelParams Params { get; set; } | |||
| public IModelParams Params { get; set; } | |||
| /// <summary> | |||
| /// The native handle, which is used to be passed to the native APIs. Please avoid using it | |||
| /// unless you know what is the usage of the Native API. | |||
| @@ -47,7 +48,7 @@ namespace LLama | |||
| /// <param name="Params">Model params.</param> | |||
| /// <param name="encoding">Encoding to deal with text input.</param> | |||
| /// <param name="logger">The logger.</param> | |||
| public LLamaModel(ModelParams Params, string encoding = "UTF-8", ILLamaLogger? logger = null) | |||
| public LLamaModel(IModelParams Params, string encoding = "UTF-8", ILLamaLogger? logger = null) | |||
| { | |||
| _logger = logger; | |||
| this.Params = Params; | |||
| @@ -1,4 +1,4 @@ | |||
| using LLama.Common; | |||
| using LLama.Abstractions; | |||
| using System; | |||
| using System.Collections.Generic; | |||
| using System.Text; | |||
| @@ -19,7 +19,7 @@ namespace LLama | |||
| /// </summary> | |||
| /// <param name="Params"></param> | |||
| /// <param name="encoding"></param> | |||
| public ResettableLLamaModel(ModelParams Params, string encoding = "UTF-8") : base(Params, encoding) | |||
| public ResettableLLamaModel(IModelParams Params, string encoding = "UTF-8") : base(Params, encoding) | |||
| { | |||
| OriginalState = GetState(); | |||
| } | |||
| @@ -1,4 +1,4 @@ | |||
| using LLama.Common; | |||
| using LLama.Abstractions; | |||
| using LLama.Exceptions; | |||
| using LLama.Native; | |||
| using System; | |||
| @@ -13,7 +13,7 @@ namespace LLama | |||
| using llama_token = Int32; | |||
| internal static class Utils | |||
| { | |||
| public static SafeLLamaContextHandle InitLLamaContextFromModelParams(ModelParams @params) | |||
| public static SafeLLamaContextHandle InitLLamaContextFromModelParams(IModelParams @params) | |||
| { | |||
| var lparams = NativeApi.llama_context_default_params(); | |||
| @@ -28,6 +28,11 @@ namespace LLama | |||
| lparams.logits_all = @params.Perplexity; | |||
| lparams.embedding = @params.EmbeddingMode; | |||
| lparams.low_vram = @params.LowVram; | |||
| lparams.n_gqa = @params.GroupedQueryAttention; | |||
| lparams.rms_norm_eps = @params.RmsNormEpsilon; | |||
| lparams.rope_freq_base = @params.RopeFrequencyBase; | |||
| lparams.rope_freq_scale = @params.RopeFrequencyScale; | |||
| lparams.mul_mat_q = @params.MulMatQ; | |||
| /* | |||
| if (@params.TensorSplits.Length != 1) | |||