Browse Source

ModelParams abstraction

tags/v0.5.1
sa_ddam213 2 years ago
parent
commit
2a04e31b7d
7 changed files with 303 additions and 37 deletions
  1. +107
    -7
      LLama.Web/Common/ModelOptions.cs
  2. +123
    -0
      LLama/Abstractions/IModelParams.cs
  3. +59
    -22
      LLama/Common/ModelParams.cs
  4. +2
    -2
      LLama/LLamaEmbedder.cs
  5. +3
    -2
      LLama/LLamaModel.cs
  6. +2
    -2
      LLama/ResettableLLamaModel.cs
  7. +7
    -2
      LLama/Utils.cs

+ 107
- 7
LLama.Web/Common/ModelOptions.cs View File

@@ -1,15 +1,115 @@
using LLama.Common;
using LLama.Abstractions;

namespace LLama.Web.Common
{
public class ModelOptions : ModelParams
public class ModelOptions : IModelParams
{
public ModelOptions() : base("", 512, 20, 1337, true, true, false, false, "", "", -1, 512, false, false)
{
}

public string Name { get; set; }
public int MaxInstances { get; set; }

}

/// <summary>
/// Model context size (n_ctx)
/// </summary>
public int ContextSize { get; set; } = 512;
/// <summary>
/// the GPU that is used for scratch and small tensors
/// </summary>
public int MainGpu { get; set; } = 0;
/// <summary>
/// if true, reduce VRAM usage at the cost of performance
/// </summary>
public bool LowVram { get; set; } = false;
/// <summary>
/// Number of layers to run in VRAM / GPU memory (n_gpu_layers)
/// </summary>
public int GpuLayerCount { get; set; } = 20;
/// <summary>
/// Seed for the random number generator (seed)
/// </summary>
public int Seed { get; set; } = 1686349486;
/// <summary>
/// Use f16 instead of f32 for memory kv (memory_f16)
/// </summary>
public bool UseFp16Memory { get; set; } = true;
/// <summary>
/// Use mmap for faster loads (use_mmap)
/// </summary>
public bool UseMemorymap { get; set; } = true;
/// <summary>
/// Use mlock to keep model in memory (use_mlock)
/// </summary>
public bool UseMemoryLock { get; set; } = false;
/// <summary>
/// Compute perplexity over the prompt (perplexity)
/// </summary>
public bool Perplexity { get; set; } = false;
/// <summary>
/// Model path (model)
/// </summary>
public string ModelPath { get; set; }
/// <summary>
/// model alias
/// </summary>
public string ModelAlias { get; set; } = "unknown";
/// <summary>
/// lora adapter path (lora_adapter)
/// </summary>
public string LoraAdapter { get; set; } = string.Empty;
/// <summary>
/// base model path for the lora adapter (lora_base)
/// </summary>
public string LoraBase { get; set; } = string.Empty;
/// <summary>
/// Number of threads (-1 = autodetect) (n_threads)
/// </summary>
public int Threads { get; set; } = Math.Max(Environment.ProcessorCount / 2, 1);
/// <summary>
/// batch size for prompt processing (must be >=32 to use BLAS) (n_batch)
/// </summary>
public int BatchSize { get; set; } = 512;

/// <summary>
/// Whether to convert eos to newline during the inference.
/// </summary>
public bool ConvertEosToNewLine { get; set; } = false;

/// <summary>
/// Whether to use embedding mode. (embedding) Note that if this is set to true,
/// The LLamaModel won't produce text response anymore.
/// </summary>
public bool EmbeddingMode { get; set; } = false;

/// <summary>
/// how split tensors should be distributed across GPUs
/// </summary>
public nint TensorSplits { get; set; }

/// <summary>
/// Grouped-Query Attention
/// </summary>
public int GroupedQueryAttention { get; set; } = 1;

/// <summary>
/// RMS Norm Epsilon
/// </summary>
public float RmsNormEpsilon { get; set; } = 5e-6f;

/// <summary>
/// RoPE base frequency
/// </summary>
public float RopeFrequencyBase { get; set; } = 10000.0f;

/// <summary>
/// RoPE frequency scaling factor
/// </summary>
public float RopeFrequencyScale { get; set; } = 1.0f;

/// <summary>
/// Use experimental mul_mat_q kernels
/// </summary>
public bool MulMatQ { get; set; }

}
}

+ 123
- 0
LLama/Abstractions/IModelParams.cs View File

@@ -0,0 +1,123 @@
using System;

namespace LLama.Abstractions
{
public interface IModelParams
{
/// <summary>
/// Model context size (n_ctx)
/// </summary>
int ContextSize { get; set; }

/// <summary>
/// the GPU that is used for scratch and small tensors
/// </summary>
int MainGpu { get; set; }

/// <summary>
/// if true, reduce VRAM usage at the cost of performance
/// </summary>
bool LowVram { get; set; }

/// <summary>
/// Number of layers to run in VRAM / GPU memory (n_gpu_layers)
/// </summary>
int GpuLayerCount { get; set; }

/// <summary>
/// Seed for the random number generator (seed)
/// </summary>
int Seed { get; set; }

/// <summary>
/// Use f16 instead of f32 for memory kv (memory_f16)
/// </summary>
bool UseFp16Memory { get; set; }

/// <summary>
/// Use mmap for faster loads (use_mmap)
/// </summary>
bool UseMemorymap { get; set; }

/// <summary>
/// Use mlock to keep model in memory (use_mlock)
/// </summary>
bool UseMemoryLock { get; set; }

/// <summary>
/// Compute perplexity over the prompt (perplexity)
/// </summary>
bool Perplexity { get; set; }

/// <summary>
/// Model path (model)
/// </summary>
string ModelPath { get; set; }

/// <summary>
/// model alias
/// </summary>
string ModelAlias { get; set; }

/// <summary>
/// lora adapter path (lora_adapter)
/// </summary>
string LoraAdapter { get; set; }

/// <summary>
/// base model path for the lora adapter (lora_base)
/// </summary>
string LoraBase { get; set; }

/// <summary>
/// Number of threads (-1 = autodetect) (n_threads)
/// </summary>
int Threads { get; set; }

/// <summary>
/// batch size for prompt processing (must be >=32 to use BLAS) (n_batch)
/// </summary>
int BatchSize { get; set; }

/// <summary>
/// Whether to convert eos to newline during the inference.
/// </summary>
bool ConvertEosToNewLine { get; set; }

/// <summary>
/// Whether to use embedding mode. (embedding) Note that if this is set to true,
/// The LLamaModel won't produce text response anymore.
/// </summary>
bool EmbeddingMode { get; set; }

/// <summary>
/// how split tensors should be distributed across GPUs
/// </summary>
nint TensorSplits { get; set; }

/// <summary>
/// Grouped-Query Attention
/// </summary>
int GroupedQueryAttention { get; set; }

/// <summary>
/// RMS Norm Epsilon
/// </summary>
float RmsNormEpsilon { get; set; }

/// <summary>
/// RoPE base frequency
/// </summary>
float RopeFrequencyBase { get; set; }

/// <summary>
/// RoPE frequency scaling factor
/// </summary>
float RopeFrequencyScale { get; set; }

/// <summary>
/// Use experimental mul_mat_q kernels
/// </summary>
bool MulMatQ { get; set; }
}
}

+ 59
- 22
LLama/Common/ModelParams.cs View File

@@ -1,4 +1,5 @@
using System;
using LLama.Abstractions;
using System;
using System.Collections.Generic;
using System.Text;

@@ -7,7 +8,7 @@ namespace LLama.Common
/// <summary>
/// The parameters for initializing a LLama model.
/// </summary>
public class ModelParams
public class ModelParams : IModelParams
{
/// <summary>
/// Model context size (n_ctx)
@@ -86,28 +87,59 @@ namespace LLama.Common
/// </summary>
public nint TensorSplits { get; set; }

/// <summary>
///
/// </summary>
/// <param name="modelPath">The model path.</param>
/// <param name="contextSize">Model context size (n_ctx)</param>
/// <param name="gpuLayerCount">Number of layers to run in VRAM / GPU memory (n_gpu_layers)</param>
/// <param name="seed">Seed for the random number generator (seed)</param>
/// <param name="useFp16Memory">Whether to use f16 instead of f32 for memory kv (memory_f16)</param>
/// <param name="useMemorymap">Whether to use mmap for faster loads (use_mmap)</param>
/// <param name="useMemoryLock">Whether to use mlock to keep model in memory (use_mlock)</param>
/// <param name="perplexity">Thether to compute perplexity over the prompt (perplexity)</param>
/// <param name="loraAdapter">Lora adapter path (lora_adapter)</param>
/// <param name="loraBase">Base model path for the lora adapter (lora_base)</param>
/// <param name="threads">Number of threads (-1 = autodetect) (n_threads)</param>
/// <param name="batchSize">Batch size for prompt processing (must be >=32 to use BLAS) (n_batch)</param>
/// <param name="convertEosToNewLine">Whether to convert eos to newline during the inference.</param>
/// <param name="embeddingMode">Whether to use embedding mode. (embedding) Note that if this is set to true, The LLamaModel won't produce text response anymore.</param>
public ModelParams(string modelPath, int contextSize = 512, int gpuLayerCount = 20,
/// <summary>
/// Grouped-Query Attention
/// </summary>
public int GroupedQueryAttention { get; set; } = 1;

/// <summary>
/// RMS Norm Epsilon
/// </summary>
public float RmsNormEpsilon { get; set; } = 5e-6f;

/// <summary>
/// RoPE base frequency
/// </summary>
public float RopeFrequencyBase { get; set; } = 10000.0f;

/// <summary>
/// RoPE frequency scaling factor
/// </summary>
public float RopeFrequencyScale { get; set; } = 1.0f;

/// <summary>
/// Use experimental mul_mat_q kernels
/// </summary>
public bool MulMatQ { get; set; }

/// <summary>
///
/// </summary>
/// <param name="modelPath">The model path.</param>
/// <param name="contextSize">Model context size (n_ctx)</param>
/// <param name="gpuLayerCount">Number of layers to run in VRAM / GPU memory (n_gpu_layers)</param>
/// <param name="seed">Seed for the random number generator (seed)</param>
/// <param name="useFp16Memory">Whether to use f16 instead of f32 for memory kv (memory_f16)</param>
/// <param name="useMemorymap">Whether to use mmap for faster loads (use_mmap)</param>
/// <param name="useMemoryLock">Whether to use mlock to keep model in memory (use_mlock)</param>
/// <param name="perplexity">Thether to compute perplexity over the prompt (perplexity)</param>
/// <param name="loraAdapter">Lora adapter path (lora_adapter)</param>
/// <param name="loraBase">Base model path for the lora adapter (lora_base)</param>
/// <param name="threads">Number of threads (-1 = autodetect) (n_threads)</param>
/// <param name="batchSize">Batch size for prompt processing (must be >=32 to use BLAS) (n_batch)</param>
/// <param name="convertEosToNewLine">Whether to convert eos to newline during the inference.</param>
/// <param name="embeddingMode">Whether to use embedding mode. (embedding) Note that if this is set to true, The LLamaModel won't produce text response anymore.</param>
/// <param name="gqa">Grouped-Query Attention</param>
/// <param name="rmsNormEps">RMS Norm Epsilon</param>
/// <param name="rope_freq_base">RoPE base frequency.</param>
/// <param name="rope_freq_scale">RoPE frequency scaling factor</param>
/// <param name="muMatQ">Use experimental mul_mat_q kernels</param>
public ModelParams(string modelPath, int contextSize = 512, int gpuLayerCount = 20,
int seed = 1337, bool useFp16Memory = true,
bool useMemorymap = true, bool useMemoryLock = false, bool perplexity = false,
string loraAdapter = "", string loraBase = "", int threads = -1, int batchSize = 512,
bool convertEosToNewLine = false, bool embeddingMode = false)
bool convertEosToNewLine = false, bool embeddingMode = false,
int gqa = 1, float rmsNormEps = 5e-6f, float rope_freq_base = 10000.0f, float rope_freq_scale = 1f, bool muMatQ = false)
{
ContextSize = contextSize;
GpuLayerCount = gpuLayerCount;
@@ -123,6 +155,11 @@ namespace LLama.Common
BatchSize = batchSize;
ConvertEosToNewLine = convertEosToNewLine;
EmbeddingMode = embeddingMode;
}
GroupedQueryAttention = gqa;
RmsNormEpsilon = rmsNormEps;
RopeFrequencyBase = rope_freq_base;
RopeFrequencyScale = rope_freq_scale;
MulMatQ = muMatQ;
}
}
}

+ 2
- 2
LLama/LLamaEmbedder.cs View File

@@ -4,7 +4,7 @@ using System.Collections.Generic;
using System.Text;
using LLama.Exceptions;
using System.Linq;
using LLama.Common;
using LLama.Abstractions;

namespace LLama
{
@@ -28,7 +28,7 @@ namespace LLama
///
/// </summary>
/// <param name="params"></param>
public LLamaEmbedder(ModelParams @params)
public LLamaEmbedder(IModelParams @params)
{
@params.EmbeddingMode = true;
_ctx = Utils.InitLLamaContextFromModelParams(@params);


+ 3
- 2
LLama/LLamaModel.cs View File

@@ -10,6 +10,7 @@ using LLama.Common;
using System.Runtime.InteropServices;
using LLama.Extensions;
using Microsoft.Win32.SafeHandles;
using LLama.Abstractions;

namespace LLama
{
@@ -30,7 +31,7 @@ namespace LLama
/// <summary>
/// The model params set for this model.
/// </summary>
public ModelParams Params { get; set; }
public IModelParams Params { get; set; }
/// <summary>
/// The native handle, which is used to be passed to the native APIs. Please avoid using it
/// unless you know what is the usage of the Native API.
@@ -47,7 +48,7 @@ namespace LLama
/// <param name="Params">Model params.</param>
/// <param name="encoding">Encoding to deal with text input.</param>
/// <param name="logger">The logger.</param>
public LLamaModel(ModelParams Params, string encoding = "UTF-8", ILLamaLogger? logger = null)
public LLamaModel(IModelParams Params, string encoding = "UTF-8", ILLamaLogger? logger = null)
{
_logger = logger;
this.Params = Params;


+ 2
- 2
LLama/ResettableLLamaModel.cs View File

@@ -1,4 +1,4 @@
using LLama.Common;
using LLama.Abstractions;
using System;
using System.Collections.Generic;
using System.Text;
@@ -19,7 +19,7 @@ namespace LLama
/// </summary>
/// <param name="Params"></param>
/// <param name="encoding"></param>
public ResettableLLamaModel(ModelParams Params, string encoding = "UTF-8") : base(Params, encoding)
public ResettableLLamaModel(IModelParams Params, string encoding = "UTF-8") : base(Params, encoding)
{
OriginalState = GetState();
}


+ 7
- 2
LLama/Utils.cs View File

@@ -1,4 +1,4 @@
using LLama.Common;
using LLama.Abstractions;
using LLama.Exceptions;
using LLama.Native;
using System;
@@ -13,7 +13,7 @@ namespace LLama
using llama_token = Int32;
internal static class Utils
{
public static SafeLLamaContextHandle InitLLamaContextFromModelParams(ModelParams @params)
public static SafeLLamaContextHandle InitLLamaContextFromModelParams(IModelParams @params)
{
var lparams = NativeApi.llama_context_default_params();

@@ -28,6 +28,11 @@ namespace LLama
lparams.logits_all = @params.Perplexity;
lparams.embedding = @params.EmbeddingMode;
lparams.low_vram = @params.LowVram;
lparams.n_gqa = @params.GroupedQueryAttention;
lparams.rms_norm_eps = @params.RmsNormEpsilon;
lparams.rope_freq_base = @params.RopeFrequencyBase;
lparams.rope_freq_scale = @params.RopeFrequencyScale;
lparams.mul_mat_q = @params.MulMatQ;

/*
if (@params.TensorSplits.Length != 1)


Loading…
Cancel
Save