| @@ -1,5 +1,6 @@ | |||||
| using System.Text; | using System.Text; | ||||
| using LLama.Abstractions; | using LLama.Abstractions; | ||||
| using LLama.Native; | |||||
| namespace LLama.Web.Common | namespace LLama.Web.Common | ||||
| { | { | ||||
| @@ -118,6 +119,24 @@ namespace LLama.Web.Common | |||||
| /// </summary> | /// </summary> | ||||
| public float? RopeFrequencyScale { get; set; } | public float? RopeFrequencyScale { get; set; } | ||||
| /// <inheritdoc /> | |||||
| public float? YarnExtrapolationFactor { get; set; } | |||||
| /// <inheritdoc /> | |||||
| public float? YarnAttentionFactor { get; set; } | |||||
| /// <inheritdoc /> | |||||
| public float? YarnBetaFast { get; set; } | |||||
| /// <inheritdoc /> | |||||
| public float? YarnBetaSlow { get; set; } | |||||
| /// <inheritdoc /> | |||||
| public uint? YarnOriginalContext { get; set; } | |||||
| /// <inheritdoc /> | |||||
| public RopeScalingType? YarnScalingType { get; set; } | |||||
| /// <summary> | /// <summary> | ||||
| /// Use experimental mul_mat_q kernels | /// Use experimental mul_mat_q kernels | ||||
| /// </summary> | /// </summary> | ||||
| @@ -1,4 +1,5 @@ | |||||
| using System.Text; | using System.Text; | ||||
| using LLama.Native; | |||||
| namespace LLama.Abstractions; | namespace LLama.Abstractions; | ||||
| @@ -67,4 +68,34 @@ public interface IContextParams | |||||
| /// Number of threads to use for batch processing (null = autodetect) (n_threads) | /// Number of threads to use for batch processing (null = autodetect) (n_threads) | ||||
| /// </summary> | /// </summary> | ||||
| uint? BatchThreads { get; set; } | uint? BatchThreads { get; set; } | ||||
| /// <summary> | |||||
| /// YaRN extrapolation mix factor | |||||
| /// </summary> | |||||
| float? YarnExtrapolationFactor { get; set; } | |||||
| /// <summary> | |||||
| /// YaRN magnitude scaling factor | |||||
| /// </summary> | |||||
| float? YarnAttentionFactor { get; set; } | |||||
| /// <summary> | |||||
| /// YaRN low correction dim | |||||
| /// </summary> | |||||
| float? YarnBetaFast { get; set; } | |||||
| /// <summary> | |||||
| /// YaRN high correction dim | |||||
| /// </summary> | |||||
| float? YarnBetaSlow { get; set; } | |||||
| /// <summary> | |||||
| /// YaRN original context length | |||||
| /// </summary> | |||||
| uint? YarnOriginalContext { get; set; } | |||||
| /// <summary> | |||||
| /// YaRN scaling method to use. | |||||
| /// </summary> | |||||
| RopeScalingType? YarnScalingType { get; set; } | |||||
| } | } | ||||
| @@ -3,6 +3,7 @@ using System; | |||||
| using System.Text; | using System.Text; | ||||
| using System.Text.Json; | using System.Text.Json; | ||||
| using System.Text.Json.Serialization; | using System.Text.Json.Serialization; | ||||
| using LLama.Native; | |||||
| namespace LLama.Common | namespace LLama.Common | ||||
| { | { | ||||
| @@ -70,6 +71,7 @@ namespace LLama.Common | |||||
| /// </summary> | /// </summary> | ||||
| public uint? BatchThreads { get; set; } | public uint? BatchThreads { get; set; } | ||||
| /// <summary> | /// <summary> | ||||
| /// batch size for prompt processing (must be >=32 to use BLAS) (n_batch) | /// batch size for prompt processing (must be >=32 to use BLAS) (n_batch) | ||||
| /// </summary> | /// </summary> | ||||
| @@ -98,10 +100,28 @@ namespace LLama.Common | |||||
| /// </summary> | /// </summary> | ||||
| public float? RopeFrequencyScale { get; set; } | public float? RopeFrequencyScale { get; set; } | ||||
| /// <summary> | |||||
| /// Use experimental mul_mat_q kernels | |||||
| /// </summary> | |||||
| public bool MulMatQ { get; set; } | |||||
| /// <inheritdoc /> | |||||
| public float? YarnExtrapolationFactor { get; set; } | |||||
| /// <inheritdoc /> | |||||
| public float? YarnAttentionFactor { get; set; } | |||||
| /// <inheritdoc /> | |||||
| public float? YarnBetaFast { get; set; } | |||||
| /// <inheritdoc /> | |||||
| public float? YarnBetaSlow { get; set; } | |||||
| /// <inheritdoc /> | |||||
| public uint? YarnOriginalContext { get; set; } | |||||
| /// <inheritdoc /> | |||||
| public RopeScalingType? YarnScalingType { get; set; } | |||||
| /// <summary> | |||||
| /// Use experimental mul_mat_q kernels | |||||
| /// </summary> | |||||
| public bool MulMatQ { get; set; } | |||||
| /// <summary> | /// <summary> | ||||
| /// Load vocab only (no weights) | /// Load vocab only (no weights) | ||||
| @@ -29,6 +29,15 @@ namespace LLama.Extensions | |||||
| result.embedding = @params.EmbeddingMode; | result.embedding = @params.EmbeddingMode; | ||||
| result.rope_freq_base = @params.RopeFrequencyBase ?? 0; | result.rope_freq_base = @params.RopeFrequencyBase ?? 0; | ||||
| result.rope_freq_scale = @params.RopeFrequencyScale ?? 0; | result.rope_freq_scale = @params.RopeFrequencyScale ?? 0; | ||||
| // Default YaRN values copied from here: https://github.com/ggerganov/llama.cpp/blob/381efbf480959bb6d1e247a8b0c2328f22e350f8/common/common.h#L67 | |||||
| result.yarn_ext_factor = @params.YarnExtrapolationFactor ?? -1f; | |||||
| result.yarn_attn_factor = @params.YarnAttentionFactor ?? 1f; | |||||
| result.yarn_beta_fast = @params.YarnBetaFast ?? 32f; | |||||
| result.yarn_beta_slow = @params.YarnBetaSlow ?? 1f; | |||||
| result.yarn_orig_ctx = @params.YarnOriginalContext ?? 0; | |||||
| result.rope_scaling_type = @params.YarnScalingType ?? RopeScalingType.LLAMA_ROPE_SCALING_UNSPECIFIED; | |||||
| result.mul_mat_q = @params.MulMatQ; | result.mul_mat_q = @params.MulMatQ; | ||||
| result.n_threads = Threads(@params.Threads); | result.n_threads = Threads(@params.Threads); | ||||
| @@ -44,13 +44,13 @@ namespace LLama.Native | |||||
| /// <summary> | /// <summary> | ||||
| /// RoPE scaling type, from `enum llama_rope_scaling_type` | /// RoPE scaling type, from `enum llama_rope_scaling_type` | ||||
| /// </summary> | /// </summary> | ||||
| public sbyte rope_scaling_type; | |||||
| public RopeScalingType rope_scaling_type; | |||||
| /// <summary> | /// <summary> | ||||
| /// RoPE base frequency, 0 = from model | /// RoPE base frequency, 0 = from model | ||||
| /// </summary> | /// </summary> | ||||
| public float rope_freq_base; | |||||
| public float rope_freq_base; | |||||
| /// <summary> | /// <summary> | ||||
| /// RoPE frequency scaling factor, 0 = from model | /// RoPE frequency scaling factor, 0 = from model | ||||
| /// </summary> | /// </summary> | ||||
| @@ -0,0 +1,17 @@ | |||||
| namespace LLama.Native | |||||
| { | |||||
| /// <summary> | |||||
| /// RoPE scaling type. C# equivalent of llama_rope_scaling_type | |||||
| /// </summary> | |||||
| public enum RopeScalingType | |||||
| : sbyte | |||||
| { | |||||
| LLAMA_ROPE_SCALING_UNSPECIFIED = -1, | |||||
| LLAMA_ROPE_SCALING_NONE = 0, | |||||
| LLAMA_ROPE_SCALING_LINEAR = 1, | |||||
| LLAMA_ROPE_SCALING_YARN = 2, | |||||
| } | |||||
| } | |||||