| @@ -52,6 +52,30 @@ | |||||
| <None Update="Assets\reason-act.txt"> | <None Update="Assets\reason-act.txt"> | ||||
| <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory> | <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory> | ||||
| </None> | </None> | ||||
| <None Update="C:\Users\Martin\Documents\dotnet\LLamaSharp\LLama\runtimes/ggml-metal.metal"> | |||||
| <CopyToOutputDirectory>Never</CopyToOutputDirectory> | |||||
| </None> | |||||
| <None Update="C:\Users\Martin\Documents\dotnet\LLamaSharp\LLama\runtimes/libllama-cuda11.dll"> | |||||
| <CopyToOutputDirectory>Never</CopyToOutputDirectory> | |||||
| </None> | |||||
| <None Update="C:\Users\Martin\Documents\dotnet\LLamaSharp\LLama\runtimes/libllama-cuda11.so"> | |||||
| <CopyToOutputDirectory>Never</CopyToOutputDirectory> | |||||
| </None> | |||||
| <None Update="C:\Users\Martin\Documents\dotnet\LLamaSharp\LLama\runtimes/libllama-cuda12.dll"> | |||||
| <CopyToOutputDirectory>Never</CopyToOutputDirectory> | |||||
| </None> | |||||
| <None Update="C:\Users\Martin\Documents\dotnet\LLamaSharp\LLama\runtimes/libllama-cuda12.so"> | |||||
| <CopyToOutputDirectory>Never</CopyToOutputDirectory> | |||||
| </None> | |||||
| <None Update="C:\Users\Martin\Documents\dotnet\LLamaSharp\LLama\runtimes/libllama-metal.dylib"> | |||||
| <CopyToOutputDirectory>Never</CopyToOutputDirectory> | |||||
| </None> | |||||
| <None Update="C:\Users\Martin\Documents\dotnet\LLamaSharp\LLama\runtimes/libllama.dylib"> | |||||
| <CopyToOutputDirectory>Never</CopyToOutputDirectory> | |||||
| </None> | |||||
| <None Update="C:\Users\Martin\Documents\dotnet\LLamaSharp\LLama\runtimes/libllama.so"> | |||||
| <CopyToOutputDirectory>Never</CopyToOutputDirectory> | |||||
| </None> | |||||
| </ItemGroup> | </ItemGroup> | ||||
| </Project> | </Project> | ||||
| @@ -37,6 +37,30 @@ | |||||
| </ItemGroup> | </ItemGroup> | ||||
| <ItemGroup> | <ItemGroup> | ||||
| <None Update="C:\Users\Martin\Documents\dotnet\LLamaSharp\LLama\runtimes/ggml-metal.metal"> | |||||
| <CopyToOutputDirectory>Never</CopyToOutputDirectory> | |||||
| </None> | |||||
| <None Update="C:\Users\Martin\Documents\dotnet\LLamaSharp\LLama\runtimes/libllama-cuda11.dll"> | |||||
| <CopyToOutputDirectory>Never</CopyToOutputDirectory> | |||||
| </None> | |||||
| <None Update="C:\Users\Martin\Documents\dotnet\LLamaSharp\LLama\runtimes/libllama-cuda11.so"> | |||||
| <CopyToOutputDirectory>Never</CopyToOutputDirectory> | |||||
| </None> | |||||
| <None Update="C:\Users\Martin\Documents\dotnet\LLamaSharp\LLama\runtimes/libllama-cuda12.dll"> | |||||
| <CopyToOutputDirectory>Never</CopyToOutputDirectory> | |||||
| </None> | |||||
| <None Update="C:\Users\Martin\Documents\dotnet\LLamaSharp\LLama\runtimes/libllama-cuda12.so"> | |||||
| <CopyToOutputDirectory>Never</CopyToOutputDirectory> | |||||
| </None> | |||||
| <None Update="C:\Users\Martin\Documents\dotnet\LLamaSharp\LLama\runtimes/libllama-metal.dylib"> | |||||
| <CopyToOutputDirectory>Never</CopyToOutputDirectory> | |||||
| </None> | |||||
| <None Update="C:\Users\Martin\Documents\dotnet\LLamaSharp\LLama\runtimes/libllama.dylib"> | |||||
| <CopyToOutputDirectory>Never</CopyToOutputDirectory> | |||||
| </None> | |||||
| <None Update="C:\Users\Martin\Documents\dotnet\LLamaSharp\LLama\runtimes/libllama.so"> | |||||
| <CopyToOutputDirectory>Never</CopyToOutputDirectory> | |||||
| </None> | |||||
| <None Update="Models\llama-2-7b-chat.ggmlv3.q3_K_S.bin"> | <None Update="Models\llama-2-7b-chat.ggmlv3.q3_K_S.bin"> | ||||
| <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory> | <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory> | ||||
| </None> | </None> | ||||
| @@ -14,7 +14,6 @@ namespace LLama.Unittest | |||||
| BatchSize = 17, | BatchSize = 17, | ||||
| ContextSize = 42, | ContextSize = 42, | ||||
| LoraAdapter = "adapter", | LoraAdapter = "adapter", | ||||
| GroupedQueryAttention = 7, | |||||
| Seed = 42, | Seed = 42, | ||||
| GpuLayerCount = 111 | GpuLayerCount = 111 | ||||
| }; | }; | ||||
| @@ -33,7 +32,6 @@ namespace LLama.Unittest | |||||
| BatchSize = 17, | BatchSize = 17, | ||||
| ContextSize = 42, | ContextSize = 42, | ||||
| LoraAdapter = "adapter", | LoraAdapter = "adapter", | ||||
| GroupedQueryAttention = 7, | |||||
| Seed = 42, | Seed = 42, | ||||
| GpuLayerCount = 111 | GpuLayerCount = 111 | ||||
| }; | }; | ||||
| @@ -88,16 +88,6 @@ namespace LLama.Web.Common | |||||
| /// </summary> | /// </summary> | ||||
| public float[] TensorSplits { get; set; } | public float[] TensorSplits { get; set; } | ||||
| /// <summary> | |||||
| /// Grouped-Query Attention | |||||
| /// </summary> | |||||
| public int GroupedQueryAttention { get; set; } = 1; | |||||
| /// <summary> | |||||
| /// RMS Norm Epsilon | |||||
| /// </summary> | |||||
| public float RmsNormEpsilon { get; set; } = 5e-6f; | |||||
| /// <summary> | /// <summary> | ||||
| /// RoPE base frequency | /// RoPE base frequency | ||||
| /// </summary> | /// </summary> | ||||
| @@ -98,16 +98,6 @@ namespace LLama.Abstractions | |||||
| /// </summary> | /// </summary> | ||||
| float[]? TensorSplits { get; set; } | float[]? TensorSplits { get; set; } | ||||
| /// <summary> | |||||
| /// Grouped-Query Attention | |||||
| /// </summary> | |||||
| int GroupedQueryAttention { get; set; } | |||||
| /// <summary> | |||||
| /// RMS Norm Epsilon | |||||
| /// </summary> | |||||
| float RmsNormEpsilon { get; set; } | |||||
| /// <summary> | /// <summary> | ||||
| /// RoPE base frequency | /// RoPE base frequency | ||||
| /// </summary> | /// </summary> | ||||
| @@ -89,16 +89,6 @@ namespace LLama.Common | |||||
| /// </summary> | /// </summary> | ||||
| public float[]? TensorSplits { get; set; } | public float[]? TensorSplits { get; set; } | ||||
| /// <summary> | |||||
| /// Grouped-Query Attention | |||||
| /// </summary> | |||||
| public int GroupedQueryAttention { get; set; } = 1; | |||||
| /// <summary> | |||||
| /// RMS Norm Epsilon | |||||
| /// </summary> | |||||
| public float RmsNormEpsilon { get; set; } = 5e-6f; | |||||
| /// <summary> | /// <summary> | ||||
| /// RoPE base frequency | /// RoPE base frequency | ||||
| /// </summary> | /// </summary> | ||||
| @@ -153,8 +143,6 @@ namespace LLama.Common | |||||
| /// <param name="batchSize">Batch size for prompt processing (must be >=32 to use BLAS) (n_batch)</param> | /// <param name="batchSize">Batch size for prompt processing (must be >=32 to use BLAS) (n_batch)</param> | ||||
| /// <param name="convertEosToNewLine">Whether to convert eos to newline during the inference.</param> | /// <param name="convertEosToNewLine">Whether to convert eos to newline during the inference.</param> | ||||
| /// <param name="embeddingMode">Whether to use embedding mode. (embedding) Note that if this is set to true, The LLamaModel won't produce text response anymore.</param> | /// <param name="embeddingMode">Whether to use embedding mode. (embedding) Note that if this is set to true, The LLamaModel won't produce text response anymore.</param> | ||||
| /// <param name="groupedQueryAttention">Grouped-Query Attention</param> | |||||
| /// <param name="rmsNormEpsilon">RMS Norm Epsilon</param> | |||||
| /// <param name="ropeFrequencyBase">RoPE base frequency.</param> | /// <param name="ropeFrequencyBase">RoPE base frequency.</param> | ||||
| /// <param name="ropeFrequencyScale">RoPE frequency scaling factor</param> | /// <param name="ropeFrequencyScale">RoPE frequency scaling factor</param> | ||||
| /// <param name="mulMatQ">Use experimental mul_mat_q kernels</param> | /// <param name="mulMatQ">Use experimental mul_mat_q kernels</param> | ||||
| @@ -165,7 +153,7 @@ namespace LLama.Common | |||||
| bool useMemorymap = true, bool useMemoryLock = false, bool perplexity = false, | bool useMemorymap = true, bool useMemoryLock = false, bool perplexity = false, | ||||
| string loraAdapter = "", string loraBase = "", int threads = -1, int batchSize = 512, | string loraAdapter = "", string loraBase = "", int threads = -1, int batchSize = 512, | ||||
| bool convertEosToNewLine = false, bool embeddingMode = false, | bool convertEosToNewLine = false, bool embeddingMode = false, | ||||
| int groupedQueryAttention = 1, float rmsNormEpsilon = 5e-6f, float ropeFrequencyBase = 10000.0f, float ropeFrequencyScale = 1f, bool mulMatQ = false, | |||||
| float ropeFrequencyBase = 10000.0f, float ropeFrequencyScale = 1f, bool mulMatQ = false, | |||||
| string encoding = "UTF-8") | string encoding = "UTF-8") | ||||
| { | { | ||||
| ContextSize = contextSize; | ContextSize = contextSize; | ||||
| @@ -182,8 +170,6 @@ namespace LLama.Common | |||||
| BatchSize = batchSize; | BatchSize = batchSize; | ||||
| ConvertEosToNewLine = convertEosToNewLine; | ConvertEosToNewLine = convertEosToNewLine; | ||||
| EmbeddingMode = embeddingMode; | EmbeddingMode = embeddingMode; | ||||
| GroupedQueryAttention = groupedQueryAttention; | |||||
| RmsNormEpsilon = rmsNormEpsilon; | |||||
| RopeFrequencyBase = ropeFrequencyBase; | RopeFrequencyBase = ropeFrequencyBase; | ||||
| RopeFrequencyScale = ropeFrequencyScale; | RopeFrequencyScale = ropeFrequencyScale; | ||||
| MulMatQ = mulMatQ; | MulMatQ = mulMatQ; | ||||
| @@ -39,8 +39,6 @@ namespace LLama.Extensions | |||||
| result.logits_all = @params.Perplexity; | result.logits_all = @params.Perplexity; | ||||
| result.embedding = @params.EmbeddingMode; | result.embedding = @params.EmbeddingMode; | ||||
| result.low_vram = @params.LowVram; | result.low_vram = @params.LowVram; | ||||
| result.n_gqa = @params.GroupedQueryAttention; | |||||
| result.rms_norm_eps = @params.RmsNormEpsilon; | |||||
| result.rope_freq_base = @params.RopeFrequencyBase; | result.rope_freq_base = @params.RopeFrequencyBase; | ||||
| result.rope_freq_scale = @params.RopeFrequencyScale; | result.rope_freq_scale = @params.RopeFrequencyScale; | ||||
| result.mul_mat_q = @params.MulMatQ; | result.mul_mat_q = @params.MulMatQ; | ||||
| @@ -31,16 +31,6 @@ namespace LLama.Native | |||||
| /// </summary> | /// </summary> | ||||
| public int n_batch; | public int n_batch; | ||||
| /// <summary> | |||||
| /// grouped-query attention (TEMP - will be moved to model hparams) | |||||
| /// </summary> | |||||
| public int n_gqa; | |||||
| /// <summary> | |||||
| /// rms norm epsilon (TEMP - will be moved to model hparams) | |||||
| /// </summary> | |||||
| public float rms_norm_eps; | |||||
| /// <summary> | /// <summary> | ||||
| /// number of layers to store in VRAM | /// number of layers to store in VRAM | ||||
| /// </summary> | /// </summary> | ||||
| @@ -82,8 +72,8 @@ namespace LLama.Native | |||||
| /// if true, reduce VRAM usage at the cost of performance | /// if true, reduce VRAM usage at the cost of performance | ||||
| /// </summary> | /// </summary> | ||||
| public bool low_vram | public bool low_vram | ||||
| { | |||||
| get => Convert.ToBoolean(_low_vram); | |||||
| { | |||||
| readonly get => Convert.ToBoolean(_low_vram); | |||||
| set => _low_vram = Convert.ToSByte(value); | set => _low_vram = Convert.ToSByte(value); | ||||
| } | } | ||||
| private sbyte _low_vram; | private sbyte _low_vram; | ||||
| @@ -92,8 +82,8 @@ namespace LLama.Native | |||||
| /// if true, use experimental mul_mat_q kernels | /// if true, use experimental mul_mat_q kernels | ||||
| /// </summary> | /// </summary> | ||||
| public bool mul_mat_q | public bool mul_mat_q | ||||
| { | |||||
| get => Convert.ToBoolean(_mul_mat_q); | |||||
| { | |||||
| readonly get => Convert.ToBoolean(_mul_mat_q); | |||||
| set => _mul_mat_q = Convert.ToSByte(value); | set => _mul_mat_q = Convert.ToSByte(value); | ||||
| } | } | ||||
| private sbyte _mul_mat_q; | private sbyte _mul_mat_q; | ||||
| @@ -102,8 +92,8 @@ namespace LLama.Native | |||||
| /// use fp16 for KV cache | /// use fp16 for KV cache | ||||
| /// </summary> | /// </summary> | ||||
| public bool f16_kv | public bool f16_kv | ||||
| { | |||||
| get => Convert.ToBoolean(_f16_kv); | |||||
| { | |||||
| readonly get => Convert.ToBoolean(_f16_kv); | |||||
| set => _f16_kv = Convert.ToSByte(value); | set => _f16_kv = Convert.ToSByte(value); | ||||
| } | } | ||||
| private sbyte _f16_kv; | private sbyte _f16_kv; | ||||
| @@ -112,8 +102,8 @@ namespace LLama.Native | |||||
| /// the llama_eval() call computes all logits, not just the last one | /// the llama_eval() call computes all logits, not just the last one | ||||
| /// </summary> | /// </summary> | ||||
| public bool logits_all | public bool logits_all | ||||
| { | |||||
| get => Convert.ToBoolean(_logits_all); | |||||
| { | |||||
| readonly get => Convert.ToBoolean(_logits_all); | |||||
| set => _logits_all = Convert.ToSByte(value); | set => _logits_all = Convert.ToSByte(value); | ||||
| } | } | ||||
| private sbyte _logits_all; | private sbyte _logits_all; | ||||
| @@ -122,8 +112,8 @@ namespace LLama.Native | |||||
| /// only load the vocabulary, no weights | /// only load the vocabulary, no weights | ||||
| /// </summary> | /// </summary> | ||||
| public bool vocab_only | public bool vocab_only | ||||
| { | |||||
| get => Convert.ToBoolean(_vocab_only); | |||||
| { | |||||
| readonly get => Convert.ToBoolean(_vocab_only); | |||||
| set => _vocab_only = Convert.ToSByte(value); | set => _vocab_only = Convert.ToSByte(value); | ||||
| } | } | ||||
| private sbyte _vocab_only; | private sbyte _vocab_only; | ||||
| @@ -132,8 +122,8 @@ namespace LLama.Native | |||||
| /// use mmap if possible | /// use mmap if possible | ||||
| /// </summary> | /// </summary> | ||||
| public bool use_mmap | public bool use_mmap | ||||
| { | |||||
| get => Convert.ToBoolean(_use_mmap); | |||||
| { | |||||
| readonly get => Convert.ToBoolean(_use_mmap); | |||||
| set => _use_mmap = Convert.ToSByte(value); | set => _use_mmap = Convert.ToSByte(value); | ||||
| } | } | ||||
| private sbyte _use_mmap; | private sbyte _use_mmap; | ||||
| @@ -142,8 +132,8 @@ namespace LLama.Native | |||||
| /// force system to keep model in RAM | /// force system to keep model in RAM | ||||
| /// </summary> | /// </summary> | ||||
| public bool use_mlock | public bool use_mlock | ||||
| { | |||||
| get => Convert.ToBoolean(_use_mlock); | |||||
| { | |||||
| readonly get => Convert.ToBoolean(_use_mlock); | |||||
| set => _use_mlock = Convert.ToSByte(value); | set => _use_mlock = Convert.ToSByte(value); | ||||
| } | } | ||||
| private sbyte _use_mlock; | private sbyte _use_mlock; | ||||
| @@ -152,8 +142,8 @@ namespace LLama.Native | |||||
| /// embedding mode only | /// embedding mode only | ||||
| /// </summary> | /// </summary> | ||||
| public bool embedding | public bool embedding | ||||
| { | |||||
| get => Convert.ToBoolean(_embedding); | |||||
| { | |||||
| readonly get => Convert.ToBoolean(_embedding); | |||||
| set => _embedding = Convert.ToSByte(value); | set => _embedding = Convert.ToSByte(value); | ||||
| } | } | ||||
| private sbyte _embedding; | private sbyte _embedding; | ||||
| @@ -105,5 +105,10 @@ | |||||
| /// </summary> | /// </summary> | ||||
| /// <remarks>Benchmark@7B: 5.15GB, +0.0044 ppl</remarks> | /// <remarks>Benchmark@7B: 5.15GB, +0.0044 ppl</remarks> | ||||
| LLAMA_FTYPE_MOSTLY_Q6_K = 18, | LLAMA_FTYPE_MOSTLY_Q6_K = 18, | ||||
| /// <summary> | |||||
| /// File type was not specified | |||||
| /// </summary> | |||||
| LLAMA_FTYPE_GUESSED = 1024 | |||||
| } | } | ||||
| } | } | ||||
| @@ -377,7 +377,7 @@ namespace LLama.Native | |||||
| /// <param name="model"></param> | /// <param name="model"></param> | ||||
| /// <returns></returns> | /// <returns></returns> | ||||
| [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)] | [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)] | ||||
| public static extern int llama_n_vocab_from_model(SafeLlamaModelHandle model); | |||||
| public static extern int llama_model_n_vocab(SafeLlamaModelHandle model); | |||||
| /// <summary> | /// <summary> | ||||
| /// Get the size of the context window for the model | /// Get the size of the context window for the model | ||||
| @@ -385,7 +385,7 @@ namespace LLama.Native | |||||
| /// <param name="model"></param> | /// <param name="model"></param> | ||||
| /// <returns></returns> | /// <returns></returns> | ||||
| [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)] | [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)] | ||||
| public static extern int llama_n_ctx_from_model(SafeLlamaModelHandle model); | |||||
| public static extern int llama_model_n_ctx(SafeLlamaModelHandle model); | |||||
| /// <summary> | /// <summary> | ||||
| /// Get the dimension of embedding vectors from this model | /// Get the dimension of embedding vectors from this model | ||||
| @@ -393,7 +393,7 @@ namespace LLama.Native | |||||
| /// <param name="model"></param> | /// <param name="model"></param> | ||||
| /// <returns></returns> | /// <returns></returns> | ||||
| [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)] | [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)] | ||||
| public static extern int llama_n_embd_from_model(SafeLlamaModelHandle model); | |||||
| public static extern int llama_model_n_embd(SafeLlamaModelHandle model); | |||||
| /// <summary> | /// <summary> | ||||
| /// Convert a single token into text | /// Convert a single token into text | ||||
| @@ -28,9 +28,9 @@ namespace LLama.Native | |||||
| internal SafeLlamaModelHandle(IntPtr handle) | internal SafeLlamaModelHandle(IntPtr handle) | ||||
| : base(handle) | : base(handle) | ||||
| { | { | ||||
| VocabCount = NativeApi.llama_n_vocab_from_model(this); | |||||
| ContextSize = NativeApi.llama_n_ctx_from_model(this); | |||||
| EmbeddingSize = NativeApi.llama_n_embd_from_model(this); | |||||
| VocabCount = NativeApi.llama_model_n_vocab(this); | |||||
| ContextSize = NativeApi.llama_model_n_ctx(this); | |||||
| EmbeddingSize = NativeApi.llama_model_n_embd(this); | |||||
| } | } | ||||
| /// <inheritdoc /> | /// <inheritdoc /> | ||||