From acd91341e6fd3733b41a59818b9e5ed078a2c5ac Mon Sep 17 00:00:00 2001 From: Martin Evans Date: Thu, 10 Aug 2023 02:14:21 +0100 Subject: [PATCH 1/3] Added lots of comments to all the LLamaFtype variants --- LLama/Native/LLamaFtype.cs | 126 ++++++++++++++++++----- LLama/Native/LLamaModelQuantizeParams.cs | 11 +- 2 files changed, 110 insertions(+), 27 deletions(-) diff --git a/LLama/Native/LLamaFtype.cs b/LLama/Native/LLamaFtype.cs index 41159ee2..b7b74bf5 100644 --- a/LLama/Native/LLamaFtype.cs +++ b/LLama/Native/LLamaFtype.cs @@ -1,29 +1,109 @@ -using System; -using System.Collections.Generic; -using System.Text; - -namespace LLama.Native +namespace LLama.Native { + /// + /// Supported model file types + /// public enum LLamaFtype { + /// + /// All f32 + /// + /// Benchmark: 26GB @ 7B parameters LLAMA_FTYPE_ALL_F32 = 0, - LLAMA_FTYPE_MOSTLY_F16 = 1, // except 1d tensors - LLAMA_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors - LLAMA_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors - LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16 - // LLAMA_FTYPE_MOSTLY_Q4_2 = 5, // support has been removed - // LLAMA_FTYPE_MOSTLY_Q4_3 (6) support has been removed - LLAMA_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors - LLAMA_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors - LLAMA_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors - LLAMA_FTYPE_MOSTLY_Q2_K = 10,// except 1d tensors - LLAMA_FTYPE_MOSTLY_Q3_K_S = 11,// except 1d tensors - LLAMA_FTYPE_MOSTLY_Q3_K_M = 12,// except 1d tensors - LLAMA_FTYPE_MOSTLY_Q3_K_L = 13,// except 1d tensors - LLAMA_FTYPE_MOSTLY_Q4_K_S = 14,// except 1d tensors - LLAMA_FTYPE_MOSTLY_Q4_K_M = 15,// except 1d tensors - LLAMA_FTYPE_MOSTLY_Q5_K_S = 16,// except 1d tensors - LLAMA_FTYPE_MOSTLY_Q5_K_M = 17,// except 1d tensors - LLAMA_FTYPE_MOSTLY_Q6_K = 18,// except 1d tensors + + /// + /// Mostly f16 + /// + /// Benchmark: 13GB @ 7B parameters + LLAMA_FTYPE_MOSTLY_F16 = 1, + + /// + /// Mostly 8 bit + /// + /// Benchmark: 6.7GB @ 7B parameters, +0.0004ppl + LLAMA_FTYPE_MOSTLY_Q8_0 = 7, + + /// + /// Mostly 4 bit + /// + /// Benchmark: 3.50GB @ 7B parameters, +0.2499 ppl + LLAMA_FTYPE_MOSTLY_Q4_0 = 2, + + /// + /// Mostly 4 bit + /// + /// Benchmark: 3.90GB @ 7B parameters, +0.1846 ppl + LLAMA_FTYPE_MOSTLY_Q4_1 = 3, + + /// + /// Mostly 4 bit, tok_embeddings.weight and output.weight are f16 + /// + LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, + + /// + /// Mostly 5 bit + /// + /// Benchmark: 4.30GB @ 7B tokens, +0.0796 ppl + LLAMA_FTYPE_MOSTLY_Q5_0 = 8, + + /// + /// Mostly 5 bit + /// + /// Benchmark: 4.70GB @ 7B parameters, +0.0415 ppl + LLAMA_FTYPE_MOSTLY_Q5_1 = 9, + + /// + /// K-Quant 2 bit + /// + /// Benchmark: 2.67GB @ 7N parameters, +0.8698 ppl + LLAMA_FTYPE_MOSTLY_Q2_K = 10, + + /// + /// K-Quant 3 bit (Small) + /// + /// Benchmark: 2.75GB @ 7B parameters, +0.5505 ppl + LLAMA_FTYPE_MOSTLY_Q3_K_S = 11, + + /// + /// K-Quant 3 bit (Medium) + /// + /// Benchmark: 3.06GB @ 7B parameters, +0.2437 ppl + LLAMA_FTYPE_MOSTLY_Q3_K_M = 12, + + /// + /// K-Quant 3 bit (Large) + /// + /// Benchmark: 3.35GB @ 7B parameters, +0.1803 ppl + LLAMA_FTYPE_MOSTLY_Q3_K_L = 13, + + /// + /// K-Quant 4 bit (Small) + /// + /// Benchmark: 3.56GB @ 7B parameters, +0.1149 ppl + LLAMA_FTYPE_MOSTLY_Q4_K_S = 14, + + /// + /// K-Quant 4 bit (Medium) + /// + /// Benchmark: 3.80GB @ 7B parameters, +0.0535 ppl + LLAMA_FTYPE_MOSTLY_Q4_K_M = 15, + + /// + /// K-Quant 5 bit (Small) + /// + /// Benchmark: 4.33GB @ 7B parameters, +0.0353 ppl + LLAMA_FTYPE_MOSTLY_Q5_K_S = 16, + + /// + /// K-Quant 5 bit (Medium) + /// + /// Benchmark: 4.45GB @ 7B parameters, +0.0142 ppl + LLAMA_FTYPE_MOSTLY_Q5_K_M = 17, + + /// + /// K-Quant 6 bit + /// + /// Benchmark: 5.15GB @ 7B parameters, +0.0044 ppl + LLAMA_FTYPE_MOSTLY_Q6_K = 18, } } diff --git a/LLama/Native/LLamaModelQuantizeParams.cs b/LLama/Native/LLamaModelQuantizeParams.cs index 17ec035a..f23c1d2e 100644 --- a/LLama/Native/LLamaModelQuantizeParams.cs +++ b/LLama/Native/LLamaModelQuantizeParams.cs @@ -1,25 +1,28 @@ -using System; -using System.Collections.Generic; -using System.Runtime.InteropServices; -using System.Text; +using System.Runtime.InteropServices; namespace LLama.Native { + /// + /// Quantizer parameters used in the native API + /// public struct LLamaModelQuantizeParams { /// /// number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency() /// public int nthread; + /// /// quantize to this llama_ftype /// public LLamaFtype ftype; + /// /// allow quantizing non-f32/f16 tensors /// [MarshalAs(UnmanagedType.I1)] public bool allow_requantize; + /// /// quantize output.weight /// From b69f4bc40e19d78e0beb4a5093d4cea5bfd5284a Mon Sep 17 00:00:00 2001 From: Martin Evans Date: Thu, 10 Aug 2023 16:58:00 +0100 Subject: [PATCH 2/3] - Expanded range of supported types in quantizer to match llama.cpp - Rewritten `LLamaFtype` parsing to support any substring which uniquely matches a single enum variant --- LLama/LLamaQuantizer.cs | 88 ++++++++++++++++++++++++++--------------- 1 file changed, 57 insertions(+), 31 deletions(-) diff --git a/LLama/LLamaQuantizer.cs b/LLama/LLamaQuantizer.cs index c3ff5613..f1d89586 100644 --- a/LLama/LLamaQuantizer.cs +++ b/LLama/LLamaQuantizer.cs @@ -1,8 +1,6 @@ using LLama.Native; using System; using System.Collections.Generic; -using System.Linq; -using System.Text; namespace LLama { @@ -36,8 +34,7 @@ namespace LLama quantizeParams.nthread = nthread; quantizeParams.allow_requantize = allowRequantize; quantizeParams.quantize_output_tensor = quantizeOutputTensor; - LLamaModelQuantizeParams* p = &quantizeParams; - return NativeApi.llama_model_quantize(srcFileName, dstFilename, p) == 0; + return NativeApi.llama_model_quantize(srcFileName, dstFilename, &quantizeParams) == 0; } /// @@ -57,42 +54,71 @@ namespace LLama return Quantize(srcFileName, dstFilename, StringToFtype(ftype), nthread, allowRequantize, quantizeOutputTensor); } - private static bool ValidateFtype(string ftype) - { - return new string[] { "q4_0", "q4_1", "q5_0", "q5_1", "q8_0" }.Contains(ftype); - } - private static bool ValidateFtype(LLamaFtype ftype) { - return ftype is LLamaFtype.LLAMA_FTYPE_MOSTLY_Q4_0 or LLamaFtype.LLAMA_FTYPE_MOSTLY_Q4_1 - or LLamaFtype.LLAMA_FTYPE_MOSTLY_Q5_0 or LLamaFtype.LLAMA_FTYPE_MOSTLY_Q5_1 or LLamaFtype.LLAMA_FTYPE_MOSTLY_Q8_0; - } + // Validation copies from here: + // https://github.com/ggerganov/llama.cpp/blob/e59fcb2bc129881f4a269fee748fb38bce0a64de/llama.cpp#L2960 - private static string FtypeToString(LLamaFtype ftype) - { - return ftype switch + switch (ftype) { - LLamaFtype.LLAMA_FTYPE_MOSTLY_Q4_0 => "q4_0", - LLamaFtype.LLAMA_FTYPE_MOSTLY_Q4_1 => "q4_1", - LLamaFtype.LLAMA_FTYPE_MOSTLY_Q5_0 => "q5_0", - LLamaFtype.LLAMA_FTYPE_MOSTLY_Q5_1 => "q5_1", - LLamaFtype.LLAMA_FTYPE_MOSTLY_Q8_0 => "q8_0", - _ => throw new ArgumentException($"The type {Enum.GetName(typeof(LLamaFtype), ftype)} is not a valid type " + - $"to perform quantization.") - }; + case LLamaFtype.LLAMA_FTYPE_MOSTLY_Q4_0: + case LLamaFtype.LLAMA_FTYPE_MOSTLY_Q4_1: + case LLamaFtype.LLAMA_FTYPE_MOSTLY_Q5_0: + case LLamaFtype.LLAMA_FTYPE_MOSTLY_Q5_1: + case LLamaFtype.LLAMA_FTYPE_MOSTLY_Q8_0: + case LLamaFtype.LLAMA_FTYPE_MOSTLY_F16: + case LLamaFtype.LLAMA_FTYPE_ALL_F32: + case LLamaFtype.LLAMA_FTYPE_MOSTLY_Q2_K: + case LLamaFtype.LLAMA_FTYPE_MOSTLY_Q3_K_S: + case LLamaFtype.LLAMA_FTYPE_MOSTLY_Q3_K_M: + case LLamaFtype.LLAMA_FTYPE_MOSTLY_Q3_K_L: + case LLamaFtype.LLAMA_FTYPE_MOSTLY_Q4_K_S: + case LLamaFtype.LLAMA_FTYPE_MOSTLY_Q4_K_M: + case LLamaFtype.LLAMA_FTYPE_MOSTLY_Q5_K_S: + case LLamaFtype.LLAMA_FTYPE_MOSTLY_Q5_K_M: + case LLamaFtype.LLAMA_FTYPE_MOSTLY_Q6_K: + return true; + + case LLamaFtype.LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16: + default: + return false; + } } + /// + /// Parse a string into a LLamaFtype. This is a "relaxed" parsing, which allows any string which is contained within + /// the enum name to be used. + /// + /// For example "Q5_K_M" will convert to "LLAMA_FTYPE_MOSTLY_Q5_K_M" + /// + /// + /// + /// private static LLamaFtype StringToFtype(string str) { - return str switch + // Find all variants which contain the input string + var matches = new List(); + foreach (LLamaFtype ftype in Enum.GetValues(typeof(LLamaFtype))) { - "q4_0" => LLamaFtype.LLAMA_FTYPE_MOSTLY_Q4_0, - "q4_1" => LLamaFtype.LLAMA_FTYPE_MOSTLY_Q4_1, - "q5_0" => LLamaFtype.LLAMA_FTYPE_MOSTLY_Q5_0, - "q5_1" => LLamaFtype.LLAMA_FTYPE_MOSTLY_Q5_1, - "q8_0" => LLamaFtype.LLAMA_FTYPE_MOSTLY_Q8_0, - _ => throw new ArgumentException($"Invalid ftype {str} to quantize.") - }; + var name = Enum.GetName(typeof(LLamaFtype), ftype); + + // Note: this is using "IndexOf" instead of "Contains" to be compatible with netstandard2.0 +#pragma warning disable CA2249 + if (name != null && name.IndexOf(str, StringComparison.OrdinalIgnoreCase) >= 0) + matches.Add(ftype); +#pragma warning restore CA2249 + } + + // If there was just one match, success! + if (matches.Count == 1) + return matches[0]; + + // If none matched throw a generic error + if (matches.Count == 0) + throw new ArgumentException($"Unknown ftype \"{str}\" for quantization."); + + // There were several matches, throw an error asking the user to be more specific + throw new ArgumentException($"\"{str}\" matches multiple potential ftypes: {string.Join(",", matches)}"); } } } From ce325b49c711f76b38971f86e8da6196ff8e1193 Mon Sep 17 00:00:00 2001 From: Martin Evans Date: Thu, 10 Aug 2023 17:00:54 +0100 Subject: [PATCH 3/3] Rewritten comments --- LLama/Native/LLamaFtype.cs | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/LLama/Native/LLamaFtype.cs b/LLama/Native/LLamaFtype.cs index b7b74bf5..79fdf854 100644 --- a/LLama/Native/LLamaFtype.cs +++ b/LLama/Native/LLamaFtype.cs @@ -8,31 +8,31 @@ /// /// All f32 /// - /// Benchmark: 26GB @ 7B parameters + /// Benchmark@7B: 26GB LLAMA_FTYPE_ALL_F32 = 0, /// /// Mostly f16 /// - /// Benchmark: 13GB @ 7B parameters + /// Benchmark@7B: 13GB LLAMA_FTYPE_MOSTLY_F16 = 1, /// /// Mostly 8 bit /// - /// Benchmark: 6.7GB @ 7B parameters, +0.0004ppl + /// Benchmark@7B: 6.7GB, +0.0004ppl LLAMA_FTYPE_MOSTLY_Q8_0 = 7, /// /// Mostly 4 bit /// - /// Benchmark: 3.50GB @ 7B parameters, +0.2499 ppl + /// Benchmark@7B: 3.50GB, +0.2499 ppl LLAMA_FTYPE_MOSTLY_Q4_0 = 2, /// /// Mostly 4 bit /// - /// Benchmark: 3.90GB @ 7B parameters, +0.1846 ppl + /// Benchmark@7B: 3.90GB, +0.1846 ppl LLAMA_FTYPE_MOSTLY_Q4_1 = 3, /// @@ -43,67 +43,67 @@ /// /// Mostly 5 bit /// - /// Benchmark: 4.30GB @ 7B tokens, +0.0796 ppl + /// Benchmark@7B: 4.30GB @ 7B tokens, +0.0796 ppl LLAMA_FTYPE_MOSTLY_Q5_0 = 8, /// /// Mostly 5 bit /// - /// Benchmark: 4.70GB @ 7B parameters, +0.0415 ppl + /// Benchmark@7B: 4.70GB, +0.0415 ppl LLAMA_FTYPE_MOSTLY_Q5_1 = 9, /// /// K-Quant 2 bit /// - /// Benchmark: 2.67GB @ 7N parameters, +0.8698 ppl + /// Benchmark@7B: 2.67GB @ 7N parameters, +0.8698 ppl LLAMA_FTYPE_MOSTLY_Q2_K = 10, /// /// K-Quant 3 bit (Small) /// - /// Benchmark: 2.75GB @ 7B parameters, +0.5505 ppl + /// Benchmark@7B: 2.75GB, +0.5505 ppl LLAMA_FTYPE_MOSTLY_Q3_K_S = 11, /// /// K-Quant 3 bit (Medium) /// - /// Benchmark: 3.06GB @ 7B parameters, +0.2437 ppl + /// Benchmark@7B: 3.06GB, +0.2437 ppl LLAMA_FTYPE_MOSTLY_Q3_K_M = 12, /// /// K-Quant 3 bit (Large) /// - /// Benchmark: 3.35GB @ 7B parameters, +0.1803 ppl + /// Benchmark@7B: 3.35GB, +0.1803 ppl LLAMA_FTYPE_MOSTLY_Q3_K_L = 13, /// /// K-Quant 4 bit (Small) /// - /// Benchmark: 3.56GB @ 7B parameters, +0.1149 ppl + /// Benchmark@7B: 3.56GB, +0.1149 ppl LLAMA_FTYPE_MOSTLY_Q4_K_S = 14, /// /// K-Quant 4 bit (Medium) /// - /// Benchmark: 3.80GB @ 7B parameters, +0.0535 ppl + /// Benchmark@7B: 3.80GB, +0.0535 ppl LLAMA_FTYPE_MOSTLY_Q4_K_M = 15, /// /// K-Quant 5 bit (Small) /// - /// Benchmark: 4.33GB @ 7B parameters, +0.0353 ppl + /// Benchmark@7B: 4.33GB, +0.0353 ppl LLAMA_FTYPE_MOSTLY_Q5_K_S = 16, /// /// K-Quant 5 bit (Medium) /// - /// Benchmark: 4.45GB @ 7B parameters, +0.0142 ppl + /// Benchmark@7B: 4.45GB, +0.0142 ppl LLAMA_FTYPE_MOSTLY_Q5_K_M = 17, /// /// K-Quant 6 bit /// - /// Benchmark: 5.15GB @ 7B parameters, +0.0044 ppl + /// Benchmark@7B: 5.15GB, +0.0044 ppl LLAMA_FTYPE_MOSTLY_Q6_K = 18, } }