Minor quantizer improvementstags/v0.5.1
| @@ -1,8 +1,6 @@ | |||||
| using LLama.Native; | using LLama.Native; | ||||
| using System; | using System; | ||||
| using System.Collections.Generic; | using System.Collections.Generic; | ||||
| using System.Linq; | |||||
| using System.Text; | |||||
| namespace LLama | namespace LLama | ||||
| { | { | ||||
| @@ -36,8 +34,7 @@ namespace LLama | |||||
| quantizeParams.nthread = nthread; | quantizeParams.nthread = nthread; | ||||
| quantizeParams.allow_requantize = allowRequantize; | quantizeParams.allow_requantize = allowRequantize; | ||||
| quantizeParams.quantize_output_tensor = quantizeOutputTensor; | quantizeParams.quantize_output_tensor = quantizeOutputTensor; | ||||
| LLamaModelQuantizeParams* p = &quantizeParams; | |||||
| return NativeApi.llama_model_quantize(srcFileName, dstFilename, p) == 0; | |||||
| return NativeApi.llama_model_quantize(srcFileName, dstFilename, &quantizeParams) == 0; | |||||
| } | } | ||||
| /// <summary> | /// <summary> | ||||
| @@ -57,42 +54,71 @@ namespace LLama | |||||
| return Quantize(srcFileName, dstFilename, StringToFtype(ftype), nthread, allowRequantize, quantizeOutputTensor); | return Quantize(srcFileName, dstFilename, StringToFtype(ftype), nthread, allowRequantize, quantizeOutputTensor); | ||||
| } | } | ||||
| private static bool ValidateFtype(string ftype) | |||||
| { | |||||
| return new string[] { "q4_0", "q4_1", "q5_0", "q5_1", "q8_0" }.Contains(ftype); | |||||
| } | |||||
| private static bool ValidateFtype(LLamaFtype ftype) | private static bool ValidateFtype(LLamaFtype ftype) | ||||
| { | { | ||||
| return ftype is LLamaFtype.LLAMA_FTYPE_MOSTLY_Q4_0 or LLamaFtype.LLAMA_FTYPE_MOSTLY_Q4_1 | |||||
| or LLamaFtype.LLAMA_FTYPE_MOSTLY_Q5_0 or LLamaFtype.LLAMA_FTYPE_MOSTLY_Q5_1 or LLamaFtype.LLAMA_FTYPE_MOSTLY_Q8_0; | |||||
| } | |||||
| // Validation copies from here: | |||||
| // https://github.com/ggerganov/llama.cpp/blob/e59fcb2bc129881f4a269fee748fb38bce0a64de/llama.cpp#L2960 | |||||
| private static string FtypeToString(LLamaFtype ftype) | |||||
| { | |||||
| return ftype switch | |||||
| switch (ftype) | |||||
| { | { | ||||
| LLamaFtype.LLAMA_FTYPE_MOSTLY_Q4_0 => "q4_0", | |||||
| LLamaFtype.LLAMA_FTYPE_MOSTLY_Q4_1 => "q4_1", | |||||
| LLamaFtype.LLAMA_FTYPE_MOSTLY_Q5_0 => "q5_0", | |||||
| LLamaFtype.LLAMA_FTYPE_MOSTLY_Q5_1 => "q5_1", | |||||
| LLamaFtype.LLAMA_FTYPE_MOSTLY_Q8_0 => "q8_0", | |||||
| _ => throw new ArgumentException($"The type {Enum.GetName(typeof(LLamaFtype), ftype)} is not a valid type " + | |||||
| $"to perform quantization.") | |||||
| }; | |||||
| case LLamaFtype.LLAMA_FTYPE_MOSTLY_Q4_0: | |||||
| case LLamaFtype.LLAMA_FTYPE_MOSTLY_Q4_1: | |||||
| case LLamaFtype.LLAMA_FTYPE_MOSTLY_Q5_0: | |||||
| case LLamaFtype.LLAMA_FTYPE_MOSTLY_Q5_1: | |||||
| case LLamaFtype.LLAMA_FTYPE_MOSTLY_Q8_0: | |||||
| case LLamaFtype.LLAMA_FTYPE_MOSTLY_F16: | |||||
| case LLamaFtype.LLAMA_FTYPE_ALL_F32: | |||||
| case LLamaFtype.LLAMA_FTYPE_MOSTLY_Q2_K: | |||||
| case LLamaFtype.LLAMA_FTYPE_MOSTLY_Q3_K_S: | |||||
| case LLamaFtype.LLAMA_FTYPE_MOSTLY_Q3_K_M: | |||||
| case LLamaFtype.LLAMA_FTYPE_MOSTLY_Q3_K_L: | |||||
| case LLamaFtype.LLAMA_FTYPE_MOSTLY_Q4_K_S: | |||||
| case LLamaFtype.LLAMA_FTYPE_MOSTLY_Q4_K_M: | |||||
| case LLamaFtype.LLAMA_FTYPE_MOSTLY_Q5_K_S: | |||||
| case LLamaFtype.LLAMA_FTYPE_MOSTLY_Q5_K_M: | |||||
| case LLamaFtype.LLAMA_FTYPE_MOSTLY_Q6_K: | |||||
| return true; | |||||
| case LLamaFtype.LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16: | |||||
| default: | |||||
| return false; | |||||
| } | |||||
| } | } | ||||
| /// <summary> | |||||
| /// Parse a string into a LLamaFtype. This is a "relaxed" parsing, which allows any string which is contained within | |||||
| /// the enum name to be used. | |||||
| /// | |||||
| /// For example "Q5_K_M" will convert to "LLAMA_FTYPE_MOSTLY_Q5_K_M" | |||||
| /// </summary> | |||||
| /// <param name="str"></param> | |||||
| /// <returns></returns> | |||||
| /// <exception cref="ArgumentException"></exception> | |||||
| private static LLamaFtype StringToFtype(string str) | private static LLamaFtype StringToFtype(string str) | ||||
| { | { | ||||
| return str switch | |||||
| // Find all variants which contain the input string | |||||
| var matches = new List<LLamaFtype>(); | |||||
| foreach (LLamaFtype ftype in Enum.GetValues(typeof(LLamaFtype))) | |||||
| { | { | ||||
| "q4_0" => LLamaFtype.LLAMA_FTYPE_MOSTLY_Q4_0, | |||||
| "q4_1" => LLamaFtype.LLAMA_FTYPE_MOSTLY_Q4_1, | |||||
| "q5_0" => LLamaFtype.LLAMA_FTYPE_MOSTLY_Q5_0, | |||||
| "q5_1" => LLamaFtype.LLAMA_FTYPE_MOSTLY_Q5_1, | |||||
| "q8_0" => LLamaFtype.LLAMA_FTYPE_MOSTLY_Q8_0, | |||||
| _ => throw new ArgumentException($"Invalid ftype {str} to quantize.") | |||||
| }; | |||||
| var name = Enum.GetName(typeof(LLamaFtype), ftype); | |||||
| // Note: this is using "IndexOf" instead of "Contains" to be compatible with netstandard2.0 | |||||
| #pragma warning disable CA2249 | |||||
| if (name != null && name.IndexOf(str, StringComparison.OrdinalIgnoreCase) >= 0) | |||||
| matches.Add(ftype); | |||||
| #pragma warning restore CA2249 | |||||
| } | |||||
| // If there was just one match, success! | |||||
| if (matches.Count == 1) | |||||
| return matches[0]; | |||||
| // If none matched throw a generic error | |||||
| if (matches.Count == 0) | |||||
| throw new ArgumentException($"Unknown ftype \"{str}\" for quantization."); | |||||
| // There were several matches, throw an error asking the user to be more specific | |||||
| throw new ArgumentException($"\"{str}\" matches multiple potential ftypes: {string.Join(",", matches)}"); | |||||
| } | } | ||||
| } | } | ||||
| } | } | ||||
| @@ -1,29 +1,109 @@ | |||||
| using System; | |||||
| using System.Collections.Generic; | |||||
| using System.Text; | |||||
| namespace LLama.Native | |||||
| namespace LLama.Native | |||||
| { | { | ||||
| /// <summary> | |||||
| /// Supported model file types | |||||
| /// </summary> | |||||
| public enum LLamaFtype | public enum LLamaFtype | ||||
| { | { | ||||
| /// <summary> | |||||
| /// All f32 | |||||
| /// </summary> | |||||
| /// <remarks>Benchmark@7B: 26GB</remarks> | |||||
| LLAMA_FTYPE_ALL_F32 = 0, | LLAMA_FTYPE_ALL_F32 = 0, | ||||
| LLAMA_FTYPE_MOSTLY_F16 = 1, // except 1d tensors | |||||
| LLAMA_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors | |||||
| LLAMA_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors | |||||
| LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16 | |||||
| // LLAMA_FTYPE_MOSTLY_Q4_2 = 5, // support has been removed | |||||
| // LLAMA_FTYPE_MOSTLY_Q4_3 (6) support has been removed | |||||
| LLAMA_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors | |||||
| LLAMA_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors | |||||
| LLAMA_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors | |||||
| LLAMA_FTYPE_MOSTLY_Q2_K = 10,// except 1d tensors | |||||
| LLAMA_FTYPE_MOSTLY_Q3_K_S = 11,// except 1d tensors | |||||
| LLAMA_FTYPE_MOSTLY_Q3_K_M = 12,// except 1d tensors | |||||
| LLAMA_FTYPE_MOSTLY_Q3_K_L = 13,// except 1d tensors | |||||
| LLAMA_FTYPE_MOSTLY_Q4_K_S = 14,// except 1d tensors | |||||
| LLAMA_FTYPE_MOSTLY_Q4_K_M = 15,// except 1d tensors | |||||
| LLAMA_FTYPE_MOSTLY_Q5_K_S = 16,// except 1d tensors | |||||
| LLAMA_FTYPE_MOSTLY_Q5_K_M = 17,// except 1d tensors | |||||
| LLAMA_FTYPE_MOSTLY_Q6_K = 18,// except 1d tensors | |||||
| /// <summary> | |||||
| /// Mostly f16 | |||||
| /// </summary> | |||||
| /// <remarks>Benchmark@7B: 13GB</remarks> | |||||
| LLAMA_FTYPE_MOSTLY_F16 = 1, | |||||
| /// <summary> | |||||
| /// Mostly 8 bit | |||||
| /// </summary> | |||||
| /// <remarks>Benchmark@7B: 6.7GB, +0.0004ppl</remarks> | |||||
| LLAMA_FTYPE_MOSTLY_Q8_0 = 7, | |||||
| /// <summary> | |||||
| /// Mostly 4 bit | |||||
| /// </summary> | |||||
| /// <remarks>Benchmark@7B: 3.50GB, +0.2499 ppl</remarks> | |||||
| LLAMA_FTYPE_MOSTLY_Q4_0 = 2, | |||||
| /// <summary> | |||||
| /// Mostly 4 bit | |||||
| /// </summary> | |||||
| /// <remarks>Benchmark@7B: 3.90GB, +0.1846 ppl</remarks> | |||||
| LLAMA_FTYPE_MOSTLY_Q4_1 = 3, | |||||
| /// <summary> | |||||
| /// Mostly 4 bit, tok_embeddings.weight and output.weight are f16 | |||||
| /// </summary> | |||||
| LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, | |||||
| /// <summary> | |||||
| /// Mostly 5 bit | |||||
| /// </summary> | |||||
| /// <remarks>Benchmark@7B: 4.30GB @ 7B tokens, +0.0796 ppl</remarks> | |||||
| LLAMA_FTYPE_MOSTLY_Q5_0 = 8, | |||||
| /// <summary> | |||||
| /// Mostly 5 bit | |||||
| /// </summary> | |||||
| /// <remarks>Benchmark@7B: 4.70GB, +0.0415 ppl</remarks> | |||||
| LLAMA_FTYPE_MOSTLY_Q5_1 = 9, | |||||
| /// <summary> | |||||
| /// K-Quant 2 bit | |||||
| /// </summary> | |||||
| /// <remarks>Benchmark@7B: 2.67GB @ 7N parameters, +0.8698 ppl</remarks> | |||||
| LLAMA_FTYPE_MOSTLY_Q2_K = 10, | |||||
| /// <summary> | |||||
| /// K-Quant 3 bit (Small) | |||||
| /// </summary> | |||||
| /// <remarks>Benchmark@7B: 2.75GB, +0.5505 ppl</remarks> | |||||
| LLAMA_FTYPE_MOSTLY_Q3_K_S = 11, | |||||
| /// <summary> | |||||
| /// K-Quant 3 bit (Medium) | |||||
| /// </summary> | |||||
| /// <remarks>Benchmark@7B: 3.06GB, +0.2437 ppl</remarks> | |||||
| LLAMA_FTYPE_MOSTLY_Q3_K_M = 12, | |||||
| /// <summary> | |||||
| /// K-Quant 3 bit (Large) | |||||
| /// </summary> | |||||
| /// <remarks>Benchmark@7B: 3.35GB, +0.1803 ppl</remarks> | |||||
| LLAMA_FTYPE_MOSTLY_Q3_K_L = 13, | |||||
| /// <summary> | |||||
| /// K-Quant 4 bit (Small) | |||||
| /// </summary> | |||||
| /// <remarks>Benchmark@7B: 3.56GB, +0.1149 ppl</remarks> | |||||
| LLAMA_FTYPE_MOSTLY_Q4_K_S = 14, | |||||
| /// <summary> | |||||
| /// K-Quant 4 bit (Medium) | |||||
| /// </summary> | |||||
| /// <remarks>Benchmark@7B: 3.80GB, +0.0535 ppl</remarks> | |||||
| LLAMA_FTYPE_MOSTLY_Q4_K_M = 15, | |||||
| /// <summary> | |||||
| /// K-Quant 5 bit (Small) | |||||
| /// </summary> | |||||
| /// <remarks>Benchmark@7B: 4.33GB, +0.0353 ppl</remarks> | |||||
| LLAMA_FTYPE_MOSTLY_Q5_K_S = 16, | |||||
| /// <summary> | |||||
| /// K-Quant 5 bit (Medium) | |||||
| /// </summary> | |||||
| /// <remarks>Benchmark@7B: 4.45GB, +0.0142 ppl</remarks> | |||||
| LLAMA_FTYPE_MOSTLY_Q5_K_M = 17, | |||||
| /// <summary> | |||||
| /// K-Quant 6 bit | |||||
| /// </summary> | |||||
| /// <remarks>Benchmark@7B: 5.15GB, +0.0044 ppl</remarks> | |||||
| LLAMA_FTYPE_MOSTLY_Q6_K = 18, | |||||
| } | } | ||||
| } | } | ||||
| @@ -1,25 +1,28 @@ | |||||
| using System; | |||||
| using System.Collections.Generic; | |||||
| using System.Runtime.InteropServices; | |||||
| using System.Text; | |||||
| using System.Runtime.InteropServices; | |||||
| namespace LLama.Native | namespace LLama.Native | ||||
| { | { | ||||
| /// <summary> | |||||
| /// Quantizer parameters used in the native API | |||||
| /// </summary> | |||||
| public struct LLamaModelQuantizeParams | public struct LLamaModelQuantizeParams | ||||
| { | { | ||||
| /// <summary> | /// <summary> | ||||
| /// number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency() | /// number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency() | ||||
| /// </summary> | /// </summary> | ||||
| public int nthread; | public int nthread; | ||||
| /// <summary> | /// <summary> | ||||
| /// quantize to this llama_ftype | /// quantize to this llama_ftype | ||||
| /// </summary> | /// </summary> | ||||
| public LLamaFtype ftype; | public LLamaFtype ftype; | ||||
| /// <summary> | /// <summary> | ||||
| /// allow quantizing non-f32/f16 tensors | /// allow quantizing non-f32/f16 tensors | ||||
| /// </summary> | /// </summary> | ||||
| [MarshalAs(UnmanagedType.I1)] | [MarshalAs(UnmanagedType.I1)] | ||||
| public bool allow_requantize; | public bool allow_requantize; | ||||
| /// <summary> | /// <summary> | ||||
| /// quantize output.weight | /// quantize output.weight | ||||
| /// </summary> | /// </summary> | ||||