diff --git a/LLama/Native/LLamaFtype.cs b/LLama/Native/LLamaFtype.cs index 41159ee2..b7b74bf5 100644 --- a/LLama/Native/LLamaFtype.cs +++ b/LLama/Native/LLamaFtype.cs @@ -1,29 +1,109 @@ -using System; -using System.Collections.Generic; -using System.Text; - -namespace LLama.Native +namespace LLama.Native { + /// + /// Supported model file types + /// public enum LLamaFtype { + /// + /// All f32 + /// + /// Benchmark: 26GB @ 7B parameters LLAMA_FTYPE_ALL_F32 = 0, - LLAMA_FTYPE_MOSTLY_F16 = 1, // except 1d tensors - LLAMA_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors - LLAMA_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors - LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16 - // LLAMA_FTYPE_MOSTLY_Q4_2 = 5, // support has been removed - // LLAMA_FTYPE_MOSTLY_Q4_3 (6) support has been removed - LLAMA_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors - LLAMA_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors - LLAMA_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors - LLAMA_FTYPE_MOSTLY_Q2_K = 10,// except 1d tensors - LLAMA_FTYPE_MOSTLY_Q3_K_S = 11,// except 1d tensors - LLAMA_FTYPE_MOSTLY_Q3_K_M = 12,// except 1d tensors - LLAMA_FTYPE_MOSTLY_Q3_K_L = 13,// except 1d tensors - LLAMA_FTYPE_MOSTLY_Q4_K_S = 14,// except 1d tensors - LLAMA_FTYPE_MOSTLY_Q4_K_M = 15,// except 1d tensors - LLAMA_FTYPE_MOSTLY_Q5_K_S = 16,// except 1d tensors - LLAMA_FTYPE_MOSTLY_Q5_K_M = 17,// except 1d tensors - LLAMA_FTYPE_MOSTLY_Q6_K = 18,// except 1d tensors + + /// + /// Mostly f16 + /// + /// Benchmark: 13GB @ 7B parameters + LLAMA_FTYPE_MOSTLY_F16 = 1, + + /// + /// Mostly 8 bit + /// + /// Benchmark: 6.7GB @ 7B parameters, +0.0004ppl + LLAMA_FTYPE_MOSTLY_Q8_0 = 7, + + /// + /// Mostly 4 bit + /// + /// Benchmark: 3.50GB @ 7B parameters, +0.2499 ppl + LLAMA_FTYPE_MOSTLY_Q4_0 = 2, + + /// + /// Mostly 4 bit + /// + /// Benchmark: 3.90GB @ 7B parameters, +0.1846 ppl + LLAMA_FTYPE_MOSTLY_Q4_1 = 3, + + /// + /// Mostly 4 bit, tok_embeddings.weight and output.weight are f16 + /// + LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, + + /// + /// Mostly 5 bit + /// + /// Benchmark: 4.30GB @ 7B tokens, +0.0796 ppl + LLAMA_FTYPE_MOSTLY_Q5_0 = 8, + + /// + /// Mostly 5 bit + /// + /// Benchmark: 4.70GB @ 7B parameters, +0.0415 ppl + LLAMA_FTYPE_MOSTLY_Q5_1 = 9, + + /// + /// K-Quant 2 bit + /// + /// Benchmark: 2.67GB @ 7N parameters, +0.8698 ppl + LLAMA_FTYPE_MOSTLY_Q2_K = 10, + + /// + /// K-Quant 3 bit (Small) + /// + /// Benchmark: 2.75GB @ 7B parameters, +0.5505 ppl + LLAMA_FTYPE_MOSTLY_Q3_K_S = 11, + + /// + /// K-Quant 3 bit (Medium) + /// + /// Benchmark: 3.06GB @ 7B parameters, +0.2437 ppl + LLAMA_FTYPE_MOSTLY_Q3_K_M = 12, + + /// + /// K-Quant 3 bit (Large) + /// + /// Benchmark: 3.35GB @ 7B parameters, +0.1803 ppl + LLAMA_FTYPE_MOSTLY_Q3_K_L = 13, + + /// + /// K-Quant 4 bit (Small) + /// + /// Benchmark: 3.56GB @ 7B parameters, +0.1149 ppl + LLAMA_FTYPE_MOSTLY_Q4_K_S = 14, + + /// + /// K-Quant 4 bit (Medium) + /// + /// Benchmark: 3.80GB @ 7B parameters, +0.0535 ppl + LLAMA_FTYPE_MOSTLY_Q4_K_M = 15, + + /// + /// K-Quant 5 bit (Small) + /// + /// Benchmark: 4.33GB @ 7B parameters, +0.0353 ppl + LLAMA_FTYPE_MOSTLY_Q5_K_S = 16, + + /// + /// K-Quant 5 bit (Medium) + /// + /// Benchmark: 4.45GB @ 7B parameters, +0.0142 ppl + LLAMA_FTYPE_MOSTLY_Q5_K_M = 17, + + /// + /// K-Quant 6 bit + /// + /// Benchmark: 5.15GB @ 7B parameters, +0.0044 ppl + LLAMA_FTYPE_MOSTLY_Q6_K = 18, } } diff --git a/LLama/Native/LLamaModelQuantizeParams.cs b/LLama/Native/LLamaModelQuantizeParams.cs index 17ec035a..f23c1d2e 100644 --- a/LLama/Native/LLamaModelQuantizeParams.cs +++ b/LLama/Native/LLamaModelQuantizeParams.cs @@ -1,25 +1,28 @@ -using System; -using System.Collections.Generic; -using System.Runtime.InteropServices; -using System.Text; +using System.Runtime.InteropServices; namespace LLama.Native { + /// + /// Quantizer parameters used in the native API + /// public struct LLamaModelQuantizeParams { /// /// number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency() /// public int nthread; + /// /// quantize to this llama_ftype /// public LLamaFtype ftype; + /// /// allow quantizing non-f32/f16 tensors /// [MarshalAs(UnmanagedType.I1)] public bool allow_requantize; + /// /// quantize output.weight ///