Browse Source

Added lots of comments to all the LLamaFtype variants

tags/v0.5.1
Martin Evans 2 years ago
parent
commit
acd91341e6
2 changed files with 110 additions and 27 deletions
  1. +103
    -23
      LLama/Native/LLamaFtype.cs
  2. +7
    -4
      LLama/Native/LLamaModelQuantizeParams.cs

+ 103
- 23
LLama/Native/LLamaFtype.cs View File

@@ -1,29 +1,109 @@
using System;
using System.Collections.Generic;
using System.Text;

namespace LLama.Native
namespace LLama.Native
{ {
/// <summary>
/// Supported model file types
/// </summary>
public enum LLamaFtype public enum LLamaFtype
{ {
/// <summary>
/// All f32
/// </summary>
/// <remarks>Benchmark: 26GB @ 7B parameters</remarks>
LLAMA_FTYPE_ALL_F32 = 0, LLAMA_FTYPE_ALL_F32 = 0,
LLAMA_FTYPE_MOSTLY_F16 = 1, // except 1d tensors
LLAMA_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors
LLAMA_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
// LLAMA_FTYPE_MOSTLY_Q4_2 = 5, // support has been removed
// LLAMA_FTYPE_MOSTLY_Q4_3 (6) support has been removed
LLAMA_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
LLAMA_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
LLAMA_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
LLAMA_FTYPE_MOSTLY_Q2_K = 10,// except 1d tensors
LLAMA_FTYPE_MOSTLY_Q3_K_S = 11,// except 1d tensors
LLAMA_FTYPE_MOSTLY_Q3_K_M = 12,// except 1d tensors
LLAMA_FTYPE_MOSTLY_Q3_K_L = 13,// except 1d tensors
LLAMA_FTYPE_MOSTLY_Q4_K_S = 14,// except 1d tensors
LLAMA_FTYPE_MOSTLY_Q4_K_M = 15,// except 1d tensors
LLAMA_FTYPE_MOSTLY_Q5_K_S = 16,// except 1d tensors
LLAMA_FTYPE_MOSTLY_Q5_K_M = 17,// except 1d tensors
LLAMA_FTYPE_MOSTLY_Q6_K = 18,// except 1d tensors

/// <summary>
/// Mostly f16
/// </summary>
/// <remarks>Benchmark: 13GB @ 7B parameters</remarks>
LLAMA_FTYPE_MOSTLY_F16 = 1,

/// <summary>
/// Mostly 8 bit
/// </summary>
/// <remarks>Benchmark: 6.7GB @ 7B parameters, +0.0004ppl</remarks>
LLAMA_FTYPE_MOSTLY_Q8_0 = 7,

/// <summary>
/// Mostly 4 bit
/// </summary>
/// <remarks>Benchmark: 3.50GB @ 7B parameters, +0.2499 ppl</remarks>
LLAMA_FTYPE_MOSTLY_Q4_0 = 2,

/// <summary>
/// Mostly 4 bit
/// </summary>
/// <remarks>Benchmark: 3.90GB @ 7B parameters, +0.1846 ppl</remarks>
LLAMA_FTYPE_MOSTLY_Q4_1 = 3,

/// <summary>
/// Mostly 4 bit, tok_embeddings.weight and output.weight are f16
/// </summary>
LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4,

/// <summary>
/// Mostly 5 bit
/// </summary>
/// <remarks>Benchmark: 4.30GB @ 7B tokens, +0.0796 ppl</remarks>
LLAMA_FTYPE_MOSTLY_Q5_0 = 8,

/// <summary>
/// Mostly 5 bit
/// </summary>
/// <remarks>Benchmark: 4.70GB @ 7B parameters, +0.0415 ppl</remarks>
LLAMA_FTYPE_MOSTLY_Q5_1 = 9,

/// <summary>
/// K-Quant 2 bit
/// </summary>
/// <remarks>Benchmark: 2.67GB @ 7N parameters, +0.8698 ppl</remarks>
LLAMA_FTYPE_MOSTLY_Q2_K = 10,

/// <summary>
/// K-Quant 3 bit (Small)
/// </summary>
/// <remarks>Benchmark: 2.75GB @ 7B parameters, +0.5505 ppl</remarks>
LLAMA_FTYPE_MOSTLY_Q3_K_S = 11,

/// <summary>
/// K-Quant 3 bit (Medium)
/// </summary>
/// <remarks>Benchmark: 3.06GB @ 7B parameters, +0.2437 ppl</remarks>
LLAMA_FTYPE_MOSTLY_Q3_K_M = 12,

/// <summary>
/// K-Quant 3 bit (Large)
/// </summary>
/// <remarks>Benchmark: 3.35GB @ 7B parameters, +0.1803 ppl</remarks>
LLAMA_FTYPE_MOSTLY_Q3_K_L = 13,

/// <summary>
/// K-Quant 4 bit (Small)
/// </summary>
/// <remarks>Benchmark: 3.56GB @ 7B parameters, +0.1149 ppl</remarks>
LLAMA_FTYPE_MOSTLY_Q4_K_S = 14,

/// <summary>
/// K-Quant 4 bit (Medium)
/// </summary>
/// <remarks>Benchmark: 3.80GB @ 7B parameters, +0.0535 ppl</remarks>
LLAMA_FTYPE_MOSTLY_Q4_K_M = 15,

/// <summary>
/// K-Quant 5 bit (Small)
/// </summary>
/// <remarks>Benchmark: 4.33GB @ 7B parameters, +0.0353 ppl</remarks>
LLAMA_FTYPE_MOSTLY_Q5_K_S = 16,

/// <summary>
/// K-Quant 5 bit (Medium)
/// </summary>
/// <remarks>Benchmark: 4.45GB @ 7B parameters, +0.0142 ppl</remarks>
LLAMA_FTYPE_MOSTLY_Q5_K_M = 17,

/// <summary>
/// K-Quant 6 bit
/// </summary>
/// <remarks>Benchmark: 5.15GB @ 7B parameters, +0.0044 ppl</remarks>
LLAMA_FTYPE_MOSTLY_Q6_K = 18,
} }
} }

+ 7
- 4
LLama/Native/LLamaModelQuantizeParams.cs View File

@@ -1,25 +1,28 @@
using System;
using System.Collections.Generic;
using System.Runtime.InteropServices;
using System.Text;
using System.Runtime.InteropServices;


namespace LLama.Native namespace LLama.Native
{ {
/// <summary>
/// Quantizer parameters used in the native API
/// </summary>
public struct LLamaModelQuantizeParams public struct LLamaModelQuantizeParams
{ {
/// <summary> /// <summary>
/// number of threads to use for quantizing, if &lt;=0 will use std::thread::hardware_concurrency() /// number of threads to use for quantizing, if &lt;=0 will use std::thread::hardware_concurrency()
/// </summary> /// </summary>
public int nthread; public int nthread;

/// <summary> /// <summary>
/// quantize to this llama_ftype /// quantize to this llama_ftype
/// </summary> /// </summary>
public LLamaFtype ftype; public LLamaFtype ftype;

/// <summary> /// <summary>
/// allow quantizing non-f32/f16 tensors /// allow quantizing non-f32/f16 tensors
/// </summary> /// </summary>
[MarshalAs(UnmanagedType.I1)] [MarshalAs(UnmanagedType.I1)]
public bool allow_requantize; public bool allow_requantize;

/// <summary> /// <summary>
/// quantize output.weight /// quantize output.weight
/// </summary> /// </summary>


Loading…
Cancel
Save