| @@ -20,14 +20,22 @@ namespace LLama | |||
| /// <param name="nthread">Thread to be used during the quantization. By default it's the physical core number.</param> | |||
| /// <returns>Whether the quantization is successful.</returns> | |||
| /// <exception cref="ArgumentException"></exception> | |||
| public static bool Quantize(string srcFileName, string dstFilename, LLamaFtype ftype, int nthread = -1) | |||
| public static unsafe bool Quantize(string srcFileName, string dstFilename, LLamaFtype ftype, int nthread = -1, bool allowRequantize = true, | |||
| bool quantizeOutputTensor = false) | |||
| { | |||
| if (!ValidateFtype(ftype)) | |||
| { | |||
| throw new ArgumentException($"The type {Enum.GetName(typeof(LLamaFtype), ftype)} is not a valid type " + | |||
| $"to perform quantization."); | |||
| } | |||
| return NativeApi.llama_model_quantize(srcFileName, dstFilename, ftype, nthread) == 0; | |||
| var quantizeParams = NativeApi.llama_model_quantize_default_params(); | |||
| quantizeParams.ftype = ftype; | |||
| quantizeParams.nthread = nthread; | |||
| quantizeParams.allow_requantize = allowRequantize; | |||
| quantizeParams.quantize_output_tensor = quantizeOutputTensor; | |||
| LLamaModelQuantizeParams* p = &quantizeParams; | |||
| return NativeApi.llama_model_quantize(srcFileName, dstFilename, p) == 0; | |||
| } | |||
| /// <summary> | |||
| @@ -39,9 +47,10 @@ namespace LLama | |||
| /// <param name="nthread">Thread to be used during the quantization. By default it's the physical core number.</param> | |||
| /// <returns>Whether the quantization is successful.</returns> | |||
| /// <exception cref="ArgumentException"></exception> | |||
| public static bool Quantize(string srcFileName, string dstFilename, string ftype, int nthread = -1) | |||
| public static bool Quantize(string srcFileName, string dstFilename, string ftype, int nthread = -1, bool allowRequantize = true, | |||
| bool quantizeOutputTensor = false) | |||
| { | |||
| return Quantize(srcFileName, dstFilename, StringToFtype(ftype), nthread); | |||
| return Quantize(srcFileName, dstFilename, StringToFtype(ftype), nthread, allowRequantize, quantizeOutputTensor); | |||
| } | |||
| private static bool ValidateFtype(string ftype) | |||
| @@ -9,19 +9,44 @@ namespace LLama.Native | |||
| [StructLayout(LayoutKind.Sequential)] | |||
| public struct LLamaContextParams | |||
| { | |||
| /// <summary> | |||
| /// RNG seed, -1 for random | |||
| /// </summary> | |||
| public int seed; | |||
| /// <summary> | |||
| /// text context | |||
| /// </summary> | |||
| public int n_ctx; | |||
| /// <summary> | |||
| /// prompt processing batch size | |||
| /// </summary> | |||
| public int n_batch; | |||
| /// <summary> | |||
| /// number of layers to store in VRAM | |||
| /// </summary> | |||
| public int n_gpu_layers; | |||
| /// <summary> | |||
| /// RNG seed, -1 for random | |||
| /// the GPU that is used for scratch and small tensors | |||
| /// </summary> | |||
| public int seed; | |||
| public int main_gpu; | |||
| /// <summary> | |||
| /// how to split layers across multiple GPUs | |||
| /// </summary> | |||
| public TensorSplits tensor_split; | |||
| /// <summary> | |||
| /// called with a progress value between 0 and 1, pass NULL to disable | |||
| /// </summary> | |||
| public IntPtr progress_callback; | |||
| /// <summary> | |||
| /// context pointer passed to the progress callback | |||
| /// </summary> | |||
| public IntPtr progress_callback_user_data; | |||
| /// <summary> | |||
| /// if true, reduce VRAM usage at the cost of performance | |||
| /// </summary> | |||
| [MarshalAs(UnmanagedType.I1)] | |||
| public bool low_vram; | |||
| /// <summary> | |||
| /// use fp16 for KV cache | |||
| /// </summary> | |||
| @@ -52,14 +77,10 @@ namespace LLama.Native | |||
| /// </summary> | |||
| [MarshalAs(UnmanagedType.I1)] | |||
| public bool embedding; | |||
| } | |||
| /// <summary> | |||
| /// called with a progress value between 0 and 1, pass NULL to disable | |||
| /// </summary> | |||
| public IntPtr progress_callback; | |||
| /// <summary> | |||
| /// context pointer passed to the progress callback | |||
| /// </summary> | |||
| public IntPtr progress_callback_user_data; | |||
| public struct TensorSplits | |||
| { | |||
| public float Item1; | |||
| } | |||
| } | |||
| @@ -16,5 +16,14 @@ namespace LLama.Native | |||
| LLAMA_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors | |||
| LLAMA_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors | |||
| LLAMA_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors | |||
| LLAMA_FTYPE_MOSTLY_Q2_K = 10,// except 1d tensors | |||
| LLAMA_FTYPE_MOSTLY_Q3_K_S = 11,// except 1d tensors | |||
| LLAMA_FTYPE_MOSTLY_Q3_K_M = 12,// except 1d tensors | |||
| LLAMA_FTYPE_MOSTLY_Q3_K_L = 13,// except 1d tensors | |||
| LLAMA_FTYPE_MOSTLY_Q4_K_S = 14,// except 1d tensors | |||
| LLAMA_FTYPE_MOSTLY_Q4_K_M = 15,// except 1d tensors | |||
| LLAMA_FTYPE_MOSTLY_Q5_K_S = 16,// except 1d tensors | |||
| LLAMA_FTYPE_MOSTLY_Q5_K_M = 17,// except 1d tensors | |||
| LLAMA_FTYPE_MOSTLY_Q6_K = 18,// except 1d tensors | |||
| } | |||
| } | |||
| @@ -0,0 +1,29 @@ | |||
| using System; | |||
| using System.Collections.Generic; | |||
| using System.Runtime.InteropServices; | |||
| using System.Text; | |||
| namespace LLama.Native | |||
| { | |||
| public struct LLamaModelQuantizeParams | |||
| { | |||
| /// <summary> | |||
| /// number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency() | |||
| /// </summary> | |||
| public int nthread; | |||
| /// <summary> | |||
| /// quantize to this llama_ftype | |||
| /// </summary> | |||
| public LLamaFtype ftype; | |||
| /// <summary> | |||
| /// allow quantizing non-f32/f16 tensors | |||
| /// </summary> | |||
| [MarshalAs(UnmanagedType.I1)] | |||
| public bool allow_requantize; | |||
| /// <summary> | |||
| /// quantize output.weight | |||
| /// </summary> | |||
| [MarshalAs(UnmanagedType.I1)] | |||
| public bool quantize_output_tensor; | |||
| } | |||
| } | |||
| @@ -17,6 +17,6 @@ namespace LLama.Native | |||
| /// <remarks>not great API - very likely to change</remarks> | |||
| /// <returns>Returns 0 on success</returns> | |||
| [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)] | |||
| public static extern int llama_model_quantize(string fname_inp, string fname_out, LLamaFtype ftype, int nthread); | |||
| public unsafe static extern int llama_model_quantize(string fname_inp, string fname_out, LLamaModelQuantizeParams* param); | |||
| } | |||
| } | |||
| @@ -10,6 +10,7 @@ namespace LLama.Native | |||
| using llama_token = Int32; | |||
| public unsafe partial class NativeApi | |||
| { | |||
| public static readonly int LLAMA_MAX_DEVICES = 1; | |||
| static NativeApi() | |||
| { | |||
| try | |||
| @@ -34,6 +35,9 @@ namespace LLama.Native | |||
| [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)] | |||
| public static extern LLamaContextParams llama_context_default_params(); | |||
| [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)] | |||
| public static extern LLamaModelQuantizeParams llama_model_quantize_default_params(); | |||
| [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)] | |||
| public static extern bool llama_mmap_supported(); | |||