using System; using System.Runtime.InteropServices; namespace LLama.Native { /// /// Quantizer parameters used in the native API /// /// llama_model_quantize_params [StructLayout(LayoutKind.Sequential)] public struct LLamaModelQuantizeParams { /// /// number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency() /// public int nthread; /// /// quantize to this llama_ftype /// public LLamaFtype ftype; /// /// output tensor type /// public GGMLType output_tensor_type; /// /// itoken embeddings tensor type /// public GGMLType token_embedding_type; /// /// allow quantizing non-f32/f16 tensors /// public bool allow_requantize { get => Convert.ToBoolean(_allow_requantize); set => _allow_requantize = Convert.ToSByte(value); } private sbyte _allow_requantize; /// /// quantize output.weight /// public bool quantize_output_tensor { get => Convert.ToBoolean(_quantize_output_tensor); set => _quantize_output_tensor = Convert.ToSByte(value); } private sbyte _quantize_output_tensor; /// /// only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored /// public bool only_copy { get => Convert.ToBoolean(_only_copy); set => _only_copy = Convert.ToSByte(value); } private sbyte _only_copy; /// /// quantize all tensors to the default type /// public bool pure { get => Convert.ToBoolean(_pure); set => _pure = Convert.ToSByte(value); } private sbyte _pure; /// /// pointer to importance matrix data /// public IntPtr imatrix; /// /// pointer to vector containing overrides /// public IntPtr kv_overrides; /// /// Create a LLamaModelQuantizeParams with default values /// /// public static LLamaModelQuantizeParams Default() { return llama_model_quantize_default_params(); [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)] static extern LLamaModelQuantizeParams llama_model_quantize_default_params(); } } }