using System; using System.Runtime.InteropServices; namespace LLama.Native { /// /// Quantizer parameters used in the native API /// /// llama_model_quantize_params [StructLayout(LayoutKind.Sequential)] public struct LLamaModelQuantizeParams { /// /// number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency() /// public int nthread; /// /// quantize to this llama_ftype /// public LLamaFtype ftype; /// /// allow quantizing non-f32/f16 tensors /// public bool allow_requantize { get => Convert.ToBoolean(_allow_requantize); set => _allow_requantize = Convert.ToSByte(value); } private sbyte _allow_requantize; /// /// quantize output.weight /// public bool quantize_output_tensor { get => Convert.ToBoolean(_quantize_output_tensor); set => _quantize_output_tensor = Convert.ToSByte(value); } private sbyte _quantize_output_tensor; /// /// only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored /// public bool only_copy { get => Convert.ToBoolean(_only_copy); set => _only_copy = Convert.ToSByte(value); } private sbyte _only_copy; /// /// disable k-quant mixtures and quantize all tensors to the same type /// public bool pure { get => Convert.ToBoolean(_pure); set => _pure = Convert.ToSByte(value); } private sbyte _pure; /// /// pointer to importance matrix data /// public IntPtr imatrix; } }