Merge pull request #122 from martindevans/gguf

Add GGUF support
2 years ago · 4e83e48ad1
--- a/.gitignore
+++ b/.gitignore
@@ -344,4 +344,5 @@ test/TensorFlowNET.Examples/mnist
 site/

 /LLama.Unittest/Models/*.bin
 /LLama.Unittest/Models/*.gguf

--- a/LLama.Unittest/BasicTest.cs
+++ b/LLama.Unittest/BasicTest.cs
@@ -10,7 +10,7 @@ namespace LLama.Unittest

        public BasicTest()
        {
            _params = new ModelParams("Models/llama-2-7b-chat.ggmlv3.q3_K_S.bin")
            _params = new ModelParams(Constants.ModelPath)
            {
                ContextSize = 2048
            };
--- a/LLama.Unittest/Constants.cs
+++ b/LLama.Unittest/Constants.cs
@@ -0,0 +1,7 @@
 namespace LLama.Unittest
 {
    internal static class Constants
    {
        public static string ModelPath = "Models/llama-2-7b.q4_0.gguf";
    }
 }
--- a/LLama.Unittest/GrammarTest.cs
+++ b/LLama.Unittest/GrammarTest.cs
@@ -12,7 +12,7 @@ namespace LLama.Unittest

        public GrammarTest()
        {
            _params = new ModelParams("Models/llama-2-7b-chat.ggmlv3.q3_K_S.bin")
            _params = new ModelParams(Constants.ModelPath)
            {
                ContextSize = 2048,
            };
--- a/LLama.Unittest/LLama.Unittest.csproj
+++ b/LLama.Unittest/LLama.Unittest.csproj
@@ -24,7 +24,7 @@
  </ItemGroup>

  <Target Name="DownloadContentFiles" BeforeTargets="Build">
      <DownloadFile SourceUrl="https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGML/resolve/main/llama-2-7b-chat.ggmlv3.q3_K_S.bin" DestinationFolder="Models" DestinationFileName="llama-2-7b-chat.ggmlv3.q3_K_S.bin" SkipUnchangedFiles="true">
      <DownloadFile SourceUrl="https://huggingface.co/narrative-bi/Llama-2-7B-GGUF/resolve/main/llama-2-7b.q4_0.gguf" DestinationFolder="Models" DestinationFileName="llama-2-7b.q4_0.gguf" SkipUnchangedFiles="true">
    </DownloadFile>
  </Target>

@@ -37,7 +37,7 @@
  </ItemGroup>

  <ItemGroup>
    <None Update="Models\llama-2-7b-chat.ggmlv3.q3_K_S.bin">
    <None Update="Models\llama-2-7b.q4_0.gguf">
      <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
    </None>
  </ItemGroup>
--- a/LLama.Unittest/LLamaContextTests.cs
+++ b/LLama.Unittest/LLamaContextTests.cs
@@ -10,7 +10,7 @@ namespace LLama.Unittest

        public LLamaContextTests()
        {
            var @params = new ModelParams("Models/llama-2-7b-chat.ggmlv3.q3_K_S.bin")
            var @params = new ModelParams(Constants.ModelPath)
            {
                ContextSize = 768,
            };
--- a/LLama.Unittest/LLamaEmbedderTests.cs
+++ b/LLama.Unittest/LLamaEmbedderTests.cs
@@ -5,7 +5,7 @@ namespace LLama.Unittest;
 public class LLamaEmbedderTests
    : IDisposable
 {
    private readonly LLamaEmbedder _embedder = new(new ModelParams("Models/llama-2-7b-chat.ggmlv3.q3_K_S.bin"));
    private readonly LLamaEmbedder _embedder = new(new ModelParams(Constants.ModelPath));

    public void Dispose()
    {
@@ -36,18 +36,19 @@ public class LLamaEmbedderTests
            Assert.Equal(expected[i], actual[i], epsilon);
    }

    [Fact]
    public void EmbedBasic()
    {
        var cat = _embedder.GetEmbeddings("cat");
    // todo: enable this one llama2 7B gguf is available
    //[Fact]
    //public void EmbedBasic()
    //{
    //    var cat = _embedder.GetEmbeddings("cat");

        Assert.NotNull(cat);
        Assert.NotEmpty(cat);
    //    Assert.NotNull(cat);
    //    Assert.NotEmpty(cat);

        // Expected value generate with llama.cpp embedding.exe
        var expected = new float[] { -0.127304f, -0.678057f, -0.085244f, -0.956915f, -0.638633f };
        AssertApproxStartsWith(expected, cat);
    }
    //    // Expected value generate with llama.cpp embedding.exe
    //    var expected = new float[] { -0.127304f, -0.678057f, -0.085244f, -0.956915f, -0.638633f };
    //    AssertApproxStartsWith(expected, cat);
    //}

    [Fact]
    public void EmbedCompare()
--- a/LLama.Unittest/ModelsParamsTests.cs
+++ b/LLama.Unittest/ModelsParamsTests.cs
@@ -14,7 +14,6 @@ namespace LLama.Unittest
                BatchSize = 17,
                ContextSize = 42,
                LoraAdapter = "adapter",
                GroupedQueryAttention = 7,
                Seed = 42,
                GpuLayerCount = 111
            };
@@ -33,7 +32,6 @@ namespace LLama.Unittest
                BatchSize = 17,
                ContextSize = 42,
                LoraAdapter = "adapter",
                GroupedQueryAttention = 7,
                Seed = 42,
                GpuLayerCount = 111
            };
@@ -47,21 +45,26 @@ namespace LLama.Unittest
            Assert.Equal(expected, actual);
        }

        private class NewtsonsoftEncodingConverter
            : Newtonsoft.Json.JsonConverter<Encoding>


        public class NewtsonsoftEncodingConverter : JsonConverter
        {
            public override void WriteJson(JsonWriter writer, Encoding? value, JsonSerializer serializer)
            public override bool CanConvert(Type objectType)
            {
                writer.WriteValue((string?)value?.WebName);
                return typeof(Encoding).IsAssignableFrom(objectType);
            }

            public override Encoding? ReadJson(JsonReader reader, Type objectType, Encoding? existingValue, bool hasExistingValue, JsonSerializer serializer)
            public override void WriteJson(JsonWriter writer, object value, JsonSerializer serializer)
            {
                var name = (string?)reader.Value;
                if (name == null)
                    return null;
                return Encoding.GetEncoding(name);
                writer.WriteValue(((Encoding)value).WebName);
            }

            public override object ReadJson(JsonReader reader, Type objectType, object existingValue, JsonSerializer serializer)
            {
                return Encoding.GetEncoding((string)reader.Value);
            }
        }


    }
 }
--- a/LLama.Unittest/StatelessExecutorTest.cs
+++ b/LLama.Unittest/StatelessExecutorTest.cs
@@ -13,7 +13,7 @@ namespace LLama.Unittest
        public StatelessExecutorTest(ITestOutputHelper testOutputHelper)
        {
            _testOutputHelper = testOutputHelper;
            _params = new ModelParams("Models/llama-2-7b-chat.ggmlv3.q3_K_S.bin")
            _params = new ModelParams(Constants.ModelPath)
            {
                ContextSize = 60,
                Seed = 1754
--- a/LLama.Web/Common/ModelOptions.cs
+++ b/LLama.Web/Common/ModelOptions.cs
@@ -88,16 +88,6 @@ namespace LLama.Web.Common
 		/// </summary>
 		public float[] TensorSplits { get; set; }

 		/// <summary>
 		/// Grouped-Query Attention
 		/// </summary>
 		public int GroupedQueryAttention { get; set; } = 1;

 		/// <summary>
 		/// RMS Norm Epsilon
 		/// </summary>
 		public float RmsNormEpsilon { get; set; } = 5e-6f;

 		/// <summary>
 		/// RoPE base frequency
 		/// </summary>
--- a/LLama/Abstractions/IModelParams.cs
+++ b/LLama/Abstractions/IModelParams.cs
@@ -98,16 +98,6 @@ namespace LLama.Abstractions
        /// </summary>
        float[]? TensorSplits { get; set; }

        /// <summary>
        /// Grouped-Query Attention
        /// </summary>
        int GroupedQueryAttention { get; set; }

        /// <summary>
        /// RMS Norm Epsilon
        /// </summary>
        float RmsNormEpsilon { get; set; }

        /// <summary>
        /// RoPE base frequency
        /// </summary>
--- a/LLama/Common/ModelParams.cs
+++ b/LLama/Common/ModelParams.cs
@@ -89,16 +89,6 @@ namespace LLama.Common
        /// </summary>
        public float[]? TensorSplits { get; set; }

 		/// <summary>
 		/// Grouped-Query Attention
 		/// </summary>
 		public int GroupedQueryAttention { get; set; } = 1;

 		/// <summary>
 		/// RMS Norm Epsilon
 		/// </summary>
 		public float RmsNormEpsilon { get; set; } = 5e-6f;

 		/// <summary>
 		/// RoPE base frequency
 		/// </summary>
@@ -153,8 +143,6 @@ namespace LLama.Common
        /// <param name="batchSize">Batch size for prompt processing (must be >=32 to use BLAS) (n_batch)</param>
        /// <param name="convertEosToNewLine">Whether to convert eos to newline during the inference.</param>
        /// <param name="embeddingMode">Whether to use embedding mode. (embedding) Note that if this is set to true, The LLamaModel won't produce text response anymore.</param>
        /// <param name="groupedQueryAttention">Grouped-Query Attention</param>
        /// <param name="rmsNormEpsilon">RMS Norm Epsilon</param>
        /// <param name="ropeFrequencyBase">RoPE base frequency.</param>
        /// <param name="ropeFrequencyScale">RoPE frequency scaling factor</param>
        /// <param name="mulMatQ">Use experimental mul_mat_q kernels</param>
@@ -165,7 +153,7 @@ namespace LLama.Common
                           bool useMemorymap = true, bool useMemoryLock = false, bool perplexity = false,
                           string loraAdapter = "", string loraBase = "", int threads = -1, int batchSize = 512,
                           bool convertEosToNewLine = false, bool embeddingMode = false,
                           int groupedQueryAttention = 1, float rmsNormEpsilon = 5e-6f, float ropeFrequencyBase = 10000.0f, float ropeFrequencyScale = 1f, bool mulMatQ = false,
                           float ropeFrequencyBase = 10000.0f, float ropeFrequencyScale = 1f, bool mulMatQ = false,
                           string encoding = "UTF-8")
        {
            ContextSize = contextSize;
@@ -182,8 +170,6 @@ namespace LLama.Common
            BatchSize = batchSize;
            ConvertEosToNewLine = convertEosToNewLine;
            EmbeddingMode = embeddingMode;
            GroupedQueryAttention = groupedQueryAttention;
            RmsNormEpsilon = rmsNormEpsilon;
            RopeFrequencyBase = ropeFrequencyBase;
            RopeFrequencyScale = ropeFrequencyScale;
            MulMatQ = mulMatQ;
--- a/LLama/Extensions/IModelParamsExtensions.cs
+++ b/LLama/Extensions/IModelParamsExtensions.cs
@@ -39,8 +39,6 @@ namespace LLama.Extensions
            result.logits_all = @params.Perplexity;
            result.embedding = @params.EmbeddingMode;
            result.low_vram = @params.LowVram;
            result.n_gqa = @params.GroupedQueryAttention;
            result.rms_norm_eps = @params.RmsNormEpsilon;
            result.rope_freq_base = @params.RopeFrequencyBase;
            result.rope_freq_scale = @params.RopeFrequencyScale;
            result.mul_mat_q = @params.MulMatQ;
--- a/LLama/LLamaContext.cs
+++ b/LLama/LLamaContext.cs
@@ -132,9 +132,10 @@ namespace LLama
        /// <returns></returns>
        public string DeTokenize(IEnumerable<llama_token> tokens)
        {
            StringBuilder sb = new();
            foreach(var token in tokens)
                sb.Append(_ctx.TokenToString(token, _encoding));
            var sb = new StringBuilder();
            foreach (var token in tokens)
                _ctx.TokenToString(token, _encoding, sb);

            return sb.ToString();
        }

@@ -365,7 +366,7 @@ namespace LLama
            }

            // Save the newline logit value
            var nl_token = NativeApi.llama_token_nl();
            var nl_token = NativeApi.llama_token_nl(_ctx);
            var nl_logit = logits[nl_token];

            // Convert logits into token candidates
--- a/LLama/LLamaEmbedder.cs
+++ b/LLama/LLamaEmbedder.cs
@@ -70,10 +70,6 @@ namespace LLama
        /// <exception cref="RuntimeError"></exception>
        public float[] GetEmbeddings(string text, bool addBos)
        {
            if (addBos)
            {
                text = text.Insert(0, " ");
            }

            var embed_inp_array = _ctx.Tokenize(text, addBos);

--- a/LLama/LLamaInstructExecutor.cs
+++ b/LLama/LLamaInstructExecutor.cs
@@ -5,6 +5,7 @@ using System;
 using System.Collections.Generic;
 using System.IO;
 using System.Linq;
 using System.Text;
 using System.Text.Json;
 using System.Text.Json.Serialization;

@@ -113,7 +114,6 @@ namespace LLama
            if (_is_prompt_run)
            {
                // When running the first input (prompt) in inteactive mode, we should specially process it.
                text = " " + text;
                _embed_inps = Context.Tokenize(text, true).ToList();
            }
            else
@@ -141,9 +141,10 @@ namespace LLama
            {
                if (args.Antiprompts is not null && args.Antiprompts.Count > 0)
                {
                    string last_output = "";
                    foreach (var id in _last_n_tokens)
                        last_output += Context.NativeHandle.TokenToString(id, Context.Encoding);
                    var last_output_builder = new StringBuilder();
                    foreach (var token in _last_n_tokens)
                        Context.NativeHandle.TokenToString(token, Context.Encoding, last_output_builder);
                    var last_output = last_output_builder.ToString();

                    foreach (var antiprompt in args.Antiprompts)
                    {
@@ -162,7 +163,7 @@ namespace LLama
                }
            }

            if (_embeds.Count > 0 && _embeds.Last() == NativeApi.llama_token_eos())
            if (_embeds.Count > 0 && _embeds.Last() == NativeApi.llama_token_eos(Context.NativeHandle))
            {
                args.WaitForInput = true;
            }
--- a/LLama/LLamaInteractExecutor.cs
+++ b/LLama/LLamaInteractExecutor.cs
@@ -7,6 +7,7 @@ using System.IO;
 using System.Linq;
 using System.Text.Json;
 using System.Text.Json.Serialization;
 using System.Text;

 namespace LLama
 {
@@ -25,7 +26,7 @@ namespace LLama
        /// <param name="context"></param>
        public InteractiveExecutor(LLamaContext context) : base(context)
        {
            _llama_token_newline = Context.NativeHandle.Tokenize("\n", false, Context.Encoding);
            _llama_token_newline = new [] { NativeApi.llama_token_nl(Context.NativeHandle) };
        }

        /// <inheritdoc />
@@ -103,7 +104,6 @@ namespace LLama
            if (_is_prompt_run)
            {
                // When running the first input (prompt) in inteactive mode, we should specially process it.
                text = " " + text;
                _embed_inps = Context.Tokenize(text, true).ToList();
            }
            else
@@ -132,11 +132,10 @@ namespace LLama
            {
                if (args.Antiprompts is not null && args.Antiprompts.Count > 0)
                {
                    string last_output = "";
                    foreach (var id in _last_n_tokens)
                    {
                        last_output += Context.NativeHandle.TokenToString(id, Context.Encoding);
                    }
                    var last_output_builder = new StringBuilder();
                    foreach (var token in _last_n_tokens)
                        Context.NativeHandle.TokenToString(token, Context.Encoding, last_output_builder);
                    var last_output = last_output_builder.ToString();

                    foreach (var antiprompt in args.Antiprompts)
                    {
@@ -154,7 +153,7 @@ namespace LLama
                }
            }

            if (_embeds.Count > 0 && _embeds.Last() == NativeApi.llama_token_eos())
            if (_embeds.Count > 0 && _embeds.Last() == NativeApi.llama_token_eos(Context.NativeHandle))
            {
                extraOutputs = new[] { " [end of text]\n" };
                return true;
@@ -215,7 +214,7 @@ namespace LLama

                _last_n_tokens.Enqueue(id);

                if (id == NativeApi.llama_token_eos())
                if (id == NativeApi.llama_token_eos(Context.NativeHandle))
                {
                    id = _llama_token_newline.First();
                    if (args.Antiprompts is not null && args.Antiprompts.Count > 0)
--- a/LLama/LLamaSharp.Runtime.targets
+++ b/LLama/LLamaSharp.Runtime.targets
@@ -32,11 +32,11 @@
            <Link>libllama.dylib</Link>
        </None>
        <None Include="$(MSBuildThisFileDirectory)runtimes/libllama-metal.dylib">
            <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
            <CopyToOutputDirectory>None</CopyToOutputDirectory>
            <Link>libllama-metal.dylib</Link>
        </None>
        <None Include="$(MSBuildThisFileDirectory)runtimes/ggml-metal.metal">
            <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
            <CopyToOutputDirectory>None</CopyToOutputDirectory>
            <Link>ggml-metal.metal</Link>
        </None>    
    </ItemGroup>
--- a/LLama/Native/LLamaContextParams.cs
+++ b/LLama/Native/LLamaContextParams.cs
@@ -31,16 +31,6 @@ namespace LLama.Native
        /// </summary>
        public int n_batch;

        /// <summary>
        /// grouped-query attention (TEMP - will be moved to model hparams)
        /// </summary>
        public int n_gqa;

        /// <summary>
        /// rms norm epsilon (TEMP - will be moved to model hparams)
        /// </summary>
        public float rms_norm_eps;

        /// <summary>
        /// number of layers to store in VRAM
        /// </summary>
@@ -82,8 +72,8 @@ namespace LLama.Native
        /// if true, reduce VRAM usage at the cost of performance
        /// </summary>
        public bool low_vram
        {
            get => Convert.ToBoolean(_low_vram);
        { 
            readonly get => Convert.ToBoolean(_low_vram);
            set => _low_vram = Convert.ToSByte(value);
        }
        private sbyte _low_vram;
@@ -92,8 +82,8 @@ namespace LLama.Native
        /// if true, use experimental mul_mat_q kernels
        /// </summary>
        public bool mul_mat_q
        {
            get => Convert.ToBoolean(_mul_mat_q);
        { 
            readonly get => Convert.ToBoolean(_mul_mat_q);
            set => _mul_mat_q = Convert.ToSByte(value);
        }
        private sbyte _mul_mat_q;
@@ -102,8 +92,8 @@ namespace LLama.Native
        /// use fp16 for KV cache
        /// </summary>
        public bool f16_kv
        {
            get => Convert.ToBoolean(_f16_kv);
        { 
            readonly get => Convert.ToBoolean(_f16_kv);
            set => _f16_kv = Convert.ToSByte(value);
        }
        private sbyte _f16_kv;
@@ -112,8 +102,8 @@ namespace LLama.Native
        /// the llama_eval() call computes all logits, not just the last one
        /// </summary>
        public bool logits_all
        {
            get => Convert.ToBoolean(_logits_all);
        { 
            readonly get => Convert.ToBoolean(_logits_all);
            set => _logits_all = Convert.ToSByte(value);
        }
        private sbyte _logits_all;
@@ -122,8 +112,8 @@ namespace LLama.Native
        /// only load the vocabulary, no weights
        /// </summary>
        public bool vocab_only
        {
            get => Convert.ToBoolean(_vocab_only);
        { 
            readonly get => Convert.ToBoolean(_vocab_only);
            set => _vocab_only = Convert.ToSByte(value);
        }
        private sbyte _vocab_only;
@@ -132,8 +122,8 @@ namespace LLama.Native
        /// use mmap if possible
        /// </summary>
        public bool use_mmap
        {
            get => Convert.ToBoolean(_use_mmap);
        { 
            readonly get => Convert.ToBoolean(_use_mmap);
            set => _use_mmap = Convert.ToSByte(value);
        }
        private sbyte _use_mmap;
@@ -142,8 +132,8 @@ namespace LLama.Native
        /// force system to keep model in RAM
        /// </summary>
        public bool use_mlock
        {
            get => Convert.ToBoolean(_use_mlock);
        { 
            readonly get => Convert.ToBoolean(_use_mlock);
            set => _use_mlock = Convert.ToSByte(value);
        }
        private sbyte _use_mlock;
@@ -152,8 +142,8 @@ namespace LLama.Native
        /// embedding mode only
        /// </summary>
        public bool embedding
        {
            get => Convert.ToBoolean(_embedding);
        { 
            readonly get => Convert.ToBoolean(_embedding);
            set => _embedding = Convert.ToSByte(value);
        }
        private sbyte _embedding;
--- a/LLama/Native/LLamaFtype.cs
+++ b/LLama/Native/LLamaFtype.cs
@@ -105,5 +105,10 @@
        /// </summary>
        /// <remarks>Benchmark@7B: 5.15GB, +0.0044 ppl</remarks>
        LLAMA_FTYPE_MOSTLY_Q6_K = 18,

        /// <summary>
        /// File type was not specified
        /// </summary>
        LLAMA_FTYPE_GUESSED = 1024
    }
 }
--- a/LLama/Native/NativeApi.cs
+++ b/LLama/Native/NativeApi.cs
@@ -11,12 +11,18 @@ namespace LLama.Native
 {
    using llama_token = Int32;

    /// <summary>
    /// Callback from llama.cpp with log messages
    /// </summary>
    /// <param name="level"></param>
    /// <param name="message"></param>
 	public delegate void LLamaLogCallback(ILLamaLogger.LogLevel level, string message);

    /// <summary>
    /// Direct translation of the llama.cpp API
    /// </summary>
 	public unsafe partial class NativeApi
    {
        public static readonly int LLAMA_MAX_DEVICES = 1;

        static NativeApi()
        {
            try
@@ -43,18 +49,43 @@ namespace LLama.Native
        [DllImport(libraryName, EntryPoint = "llama_mmap_supported", CallingConvention = CallingConvention.Cdecl)]
        public static extern bool llama_empty_call();

        /// <summary>
        /// Create a LLamaContextParams with default values
        /// </summary>
        /// <returns></returns>
        [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
        public static extern LLamaContextParams llama_context_default_params();

        /// <summary>
        /// Create a LLamaModelQuantizeParams with default values
        /// </summary>
        /// <returns></returns>
        [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
        public static extern LLamaModelQuantizeParams llama_model_quantize_default_params();

        /// <summary>
        /// Check if memory mapping is supported
        /// </summary>
        /// <returns></returns>
        [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
        public static extern bool llama_mmap_supported();

        /// <summary>
        /// Check if memory lockingis supported
        /// </summary>
        /// <returns></returns>
        [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
        public static extern bool llama_mlock_supported();

        /// <summary>
        /// Export a static computation graph for context of 511 and batch size of 1
        /// NOTE: since this functionality is mostly for debugging and demonstration purposes, we hardcode these
        ///       parameters here to keep things simple
        /// IMPORTANT: do not use for anything else other than debugging and testing!
        /// </summary>
        /// <param name="ctx"></param>
        /// <param name="fname"></param>
        /// <returns></returns>
        [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
        public static extern int llama_eval_export(SafeLLamaContextHandle ctx, string fname);

@@ -69,6 +100,13 @@ namespace LLama.Native
        [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
        public static extern IntPtr llama_load_model_from_file(string path_model, LLamaContextParams @params);

        /// <summary>
        /// Create a new llama_context with the given model.
        /// Return value should always be wrapped in SafeLLamaContextHandle!
        /// </summary>
        /// <param name="model"></param>
        /// <param name="params"></param>
        /// <returns></returns>
        [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
        public static extern IntPtr llama_new_context_with_model(SafeLlamaModelHandle model, LLamaContextParams @params);

@@ -81,7 +119,7 @@ namespace LLama.Native
        public static extern void llama_backend_init(bool numa);

        /// <summary>
        /// Frees all allocated memory
        /// Frees all allocated memory in the given llama_context
        /// </summary>
        /// <param name="ctx"></param>
        [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
@@ -341,14 +379,26 @@ namespace LLama.Native
        [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
        public static extern IntPtr llama_token_to_str(SafeLLamaContextHandle ctx, llama_token token);

        /// <summary>
        /// Get the "Beginning of sentence" token
        /// </summary>
        /// <returns></returns>
        [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
        public static extern llama_token llama_token_bos();
        public static extern llama_token llama_token_bos(SafeLLamaContextHandle ctx);

        /// <summary>
        /// Get the "End of sentence" token
        /// </summary>
        /// <returns></returns>
        [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
        public static extern llama_token llama_token_eos();
        public static extern llama_token llama_token_eos(SafeLLamaContextHandle ctx);

        /// <summary>
        /// Get the "new line" token
        /// </summary>
        /// <returns></returns>
        [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
        public static extern llama_token llama_token_nl();
        public static extern llama_token llama_token_nl(SafeLLamaContextHandle ctx);

        /// <summary>
        /// Print out timing information for this context
@@ -377,7 +427,7 @@ namespace LLama.Native
        /// <param name="model"></param>
        /// <returns></returns>
        [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
        public static extern int llama_n_vocab_from_model(SafeLlamaModelHandle model);
        public static extern int llama_model_n_vocab(SafeLlamaModelHandle model);

        /// <summary>
        /// Get the size of the context window for the model
@@ -385,7 +435,7 @@ namespace LLama.Native
        /// <param name="model"></param>
        /// <returns></returns>
        [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
        public static extern int llama_n_ctx_from_model(SafeLlamaModelHandle model);
        public static extern int llama_model_n_ctx(SafeLlamaModelHandle model);

        /// <summary>
        /// Get the dimension of embedding vectors from this model
@@ -393,16 +443,18 @@ namespace LLama.Native
        /// <param name="model"></param>
        /// <returns></returns>
        [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
        public static extern int llama_n_embd_from_model(SafeLlamaModelHandle model);
        public static extern int llama_model_n_embd(SafeLlamaModelHandle model);

        /// <summary>
        /// Convert a single token into text
        /// </summary>
        /// <param name="model"></param>
        /// <param name="llamaToken"></param>
        /// <returns></returns>
        /// <param name="buffer">buffer to write string into</param>
        /// <param name="length">size of the buffer</param>
        /// <returns>The length writte, or if the buffer is too small a negative that indicates the length required</returns>
        [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
        public static extern byte* llama_token_to_str_with_model(SafeLlamaModelHandle model, int llamaToken);
        public static extern int llama_token_to_piece_with_model(SafeLlamaModelHandle model, int llamaToken, byte* buffer, int length);

        /// <summary>
        /// Convert text into tokens
--- a/LLama/Native/SafeLLamaContextHandle.cs
+++ b/LLama/Native/SafeLLamaContextHandle.cs
@@ -183,7 +183,7 @@ namespace LLama.Native
        /// <summary>
        /// Convert a token into a string
        /// </summary>
        /// <param name="token"></param>
        /// <param name="token">Token to decode into a string</param>
        /// <param name="encoding"></param>
        /// <returns></returns>
        public string TokenToString(int token, Encoding encoding)
@@ -192,13 +192,25 @@ namespace LLama.Native
        }

        /// <summary>
        /// Convert a token into a span of bytes that could be decoded into a string
        /// Append a single llama token to a string builder
        /// </summary>
        /// <param name="token"></param>
        /// <returns></returns>
        public ReadOnlySpan<byte> TokenToSpan(int token)
        /// <param name="token">Token to decode</param>
        /// <param name="encoding"></param>
        /// <param name="dest">string builder to append the result to</param>
        public void TokenToString(int token, Encoding encoding, StringBuilder dest)
        {
            ThrowIfDisposed().TokenToString(token, encoding, dest);
        }

        /// <summary>
        /// Convert a single llama token into bytes
        /// </summary>
        /// <param name="token">Token to decode</param>
        /// <param name="dest">A span to attempt to write into. If this is too small nothing will be written</param>
        /// <returns>The size of this token. **nothing will be written** if this is larger than `dest`</returns>
        public int TokenToSpan(int token, Span<byte> dest)
        {
            return ThrowIfDisposed().TokenToSpan(token);
            return ThrowIfDisposed().TokenToSpan(token, dest);
        }

        /// <summary>
--- a/LLama/Native/SafeLlamaModelHandle.cs
+++ b/LLama/Native/SafeLlamaModelHandle.cs
@@ -1,4 +1,5 @@
 using System;
 using System.Diagnostics;
 using System.Text;
 using LLama.Exceptions;

@@ -28,9 +29,9 @@ namespace LLama.Native
        internal SafeLlamaModelHandle(IntPtr handle)
            : base(handle)
        {
            VocabCount = NativeApi.llama_n_vocab_from_model(this);
            ContextSize = NativeApi.llama_n_ctx_from_model(this);
            EmbeddingSize = NativeApi.llama_n_embd_from_model(this);
            VocabCount = NativeApi.llama_model_n_vocab(this);
            ContextSize = NativeApi.llama_model_n_ctx(this);
            EmbeddingSize = NativeApi.llama_model_n_embd(this);
        }

        /// <inheritdoc />
@@ -82,17 +83,20 @@ namespace LLama.Native

        #region tokenize
        /// <summary>
        /// Convert a single llama token into string bytes
        /// Convert a single llama token into bytes
        /// </summary>
        /// <param name="llama_token"></param>
        /// <returns></returns>
        public ReadOnlySpan<byte> TokenToSpan(int llama_token)
        /// <param name="llama_token">Token to decode</param>
        /// <param name="dest">A span to attempt to write into. If this is too small nothing will be written</param>
        /// <returns>The size of this token. **nothing will be written** if this is larger than `dest`</returns>
        public int TokenToSpan(int llama_token, Span<byte> dest)
        {
            unsafe
            {
                var bytes = new ReadOnlySpan<byte>(NativeApi.llama_token_to_str_with_model(this, llama_token), int.MaxValue);
                var terminator = bytes.IndexOf((byte)0);
                return bytes.Slice(0, terminator);
                fixed (byte* destPtr = dest)
                {
                    var length = NativeApi.llama_token_to_piece_with_model(this, llama_token, destPtr, dest.Length);
                    return Math.Abs(length);
                }
            }
        }

@@ -104,16 +108,54 @@ namespace LLama.Native
        /// <returns></returns>
        public string TokenToString(int llama_token, Encoding encoding)
        {
            var span = TokenToSpan(llama_token);
            unsafe
            {
                var length = NativeApi.llama_token_to_piece_with_model(this, llama_token, null, 0);
                if (length == 0)
                    return "";

                Span<byte> bytes = stackalloc byte[-length];

            if (span.Length == 0)
                return "";
                fixed (byte* bytePtr = bytes)
                {
                    var written = NativeApi.llama_token_to_piece_with_model(this, llama_token, bytePtr, bytes.Length);
                    Debug.Assert(written == bytes.Length);

                    return encoding.GetString(bytePtr, bytes.Length);
                }
            }
        }

        /// <summary>
        /// Append a single llama token to a string builder
        /// </summary>
        /// <param name="llama_token">Token to decode</param>
        /// <param name="encoding"></param>
        /// <param name="dest">string builder to append the result to</param>
        public void TokenToString(int llama_token, Encoding encoding, StringBuilder dest)
        {
            unsafe
            {
                fixed (byte* ptr = &span[0])
                var length = NativeApi.llama_token_to_piece_with_model(this, llama_token, null, 0);
                if (length == 0)
                    return;

                Span<byte> bytes = stackalloc byte[-length];
                fixed (byte* bytePtr = bytes)
                {
                    return encoding.GetString(ptr, span.Length);
                    // Decode into bytes
                    var written = NativeApi.llama_token_to_piece_with_model(this, llama_token, bytePtr, bytes.Length);
                    Debug.Assert(written == bytes.Length);

                    // Decode into chars
                    var charCount = encoding.GetCharCount(bytePtr, bytes.Length);
                    Span<char> chars = stackalloc char[charCount];
                    fixed (char* charPtr = chars)
                        encoding.GetChars(bytePtr, bytes.Length, charPtr, chars.Length);

                    // Write it to the output
                    for (var i = 0; i < chars.Length; i++)
                        dest.Append(chars[i]);
                }
            }
        }
--- a/LLama/Native/SamplingApi.cs
+++ b/LLama/Native/SamplingApi.cs
@@ -1,8 +1,14 @@
 using System;

 #pragma warning disable IDE1006 // Naming Styles

 namespace LLama.Native
 {
    using llama_token = Int32;

    /// <summary>
    /// Direct translation of the llama.cpp sampling API
    /// </summary>
    public unsafe class SamplingApi
    {
        /// <summary>
@@ -140,6 +146,13 @@ namespace LLama.Native
            NativeApi.llama_sample_typical(ctx, ref st, p, min_keep);
        }

        /// <summary>
        /// Sample with temperature.
        /// As temperature increases, the prediction becomes diverse but also vulnerable to hallucinations -- generating tokens that are sensible but not factual
        /// </summary>
        /// <param name="ctx"></param>
        /// <param name="candidates"></param>
        /// <param name="temp"></param>
        public static void llama_sample_temperature(SafeLLamaContextHandle ctx, LLamaTokenDataArray candidates, float temp)
        {
            using var handle = LLamaTokenDataArrayNative.Create(candidates, out var st);
--- a/LLama/OldVersion/LLamaModel.cs
+++ b/LLama/OldVersion/LLamaModel.cs
@@ -634,7 +634,7 @@ namespace LLama.OldVersion
                        LLamaTokenDataArray candidates_p = new LLamaTokenDataArray(candidates);

                        // Apply penalties
                        float nl_logit = logits[NativeApi.llama_token_nl()];
                        float nl_logit = logits[NativeApi.llama_token_nl(_ctx)];
                        var last_n_repeat = Math.Min(Math.Min(_last_n_tokens.Count, repeat_last_n), _n_ctx);
                        SamplingApi.llama_sample_repetition_penalty(_ctx, candidates_p,
                            _last_n_tokens.Skip(_last_n_tokens.Count - last_n_repeat).ToArray(),
@@ -644,7 +644,7 @@ namespace LLama.OldVersion
                            (ulong)last_n_repeat, alpha_frequency, alpha_presence);
                        if (!penalize_nl)
                        {
                            logits[NativeApi.llama_token_nl()] = nl_logit;
                            logits[NativeApi.llama_token_nl(_ctx)] = nl_logit;
                        }

                        if (temp <= 0)
@@ -684,7 +684,7 @@ namespace LLama.OldVersion
                    }

                    // replace end of text token with newline token when in interactive mode
                    if (id == NativeApi.llama_token_eos() && _params.interactive && !_params.instruct)
                    if (id == NativeApi.llama_token_eos(_ctx) && _params.interactive && !_params.instruct)
                    {
                        id = _llama_token_newline[0];
                        if (_params.antiprompt.Count != 0)
@@ -760,7 +760,7 @@ namespace LLama.OldVersion
                        break;
                    }

                    if (_embed.Count > 0 && _embed.Last() == NativeApi.llama_token_eos())
                    if (_embed.Count > 0 && _embed.Last() == NativeApi.llama_token_eos(_ctx))
                    {
                        if (_params.instruct)
                        {
--- a/LLama/runtimes/ggml-metal.metal
+++ b/LLama/runtimes/ggml-metal.metal
--- a/LLama/runtimes/libllama-cuda11.dll
+++ b/LLama/runtimes/libllama-cuda11.dll
--- a/LLama/runtimes/libllama-cuda11.so
+++ b/LLama/runtimes/libllama-cuda11.so
--- a/LLama/runtimes/libllama-cuda12.dll
+++ b/LLama/runtimes/libllama-cuda12.dll
--- a/LLama/runtimes/libllama-cuda12.so
+++ b/LLama/runtimes/libllama-cuda12.so
--- a/LLama/runtimes/libllama-metal.dylib
+++ b/LLama/runtimes/libllama-metal.dylib
--- a/LLama/runtimes/libllama.dll
+++ b/LLama/runtimes/libllama.dll
--- a/LLama/runtimes/libllama.dylib
+++ b/LLama/runtimes/libllama.dylib
--- a/LLama/runtimes/libllama.so
+++ b/LLama/runtimes/libllama.so
--- a/docs/ContributingGuide.md
+++ b/docs/ContributingGuide.md
@@ -33,11 +33,11 @@ When adding the feature, please take care of the namespace and the naming conven

 ## Find the problem and fix the BUG

 If the issue is related to the LLM internal behaviors, such as endless generating the response, the best way to find the problem is to do comparison test between llama.cpp and LLamaSharp.
 If the issue is related to the LLM internal behaviour, such as endless generating the response, the best way to find the problem is to do comparison test between llama.cpp and LLamaSharp.

 You could use exactly the same prompt, the same model and the same parameters to run the inference in llama.cpp and LLamaSharp respectively to see if it's really a problem caused by the implementation in LLamaSharp.

 If the experiment showed that it worked well in llama.cpp but didn't in LLamaSharp, a the search for the problem could be started. While the reason of the problem could be various, the best way I think is to add log-print in the code of llama.cpp and use it in LLamaSharp after compilation. Thus, when running LLamaSharp, you could see what happened in the native library.
 If the experiment showed that it worked well in llama.cpp but didn't in LLamaSharp, a search for the problem could be started. While the reason of the problem could be various, the best way I think is to add log-print in the code of llama.cpp and use it in LLamaSharp after compilation. Thus, when running LLamaSharp, you could see what happened in the native library.

 After finding out the reason, a painful but happy process comes. When working on the BUG fix, there's only one rule to follow, that is keeping the examples working well. If the modification fixed the BUG but impact on other functions, it would not be a good fix.

--- a/docs/GetStarted.md
+++ b/docs/GetStarted.md
@@ -54,8 +54,16 @@ using LLama;
 string modelPath = "<Your model path>" // change it to your own model path
 var prompt = "Transcript of a dialog, where the User interacts with an Assistant named Bob. Bob is helpful, kind, honest, good at writing, and never fails to answer the User's requests immediately and with precision.\r\n\r\nUser: Hello, Bob.\r\nBob: Hello. How may I help you today?\r\nUser: Please tell me the largest city in Europe.\r\nBob: Sure. The largest city in Europe is Moscow, the capital of Russia.\r\nUser:"; // use the "chat-with-bob" prompt here.

 // Load model
 var parameters = new ModelParams(modelPath)
 {
    ContextSize = 1024
 };
 using var model = LLamaWeights.LoadFromFile(parameters);

 // Initialize a chat session
 var ex = new InteractiveExecutor(new LLamaModel(new ModelParams(modelPath, contextSize: 1024, seed: 1337, gpuLayerCount: 5)));
 using var context = model.CreateContext(parameters);
 var ex = new InteractiveExecutor(context);
 ChatSession session = new ChatSession(ex);

 // show the prompt
--- a/docs/Tricks.md
+++ b/docs/Tricks.md
@@ -1,11 +1,11 @@
 # Tricks for FAQ

 Sometimes, your application with LLM and LLamaSharp may have strange behaviors. Before opening an issue to report the BUG, the following tricks may worth a try.
 Sometimes, your application with LLM and LLamaSharp may have strange behaviours. Before opening an issue to report the BUG, the following tricks may worth a try.


 ## Carefully set the anti-prompts

 Anti-prompt can also be called as "Stop-keyword", which decides when to stop the response generation. Under interactive mode, the maximum tokens count is always not set, which makes the LLM generates responses infinitively. Therefore, setting anti-prompt correctly helps a lot to avoid the strange behaviors. For example, the prompt file `chat-with-bob.txt` has the following content:
 Anti-prompt can also be called as "Stop-keyword", which decides when to stop the response generation. Under interactive mode, the maximum tokens count is always not set, which makes the LLM generates responses infinitively. Therefore, setting anti-prompt correctly helps a lot to avoid the strange behaviours. For example, the prompt file `chat-with-bob.txt` has the following content:

 ```
 Transcript of a dialog, where the User interacts with an Assistant named Bob. Bob is helpful, kind, honest, good at writing, and never fails to answer the User's requests immediately and with precision.
@@ -19,7 +19,7 @@ User:

 Therefore, the anti-prompt should be set as "User:". If the last line of the prompt is removed, LLM will automatically generate a question (user) and a response (bob) for one time when running the chat session. Therefore, the antiprompt is suggested to be appended to the prompt when starting a chat session.

 What if an extra line is appended? The string "User:" in the prompt will be followed with a char "\n". Thus when running the model, the automatic generation of a pair of question and response may appear because the anti-prompt is "User:" but the last token is "User:\n". As for whether it will appear, it's an undefined behavior, which depends on the implementation inside the `LLamaExecutor`. Anyway, since it may leads to unexpected behaviors, it's recommended to trim your prompt or carefully keep consistent with your anti-prompt.
 What if an extra line is appended? The string "User:" in the prompt will be followed with a char "\n". Thus when running the model, the automatic generation of a pair of question and response may appear because the anti-prompt is "User:" but the last token is "User:\n". As for whether it will appear, it's an undefined behaviour, which depends on the implementation inside the `LLamaExecutor`. Anyway, since it may leads to unexpected behaviors, it's recommended to trim your prompt or carefully keep consistent with your anti-prompt.

 ## Pay attention to the length of prompt

@@ -37,7 +37,7 @@ If your chat bot has bad performance, trying different executor will possibly ma

 ## Choose models weight depending on you task

 The differences between modes may lead to much different behaviors under the same task. For example, if you're building a chat bot with non-English, a fine-tuned model specially for the language you want to use will have huge effect on the performance.
 The differences between modes may lead to much different behaviours under the same task. For example, if you're building a chat bot with non-English, a fine-tuned model specially for the language you want to use will have huge effect on the performance.

 ## Set the layer count you want to offload to GPU