Added in the `special` parameter to the tokenizer (introduced in https://github.com/ggerganov/llama.cpp/pull/3538)

2 years ago · 1f8c94e386
--- a/LLama.Unittest/LLamaContextTests.cs
+++ b/LLama.Unittest/LLamaContextTests.cs
@@ -37,7 +37,7 @@ namespace LLama.Unittest
        {
            var tokens = _context.Tokenize("The quick brown fox", true);

            Assert.Equal(new[] { 1, 1576, 4996, 17354, 1701, 29916 }, tokens);
            Assert.Equal(new[] { 1, 450, 4996, 17354, 1701, 29916 }, tokens);
        }

        [Fact]
@@ -45,7 +45,7 @@ namespace LLama.Unittest
        {
            var tokens = _context.Tokenize("The quick brown fox", false);

            Assert.Equal(new[] { 1576, 4996, 17354, 1701, 29916 }, tokens);
            Assert.Equal(new[] { 450, 4996, 17354, 1701, 29916 }, tokens);
        }

        [Fact]
--- a/LLama.Unittest/StatelessExecutorTest.cs
+++ b/LLama.Unittest/StatelessExecutorTest.cs
@@ -54,7 +54,7 @@ namespace LLama.Unittest
            // with a modified context
            var @params = new InferenceParams()
            {
                MaxTokens = 70,
                MaxTokens = 65,
                TokensKeep = question.Length,
            };

--- a/LLama.Unittest/TokenTests.cs
+++ b/LLama.Unittest/TokenTests.cs
@@ -27,7 +27,7 @@ public sealed class TokenTests
    [Fact]
    public void TokensEndWith()
    {
        var tokens = _model.NativeHandle.Tokenize("The cat sat on the edge of the mat", false, Encoding.UTF8);
        var tokens = _model.NativeHandle.Tokenize("The cat sat on the edge of the mat", false, true, Encoding.UTF8);

        var result = tokens.TokensEndsWithAnyString(new[]
        {
@@ -41,7 +41,7 @@ public sealed class TokenTests
    [Fact]
    public void TokensEndSubstring()
    {
        var tokens = _model.NativeHandle.Tokenize("The cat sat on the edge of the mat", false, Encoding.UTF8);
        var tokens = _model.NativeHandle.Tokenize("The cat sat on the edge of the mat", false, true, Encoding.UTF8);

        var result = tokens.TokensEndsWithAnyString((IList<string>)new[]
        {
@@ -53,7 +53,7 @@ public sealed class TokenTests
    [Fact]
    public void TokensNotEndWith()
    {
        var tokens = _model.NativeHandle.Tokenize("The cat sat on the edge of the mat", false, Encoding.UTF8);
        var tokens = _model.NativeHandle.Tokenize("The cat sat on the edge of the mat", false, true, Encoding.UTF8);

        var result = tokens.TokensEndsWithAnyString((IList<string>)new[]
        {
@@ -67,7 +67,7 @@ public sealed class TokenTests
    [Fact]
    public void TokensNotEndWithNothing()
    {
        var tokens = _model.NativeHandle.Tokenize("The cat sat on the edge of the mat", false, Encoding.UTF8);
        var tokens = _model.NativeHandle.Tokenize("The cat sat on the edge of the mat", false, true, Encoding.UTF8);

        var result = tokens.TokensEndsWithAnyString((IList<string>)Array.Empty<string>(), _model.NativeHandle, Encoding.UTF8);
        Assert.False(result);
--- a/LLama/LLamaContext.cs
+++ b/LLama/LLamaContext.cs
@@ -92,10 +92,11 @@ namespace LLama
        /// </summary>
        /// <param name="text"></param>
        /// <param name="addBos">Whether to add a bos to the text.</param>
        /// <param name="special">Allow tokenizing special and/or control tokens which otherwise are not exposed and treated as plaintext.</param>
        /// <returns></returns>
        public llama_token[] Tokenize(string text, bool addBos = true)
        public llama_token[] Tokenize(string text, bool addBos = true, bool special = false)
        {
            return _ctx.Tokenize(text, addBos, _encoding);
            return _ctx.Tokenize(text, addBos, special, _encoding);
        }

        /// <summary>
--- a/LLama/Native/NativeApi.cs
+++ b/LLama/Native/NativeApi.cs
@@ -284,10 +284,11 @@ namespace LLama.Native
        /// <param name="tokens"></param>
        /// <param name="n_max_tokens"></param>
        /// <param name="add_bos"></param>
        /// <param name="special">Allow tokenizing special and/or control tokens which otherwise are not exposed and treated as plaintext. Does not insert a leading space.</param>
        /// <returns>Returns the number of tokens on success, no more than n_max_tokens.
        /// Returns a negative number on failure - the number of tokens that would have been returned
        /// </returns>
        public static int llama_tokenize(SafeLLamaContextHandle ctx, string text, Encoding encoding, llama_token[] tokens, int n_max_tokens, bool add_bos)
        public static int llama_tokenize(SafeLLamaContextHandle ctx, string text, Encoding encoding, llama_token[] tokens, int n_max_tokens, bool add_bos, bool special)
        {
            // Calculate number of bytes in text and borrow an array that large (+1 for nul byte)
            var byteCount = encoding.GetByteCount(text);
@@ -307,7 +308,7 @@ namespace LLama.Native
                // Do the actual tokenization
                fixed (byte* arrayPtr = array)
                fixed (llama_token* tokensPtr = tokens)
                    return llama_tokenize(ctx.ModelHandle, arrayPtr, byteCount, tokensPtr, n_max_tokens, add_bos);
                    return llama_tokenize(ctx.ModelHandle, arrayPtr, byteCount, tokensPtr, n_max_tokens, add_bos, special);
            }
            finally
            {
@@ -454,11 +455,12 @@ namespace LLama.Native
        /// <param name="tokens"></param>
        /// <param name="n_max_tokens"></param>
        /// <param name="add_bos"></param>
        /// <param name="special">Allow tokenizing special and/or control tokens which otherwise are not exposed and treated as plaintext. Does not insert a leading space.</param>
        /// <returns>Returns the number of tokens on success, no more than n_max_tokens.
        /// Returns a negative number on failure - the number of tokens that would have been returned
        /// </returns>
        [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
        public static extern int llama_tokenize(SafeLlamaModelHandle model, byte* text, int text_len, int* tokens, int n_max_tokens, bool add_bos);
        public static extern int llama_tokenize(SafeLlamaModelHandle model, byte* text, int text_len, int* tokens, int n_max_tokens, bool add_bos, bool special);

        /// <summary>
        /// Register a callback to receive llama log messages
--- a/LLama/Native/SafeLLamaContextHandle.cs
+++ b/LLama/Native/SafeLLamaContextHandle.cs
@@ -122,9 +122,10 @@ namespace LLama.Native
        /// <param name="text">The text to tokenize</param>
        /// <param name="add_bos">Whether the "BOS" token should be added</param>
        /// <param name="encoding">Encoding to use for the text</param>
        /// <param name="special">Allow tokenizing special and/or control tokens which otherwise are not exposed and treated as plaintext.</param>
        /// <returns></returns>
        /// <exception cref="RuntimeError"></exception>
        public int[] Tokenize(string text, bool add_bos, Encoding encoding)
        public int[] Tokenize(string text, bool add_bos, bool special, Encoding encoding)
        {
            ThrowIfDisposed();

@@ -140,7 +141,7 @@ namespace LLama.Native
            try
            {
                // Do the actual conversion
                var n = NativeApi.llama_tokenize(this, text, encoding, temporaryArray, count, add_bos);
                var n = NativeApi.llama_tokenize(this, text, encoding, temporaryArray, count, add_bos, special);
                if (n < 0)
                {
                    throw new RuntimeError("Error happened during tokenization. It's possibly caused by wrong encoding. Please try to " +
--- a/LLama/Native/SafeLlamaModelHandle.cs
+++ b/LLama/Native/SafeLlamaModelHandle.cs
@@ -271,8 +271,9 @@ namespace LLama.Native
        /// <param name="text"></param>
        /// <param name="add_bos"></param>
        /// <param name="encoding"></param>
        /// <param name="special">Allow tokenizing special and/or control tokens which otherwise are not exposed and treated as plaintext.</param>
        /// <returns></returns>
        public int[] Tokenize(string text, bool add_bos, Encoding encoding)
        public int[] Tokenize(string text, bool add_bos, bool special, Encoding encoding)
        {
            // Convert string to bytes, adding one extra byte to the end (null terminator)
            var bytesCount = encoding.GetByteCount(text);
@@ -291,13 +292,13 @@ namespace LLama.Native
                fixed (byte* bytesPtr = &bytes[0])
                {
                    // Tokenize once with no output, to get the token count. Output will be negative (indicating that there was insufficient space)
                    var count = -NativeApi.llama_tokenize(this, bytesPtr, bytesCount, (int*)IntPtr.Zero, 0, add_bos);
                    var count = -NativeApi.llama_tokenize(this, bytesPtr, bytesCount, (int*)IntPtr.Zero, 0, add_bos, special);

                    // Tokenize again, this time outputting into an array of exactly the right size
                    var tokens = new int[count];
                    fixed (int* tokensPtr = &tokens[0])
                    {
                        NativeApi.llama_tokenize(this, bytesPtr, bytesCount, tokensPtr, count, add_bos);
                        NativeApi.llama_tokenize(this, bytesPtr, bytesCount, tokensPtr, count, add_bos, special);
                        return tokens;
                    }
                }