diff --git a/LLama.Unittest/LLamaContextTests.cs b/LLama.Unittest/LLamaContextTests.cs index d2947624..198511f1 100644 --- a/LLama.Unittest/LLamaContextTests.cs +++ b/LLama.Unittest/LLamaContextTests.cs @@ -32,5 +32,29 @@ namespace LLama.Unittest Assert.Equal(32000, _context.VocabCount); Assert.Equal(0, _context.KVCacheTokenCount); } + + [Fact] + public void Tokenize() + { + var tokens = _context.Tokenize("The quick brown fox", true); + + Assert.Equal(new[] { 1, 450, 4996, 17354, 1701, 29916 }, tokens); + } + + [Fact] + public void TokenizeWithoutBOS() + { + var tokens = _context.Tokenize("The quick brown fox", false); + + Assert.Equal(new[] { 450, 4996, 17354, 1701, 29916 }, tokens); + } + + [Fact] + public void TokenizeEmpty() + { + var tokens = _context.Tokenize("", false); + + Assert.Equal(Array.Empty(), tokens); + } } } diff --git a/LLama/Native/SafeLLamaContextHandle.cs b/LLama/Native/SafeLLamaContextHandle.cs index 228ccde3..26fd011b 100644 --- a/LLama/Native/SafeLLamaContextHandle.cs +++ b/LLama/Native/SafeLLamaContextHandle.cs @@ -146,6 +146,9 @@ namespace LLama.Native { ThrowIfDisposed(); + if (string.IsNullOrEmpty(text) && !add_bos) + return Array.Empty(); + // Calculate number of bytes in string, this is a pessimistic estimate of token count. It can't // possibly be more than this. var count = encoding.GetByteCount(text) + (add_bos ? 1 : 0);