Browse Source

Skipping tokenization of empty strings (saves allocating an empty array every time)

tags/v0.6.0
Martin Evans 2 years ago
parent
commit
daf09eae64
2 changed files with 27 additions and 0 deletions
  1. +24
    -0
      LLama.Unittest/LLamaContextTests.cs
  2. +3
    -0
      LLama/Native/SafeLLamaContextHandle.cs

+ 24
- 0
LLama.Unittest/LLamaContextTests.cs View File

@@ -32,5 +32,29 @@ namespace LLama.Unittest
Assert.Equal(32000, _context.VocabCount); Assert.Equal(32000, _context.VocabCount);
Assert.Equal(0, _context.KVCacheTokenCount); Assert.Equal(0, _context.KVCacheTokenCount);
} }

[Fact]
public void Tokenize()
{
var tokens = _context.Tokenize("The quick brown fox", true);

Assert.Equal(new[] { 1, 450, 4996, 17354, 1701, 29916 }, tokens);
}

[Fact]
public void TokenizeWithoutBOS()
{
var tokens = _context.Tokenize("The quick brown fox", false);

Assert.Equal(new[] { 450, 4996, 17354, 1701, 29916 }, tokens);
}

[Fact]
public void TokenizeEmpty()
{
var tokens = _context.Tokenize("", false);

Assert.Equal(Array.Empty<int>(), tokens);
}
} }
} }

+ 3
- 0
LLama/Native/SafeLLamaContextHandle.cs View File

@@ -146,6 +146,9 @@ namespace LLama.Native
{ {
ThrowIfDisposed(); ThrowIfDisposed();


if (string.IsNullOrEmpty(text) && !add_bos)
return Array.Empty<int>();

// Calculate number of bytes in string, this is a pessimistic estimate of token count. It can't // Calculate number of bytes in string, this is a pessimistic estimate of token count. It can't
// possibly be more than this. // possibly be more than this.
var count = encoding.GetByteCount(text) + (add_bos ? 1 : 0); var count = encoding.GetByteCount(text) + (add_bos ? 1 : 0);


Loading…
Cancel
Save