using System.Text; using LLama.Common; using LLama.Extensions; namespace LLama.Unittest; public sealed class TokenTests : IDisposable { private readonly ModelParams _params; private readonly LLamaWeights _model; public TokenTests() { _params = new ModelParams(Constants.ModelPath) { ContextSize = 2048 }; _model = LLamaWeights.LoadFromFile(_params); } public void Dispose() { _model.Dispose(); } [Fact] public void TokensEndWith() { var tokens = _model.NativeHandle.Tokenize("The cat sat on the edge of the mat", false, true, Encoding.UTF8); var result = tokens.TokensEndsWithAnyString(new[] { "a fish", "the mat", "this is an improbably long query to be using for this method" }, _model.NativeHandle, Encoding.UTF8); Assert.True(result); } [Fact] public void TokensEndSubstring() { var tokens = _model.NativeHandle.Tokenize("The cat sat on the edge of the mat", false, true, Encoding.UTF8); var result = tokens.TokensEndsWithAnyString((IList)new[] { "at", }, _model.NativeHandle, Encoding.UTF8); Assert.True(result); } [Fact] public void TokensNotEndWith() { var tokens = _model.NativeHandle.Tokenize("The cat sat on the edge of the mat", false, true, Encoding.UTF8); var result = tokens.TokensEndsWithAnyString((IList)new[] { "a fish", "The cat sat on the edge of the ma", "this is an improbably long query to be using for this method" }, _model.NativeHandle, Encoding.UTF8); Assert.False(result); } [Fact] public void TokensNotEndWithNothing() { var tokens = _model.NativeHandle.Tokenize("The cat sat on the edge of the mat", false, true, Encoding.UTF8); var result = tokens.TokensEndsWithAnyString((IList)Array.Empty(), _model.NativeHandle, Encoding.UTF8); Assert.False(result); } [Fact] public void TokensEndWith2() { var tokens = _model.NativeHandle.Tokenize("The cat sat on the edge of the mat", false, true, Encoding.UTF8); var decoder = new StreamingTokenDecoder(Encoding.UTF8, _model); decoder.AddRange(tokens); var processor = new AntipromptProcessor(new[] { "a fish", "the mat", "this is an improbably long query to be using for this method" }); var result = processor.Add(decoder.Read()); Assert.True(result); } [Fact] public void TokensEndSubstring2() { var tokens = _model.NativeHandle.Tokenize("The cat sat on the edge of the mat", false, true, Encoding.UTF8); var decoder = new StreamingTokenDecoder(Encoding.UTF8, _model); decoder.AddRange(tokens); var processor = new AntipromptProcessor(new[] { "at" }); var result = processor.Add(decoder.Read()); Assert.True(result); } [Fact] public void TokensNotEndWith2() { var tokens = _model.NativeHandle.Tokenize("The cat sat on the edge of the mat", false, true, Encoding.UTF8); var decoder = new StreamingTokenDecoder(Encoding.UTF8, _model); decoder.AddRange(tokens); var processor = new AntipromptProcessor(new[] { "a fish", "The cat sat on the edge of the ma", "this is an improbably long query to be using for this method" }); var result = processor.Add(decoder.Read()); Assert.False(result); } [Fact] public void TokensNotEndWithNothing2() { var tokens = _model.NativeHandle.Tokenize("The cat sat on the edge of the mat", false, true, Encoding.UTF8); var decoder = new StreamingTokenDecoder(Encoding.UTF8, _model); decoder.AddRange(tokens); var processor = new AntipromptProcessor(); var result = processor.Add(decoder.Read()); Assert.False(result); } [Fact] public void RoundTrip() { var strings = new[] { "Hello world", "철수", "😀 😃 😄 😁 😆철수😅 😂 😊 😇 🙂 ", }; var charsArr = new char[1024]; foreach (var input in strings) { // Convert into llama tokens var tokens = _model.NativeHandle.Tokenize(input, false, false, Encoding.UTF8); // Convert tokens back into characters var chars = _model.NativeHandle.TokensToSpan(tokens, charsArr.AsSpan(), Encoding.UTF8); // llama.cpp adds a space to the start of strings, remove that var output = new string(chars).TrimStart(' '); // Check that the input equals the output Assert.Equal(input, output); } } [Fact] public void StreamingDecoderRoundTrip() { var decoder = new StreamingTokenDecoder(Encoding.UTF8, _model); var strings = new[] { "Hello world", "철수", "😀 😃 😄 😁 😆철수😅 😂 😊 😇 🙂 ", }; foreach (var input in strings) { decoder.Reset(); // Convert into llama tokens var tokens = _model.NativeHandle.Tokenize(input, false, false, Encoding.UTF8); // Add tokens to decoder foreach (var token in tokens) decoder.Add(token); // llama.cpp adds a space to the start of strings, remove that var output = decoder.Read().TrimStart(' '); // Check that the input equals the output Assert.Equal(input, output); } } }