|
|
|
@@ -1,4 +1,6 @@ |
|
|
|
using System; |
|
|
|
using System.Buffers; |
|
|
|
using System.Text; |
|
|
|
using LLama.Exceptions; |
|
|
|
|
|
|
|
namespace LLama.Native |
|
|
|
@@ -57,5 +59,43 @@ namespace LLama.Native |
|
|
|
|
|
|
|
return new(ctx_ptr, model); |
|
|
|
} |
|
|
|
|
|
|
|
/// <summary> |
|
|
|
/// Convert the given text into tokens |
|
|
|
/// </summary> |
|
|
|
/// <param name="text">The text to tokenize</param> |
|
|
|
/// <param name="add_bos">Whether the "BOS" token should be added</param> |
|
|
|
/// <param name="encoding">Encoding to use for the text</param> |
|
|
|
/// <returns></returns> |
|
|
|
/// <exception cref="RuntimeError"></exception> |
|
|
|
public int[] Tokenize(string text, bool add_bos, Encoding encoding) |
|
|
|
{ |
|
|
|
// Calculate number of bytes in string, this is a pessimistic estimate of token count. It can't |
|
|
|
// possibly be more than this. |
|
|
|
var count = encoding.GetByteCount(text) + (add_bos ? 1 : 0); |
|
|
|
|
|
|
|
// "Rent" an array to write results into (avoiding an allocation of a large array) |
|
|
|
var temporaryArray = ArrayPool<int>.Shared.Rent(count); |
|
|
|
try |
|
|
|
{ |
|
|
|
// Do the actual conversion |
|
|
|
var n = NativeApi.llama_tokenize(this, text, encoding, temporaryArray, count, add_bos); |
|
|
|
if (n < 0) |
|
|
|
{ |
|
|
|
throw new RuntimeError("Error happened during tokenization. It's possibly caused by wrong encoding. Please try to " + |
|
|
|
"specify the encoding."); |
|
|
|
} |
|
|
|
|
|
|
|
// Copy the results from the rented into an array which is exactly the right size |
|
|
|
var result = new int[n]; |
|
|
|
Array.ConstrainedCopy(temporaryArray, 0, result, 0, n); |
|
|
|
|
|
|
|
return result; |
|
|
|
} |
|
|
|
finally |
|
|
|
{ |
|
|
|
ArrayPool<int>.Shared.Return(temporaryArray); |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
} |