You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

DefaultTokenizer.cs 1.7 kB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253
  1. using System;
  2. using System.Collections.Generic;
  3. using System.Text;
  4. namespace LLama.Transform
  5. {
  6. /// <summary>
  7. /// The default tokenizer of LLamaSharp. This class should not be inherited.
  8. /// <b>Note that this class has state. The previous outputs feeded to it will affect its control.</b>
  9. /// If you use it in a session, please don't reuse it for another session unless you intend to do so.
  10. /// </summary>
  11. public sealed class DefaultTokenizer: ITokenizer
  12. {
  13. private Encoding _encoding;
  14. private StreamingTokenDecoder _tokenDecoder;
  15. /// <summary>
  16. /// Initialize a new tokenizer with the specified encoding.
  17. /// </summary>
  18. /// <param name="encoding"></param>
  19. public DefaultTokenizer(Encoding encoding)
  20. {
  21. _encoding = encoding;
  22. _tokenDecoder = new StreamingTokenDecoder(encoding);
  23. }
  24. /// <summary>
  25. /// <inheritdoc/>
  26. /// </summary>
  27. public IEnumerable<int> Tokenize(LLamaContext context, string text, bool addBos = true, bool special = false)
  28. {
  29. return context.Tokenize(text, addBos, special);
  30. }
  31. /// <summary>
  32. /// <inheritdoc/>
  33. /// </summary>
  34. public string Detokenize(LLamaContext context, int token)
  35. {
  36. _tokenDecoder.Add(token, context.NativeHandle.ModelHandle);
  37. return _tokenDecoder.Read();
  38. }
  39. /// <summary>
  40. /// <inheritdoc/>
  41. /// </summary>
  42. public string Detokenize(LLamaContext context, IEnumerable<int> tokens)
  43. {
  44. _tokenDecoder.AddRange(tokens, context.NativeHandle.ModelHandle);
  45. return _tokenDecoder.Read();
  46. }
  47. }
  48. }