You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

LLamaEmbedder.cs 4.7 kB

April 2024 Binary Update (#662) * Updated binaries, using [this build](https://github.com/SciSharp/LLamaSharp/actions/runs/8654672719/job/23733195669) for llama.cpp commit `f7001ccc5aa359fcf41bba19d1c99c3d25c9bcc7`. - Added all new functions. - Moved some functions (e.g. `SafeLlamaModelHandle` specific functions) into `SafeLlamaModelHandle.cs` - Exposed tokens on `SafeLlamaModelHandle` and `LLamaWeights` through a `Tokens` property. As new special tokens are added in the future they can be added here. - Changed all token properties to return nullable tokens, to handle some models not having some tokens. - Fixed `DefaultSamplingPipeline` to handle no newline token in some models. * Moved native methods to more specific locations. - Context specific things have been moved into `SafeLLamaContextHandle.cs` and made private - they're exposed through C# properties and methods already. - Checking that GPU layer count is zero if GPU offload is not supported. - Moved methods for creating default structs (`llama_model_quantize_default_params` and `llama_context_default_params`) into relevant structs. * Removed exception if `GpuLayerCount > 0` when GPU is not supported. * - Added low level wrapper methods for new per-sequence state load/save in `SafeLLamaContextHandle` - Added high level wrapper methods (save/load with `State` object or memory mapped file) in `LLamaContext` - Moved native methods for per-sequence state load/save into `SafeLLamaContextHandle` * Added update and defrag methods for KV cache in `SafeLLamaContextHandle` * Updated submodule to `f7001ccc5aa359fcf41bba19d1c99c3d25c9bcc7` * Passing the sequence ID when saving a single sequence state
1 year ago
April 2024 Binary Update (#662) * Updated binaries, using [this build](https://github.com/SciSharp/LLamaSharp/actions/runs/8654672719/job/23733195669) for llama.cpp commit `f7001ccc5aa359fcf41bba19d1c99c3d25c9bcc7`. - Added all new functions. - Moved some functions (e.g. `SafeLlamaModelHandle` specific functions) into `SafeLlamaModelHandle.cs` - Exposed tokens on `SafeLlamaModelHandle` and `LLamaWeights` through a `Tokens` property. As new special tokens are added in the future they can be added here. - Changed all token properties to return nullable tokens, to handle some models not having some tokens. - Fixed `DefaultSamplingPipeline` to handle no newline token in some models. * Moved native methods to more specific locations. - Context specific things have been moved into `SafeLLamaContextHandle.cs` and made private - they're exposed through C# properties and methods already. - Checking that GPU layer count is zero if GPU offload is not supported. - Moved methods for creating default structs (`llama_model_quantize_default_params` and `llama_context_default_params`) into relevant structs. * Removed exception if `GpuLayerCount > 0` when GPU is not supported. * - Added low level wrapper methods for new per-sequence state load/save in `SafeLLamaContextHandle` - Added high level wrapper methods (save/load with `State` object or memory mapped file) in `LLamaContext` - Moved native methods for per-sequence state load/save into `SafeLLamaContextHandle` * Added update and defrag methods for KV cache in `SafeLLamaContextHandle` * Updated submodule to `f7001ccc5aa359fcf41bba19d1c99c3d25c9bcc7` * Passing the sequence ID when saving a single sequence state
1 year ago
April 2024 Binary Update (#662) * Updated binaries, using [this build](https://github.com/SciSharp/LLamaSharp/actions/runs/8654672719/job/23733195669) for llama.cpp commit `f7001ccc5aa359fcf41bba19d1c99c3d25c9bcc7`. - Added all new functions. - Moved some functions (e.g. `SafeLlamaModelHandle` specific functions) into `SafeLlamaModelHandle.cs` - Exposed tokens on `SafeLlamaModelHandle` and `LLamaWeights` through a `Tokens` property. As new special tokens are added in the future they can be added here. - Changed all token properties to return nullable tokens, to handle some models not having some tokens. - Fixed `DefaultSamplingPipeline` to handle no newline token in some models. * Moved native methods to more specific locations. - Context specific things have been moved into `SafeLLamaContextHandle.cs` and made private - they're exposed through C# properties and methods already. - Checking that GPU layer count is zero if GPU offload is not supported. - Moved methods for creating default structs (`llama_model_quantize_default_params` and `llama_context_default_params`) into relevant structs. * Removed exception if `GpuLayerCount > 0` when GPU is not supported. * - Added low level wrapper methods for new per-sequence state load/save in `SafeLLamaContextHandle` - Added high level wrapper methods (save/load with `State` object or memory mapped file) in `LLamaContext` - Moved native methods for per-sequence state load/save into `SafeLLamaContextHandle` * Added update and defrag methods for KV cache in `SafeLLamaContextHandle` * Updated submodule to `f7001ccc5aa359fcf41bba19d1c99c3d25c9bcc7` * Passing the sequence ID when saving a single sequence state
1 year ago
April 2024 Binary Update (#662) * Updated binaries, using [this build](https://github.com/SciSharp/LLamaSharp/actions/runs/8654672719/job/23733195669) for llama.cpp commit `f7001ccc5aa359fcf41bba19d1c99c3d25c9bcc7`. - Added all new functions. - Moved some functions (e.g. `SafeLlamaModelHandle` specific functions) into `SafeLlamaModelHandle.cs` - Exposed tokens on `SafeLlamaModelHandle` and `LLamaWeights` through a `Tokens` property. As new special tokens are added in the future they can be added here. - Changed all token properties to return nullable tokens, to handle some models not having some tokens. - Fixed `DefaultSamplingPipeline` to handle no newline token in some models. * Moved native methods to more specific locations. - Context specific things have been moved into `SafeLLamaContextHandle.cs` and made private - they're exposed through C# properties and methods already. - Checking that GPU layer count is zero if GPU offload is not supported. - Moved methods for creating default structs (`llama_model_quantize_default_params` and `llama_context_default_params`) into relevant structs. * Removed exception if `GpuLayerCount > 0` when GPU is not supported. * - Added low level wrapper methods for new per-sequence state load/save in `SafeLLamaContextHandle` - Added high level wrapper methods (save/load with `State` object or memory mapped file) in `LLamaContext` - Moved native methods for per-sequence state load/save into `SafeLLamaContextHandle` * Added update and defrag methods for KV cache in `SafeLLamaContextHandle` * Updated submodule to `f7001ccc5aa359fcf41bba19d1c99c3d25c9bcc7` * Passing the sequence ID when saving a single sequence state
1 year ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138
  1. using LLama.Native;
  2. using System;
  3. using LLama.Exceptions;
  4. using LLama.Abstractions;
  5. using Microsoft.Extensions.Logging;
  6. using System.Threading;
  7. using System.Threading.Tasks;
  8. namespace LLama
  9. {
  10. /// <summary>
  11. /// The embedder for LLama, which supports getting embeddings from text.
  12. /// </summary>
  13. public sealed class LLamaEmbedder
  14. : IDisposable
  15. {
  16. /// <summary>
  17. /// Dimension of embedding vectors
  18. /// </summary>
  19. public int EmbeddingSize => Context.EmbeddingSize;
  20. /// <summary>
  21. /// LLama Context
  22. /// </summary>
  23. public LLamaContext Context { get; }
  24. /// <summary>
  25. /// Create a new embedder, using the given LLamaWeights
  26. /// </summary>
  27. /// <param name="weights"></param>
  28. /// <param name="params"></param>
  29. /// <param name="logger"></param>
  30. public LLamaEmbedder(LLamaWeights weights, IContextParams @params, ILogger? logger = null)
  31. {
  32. if (!@params.Embeddings)
  33. throw new ArgumentException("EmbeddingMode must be true", nameof(@params));
  34. Context = weights.CreateContext(@params, logger);
  35. }
  36. /// <summary>
  37. /// Get the embeddings of the text.
  38. /// </summary>
  39. /// <param name="text"></param>
  40. /// <param name="cancellationToken"></param>
  41. /// <returns></returns>
  42. /// <exception cref="RuntimeError"></exception>
  43. public Task<float[]> GetEmbeddings(string text, CancellationToken cancellationToken = default)
  44. {
  45. return GetEmbeddings(text, true, cancellationToken);
  46. }
  47. /// <summary>
  48. /// Get the embeddings of the text.
  49. /// </summary>
  50. /// <param name="text"></param>
  51. /// <param name="addBos">Add bos to the text.</param>
  52. /// <param name="cancellationToken"></param>
  53. /// <returns></returns>
  54. /// <exception cref="RuntimeError"></exception>
  55. public async Task<float[]> GetEmbeddings(string text, bool addBos, CancellationToken cancellationToken = default)
  56. {
  57. var tokens = Context.Tokenize(text, addBos);
  58. if (tokens.Length > Context.ContextSize)
  59. throw new ArgumentException($"Embedding prompt is longer than the context window ({tokens.Length} > {Context.ContextSize})", nameof(text));
  60. // Evaluate prompt in batch-size chunks
  61. var n_past = 0;
  62. var batch = new LLamaBatch();
  63. var batchSize = (int)Context.Params.BatchSize;
  64. for (var i = 0; i < tokens.Length; i += batchSize)
  65. {
  66. var n_eval = tokens.Length - i;
  67. if (n_eval > batchSize)
  68. n_eval = batchSize;
  69. batch.Clear();
  70. batch.AddRange(tokens.AsSpan(i, n_eval), n_past, LLamaSeqId.Zero, true);
  71. n_past += n_eval;
  72. var returnCode = await Context.DecodeAsync(batch, cancellationToken);
  73. if (returnCode != 0)
  74. throw new LLamaDecodeError(returnCode);
  75. }
  76. var embeddings = GetEmbeddingsArray();
  77. // Remove everything we just evaluated from the context cache
  78. Context.NativeHandle.KvCacheClear();
  79. // Normalize the embeddings vector
  80. // https://github.com/ggerganov/llama.cpp/blob/2891c8aa9af17f4ff636ff3868bc34ff72b56e25/examples/embedding/embedding.cpp#L92
  81. Normalize(embeddings);
  82. return embeddings;
  83. }
  84. private float[] GetEmbeddingsArray()
  85. {
  86. unsafe
  87. {
  88. var embeddings = NativeApi.llama_get_embeddings(Context.NativeHandle);
  89. if (embeddings == null)
  90. embeddings = NativeApi.llama_get_embeddings_seq(Context.NativeHandle, LLamaSeqId.Zero);
  91. if (embeddings == null)
  92. return Array.Empty<float>();
  93. return new Span<float>(embeddings, Context.EmbeddingSize).ToArray();
  94. }
  95. }
  96. private static void Normalize(Span<float> embeddings)
  97. {
  98. // Calculate length
  99. var lengthSqr = 0.0;
  100. foreach (var value in embeddings)
  101. lengthSqr += value * value;
  102. var length = (float)Math.Sqrt(lengthSqr);
  103. // Do not divide by length if it is zero
  104. if (length <= float.Epsilon)
  105. return;
  106. // Normalize
  107. for (var i = 0; i < embeddings.Length; i++)
  108. embeddings[i] /= length;
  109. }
  110. /// <inheritdoc />
  111. public void Dispose()
  112. {
  113. Context.Dispose();
  114. }
  115. }
  116. }