You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

SafeLLamaContextHandle.cs 17 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442
  1. using System;
  2. using System.Collections.Generic;
  3. using System.Runtime.InteropServices;
  4. using System.Text;
  5. using LLama.Exceptions;
  6. namespace LLama.Native
  7. {
  8. /// <summary>
  9. /// A safe wrapper around a llama_context
  10. /// </summary>
  11. // ReSharper disable once ClassNeverInstantiated.Global (used implicitly in native API)
  12. public sealed class SafeLLamaContextHandle
  13. : SafeLLamaHandleBase
  14. {
  15. #region properties and fields
  16. /// <summary>
  17. /// Total number of tokens in vocabulary of this model
  18. /// </summary>
  19. public int VocabCount => ThrowIfDisposed().VocabCount;
  20. /// <summary>
  21. /// Total number of tokens in the context
  22. /// </summary>
  23. public uint ContextSize => NativeApi.llama_n_ctx(this);
  24. /// <summary>
  25. /// Dimension of embedding vectors
  26. /// </summary>
  27. public int EmbeddingSize => ThrowIfDisposed().EmbeddingSize;
  28. /// <summary>
  29. /// Get the maximum batch size for this context
  30. /// </summary>
  31. public uint BatchSize => NativeApi.llama_n_batch(this);
  32. /// <summary>
  33. /// Get the model which this context is using
  34. /// </summary>
  35. public SafeLlamaModelHandle ModelHandle => ThrowIfDisposed();
  36. private SafeLlamaModelHandle? _model;
  37. #endregion
  38. #region construction/destruction
  39. /// <inheritdoc />
  40. protected override bool ReleaseHandle()
  41. {
  42. llama_free(handle);
  43. SetHandle(IntPtr.Zero);
  44. // Decrement refcount on model
  45. _model?.DangerousRelease();
  46. _model = null!;
  47. return true;
  48. }
  49. private SafeLlamaModelHandle ThrowIfDisposed()
  50. {
  51. if (IsClosed)
  52. throw new ObjectDisposedException("Cannot use this `SafeLLamaContextHandle` - it has been disposed");
  53. if (_model == null || _model.IsClosed)
  54. throw new ObjectDisposedException("Cannot use this `SafeLLamaContextHandle` - `SafeLlamaModelHandle` has been disposed");
  55. return _model!;
  56. }
  57. /// <summary>
  58. /// Create a new llama_state for the given model
  59. /// </summary>
  60. /// <param name="model"></param>
  61. /// <param name="lparams"></param>
  62. /// <returns></returns>
  63. /// <exception cref="RuntimeError"></exception>
  64. public static SafeLLamaContextHandle Create(SafeLlamaModelHandle model, LLamaContextParams lparams)
  65. {
  66. var ctx = llama_new_context_with_model(model, lparams);
  67. if (ctx == null)
  68. throw new RuntimeError("Failed to create context from model");
  69. // Increment the model reference count while this context exists.
  70. // DangerousAddRef throws if it fails, so there is no need to check "success"
  71. ctx._model = model;
  72. var success = false;
  73. ctx._model.DangerousAddRef(ref success);
  74. return ctx;
  75. }
  76. #endregion
  77. #region Native API
  78. static SafeLLamaContextHandle()
  79. {
  80. // This ensures that `NativeApi` has been loaded before calling the two native methods below
  81. NativeApi.llama_empty_call();
  82. }
  83. /// <summary>
  84. /// Create a new llama_context with the given model. **This should never be called directly! Always use SafeLLamaContextHandle.Create**!
  85. /// </summary>
  86. /// <param name="model"></param>
  87. /// <param name="params"></param>
  88. /// <returns></returns>
  89. [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)]
  90. private static extern SafeLLamaContextHandle llama_new_context_with_model(SafeLlamaModelHandle model, LLamaContextParams @params);
  91. /// <summary>
  92. /// Frees all allocated memory in the given llama_context
  93. /// </summary>
  94. /// <param name="ctx"></param>
  95. [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)]
  96. private static extern void llama_free(IntPtr ctx);
  97. /// <summary>
  98. /// Set a callback which can abort computation
  99. /// </summary>
  100. /// <param name="ctx"></param>
  101. /// <param name="abort_callback"></param>
  102. /// <param name="abort_callback_data"></param>
  103. [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)]
  104. private static extern unsafe void llama_set_abort_callback(SafeLLamaContextHandle ctx, GgmlAbortCallback abort_callback, void* abort_callback_data);
  105. /// <summary>
  106. /// If this returns true computation is cancelled
  107. /// </summary>
  108. /// <param name="data"></param>
  109. /// <returns></returns>
  110. private unsafe delegate bool GgmlAbortCallback(void* data);
  111. #endregion
  112. /// <summary>
  113. /// Token logits obtained from the last call to llama_decode
  114. /// The logits for the last token are stored in the last row
  115. /// Can be mutated in order to change the probabilities of the next token.<br />
  116. /// Rows: n_tokens<br />
  117. /// Cols: n_vocab
  118. /// </summary>
  119. /// <returns></returns>
  120. public Span<float> GetLogits()
  121. {
  122. var model = ThrowIfDisposed();
  123. unsafe
  124. {
  125. var logits = NativeApi.llama_get_logits(this);
  126. return new Span<float>(logits, model.VocabCount);
  127. }
  128. }
  129. /// <summary>
  130. /// Logits for the ith token. Equivalent to: llama_get_logits(ctx) + i*n_vocab
  131. /// </summary>
  132. /// <param name="i"></param>
  133. /// <returns></returns>
  134. public Span<float> GetLogitsIth(int i)
  135. {
  136. var model = ThrowIfDisposed();
  137. unsafe
  138. {
  139. var logits = NativeApi.llama_get_logits_ith(this, i);
  140. return new Span<float>(logits, model.VocabCount);
  141. }
  142. }
  143. #region tokens
  144. /// <summary>
  145. /// Convert the given text into tokens
  146. /// </summary>
  147. /// <param name="text">The text to tokenize</param>
  148. /// <param name="add_bos">Whether the "BOS" token should be added</param>
  149. /// <param name="encoding">Encoding to use for the text</param>
  150. /// <param name="special">Allow tokenizing special and/or control tokens which otherwise are not exposed and treated as plaintext.</param>
  151. /// <returns></returns>
  152. /// <exception cref="RuntimeError"></exception>
  153. public LLamaToken[] Tokenize(string text, bool add_bos, bool special, Encoding encoding)
  154. {
  155. return ThrowIfDisposed().Tokenize(text, add_bos, special, encoding);
  156. }
  157. /// <summary>
  158. /// Convert a single llama token into bytes
  159. /// </summary>
  160. /// <param name="token">Token to decode</param>
  161. /// <param name="dest">A span to attempt to write into. If this is too small nothing will be written</param>
  162. /// <returns>The size of this token. **nothing will be written** if this is larger than `dest`</returns>
  163. public uint TokenToSpan(LLamaToken token, Span<byte> dest)
  164. {
  165. return ThrowIfDisposed().TokenToSpan(token, dest);
  166. }
  167. #endregion
  168. #region infer
  169. /// <summary>
  170. /// This object exists to ensure there is only ever 1 inference running at a time. This is a workaround for thread safety issues in llama.cpp itself.
  171. /// Most notably CUDA, which seems to use some global singleton resources and will crash if multiple inferences are run (even against different models).
  172. ///
  173. /// For more information see these issues:
  174. /// - https://github.com/SciSharp/LLamaSharp/issues/596
  175. /// - https://github.com/ggerganov/llama.cpp/issues/3960
  176. ///
  177. /// If these are ever resolved this lock can probably be removed.
  178. /// </summary>
  179. private static readonly object GlobalInferenceLock = new();
  180. /// <summary>
  181. /// </summary>
  182. /// <param name="batch"></param>
  183. /// <returns>Positive return values does not mean a fatal error, but rather a warning:<br />
  184. /// - 0: success<br />
  185. /// - 1: could not find a KV slot for the batch (try reducing the size of the batch or increase the context)<br />
  186. /// - &lt; 0: error<br />
  187. /// </returns>
  188. public DecodeResult Decode(LLamaBatch batch)
  189. {
  190. lock (GlobalInferenceLock)
  191. using (batch.ToNativeBatch(out var nb))
  192. return (DecodeResult)NativeApi.llama_decode(this, nb);
  193. }
  194. /// <summary>
  195. /// Decode a set of tokens in batch-size chunks.
  196. /// </summary>
  197. /// <param name="tokens"></param>
  198. /// <param name="id"></param>
  199. /// <param name="batch"></param>
  200. /// <param name="n_past"></param>
  201. /// <returns>A tuple, containing the decode result and the number of tokens that have <b>not</b> been decoded yet.</returns>
  202. internal (DecodeResult, int) Decode(List<LLamaToken> tokens, LLamaSeqId id, LLamaBatch batch, ref int n_past)
  203. {
  204. var batchSize = checked((int)BatchSize);
  205. // Evaluate the prompt, in chunks smaller than the max batch size
  206. var n_left = tokens.Count;
  207. for (var i = 0; i < tokens.Count; i += batchSize)
  208. {
  209. var n_eval = tokens.Count - i;
  210. if (n_eval > batchSize)
  211. n_eval = batchSize;
  212. batch.Clear();
  213. for (var j = 0; j < n_eval; j++)
  214. batch.Add(tokens[i + j], n_past++, id, (i + j) == tokens.Count - 1);
  215. var returnCode = Decode(batch);
  216. if (returnCode != DecodeResult.Ok)
  217. return (returnCode, n_left);
  218. n_left -= n_eval;
  219. }
  220. return (DecodeResult.Ok, 0);
  221. }
  222. #endregion
  223. #region state
  224. /// <summary>
  225. /// Get the size of the state, when saved as bytes
  226. /// </summary>
  227. public ulong GetStateSize()
  228. {
  229. return NativeApi.llama_get_state_size(this);
  230. }
  231. /// <summary>
  232. /// Get the raw state of this context, encoded as bytes. Data is written into the `dest` pointer.
  233. /// </summary>
  234. /// <param name="dest">Destination to write to</param>
  235. /// <param name="size">Number of bytes available to write to in dest (check required size with `GetStateSize()`)</param>
  236. /// <returns>The number of bytes written to dest</returns>
  237. /// <exception cref="ArgumentOutOfRangeException">Thrown if dest is too small</exception>
  238. public unsafe ulong GetState(byte* dest, ulong size)
  239. {
  240. return GetState(new IntPtr(dest), size);
  241. }
  242. /// <summary>
  243. /// Get the raw state of this context, encoded as bytes. Data is written into the `dest` pointer.
  244. /// </summary>
  245. /// <param name="dest">Destination to write to</param>
  246. /// <param name="size">Number of bytes available to write to in dest (check required size with `GetStateSize()`)</param>
  247. /// <returns>The number of bytes written to dest</returns>
  248. /// <exception cref="ArgumentOutOfRangeException">Thrown if dest is too small</exception>
  249. public ulong GetState(IntPtr dest, ulong size)
  250. {
  251. var required = GetStateSize();
  252. if (size < required)
  253. throw new ArgumentOutOfRangeException(nameof(size), $"Allocated space is too small, {size} < {required}");
  254. unsafe
  255. {
  256. return NativeApi.llama_copy_state_data(this, (byte*)dest.ToPointer());
  257. }
  258. }
  259. /// <summary>
  260. /// Set the raw state of this context
  261. /// </summary>
  262. /// <param name="src">The pointer to read the state from</param>
  263. /// <returns>Number of bytes read from the src pointer</returns>
  264. public unsafe ulong SetState(byte* src)
  265. {
  266. return SetState(new IntPtr(src));
  267. }
  268. /// <summary>
  269. /// Set the raw state of this context
  270. /// </summary>
  271. /// <param name="src">The pointer to read the state from</param>
  272. /// <returns>Number of bytes read from the src pointer</returns>
  273. public ulong SetState(IntPtr src)
  274. {
  275. unsafe
  276. {
  277. return NativeApi.llama_set_state_data(this, (byte*)src.ToPointer());
  278. }
  279. }
  280. #endregion
  281. /// <summary>
  282. /// Set the RNG seed
  283. /// </summary>
  284. /// <param name="seed"></param>
  285. public void SetSeed(uint seed)
  286. {
  287. NativeApi.llama_set_rng_seed(this, seed);
  288. }
  289. /// <summary>
  290. /// Set the number of threads used for decoding
  291. /// </summary>
  292. /// <param name="threads">n_threads is the number of threads used for generation (single token)</param>
  293. /// <param name="threadsBatch">n_threads_batch is the number of threads used for prompt and batch processing (multiple tokens)</param>
  294. public void SetThreads(uint threads, uint threadsBatch)
  295. {
  296. NativeApi.llama_set_n_threads(this, threads, threadsBatch);
  297. }
  298. #region KV Cache Management
  299. /// <summary>
  300. /// Get a new KV cache view that can be used to debug the KV cache
  301. /// </summary>
  302. /// <param name="maxSequences"></param>
  303. /// <returns></returns>
  304. public LLamaKvCacheViewSafeHandle KvCacheGetDebugView(int maxSequences = 4)
  305. {
  306. return LLamaKvCacheViewSafeHandle.Allocate(this, maxSequences);
  307. }
  308. /// <summary>
  309. /// Count the number of used cells in the KV cache (i.e. have at least one sequence assigned to them)
  310. /// </summary>
  311. /// <returns></returns>
  312. public int KvCacheCountCells()
  313. {
  314. return NativeApi.llama_get_kv_cache_used_cells(this);
  315. }
  316. /// <summary>
  317. /// Returns the number of tokens in the KV cache (slow, use only for debug)
  318. /// If a KV cell has multiple sequences assigned to it, it will be counted multiple times
  319. /// </summary>
  320. /// <returns></returns>
  321. public int KvCacheCountTokens()
  322. {
  323. return NativeApi.llama_get_kv_cache_token_count(this);
  324. }
  325. /// <summary>
  326. /// Clear the KV cache
  327. /// </summary>
  328. public void KvCacheClear()
  329. {
  330. NativeApi.llama_kv_cache_clear(this);
  331. }
  332. /// <summary>
  333. /// Removes all tokens that belong to the specified sequence and have positions in [p0, p1)
  334. /// </summary>
  335. /// <param name="seq"></param>
  336. /// <param name="p0"></param>
  337. /// <param name="p1"></param>
  338. public void KvCacheRemove(LLamaSeqId seq, LLamaPos p0, LLamaPos p1)
  339. {
  340. NativeApi.llama_kv_cache_seq_rm(this, seq, p0, p1);
  341. }
  342. /// <summary>
  343. /// Copy all tokens that belong to the specified sequence to another sequence. Note that
  344. /// this does not allocate extra KV cache memory - it simply assigns the tokens to the
  345. /// new sequence
  346. /// </summary>
  347. /// <param name="src"></param>
  348. /// <param name="dest"></param>
  349. /// <param name="p0"></param>
  350. /// <param name="p1"></param>
  351. public void KvCacheSequenceCopy(LLamaSeqId src, LLamaSeqId dest, LLamaPos p0, LLamaPos p1)
  352. {
  353. NativeApi.llama_kv_cache_seq_cp(this, src, dest, p0, p1);
  354. }
  355. /// <summary>
  356. /// Removes all tokens that do not belong to the specified sequence
  357. /// </summary>
  358. /// <param name="seq"></param>
  359. public void KvCacheSequenceKeep(LLamaSeqId seq)
  360. {
  361. NativeApi.llama_kv_cache_seq_keep(this, seq);
  362. }
  363. /// <summary>
  364. /// Adds relative position "delta" to all tokens that belong to the specified sequence
  365. /// and have positions in [p0, p1. If the KV cache is RoPEd, the KV data is updated
  366. /// accordingly
  367. /// </summary>
  368. /// <param name="seq"></param>
  369. /// <param name="p0"></param>
  370. /// <param name="p1"></param>
  371. /// <param name="delta"></param>
  372. public void KvCacheSequenceAdd(LLamaSeqId seq, LLamaPos p0, LLamaPos p1, int delta)
  373. {
  374. NativeApi.llama_kv_cache_seq_add(this, seq, p0, p1, delta);
  375. }
  376. /// <summary>
  377. /// Integer division of the positions by factor of `d > 1`.
  378. /// If the KV cache is RoPEd, the KV data is updated accordingly.<br />
  379. /// p0 &lt; 0 : [0, p1]<br />
  380. /// p1 &lt; 0 : [p0, inf)
  381. /// </summary>
  382. /// <param name="seq"></param>
  383. /// <param name="p0"></param>
  384. /// <param name="p1"></param>
  385. /// <param name="divisor"></param>
  386. public void KvCacheSequenceDivide(LLamaSeqId seq, LLamaPos p0, LLamaPos p1, int divisor)
  387. {
  388. NativeApi.llama_kv_cache_seq_div(this, seq, p0, p1, divisor);
  389. }
  390. #endregion
  391. }
  392. }