You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

SafeLLamaContextHandle.cs 15 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393
  1. using System;
  2. using System.Runtime.InteropServices;
  3. using System.Text;
  4. using LLama.Exceptions;
  5. namespace LLama.Native
  6. {
  7. /// <summary>
  8. /// A safe wrapper around a llama_context
  9. /// </summary>
  10. // ReSharper disable once ClassNeverInstantiated.Global (used implicitly in native API)
  11. public sealed class SafeLLamaContextHandle
  12. : SafeLLamaHandleBase
  13. {
  14. #region properties and fields
  15. /// <summary>
  16. /// Total number of tokens in vocabulary of this model
  17. /// </summary>
  18. public int VocabCount => ThrowIfDisposed().VocabCount;
  19. /// <summary>
  20. /// Total number of tokens in the context
  21. /// </summary>
  22. public uint ContextSize => NativeApi.llama_n_ctx(this);
  23. /// <summary>
  24. /// Dimension of embedding vectors
  25. /// </summary>
  26. public int EmbeddingSize => ThrowIfDisposed().EmbeddingSize;
  27. /// <summary>
  28. /// Get the model which this context is using
  29. /// </summary>
  30. public SafeLlamaModelHandle ModelHandle => ThrowIfDisposed();
  31. private SafeLlamaModelHandle? _model;
  32. #endregion
  33. #region construction/destruction
  34. /// <inheritdoc />
  35. protected override bool ReleaseHandle()
  36. {
  37. llama_free(handle);
  38. SetHandle(IntPtr.Zero);
  39. // Decrement refcount on model
  40. _model?.DangerousRelease();
  41. _model = null!;
  42. return true;
  43. }
  44. private SafeLlamaModelHandle ThrowIfDisposed()
  45. {
  46. if (IsClosed)
  47. throw new ObjectDisposedException("Cannot use this `SafeLLamaContextHandle` - it has been disposed");
  48. if (_model == null || _model.IsClosed)
  49. throw new ObjectDisposedException("Cannot use this `SafeLLamaContextHandle` - `SafeLlamaModelHandle` has been disposed");
  50. return _model!;
  51. }
  52. /// <summary>
  53. /// Create a new llama_state for the given model
  54. /// </summary>
  55. /// <param name="model"></param>
  56. /// <param name="lparams"></param>
  57. /// <returns></returns>
  58. /// <exception cref="RuntimeError"></exception>
  59. public static SafeLLamaContextHandle Create(SafeLlamaModelHandle model, LLamaContextParams lparams)
  60. {
  61. var ctx = llama_new_context_with_model(model, lparams);
  62. if (ctx == null)
  63. throw new RuntimeError("Failed to create context from model");
  64. // Increment the model reference count while this context exists.
  65. // DangerousAddRef throws if it fails, so there is no need to check "success"
  66. ctx._model = model;
  67. var success = false;
  68. ctx._model.DangerousAddRef(ref success);
  69. return ctx;
  70. }
  71. #endregion
  72. #region Native API
  73. static SafeLLamaContextHandle()
  74. {
  75. // This ensures that `NativeApi` has been loaded before calling the two native methods below
  76. NativeApi.llama_empty_call();
  77. }
  78. /// <summary>
  79. /// Create a new llama_context with the given model. **This should never be called directly! Always use SafeLLamaContextHandle.Create**!
  80. /// </summary>
  81. /// <param name="model"></param>
  82. /// <param name="params"></param>
  83. /// <returns></returns>
  84. [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)]
  85. private static extern SafeLLamaContextHandle llama_new_context_with_model(SafeLlamaModelHandle model, LLamaContextParams @params);
  86. /// <summary>
  87. /// Frees all allocated memory in the given llama_context
  88. /// </summary>
  89. /// <param name="ctx"></param>
  90. [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)]
  91. private static extern void llama_free(IntPtr ctx);
  92. #endregion
  93. /// <summary>
  94. /// Token logits obtained from the last call to llama_eval()
  95. /// The logits for the last token are stored in the last row
  96. /// Can be mutated in order to change the probabilities of the next token.<br />
  97. /// Rows: n_tokens<br />
  98. /// Cols: n_vocab
  99. /// </summary>
  100. /// <returns></returns>
  101. public Span<float> GetLogits()
  102. {
  103. var model = ThrowIfDisposed();
  104. unsafe
  105. {
  106. var logits = NativeApi.llama_get_logits(this);
  107. return new Span<float>(logits, model.VocabCount);
  108. }
  109. }
  110. /// <summary>
  111. /// Logits for the ith token. Equivalent to: llama_get_logits(ctx) + i*n_vocab
  112. /// </summary>
  113. /// <param name="i"></param>
  114. /// <returns></returns>
  115. public Span<float> GetLogitsIth(int i)
  116. {
  117. var model = ThrowIfDisposed();
  118. unsafe
  119. {
  120. var logits = NativeApi.llama_get_logits_ith(this, i);
  121. return new Span<float>(logits, model.VocabCount);
  122. }
  123. }
  124. #region tokens
  125. /// <summary>
  126. /// Convert the given text into tokens
  127. /// </summary>
  128. /// <param name="text">The text to tokenize</param>
  129. /// <param name="add_bos">Whether the "BOS" token should be added</param>
  130. /// <param name="encoding">Encoding to use for the text</param>
  131. /// <param name="special">Allow tokenizing special and/or control tokens which otherwise are not exposed and treated as plaintext.</param>
  132. /// <returns></returns>
  133. /// <exception cref="RuntimeError"></exception>
  134. public LLamaToken[] Tokenize(string text, bool add_bos, bool special, Encoding encoding)
  135. {
  136. return ThrowIfDisposed().Tokenize(text, add_bos, special, encoding);
  137. }
  138. /// <summary>
  139. /// Convert a single llama token into bytes
  140. /// </summary>
  141. /// <param name="token">Token to decode</param>
  142. /// <param name="dest">A span to attempt to write into. If this is too small nothing will be written</param>
  143. /// <returns>The size of this token. **nothing will be written** if this is larger than `dest`</returns>
  144. public uint TokenToSpan(LLamaToken token, Span<byte> dest)
  145. {
  146. return ThrowIfDisposed().TokenToSpan(token, dest);
  147. }
  148. #endregion
  149. #region infer
  150. /// <summary>
  151. /// Run the llama inference to obtain the logits and probabilities for the next token.
  152. /// </summary>
  153. /// <param name="tokens">The provided batch of new tokens to process</param>
  154. /// <param name="n_past">the number of tokens to use from previous eval calls</param>
  155. /// <returns>Returns true on success</returns>
  156. [Obsolete("use llama_decode() instead")]
  157. public bool Eval(ReadOnlySpan<LLamaToken> tokens, int n_past)
  158. {
  159. unsafe
  160. {
  161. fixed (LLamaToken* pinned = tokens)
  162. {
  163. // the entire `eval` system needs replacing with the new batch system!
  164. var ret = NativeApi.llama_eval(this, pinned, tokens.Length, n_past);
  165. return ret == 0;
  166. }
  167. }
  168. }
  169. /// <summary>
  170. /// </summary>
  171. /// <param name="batch"></param>
  172. /// <returns>Positive return values does not mean a fatal error, but rather a warning:<br />
  173. /// - 0: success<br />
  174. /// - 1: could not find a KV slot for the batch (try reducing the size of the batch or increase the context)<br />
  175. /// - &lt; 0: error<br />
  176. /// </returns>
  177. public int Decode(LLamaBatch batch)
  178. {
  179. using (batch.ToNativeBatch(out var nb))
  180. return NativeApi.llama_decode(this, nb);
  181. }
  182. #endregion
  183. #region state
  184. /// <summary>
  185. /// Get the size of the state, when saved as bytes
  186. /// </summary>
  187. public ulong GetStateSize()
  188. {
  189. return NativeApi.llama_get_state_size(this);
  190. }
  191. /// <summary>
  192. /// Get the raw state of this context, encoded as bytes. Data is written into the `dest` pointer.
  193. /// </summary>
  194. /// <param name="dest">Destination to write to</param>
  195. /// <param name="size">Number of bytes available to write to in dest (check required size with `GetStateSize()`)</param>
  196. /// <returns>The number of bytes written to dest</returns>
  197. /// <exception cref="ArgumentOutOfRangeException">Thrown if dest is too small</exception>
  198. public unsafe ulong GetState(byte* dest, ulong size)
  199. {
  200. return GetState(new IntPtr(dest), size);
  201. }
  202. /// <summary>
  203. /// Get the raw state of this context, encoded as bytes. Data is written into the `dest` pointer.
  204. /// </summary>
  205. /// <param name="dest">Destination to write to</param>
  206. /// <param name="size">Number of bytes available to write to in dest (check required size with `GetStateSize()`)</param>
  207. /// <returns>The number of bytes written to dest</returns>
  208. /// <exception cref="ArgumentOutOfRangeException">Thrown if dest is too small</exception>
  209. public ulong GetState(IntPtr dest, ulong size)
  210. {
  211. var required = GetStateSize();
  212. if (size < required)
  213. throw new ArgumentOutOfRangeException(nameof(size), $"Allocated space is too small, {size} < {required}");
  214. unsafe
  215. {
  216. return NativeApi.llama_copy_state_data(this, (byte*)dest.ToPointer());
  217. }
  218. }
  219. /// <summary>
  220. /// Set the raw state of this context
  221. /// </summary>
  222. /// <param name="src">The pointer to read the state from</param>
  223. /// <returns>Number of bytes read from the src pointer</returns>
  224. public unsafe ulong SetState(byte* src)
  225. {
  226. return SetState(new IntPtr(src));
  227. }
  228. /// <summary>
  229. /// Set the raw state of this context
  230. /// </summary>
  231. /// <param name="src">The pointer to read the state from</param>
  232. /// <returns>Number of bytes read from the src pointer</returns>
  233. public ulong SetState(IntPtr src)
  234. {
  235. unsafe
  236. {
  237. return NativeApi.llama_set_state_data(this, (byte*)src.ToPointer());
  238. }
  239. }
  240. #endregion
  241. /// <summary>
  242. /// Set the RNG seed
  243. /// </summary>
  244. /// <param name="seed"></param>
  245. public void SetSeed(uint seed)
  246. {
  247. NativeApi.llama_set_rng_seed(this, seed);
  248. }
  249. /// <summary>
  250. /// Set the number of threads used for decoding
  251. /// </summary>
  252. /// <param name="threads">n_threads is the number of threads used for generation (single token)</param>
  253. /// <param name="threadsBatch">n_threads_batch is the number of threads used for prompt and batch processing (multiple tokens)</param>
  254. public void SetThreads(uint threads, uint threadsBatch)
  255. {
  256. NativeApi.llama_set_n_threads(this, threads, threadsBatch);
  257. }
  258. #region KV Cache Management
  259. /// <summary>
  260. /// Get a new KV cache view that can be used to debug the KV cache
  261. /// </summary>
  262. /// <param name="maxSequences"></param>
  263. /// <returns></returns>
  264. public LLamaKvCacheViewSafeHandle KvCacheGetDebugView(int maxSequences = 4)
  265. {
  266. return LLamaKvCacheViewSafeHandle.Allocate(this, maxSequences);
  267. }
  268. /// <summary>
  269. /// Count the number of used cells in the KV cache (i.e. have at least one sequence assigned to them)
  270. /// </summary>
  271. /// <returns></returns>
  272. public int KvCacheCountCells()
  273. {
  274. return NativeApi.llama_get_kv_cache_used_cells(this);
  275. }
  276. /// <summary>
  277. /// Returns the number of tokens in the KV cache (slow, use only for debug)
  278. /// If a KV cell has multiple sequences assigned to it, it will be counted multiple times
  279. /// </summary>
  280. /// <returns></returns>
  281. public int KvCacheCountTokens()
  282. {
  283. return NativeApi.llama_get_kv_cache_token_count(this);
  284. }
  285. /// <summary>
  286. /// Clear the KV cache
  287. /// </summary>
  288. public void KvCacheClear()
  289. {
  290. NativeApi.llama_kv_cache_clear(this);
  291. }
  292. /// <summary>
  293. /// Removes all tokens that belong to the specified sequence and have positions in [p0, p1)
  294. /// </summary>
  295. /// <param name="seq"></param>
  296. /// <param name="p0"></param>
  297. /// <param name="p1"></param>
  298. public void KvCacheRemove(LLamaSeqId seq, LLamaPos p0, LLamaPos p1)
  299. {
  300. NativeApi.llama_kv_cache_seq_rm(this, seq, p0, p1);
  301. }
  302. /// <summary>
  303. /// Copy all tokens that belong to the specified sequence to another sequence. Note that
  304. /// this does not allocate extra KV cache memory - it simply assigns the tokens to the
  305. /// new sequence
  306. /// </summary>
  307. /// <param name="src"></param>
  308. /// <param name="dest"></param>
  309. /// <param name="p0"></param>
  310. /// <param name="p1"></param>
  311. public void KvCacheSequenceCopy(LLamaSeqId src, LLamaSeqId dest, LLamaPos p0, LLamaPos p1)
  312. {
  313. NativeApi.llama_kv_cache_seq_cp(this, src, dest, p0, p1);
  314. }
  315. /// <summary>
  316. /// Removes all tokens that do not belong to the specified sequence
  317. /// </summary>
  318. /// <param name="seq"></param>
  319. public void KvCacheSequenceKeep(LLamaSeqId seq)
  320. {
  321. NativeApi.llama_kv_cache_seq_keep(this, seq);
  322. }
  323. /// <summary>
  324. /// Adds relative position "delta" to all tokens that belong to the specified sequence
  325. /// and have positions in [p0, p1. If the KV cache is RoPEd, the KV data is updated
  326. /// accordingly
  327. /// </summary>
  328. /// <param name="seq"></param>
  329. /// <param name="p0"></param>
  330. /// <param name="p1"></param>
  331. /// <param name="delta"></param>
  332. public void KvCacheSequenceShift(LLamaSeqId seq, LLamaPos p0, LLamaPos p1, int delta)
  333. {
  334. NativeApi.llama_kv_cache_seq_shift(this, seq, p0, p1, delta);
  335. }
  336. /// <summary>
  337. /// Integer division of the positions by factor of `d > 1`.
  338. /// If the KV cache is RoPEd, the KV data is updated accordingly.<br />
  339. /// p0 &lt; 0 : [0, p1]<br />
  340. /// p1 &lt; 0 : [p0, inf)
  341. /// </summary>
  342. /// <param name="seq"></param>
  343. /// <param name="p0"></param>
  344. /// <param name="p1"></param>
  345. /// <param name="divisor"></param>
  346. public void KvCacheSequenceDivide(LLamaSeqId seq, LLamaPos p0, LLamaPos p1, int divisor)
  347. {
  348. NativeApi.llama_kv_cache_seq_div(this, seq, p0, p1, divisor);
  349. }
  350. #endregion
  351. }
  352. }