You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

LLamaModel.cs 26 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595
  1. using LLama.Native;
  2. using System;
  3. using System.Collections.Generic;
  4. using System.IO;
  5. using System.Text;
  6. using LLama.Exceptions;
  7. using System.Linq;
  8. using System.Text.RegularExpressions;
  9. using System.Runtime.InteropServices;
  10. using System.Diagnostics;
  11. namespace LLama
  12. {
  13. using llama_token = Int32;
  14. public class LLamaModel: IChatModel, IDisposable
  15. {
  16. LLamaParams _params;
  17. SafeLLamaContextHandle _ctx;
  18. string _path_session;
  19. List<llama_token> _session_tokens;
  20. List<llama_token> _embed_inp;
  21. int _n_ctx;
  22. List<llama_token> _inp_pfx;
  23. List<llama_token> _inp_sfx;
  24. List<llama_token> _llama_token_newline;
  25. List<llama_token> _last_n_tokens;
  26. bool _is_interacting;
  27. bool _is_antiprompt;
  28. bool _input_echo;
  29. // HACK - because session saving incurs a non-negligible delay, for now skip re-saving session
  30. // if we loaded a session with at least 75% similarity. It's currently just used to speed up the
  31. // initial prompt so it doesn't need to be an exact match.
  32. bool _need_to_save_session;
  33. int _n_past;
  34. int _n_remain;
  35. int _n_consumed;
  36. int _n_session_consumed;
  37. List<llama_token> _embed;
  38. // params related to chat API only
  39. bool _first_time_chat = true;
  40. public string Name { get; set; }
  41. public SafeLLamaContextHandle NativeHandle => _ctx;
  42. public LLamaModel(string model_path, string model_name, bool echo_input = false, bool verbose = false, int seed = 0, int n_threads = -1, int n_predict = -1,
  43. int n_parts = -1, int n_ctx = 512, int n_batch = 512, int n_keep = 0, int n_gpu_layers = 0,
  44. Dictionary<llama_token, float> logit_bias = null, int top_k = 40, float top_p = 0.95f,
  45. float tfs_z = 1.00f, float typical_p = 1.00f, float temp = 0.80f, float repeat_penalty = 1.10f,
  46. int repeat_last_n = 64, float frequency_penalty = 0.00f, float presence_penalty = 0.00f,
  47. int mirostat = 0, float mirostat_tau = 5.00f, float mirostat_eta = 0.10f, string prompt = "",
  48. string path_session = "", string input_prefix = "", string input_suffix = "",
  49. List<string> antiprompt = null, string lora_adapter = "", string lora_base = "",
  50. bool memory_f16 = true, bool random_prompt = false, bool use_color = false, bool interactive = false,
  51. bool embedding = false, bool interactive_first = false, bool prompt_cache_all = false, bool instruct = false, bool penalize_nl = true,
  52. bool perplexity = false, bool use_mmap = true, bool use_mlock = false, bool mem_test = false,
  53. bool verbose_prompt = false) : this(new LLamaParams(seed, n_threads, n_predict, n_parts, n_ctx, n_batch,
  54. n_keep, n_gpu_layers, logit_bias, top_k, top_p, tfs_z, typical_p, temp, repeat_penalty, repeat_last_n, frequency_penalty,
  55. presence_penalty, mirostat, mirostat_tau, mirostat_eta, model_path, prompt, path_session, input_prefix,
  56. input_suffix, antiprompt, lora_adapter, lora_base, memory_f16, random_prompt, use_color, interactive, embedding,
  57. interactive_first, prompt_cache_all, instruct, penalize_nl, perplexity, use_mmap, use_mlock, mem_test, verbose_prompt), model_name, echo_input, verbose)
  58. {
  59. }
  60. public unsafe LLamaModel(LLamaParams @params, string name = "", bool echo_input = false, bool verbose = false)
  61. {
  62. Name = name;
  63. _params = @params;
  64. _ctx = Utils.llama_init_from_gpt_params(ref _params);
  65. // Add a space in front of the first character to match OG llama tokenizer behavior
  66. _params.prompt = _params.prompt.Insert(0, " ");
  67. _session_tokens = new List<llama_token>();
  68. _path_session = @params.path_session;
  69. if (!string.IsNullOrEmpty(_path_session))
  70. {
  71. if (verbose)
  72. {
  73. Logger.Default.Info($"Attempting to load saved session from '{_path_session}'");
  74. }
  75. if (!File.Exists(_path_session))
  76. {
  77. Logger.Default.Warn("Session file does not exist, will create.");
  78. }
  79. llama_token[] session_tokens = new llama_token[@params.n_ctx];
  80. ulong n_token_count_out = 0;
  81. if (!NativeApi.llama_load_session_file(_ctx, _path_session, session_tokens, (ulong)@params.n_ctx, &n_token_count_out))
  82. {
  83. throw new RuntimeError($"Failed to load session file {_path_session}");
  84. }
  85. _session_tokens = session_tokens.Take((int)n_token_count_out).ToList();
  86. if (verbose)
  87. {
  88. Logger.Default.Info($"Loaded a session with prompt size of {_session_tokens.Count} tokens");
  89. }
  90. }
  91. _embed_inp = Utils.llama_tokenize(_ctx, _params.prompt, true);
  92. _n_ctx = NativeApi.llama_n_ctx(_ctx);
  93. if (_embed_inp.Count > _n_ctx - 4)
  94. {
  95. throw new ArgumentException($"prompt is too long ({_embed_inp.Count} tokens, max {_n_ctx - 4})");
  96. }
  97. ulong n_matching_session_tokens = 0;
  98. if (_session_tokens.Count > 0)
  99. {
  100. foreach (var id in _session_tokens)
  101. {
  102. if (n_matching_session_tokens >= (ulong)_embed_inp.Count || id != _embed_inp[(int)n_matching_session_tokens])
  103. {
  104. break;
  105. }
  106. n_matching_session_tokens++;
  107. }
  108. if (n_matching_session_tokens >= (ulong)_embed_inp.Count && verbose)
  109. {
  110. Logger.Default.Info("Session file has exact match for prompt!");
  111. }
  112. else if (n_matching_session_tokens < (ulong)(_embed_inp.Count / 2))
  113. {
  114. Logger.Default.Warn($"session file has low similarity to prompt ({n_matching_session_tokens} " +
  115. $"/ {_embed_inp.Count} tokens); will mostly be reevaluated.");
  116. }
  117. else if(verbose)
  118. {
  119. Logger.Default.Info($"Session file matches {n_matching_session_tokens} / {_embed_inp.Count} " +
  120. $"tokens of prompt.");
  121. }
  122. }
  123. // number of tokens to keep when resetting context
  124. if (_params.n_keep < 0 || _params.n_keep > (int)_embed_inp.Count || _params.instruct)
  125. {
  126. _params.n_keep = _embed_inp.Count;
  127. }
  128. // prefix & suffix for instruct mode
  129. _inp_pfx = Utils.llama_tokenize(_ctx, "\n\n### Instruction:\n\n", true);
  130. _inp_sfx = Utils.llama_tokenize(_ctx, "\n\n### Response:\n\n", false);
  131. // in instruct mode, we inject a prefix and a suffix to each input by the user
  132. if (_params.instruct)
  133. {
  134. _params.interactive_first = true;
  135. _params.antiprompt.Add("### Instruction:\n\n");
  136. }
  137. // enable interactive mode if reverse prompt or interactive start is specified
  138. if (_params.antiprompt.Count != 0 || _params.interactive_first)
  139. {
  140. _params.interactive = true;
  141. }
  142. // determine newline token
  143. _llama_token_newline = Utils.llama_tokenize(_ctx, "\n", false);
  144. if (_params.verbose_prompt)
  145. {
  146. Logger.Default.Info("\n");
  147. Logger.Default.Info($"prompt: '{_params.prompt}'");
  148. Logger.Default.Info($"number of tokens in prompt = {_embed_inp.Count}");
  149. for (int i = 0; i < _embed_inp.Count; i++)
  150. {
  151. Logger.Default.Info($"{_embed_inp[i]} -> '{NativeApi.llama_token_to_str(_ctx, _embed_inp[i])}'");
  152. }
  153. if (_params.n_keep > 0)
  154. {
  155. Logger.Default.Info($"static prompt based on n_keep: '");
  156. for (int i = 0; i < _params.n_keep; i++)
  157. {
  158. Logger.Default.Info($"{NativeApi.llama_token_to_str(_ctx, _embed_inp[i])}");
  159. }
  160. Logger.Default.Info("\n");
  161. }
  162. Logger.Default.Info("\n");
  163. }
  164. if (_params.interactive && verbose)
  165. {
  166. Logger.Default.Info("interactive mode on.");
  167. }
  168. if (verbose)
  169. {
  170. Logger.Default.Info($"sampling: repeat_last_n = {_params.repeat_last_n}, " +
  171. $"repeat_penalty = {_params.repeat_penalty}, presence_penalty = {_params.presence_penalty}, " +
  172. $"frequency_penalty = {_params.frequency_penalty}, top_k = {_params.top_k}, tfs_z = {_params.tfs_z}," +
  173. $" top_p = {_params.top_p}, typical_p = {_params.typical_p}, temp = {_params.temp}, mirostat = {_params.mirostat}," +
  174. $" mirostat_lr = {_params.mirostat_eta}, mirostat_ent = {_params.mirostat_tau}");
  175. Logger.Default.Info($"generate: n_ctx = {_n_ctx}, n_batch = {_params.n_batch}, n_predict = {_params.n_predict}, " +
  176. $"n_keep = {_params.n_keep}");
  177. Logger.Default.Info("\n");
  178. }
  179. _last_n_tokens = Enumerable.Repeat(0, _n_ctx).ToList();
  180. if (_params.interactive)
  181. {
  182. if (verbose)
  183. {
  184. Logger.Default.Info("== Running in interactive mode. ==");
  185. }
  186. _is_interacting = _params.interactive_first;
  187. }
  188. _is_antiprompt = false;
  189. _input_echo = echo_input;
  190. _need_to_save_session = !string.IsNullOrEmpty(_path_session) && n_matching_session_tokens < (ulong)(_embed_inp.Count * 3 / 4);
  191. _n_past = 0;
  192. _n_remain = _params.n_predict;
  193. _n_consumed = 0;
  194. _n_session_consumed = 0;
  195. _embed = new List<llama_token>();
  196. }
  197. public LLamaModel WithPrompt(string prompt)
  198. {
  199. _params.prompt = prompt;
  200. if (!_params.prompt.EndsWith(" "))
  201. {
  202. _params.prompt = _params.prompt.Insert(0, " ");
  203. }
  204. _embed_inp = Utils.llama_tokenize(_ctx, _params.prompt, true);
  205. if (_embed_inp.Count > _n_ctx - 4)
  206. {
  207. throw new ArgumentException($"prompt is too long ({_embed_inp.Count} tokens, max {_n_ctx - 4})");
  208. }
  209. return this;
  210. }
  211. public LLamaModel WithPromptFile(string promptFileName)
  212. {
  213. return WithPrompt(File.ReadAllText(promptFileName));
  214. }
  215. private string ProcessTextBeforeInfer(string text)
  216. {
  217. if (!string.IsNullOrEmpty(_params.input_prefix))
  218. {
  219. text = _params.input_prefix + text;
  220. }
  221. if (!text.EndsWith("\n"))
  222. {
  223. text += "\n";
  224. }
  225. if (text.Length > 1)
  226. {
  227. // append input suffix if any
  228. if (!string.IsNullOrEmpty(_params.input_suffix))
  229. {
  230. text += _params.input_suffix;
  231. Console.Write(_params.input_suffix);
  232. }
  233. // instruct mode: insert instruction prefix
  234. if (_params.instruct && !_is_antiprompt)
  235. {
  236. _n_consumed = _embed_inp.Count;
  237. _embed_inp.AddRange(_inp_pfx);
  238. }
  239. var line_inp = Utils.llama_tokenize(_ctx, text, false);
  240. _embed_inp.AddRange(line_inp);
  241. // instruct mode: insert response suffix
  242. if (_params.instruct)
  243. {
  244. _embed_inp.AddRange(_inp_sfx);
  245. }
  246. _n_remain -= line_inp.Count;
  247. }
  248. return text;
  249. }
  250. public void InitChatPrompt(string prompt)
  251. {
  252. WithPrompt(prompt);
  253. }
  254. public void InitChatAntiprompt(string[] antiprompt)
  255. {
  256. _params.antiprompt = antiprompt.ToList();
  257. }
  258. public IEnumerable<string> Chat(string text, string? prompt = null)
  259. {
  260. _params.interactive = true;
  261. _input_echo = false;
  262. if (!string.IsNullOrEmpty(prompt))
  263. {
  264. WithPrompt(prompt);
  265. }
  266. return Call(text);
  267. }
  268. public IEnumerable<string> Call(string text)
  269. {
  270. _is_interacting = _is_antiprompt = false;
  271. ProcessTextBeforeInfer(text);
  272. while ((_n_remain != 0 || _params.interactive) && !_is_interacting)
  273. {
  274. if (_embed.Count > 0)
  275. {
  276. // infinite text generation via context swapping
  277. // if we run out of context:
  278. // - take the n_keep first tokens from the original prompt (via n_past)
  279. // - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches
  280. if (_n_past + _embed.Count > _n_ctx)
  281. {
  282. int n_left = _n_past - _params.n_keep;
  283. _n_past = _params.n_keep;
  284. // insert n_left/2 tokens at the start of embed from last_n_tokens
  285. _embed.InsertRange(0, _last_n_tokens.GetRange(_n_ctx - n_left / 2 - _embed.Count, _embed.Count));
  286. // stop saving session if we run out of context
  287. _path_session = "";
  288. // Console.WriteLine("\n---\n");
  289. // Console.Write("resetting: '");
  290. // for (int i = 0; i < embed.Count; i++) {
  291. // Console.Write(llama_token_to_str(ctx, embed[i]));
  292. // }
  293. // Console.WriteLine("'\n");
  294. // Console.WriteLine("\n---\n");
  295. }
  296. // try to reuse a matching prefix from the loaded session instead of re-eval (via n_past)
  297. // REVIEW
  298. if (_n_session_consumed < _session_tokens.Count)
  299. {
  300. int i = 0;
  301. for (; i < _embed.Count; i++)
  302. {
  303. if (!_embed[i].Equals(_session_tokens[_n_session_consumed]))
  304. {
  305. _session_tokens.RemoveRange(_n_session_consumed, _session_tokens.Count - _n_session_consumed);
  306. break;
  307. }
  308. _n_past++;
  309. _n_session_consumed++;
  310. if (_n_session_consumed >= _session_tokens.Count)
  311. {
  312. i++;
  313. break;
  314. }
  315. }
  316. if (i > 0)
  317. {
  318. _embed.RemoveRange(0, i);
  319. }
  320. }
  321. // evaluate tokens in batches
  322. // embed is typically prepared beforehand to fit within a batch, but not always
  323. for (int i = 0; i < _embed.Count; i += _params.n_batch)
  324. {
  325. int n_eval = _embed.Count - i;
  326. if (n_eval > _params.n_batch)
  327. {
  328. n_eval = _params.n_batch;
  329. }
  330. var array = _embed.GetRange(i, n_eval).ToArray();
  331. if (NativeApi.llama_eval(_ctx, array, n_eval, _n_past, _params.n_threads) != 0)
  332. {
  333. Logger.Default.Error($"Failed to eval");
  334. throw new RuntimeError("Failed to eval");
  335. }
  336. _n_past += n_eval;
  337. }
  338. if (_embed.Count > 0 && !string.IsNullOrEmpty(_path_session))
  339. {
  340. _session_tokens.AddRange(_embed);
  341. _n_session_consumed = _session_tokens.Count;
  342. }
  343. }
  344. _embed.Clear();
  345. if (_embed_inp.Count <= _n_consumed && !_is_interacting)
  346. {
  347. var temp = _params.temp;
  348. var top_k = _params.top_k <= 0 ? NativeApi.llama_n_vocab(_ctx) : _params.top_k;
  349. var top_p = _params.top_p;
  350. var tfs_z = _params.tfs_z;
  351. var typical_p = _params.typical_p;
  352. var repeat_last_n = _params.repeat_last_n < 0 ? _n_ctx : _params.repeat_last_n;
  353. var repeat_penalty = _params.repeat_penalty;
  354. var alpha_presence = _params.presence_penalty;
  355. var alpha_frequency = _params.frequency_penalty;
  356. var mirostat = _params.mirostat;
  357. var mirostat_tau = _params.mirostat_tau;
  358. var mirostat_eta = _params.mirostat_eta;
  359. var penalize_nl = _params.penalize_nl;
  360. // optionally save the session on first sample (for faster prompt loading next time)
  361. if (!string.IsNullOrEmpty(_path_session) && _need_to_save_session)
  362. {
  363. _need_to_save_session = false;
  364. NativeApi.llama_save_session_file(_ctx, _path_session, _session_tokens.ToArray(), (ulong)_session_tokens.Count);
  365. }
  366. llama_token id = 0;
  367. {
  368. var n_vocab = NativeApi.llama_n_vocab(_ctx);
  369. var logits = Utils.llama_get_logits(_ctx, n_vocab);
  370. // Apply params.logit_bias map
  371. foreach (KeyValuePair<int, float> it in _params.logit_bias)
  372. {
  373. logits[it.Key] += it.Value;
  374. }
  375. var candidates = new List<LLamaTokenData>();
  376. candidates.Capacity = n_vocab;
  377. for (llama_token token_id = 0; token_id < n_vocab; token_id++)
  378. {
  379. candidates.Add(new LLamaTokenData(token_id, logits[token_id], 0.0f));
  380. }
  381. LLamaTokenDataArray candidates_p = new LLamaTokenDataArray(candidates.ToArray(), (ulong)candidates.Count, false);
  382. // Apply penalties
  383. float nl_logit = logits[NativeApi.llama_token_nl()];
  384. var last_n_repeat = Math.Min(Math.Min(_last_n_tokens.Count, repeat_last_n), _n_ctx);
  385. SamplingApi.llama_sample_repetition_penalty(_ctx, candidates_p,
  386. _last_n_tokens.GetRange(_last_n_tokens.Count - last_n_repeat, last_n_repeat).ToArray(),
  387. (ulong)last_n_repeat, repeat_penalty);
  388. SamplingApi.llama_sample_frequency_and_presence_penalties(_ctx, candidates_p,
  389. _last_n_tokens.GetRange(_last_n_tokens.Count - last_n_repeat, last_n_repeat).ToArray(),
  390. (ulong)last_n_repeat, alpha_frequency, alpha_presence);
  391. if (!penalize_nl)
  392. {
  393. logits[NativeApi.llama_token_nl()] = nl_logit;
  394. }
  395. if (temp <= 0)
  396. {
  397. // Greedy sampling
  398. id = SamplingApi.llama_sample_token_greedy(_ctx, candidates_p);
  399. }
  400. else
  401. {
  402. if (mirostat == 1)
  403. {
  404. float mirostat_mu = 2.0f * mirostat_tau;
  405. const int mirostat_m = 100;
  406. SamplingApi.llama_sample_temperature(_ctx, candidates_p, temp);
  407. id = SamplingApi.llama_sample_token_mirostat(_ctx, candidates_p, mirostat_tau, mirostat_eta, mirostat_m, mirostat_mu);
  408. }
  409. else if (mirostat == 2)
  410. {
  411. float mirostat_mu = 2.0f * mirostat_tau;
  412. SamplingApi.llama_sample_temperature(_ctx, candidates_p, temp);
  413. id = SamplingApi.llama_sample_token_mirostat_v2(_ctx, candidates_p, mirostat_tau, mirostat_eta, mirostat_mu);
  414. }
  415. else
  416. {
  417. // Temperature sampling
  418. SamplingApi.llama_sample_top_k(_ctx, candidates_p, top_k, 1);
  419. SamplingApi.llama_sample_tail_free(_ctx, candidates_p, tfs_z, 1);
  420. SamplingApi.llama_sample_typical(_ctx, candidates_p, typical_p, 1);
  421. SamplingApi.llama_sample_top_p(_ctx, candidates_p, top_p, 1);
  422. SamplingApi.llama_sample_temperature(_ctx, candidates_p, temp);
  423. id = SamplingApi.llama_sample_token(_ctx, candidates_p);
  424. }
  425. }
  426. _last_n_tokens.RemoveAt(0);
  427. _last_n_tokens.Add(id);
  428. }
  429. // replace end of text token with newline token when in interactive mode
  430. if (id == NativeApi.llama_token_eos() && _params.interactive && !_params.instruct)
  431. {
  432. id = _llama_token_newline[0];
  433. if (_params.antiprompt.Count != 0)
  434. {
  435. // tokenize and inject first reverse prompt
  436. var first_antiprompt = Utils.llama_tokenize(_ctx, _params.antiprompt[0], false);
  437. _embed_inp.AddRange(first_antiprompt);
  438. }
  439. }
  440. // add it to the context
  441. _embed.Add(id);
  442. // echo this to console
  443. _input_echo = true;
  444. // decrement remaining sampling budget
  445. _n_remain--;
  446. }
  447. else
  448. {
  449. // Assuming that the necessary variables have been defined and initialized,
  450. // the C# equivalent code could be:
  451. while (_embed_inp.Count > _n_consumed)
  452. {
  453. _embed.Add(_embed_inp[_n_consumed]);
  454. _last_n_tokens.RemoveAt(0);
  455. _last_n_tokens.Add(_embed_inp[_n_consumed]);
  456. _n_consumed++;
  457. if (_embed.Count >= _params.n_batch)
  458. {
  459. break;
  460. }
  461. }
  462. }
  463. if (_input_echo)
  464. {
  465. foreach (var id in _embed)
  466. {
  467. yield return Utils.PtrToStringUTF8(NativeApi.llama_token_to_str(_ctx, id));
  468. }
  469. }
  470. if (_params.interactive && _embed_inp.Count <= _n_consumed)
  471. {
  472. if (_params.antiprompt.Count > 0)
  473. {
  474. string last_output = "";
  475. foreach (var id in _last_n_tokens)
  476. {
  477. last_output += Utils.PtrToStringUTF8(NativeApi.llama_token_to_str(_ctx, id));
  478. }
  479. _is_antiprompt = false;
  480. foreach (var antiprompt in _params.antiprompt)
  481. {
  482. if (last_output.EndsWith(antiprompt))
  483. {
  484. _is_interacting = true;
  485. _is_antiprompt = true;
  486. break;
  487. }
  488. }
  489. }
  490. if(_n_past > 0 && _is_interacting)
  491. {
  492. _input_echo = false;
  493. break;
  494. }
  495. if (_embed.Count > 0 && _embed.Last() == NativeApi.llama_token_eos())
  496. {
  497. if (_params.instruct) {
  498. _is_interacting = true;
  499. } else
  500. {
  501. Logger.Default.Info(" [end of text]");
  502. }
  503. }
  504. if (_params.interactive && _n_remain <= 0 && _params.n_predict != -1) {
  505. _n_remain = _params.n_predict;
  506. _is_interacting = true;
  507. }
  508. }
  509. }
  510. if(!string.IsNullOrEmpty(_path_session) && _params.prompt_cache_all)
  511. {
  512. Logger.Default.Info($"saving final output to session file {_path_session}");
  513. var session_token_array = _session_tokens.ToArray();
  514. NativeApi.llama_save_session_file(_ctx, _path_session, session_token_array, (ulong)session_token_array.Length);
  515. }
  516. }
  517. public void Dispose()
  518. {
  519. _ctx.Dispose();
  520. }
  521. }
  522. }

C#/.NET上易用的LLM高性能推理框架,支持LLaMA和LLaVA系列模型。