| @@ -15,6 +15,7 @@ namespace LLama.Abstractions | |||||
| /// The loaded model for this executor. | /// The loaded model for this executor. | ||||
| /// </summary> | /// </summary> | ||||
| public LLamaModel Model { get; } | public LLamaModel Model { get; } | ||||
| /// <summary> | /// <summary> | ||||
| /// Infers a response from the model. | /// Infers a response from the model. | ||||
| /// </summary> | /// </summary> | ||||
| @@ -24,6 +25,13 @@ namespace LLama.Abstractions | |||||
| /// <returns></returns> | /// <returns></returns> | ||||
| IEnumerable<string> Infer(string text, InferenceParams? inferenceParams = null, CancellationToken token = default); | IEnumerable<string> Infer(string text, InferenceParams? inferenceParams = null, CancellationToken token = default); | ||||
| /// <summary> | |||||
| /// Asynchronously infers a response from the model. | |||||
| /// </summary> | |||||
| /// <param name="text">Your prompt</param> | |||||
| /// <param name="inferenceParams">Any additional parameters</param> | |||||
| /// <param name="token">A cancellation token.</param> | |||||
| /// <returns></returns> | |||||
| IAsyncEnumerable<string> InferAsync(string text, InferenceParams? inferenceParams = null, CancellationToken token = default); | IAsyncEnumerable<string> InferAsync(string text, InferenceParams? inferenceParams = null, CancellationToken token = default); | ||||
| } | } | ||||
| } | } | ||||
| @@ -134,8 +134,9 @@ namespace LLama | |||||
| /// <summary> | /// <summary> | ||||
| /// Get the response from the LLama model with chat histories. | /// Get the response from the LLama model with chat histories. | ||||
| /// </summary> | /// </summary> | ||||
| /// <param name="prompt"></param> | |||||
| /// <param name="history"></param> | |||||
| /// <param name="inferenceParams"></param> | /// <param name="inferenceParams"></param> | ||||
| /// <param name="cancellationToken"></param> | |||||
| /// <returns></returns> | /// <returns></returns> | ||||
| public IEnumerable<string> Chat(ChatHistory history, InferenceParams? inferenceParams = null, CancellationToken cancellationToken = default) | public IEnumerable<string> Chat(ChatHistory history, InferenceParams? inferenceParams = null, CancellationToken cancellationToken = default) | ||||
| { | { | ||||
| @@ -156,6 +157,7 @@ namespace LLama | |||||
| /// </summary> | /// </summary> | ||||
| /// <param name="prompt"></param> | /// <param name="prompt"></param> | ||||
| /// <param name="inferenceParams"></param> | /// <param name="inferenceParams"></param> | ||||
| /// <param name="cancellationToken"></param> | |||||
| /// <returns></returns> | /// <returns></returns> | ||||
| public IEnumerable<string> Chat(string prompt, InferenceParams? inferenceParams = null, CancellationToken cancellationToken = default) | public IEnumerable<string> Chat(string prompt, InferenceParams? inferenceParams = null, CancellationToken cancellationToken = default) | ||||
| { | { | ||||
| @@ -176,8 +178,9 @@ namespace LLama | |||||
| /// <summary> | /// <summary> | ||||
| /// Get the response from the LLama model with chat histories. | /// Get the response from the LLama model with chat histories. | ||||
| /// </summary> | /// </summary> | ||||
| /// <param name="prompt"></param> | |||||
| /// <param name="history"></param> | |||||
| /// <param name="inferenceParams"></param> | /// <param name="inferenceParams"></param> | ||||
| /// <param name="cancellationToken"></param> | |||||
| /// <returns></returns> | /// <returns></returns> | ||||
| public async IAsyncEnumerable<string> ChatAsync(ChatHistory history, InferenceParams? inferenceParams = null, [EnumeratorCancellation] CancellationToken cancellationToken = default) | public async IAsyncEnumerable<string> ChatAsync(ChatHistory history, InferenceParams? inferenceParams = null, [EnumeratorCancellation] CancellationToken cancellationToken = default) | ||||
| { | { | ||||
| @@ -1,16 +1,33 @@ | |||||
| using System; | |||||
| using System.Collections.Generic; | |||||
| using System.Text; | |||||
| using System.Collections.Generic; | |||||
| namespace LLama.Common | namespace LLama.Common | ||||
| { | { | ||||
| /// <summary> | |||||
| /// Role of the message author, e.g. user/assistant/system | |||||
| /// </summary> | |||||
| public enum AuthorRole | public enum AuthorRole | ||||
| { | { | ||||
| /// <summary> | |||||
| /// Role is unknown | |||||
| /// </summary> | |||||
| Unknown = -1, | Unknown = -1, | ||||
| /// <summary> | |||||
| /// Message comes from a "system" prompt, not written by a user or language model | |||||
| /// </summary> | |||||
| System = 0, | System = 0, | ||||
| /// <summary> | |||||
| /// Message comes from the user | |||||
| /// </summary> | |||||
| User = 1, | User = 1, | ||||
| /// <summary> | |||||
| /// Messages was generated by the language model | |||||
| /// </summary> | |||||
| Assistant = 2, | Assistant = 2, | ||||
| } | } | ||||
| // copy from semantic-kernel | // copy from semantic-kernel | ||||
| /// <summary> | /// <summary> | ||||
| /// The chat history class | /// The chat history class | ||||
| @@ -24,7 +24,7 @@ namespace LLama.Common | |||||
| } | } | ||||
| /// <summary> | /// <summary> | ||||
| /// Fill the quene with the data. Please ensure that data.Count <= size | |||||
| /// Fill the quene with the data. Please ensure that data.Count <= size | |||||
| /// </summary> | /// </summary> | ||||
| /// <param name="size"></param> | /// <param name="size"></param> | ||||
| /// <param name="data"></param> | /// <param name="data"></param> | ||||
| @@ -1,6 +1,4 @@ | |||||
| using System; | using System; | ||||
| using System.Collections.Generic; | |||||
| using System.Text; | |||||
| namespace LLama.Exceptions | namespace LLama.Exceptions | ||||
| { | { | ||||
| @@ -18,10 +18,12 @@ namespace LLama | |||||
| /// <param name="dstFilename">The path to save the quantized model.</param> | /// <param name="dstFilename">The path to save the quantized model.</param> | ||||
| /// <param name="ftype">The type of quantization.</param> | /// <param name="ftype">The type of quantization.</param> | ||||
| /// <param name="nthread">Thread to be used during the quantization. By default it's the physical core number.</param> | /// <param name="nthread">Thread to be used during the quantization. By default it's the physical core number.</param> | ||||
| /// <param name="allowRequantize"></param> | |||||
| /// <param name="quantizeOutputTensor"></param> | |||||
| /// <returns>Whether the quantization is successful.</returns> | /// <returns>Whether the quantization is successful.</returns> | ||||
| /// <exception cref="ArgumentException"></exception> | /// <exception cref="ArgumentException"></exception> | ||||
| public static unsafe bool Quantize(string srcFileName, string dstFilename, LLamaFtype ftype, int nthread = -1, bool allowRequantize = true, | public static unsafe bool Quantize(string srcFileName, string dstFilename, LLamaFtype ftype, int nthread = -1, bool allowRequantize = true, | ||||
| bool quantizeOutputTensor = false) | |||||
| bool quantizeOutputTensor = false) | |||||
| { | { | ||||
| if (!ValidateFtype(ftype)) | if (!ValidateFtype(ftype)) | ||||
| { | { | ||||
| @@ -45,10 +47,12 @@ namespace LLama | |||||
| /// <param name="dstFilename">The path to save the quantized model.</param> | /// <param name="dstFilename">The path to save the quantized model.</param> | ||||
| /// <param name="ftype">The type of quantization.</param> | /// <param name="ftype">The type of quantization.</param> | ||||
| /// <param name="nthread">Thread to be used during the quantization. By default it's the physical core number.</param> | /// <param name="nthread">Thread to be used during the quantization. By default it's the physical core number.</param> | ||||
| /// <param name="allowRequantize"></param> | |||||
| /// <param name="quantizeOutputTensor"></param> | |||||
| /// <returns>Whether the quantization is successful.</returns> | /// <returns>Whether the quantization is successful.</returns> | ||||
| /// <exception cref="ArgumentException"></exception> | /// <exception cref="ArgumentException"></exception> | ||||
| public static bool Quantize(string srcFileName, string dstFilename, string ftype, int nthread = -1, bool allowRequantize = true, | public static bool Quantize(string srcFileName, string dstFilename, string ftype, int nthread = -1, bool allowRequantize = true, | ||||
| bool quantizeOutputTensor = false) | |||||
| bool quantizeOutputTensor = false) | |||||
| { | { | ||||
| return Quantize(srcFileName, dstFilename, StringToFtype(ftype), nthread, allowRequantize, quantizeOutputTensor); | return Quantize(srcFileName, dstFilename, StringToFtype(ftype), nthread, allowRequantize, quantizeOutputTensor); | ||||
| } | } | ||||
| @@ -159,8 +159,8 @@ namespace LLama | |||||
| /// <param name="keywords">Keywords that you want to remove from the response.</param> | /// <param name="keywords">Keywords that you want to remove from the response.</param> | ||||
| /// <param name="redundancyLength">The extra length when searching for the keyword. For example, if your only keyword is "highlight", | /// <param name="redundancyLength">The extra length when searching for the keyword. For example, if your only keyword is "highlight", | ||||
| /// maybe the token you get is "\r\nhighligt". In this condition, if redundancyLength=0, the token cannot be successfully matched because the length of "\r\nhighligt" (10) | /// maybe the token you get is "\r\nhighligt". In this condition, if redundancyLength=0, the token cannot be successfully matched because the length of "\r\nhighligt" (10) | ||||
| /// has already exceeded the maximum length of the keywords (8). On the contrary, setting redundancyLengyh >= 2 leads to successful match. | |||||
| /// The larger the redundancyLength is, the lower the processing speed. But as an experience, it won't introduce too much performance impact when redundancyLength <= 5 </param> | |||||
| /// has already exceeded the maximum length of the keywords (8). On the contrary, setting redundancyLengyh >= 2 leads to successful match. | |||||
| /// The larger the redundancyLength is, the lower the processing speed. But as an experience, it won't introduce too much performance impact when redundancyLength <= 5 </param> | |||||
| /// <param name="removeAllMatchedTokens">If set to true, when getting a matched keyword, all the related tokens will be removed. Otherwise only the part of keyword will be removed.</param> | /// <param name="removeAllMatchedTokens">If set to true, when getting a matched keyword, all the related tokens will be removed. Otherwise only the part of keyword will be removed.</param> | ||||
| public KeywordTextOutputStreamTransform(IEnumerable<string> keywords, int redundancyLength = 3, bool removeAllMatchedTokens = false) | public KeywordTextOutputStreamTransform(IEnumerable<string> keywords, int redundancyLength = 3, bool removeAllMatchedTokens = false) | ||||
| { | { | ||||
| @@ -8,7 +8,7 @@ namespace LLama.Native | |||||
| public struct LLamaModelQuantizeParams | public struct LLamaModelQuantizeParams | ||||
| { | { | ||||
| /// <summary> | /// <summary> | ||||
| /// number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency() | |||||
| /// number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency() | |||||
| /// </summary> | /// </summary> | ||||
| public int nthread; | public int nthread; | ||||
| /// <summary> | /// <summary> | ||||
| @@ -12,8 +12,7 @@ namespace LLama.Native | |||||
| /// </summary> | /// </summary> | ||||
| /// <param name="fname_inp"></param> | /// <param name="fname_inp"></param> | ||||
| /// <param name="fname_out"></param> | /// <param name="fname_out"></param> | ||||
| /// <param name="ftype"></param> | |||||
| /// <param name="nthread">how many threads to use. If <=0, will use std::thread::hardware_concurrency(), else the number given</param> | |||||
| /// <param name="param"></param> | |||||
| /// <remarks>not great API - very likely to change</remarks> | /// <remarks>not great API - very likely to change</remarks> | ||||
| /// <returns>Returns 0 on success</returns> | /// <returns>Returns 0 on success</returns> | ||||
| [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)] | [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)] | ||||
| @@ -39,9 +39,9 @@ namespace LLama.OldVersion | |||||
| } | } | ||||
| /// <summary> | /// <summary> | ||||
| /// Set the keyword to split the return value of chat AI. | |||||
| /// Set the keywords to split the return value of chat AI. | |||||
| /// </summary> | /// </summary> | ||||
| /// <param name="humanName"></param> | |||||
| /// <param name="antiprompt"></param> | |||||
| /// <returns></returns> | /// <returns></returns> | ||||
| public ChatSession<T> WithAntiprompt(string[] antiprompt) | public ChatSession<T> WithAntiprompt(string[] antiprompt) | ||||
| { | { | ||||
| @@ -796,6 +796,7 @@ namespace LLama.OldVersion | |||||
| } | } | ||||
| } | } | ||||
| /// <inheritdoc /> | |||||
| public void Dispose() | public void Dispose() | ||||
| { | { | ||||
| _ctx.Dispose(); | _ctx.Dispose(); | ||||