From 6a7e74e71b15ce585aea06a35674da01ff81d84c Mon Sep 17 00:00:00 2001 From: Yaohui Liu Date: Sat, 4 Nov 2023 22:38:06 +0800 Subject: [PATCH 01/12] build: add package for kernel-memory integration. --- .../LLamaSharp.KernelMemory.csproj | 23 ++++++++++++++++++- .../LLamaSharp.SemanticKernel.csproj | 6 ++--- README.md | 9 +++++++- 3 files changed, 33 insertions(+), 5 deletions(-) diff --git a/LLama.KernelMemory/LLamaSharp.KernelMemory.csproj b/LLama.KernelMemory/LLamaSharp.KernelMemory.csproj index 54766b02..de5f42a5 100644 --- a/LLama.KernelMemory/LLamaSharp.KernelMemory.csproj +++ b/LLama.KernelMemory/LLamaSharp.KernelMemory.csproj @@ -1,9 +1,30 @@ - net6.0 + netstandard2.0;net6.0;net7.0 enable enable + + 0.7.1 + Xbotter + SciSharp STACK + true + MIT, SciSharp STACK $([System.DateTime]::UtcNow.ToString(yyyy)) + https://github.com/SciSharp/LLamaSharp + git + https://avatars3.githubusercontent.com/u/44989469?s=200&v=4 + LLama, LLM, GPT, ChatGPT, kernel-memory, vector search, SciSharp + + The integration of LLamaSharp and Microsoft kernel-memory. It could make it easy to support document search for LLamaSharp model inference. + + + Support integration with kernel-memory + + MIT + packages + AnyCPU;x64;Arm64 + LLamaSharp.kernel-memory + Debug;Release;GPU diff --git a/LLama.SemanticKernel/LLamaSharp.SemanticKernel.csproj b/LLama.SemanticKernel/LLamaSharp.SemanticKernel.csproj index 77596d57..c6ece4e7 100644 --- a/LLama.SemanticKernel/LLamaSharp.SemanticKernel.csproj +++ b/LLama.SemanticKernel/LLamaSharp.SemanticKernel.csproj @@ -10,8 +10,8 @@ enable enable - 0.6.2-beta1 - Tim Miller + 0.7.1 + Tim Miller, Xbotter SciSharp STACK true MIT, SciSharp STACK $([System.DateTime]::UtcNow.ToString(yyyy)) @@ -20,7 +20,7 @@ https://avatars3.githubusercontent.com/u/44989469?s=200&v=4 LLama, LLM, GPT, ChatGPT, semantic-kernel, SciSharp - The integration of LLamaSharp ans semantic-kernel. + The integration of LLamaSharp and Microsoft semantic-kernel. Support integration with semantic-kernel diff --git a/README.md b/README.md index c3b73f8c..d116d1a4 100644 --- a/README.md +++ b/README.md @@ -54,6 +54,13 @@ For [microsoft semantic-kernel](https://github.com/microsoft/semantic-kernel) in LLamaSharp.semantic-kernel ``` +For [microsoft kernel-memory](https://github.com/microsoft/kernel-memory) integration, please search and install the following package: + +``` +LLamaSharp.kernel-memory +``` + + ### Tips for choosing a version In general, there may be some break changes between two minor releases, for example 0.5.1 and 0.6.0. On the contrary, we don't introduce API break changes in patch release. Therefore it's recommended to keep the highest patch version of a minor release. For example, keep 0.5.6 instead of 0.5.3. @@ -196,7 +203,7 @@ Another choice is generate gguf format file yourself with a pytorch weight (or a 🔳 Fine-tune -⚠️ Local document search (enabled by kernel-memory now) +✅ Local document search (enabled by kernel-memory now) 🔳 MAUI Integration From 0f12566f654f430f50480128ae65e5f588f6dc45 Mon Sep 17 00:00:00 2001 From: Yaohui Liu Date: Sun, 5 Nov 2023 02:55:41 +0800 Subject: [PATCH 02/12] build: use only net6.0 with kernel-memory. --- LLama.KernelMemory/LLamaSharp.KernelMemory.csproj | 2 +- README.md | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/LLama.KernelMemory/LLamaSharp.KernelMemory.csproj b/LLama.KernelMemory/LLamaSharp.KernelMemory.csproj index de5f42a5..7fd99e2c 100644 --- a/LLama.KernelMemory/LLamaSharp.KernelMemory.csproj +++ b/LLama.KernelMemory/LLamaSharp.KernelMemory.csproj @@ -1,7 +1,7 @@ - netstandard2.0;net6.0;net7.0 + net6.0 enable enable diff --git a/README.md b/README.md index d116d1a4..96c9883a 100644 --- a/README.md +++ b/README.md @@ -54,13 +54,12 @@ For [microsoft semantic-kernel](https://github.com/microsoft/semantic-kernel) in LLamaSharp.semantic-kernel ``` -For [microsoft kernel-memory](https://github.com/microsoft/kernel-memory) integration, please search and install the following package: +For [microsoft kernel-memory](https://github.com/microsoft/kernel-memory) integration, please search and install the following package (currently kernel-memory only supports net6.0): ``` LLamaSharp.kernel-memory ``` - ### Tips for choosing a version In general, there may be some break changes between two minor releases, for example 0.5.1 and 0.6.0. On the contrary, we don't introduce API break changes in patch release. Therefore it's recommended to keep the highest patch version of a minor release. For example, keep 0.5.6 instead of 0.5.3. From 457958435b07603e9f41dad62af2ca60e621bae1 Mon Sep 17 00:00:00 2001 From: Yaohui Liu Date: Sun, 5 Nov 2023 02:59:41 +0800 Subject: [PATCH 03/12] build: use semantic-kernel beta1 in examples. --- LLama.Examples/LLama.Examples.csproj | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/LLama.Examples/LLama.Examples.csproj b/LLama.Examples/LLama.Examples.csproj index 3ecacdfe..c1761829 100644 --- a/LLama.Examples/LLama.Examples.csproj +++ b/LLama.Examples/LLama.Examples.csproj @@ -29,7 +29,7 @@ - + From f9087015d646a81c33d4083f561d65108d1809f1 Mon Sep 17 00:00:00 2001 From: xbotter Date: Mon, 6 Nov 2023 22:04:44 +0800 Subject: [PATCH 04/12] =?UTF-8?q?=F0=9F=94=80=20Refactor=20TestRunner.cs?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Handle invalid choice in Examples.TryGetValue - Add exit condition for choice "Exit" - Display a rule before executing example - Clear console after each iteration 📝 This change refactors the TestRunner.cs file. It handles invalid choices in Examples.TryGetValue, adds an exit condition for the choice "Exit", displays a rule before executing an example, and clears the console after each iteration. --- LLama.Examples/LLama.Examples.csproj | 1 + LLama.Examples/NewVersion/GetEmbeddings.cs | 3 +- LLama.Examples/NewVersion/QuantizeModel.cs | 4 +- LLama.Examples/NewVersion/TestRunner.cs | 135 ++++++--------------- 4 files changed, 46 insertions(+), 97 deletions(-) diff --git a/LLama.Examples/LLama.Examples.csproj b/LLama.Examples/LLama.Examples.csproj index 3ecacdfe..dcd40c19 100644 --- a/LLama.Examples/LLama.Examples.csproj +++ b/LLama.Examples/LLama.Examples.csproj @@ -30,6 +30,7 @@ + diff --git a/LLama.Examples/NewVersion/GetEmbeddings.cs b/LLama.Examples/NewVersion/GetEmbeddings.cs index fe9e3ea8..1e5b19be 100644 --- a/LLama.Examples/NewVersion/GetEmbeddings.cs +++ b/LLama.Examples/NewVersion/GetEmbeddings.cs @@ -4,7 +4,7 @@ namespace LLama.Examples.NewVersion { public class GetEmbeddings { - public static void Run() + public static Task Run() { Console.Write("Please input your model path: "); var modelPath = Console.ReadLine(); @@ -23,6 +23,7 @@ namespace LLama.Examples.NewVersion Console.WriteLine(string.Join(", ", embedder.GetEmbeddings(text))); Console.WriteLine(); } + return Task.CompletedTask; } } } diff --git a/LLama.Examples/NewVersion/QuantizeModel.cs b/LLama.Examples/NewVersion/QuantizeModel.cs index 71966af8..456d8929 100644 --- a/LLama.Examples/NewVersion/QuantizeModel.cs +++ b/LLama.Examples/NewVersion/QuantizeModel.cs @@ -2,7 +2,7 @@ { public class QuantizeModel { - public static void Run() + public static Task Run() { Console.Write("Please input your original model path: "); var inputPath = Console.ReadLine(); @@ -21,6 +21,8 @@ { Console.WriteLine("Quantization failed!"); } + + return Task.CompletedTask; } } } diff --git a/LLama.Examples/NewVersion/TestRunner.cs b/LLama.Examples/NewVersion/TestRunner.cs index a21a2eed..c89cba30 100644 --- a/LLama.Examples/NewVersion/TestRunner.cs +++ b/LLama.Examples/NewVersion/TestRunner.cs @@ -1,109 +1,54 @@ -namespace LLama.Examples.NewVersion +using System.Linq.Expressions; +using Spectre.Console; + +namespace LLama.Examples.NewVersion { public class NewVersionTestRunner { + static Dictionary> Examples = new Dictionary> + { + {"Run a chat session without stripping the role names.", () => ChatSessionWithRoleName.Run()}, + {"Run a chat session with the role names stripped.",()=> ChatSessionStripRoleName.Run()}, + {"Interactive mode chat by using executor.",()=> InteractiveModeExecute.Run()}, + {"Instruct mode chat by using executor.",()=> InstructModeExecute.Run()}, + {"Stateless mode chat by using executor.",()=> StatelessModeExecute.Run()}, + {"Load and save chat session.",()=> SaveAndLoadSession.Run()}, + {"Load and save state of model and executor.",()=> LoadAndSaveState.Run()}, + {"Get embeddings from LLama model.",()=> GetEmbeddings.Run()}, + {"Quantize the model.",()=> QuantizeModel.Run()}, + {"Automatic conversation.",()=> TalkToYourself.Run()}, + {"Constrain response to json format using grammar.",()=> GrammarJsonResponse.Run()}, + {"Semantic Kernel Prompt.",()=> SemanticKernelPrompt.Run()}, + {"Semantic Kernel Chat.",()=> SemanticKernelChat.Run()}, + {"Semantic Kernel Memory.",()=> SemanticKernelMemory.Run()}, + {"Coding Assistant.",()=> CodingAssistant.Run()}, + {"Batch Decoding.",()=> BatchedDecoding.Run()}, + {"SK Kernel Memory.",()=> KernelMemory.Run()}, + {"Exit", ()=> Task.CompletedTask} + }; public static async Task Run() { - Console.WriteLine("================LLamaSharp Examples (New Version)==================\n"); - - Console.WriteLine("Please input a number to choose an example to run:"); - Console.WriteLine("0: Run a chat session without stripping the role names."); - Console.WriteLine("1: Run a chat session with the role names stripped."); - Console.WriteLine("2: Interactive mode chat by using executor."); - Console.WriteLine("3: Instruct mode chat by using executor."); - Console.WriteLine("4: Stateless mode chat by using executor."); - Console.WriteLine("5: Load and save chat session."); - Console.WriteLine("6: Load and save state of model and executor."); - Console.WriteLine("7: Get embeddings from LLama model."); - Console.WriteLine("8: Quantize the model."); - Console.WriteLine("9: Automatic conversation."); - Console.WriteLine("10: Constrain response to json format using grammar."); - Console.WriteLine("11: Semantic Kernel Prompt."); - Console.WriteLine("12: Semantic Kernel Chat."); - Console.WriteLine("13: Semantic Kernel Memory."); - Console.WriteLine("14: Coding Assistant."); - Console.WriteLine("15: Batch Decoding."); - Console.WriteLine("16: SK Kernel Memory."); + AnsiConsole.Write(new Rule("LLamaSharp Examples")); while (true) { - Console.Write("\nYour choice: "); - int choice = int.Parse(Console.ReadLine()); + var choice = AnsiConsole.Prompt( + new SelectionPrompt() + .Title("Please choose[green] an example[/] to run: ") + .AddChoices(Examples.Keys)); - if (choice == 0) - { - await ChatSessionWithRoleName.Run(); - } - else if (choice == 1) - { - await ChatSessionStripRoleName.Run(); - } - else if (choice == 2) - { - await InteractiveModeExecute.Run(); - } - else if (choice == 3) - { - await InstructModeExecute.Run(); - } - else if (choice == 4) - { - await StatelessModeExecute.Run(); - } - else if (choice == 5) - { - await SaveAndLoadSession.Run(); - } - else if (choice == 6) - { - await LoadAndSaveState.Run(); - } - else if (choice == 7) - { - GetEmbeddings.Run(); - } - else if (choice == 8) - { - QuantizeModel.Run(); - } - else if (choice == 9) - { - await TalkToYourself.Run(); - } - else if (choice == 10) - { - await GrammarJsonResponse.Run(); - } - else if (choice == 11) - { - await SemanticKernelPrompt.Run(); - } - else if (choice == 12) - { - await SemanticKernelChat.Run(); - } - else if (choice == 13) - { - await SemanticKernelMemory.Run(); - } - else if (choice == 14) - { - await CodingAssistant.Run(); - } - else if (choice == 15) - { - await BatchedDecoding.Run(); - } - else if (choice == 16) - { - await KernelMemory.Run(); - } - else + + if (Examples.TryGetValue(choice, out var example)) { - Console.WriteLine("Cannot parse your choice. Please select again."); - continue; + if (choice == "Exit") + { + break; + } + AnsiConsole.Write(new Rule(choice)); + await example(); } - break; + + AnsiConsole.Clear(); } } } From 1dad1ff834e0f1425e2c85cace765df34930f41a Mon Sep 17 00:00:00 2001 From: Udayshankar Ravikumar Date: Tue, 7 Nov 2023 03:22:05 +0530 Subject: [PATCH 05/12] Enhance framework compatibility --- LLama/Common/FixedSizeQueue.cs | 4 +++- LLama/Extensions/DictionaryExtensions.cs | 4 +++- LLama/Extensions/EncodingExtensions.cs | 4 +++- LLama/Extensions/IEnumerableExtensions.cs | 4 +++- LLama/Extensions/KeyValuePairExtensions.cs | 4 +++- LLama/Extensions/ListExtensions.cs | 4 +++- 6 files changed, 18 insertions(+), 6 deletions(-) diff --git a/LLama/Common/FixedSizeQueue.cs b/LLama/Common/FixedSizeQueue.cs index d4577a47..4b665969 100644 --- a/LLama/Common/FixedSizeQueue.cs +++ b/LLama/Common/FixedSizeQueue.cs @@ -43,11 +43,13 @@ namespace LLama.Common /// public FixedSizeQueue(int size, IEnumerable data) { -#if !NETSTANDARD2_0 +#if NET6_0_OR_GREATER // Try to check the size without enumerating the entire IEnumerable. This may not be able to get the count, // in which case we'll have to check later if (data.TryGetNonEnumeratedCount(out var dataCount) && dataCount > size) throw new ArgumentException($"The max size set for the quene is {size}, but got {dataCount} initial values."); +#elif !NETSTANDARD2_0_OR_GREATER +#error Target framework not supported! #endif // Size of "data" is unknown, copy it all into a list diff --git a/LLama/Extensions/DictionaryExtensions.cs b/LLama/Extensions/DictionaryExtensions.cs index a39ed7e8..326a1686 100644 --- a/LLama/Extensions/DictionaryExtensions.cs +++ b/LLama/Extensions/DictionaryExtensions.cs @@ -4,11 +4,13 @@ namespace LLama.Extensions { internal static class DictionaryExtensions { -#if NETSTANDARD2_0 +#if NETSTANDARD2_0_OR_GREATER public static TValue GetValueOrDefault(this IReadOnlyDictionary dictionary, TKey key, TValue defaultValue) { return GetValueOrDefaultImpl(dictionary, key, defaultValue); } +#elif !NET6_0_OR_GREATER +#error Target framework not supported! #endif internal static TValue GetValueOrDefaultImpl(IReadOnlyDictionary dictionary, TKey key, TValue defaultValue) diff --git a/LLama/Extensions/EncodingExtensions.cs b/LLama/Extensions/EncodingExtensions.cs index e88d83a7..7766c244 100644 --- a/LLama/Extensions/EncodingExtensions.cs +++ b/LLama/Extensions/EncodingExtensions.cs @@ -5,7 +5,7 @@ namespace LLama.Extensions; internal static class EncodingExtensions { -#if NETSTANDARD2_0 +#if NETSTANDARD2_0_OR_GREATER public static int GetChars(this Encoding encoding, ReadOnlySpan bytes, Span output) { return GetCharsImpl(encoding, bytes, output); @@ -15,6 +15,8 @@ internal static class EncodingExtensions { return GetCharCountImpl(encoding, bytes); } +#elif !NET6_0_OR_GREATER +#error Target framework not supported! #endif internal static int GetCharsImpl(Encoding encoding, ReadOnlySpan bytes, Span output) diff --git a/LLama/Extensions/IEnumerableExtensions.cs b/LLama/Extensions/IEnumerableExtensions.cs index 9e01feb8..ab1a2a78 100644 --- a/LLama/Extensions/IEnumerableExtensions.cs +++ b/LLama/Extensions/IEnumerableExtensions.cs @@ -5,11 +5,13 @@ namespace LLama.Extensions { internal static class IEnumerableExtensions { -#if NETSTANDARD2_0 +#if NETSTANDARD2_0_OR_GREATER public static IEnumerable TakeLast(this IEnumerable source, int count) { return TakeLastImpl(source, count); } +#elif !NET6_0_OR_GREATER +#error Target framework not supported! #endif internal static IEnumerable TakeLastImpl(IEnumerable source, int count) diff --git a/LLama/Extensions/KeyValuePairExtensions.cs b/LLama/Extensions/KeyValuePairExtensions.cs index 6e12654d..610701b7 100644 --- a/LLama/Extensions/KeyValuePairExtensions.cs +++ b/LLama/Extensions/KeyValuePairExtensions.cs @@ -5,7 +5,7 @@ /// internal static class KeyValuePairExtensions { -#if NETSTANDARD2_0 +#if NETSTANDARD2_0_OR_GREATER /// /// Deconstruct a KeyValuePair into it's constituent parts. /// @@ -19,5 +19,7 @@ internal static class KeyValuePairExtensions first = pair.Key; second = pair.Value; } +#elif !NET6_0_OR_GREATER +#error Target framework not supported! #endif } \ No newline at end of file diff --git a/LLama/Extensions/ListExtensions.cs b/LLama/Extensions/ListExtensions.cs index 11a1d4f0..08a841bd 100644 --- a/LLama/Extensions/ListExtensions.cs +++ b/LLama/Extensions/ListExtensions.cs @@ -5,12 +5,14 @@ namespace LLama.Extensions { internal static class ListExtensions { -#if NETSTANDARD2_0 +#if NETSTANDARD2_0_OR_GREATER public static void EnsureCapacity(this List list, int capacity) { if (list.Capacity < capacity) list.Capacity = capacity; } +#elif !NET6_0_OR_GREATER +#error Target framework not supported! #endif public static void AddSpan(this List list, ReadOnlySpan items) From df310e15da331647ba1aff2dfffe22df51cb0880 Mon Sep 17 00:00:00 2001 From: Udayshankar Ravikumar Date: Tue, 7 Nov 2023 04:16:14 +0530 Subject: [PATCH 06/12] Fixed preprocessor directives --- LLama/Extensions/DictionaryExtensions.cs | 4 ++-- LLama/Extensions/EncodingExtensions.cs | 4 ++-- LLama/Extensions/IEnumerableExtensions.cs | 4 ++-- LLama/Extensions/KeyValuePairExtensions.cs | 4 ++-- LLama/Extensions/ListExtensions.cs | 2 +- 5 files changed, 9 insertions(+), 9 deletions(-) diff --git a/LLama/Extensions/DictionaryExtensions.cs b/LLama/Extensions/DictionaryExtensions.cs index 326a1686..1af0e9e1 100644 --- a/LLama/Extensions/DictionaryExtensions.cs +++ b/LLama/Extensions/DictionaryExtensions.cs @@ -4,12 +4,12 @@ namespace LLama.Extensions { internal static class DictionaryExtensions { -#if NETSTANDARD2_0_OR_GREATER +#if NETSTANDARD2_0 public static TValue GetValueOrDefault(this IReadOnlyDictionary dictionary, TKey key, TValue defaultValue) { return GetValueOrDefaultImpl(dictionary, key, defaultValue); } -#elif !NET6_0_OR_GREATER +#elif !NET6_0_OR_GREATER && !NETSTANDARD2_1_OR_GREATER #error Target framework not supported! #endif diff --git a/LLama/Extensions/EncodingExtensions.cs b/LLama/Extensions/EncodingExtensions.cs index 7766c244..5005b16c 100644 --- a/LLama/Extensions/EncodingExtensions.cs +++ b/LLama/Extensions/EncodingExtensions.cs @@ -5,7 +5,7 @@ namespace LLama.Extensions; internal static class EncodingExtensions { -#if NETSTANDARD2_0_OR_GREATER +#if NETSTANDARD2_0 public static int GetChars(this Encoding encoding, ReadOnlySpan bytes, Span output) { return GetCharsImpl(encoding, bytes, output); @@ -15,7 +15,7 @@ internal static class EncodingExtensions { return GetCharCountImpl(encoding, bytes); } -#elif !NET6_0_OR_GREATER +#elif !NET6_0_OR_GREATER && !NETSTANDARD2_1_OR_GREATER #error Target framework not supported! #endif diff --git a/LLama/Extensions/IEnumerableExtensions.cs b/LLama/Extensions/IEnumerableExtensions.cs index ab1a2a78..17428d29 100644 --- a/LLama/Extensions/IEnumerableExtensions.cs +++ b/LLama/Extensions/IEnumerableExtensions.cs @@ -5,12 +5,12 @@ namespace LLama.Extensions { internal static class IEnumerableExtensions { -#if NETSTANDARD2_0_OR_GREATER +#if NETSTANDARD2_0 public static IEnumerable TakeLast(this IEnumerable source, int count) { return TakeLastImpl(source, count); } -#elif !NET6_0_OR_GREATER +#elif !NET6_0_OR_GREATER && !NETSTANDARD2_1_OR_GREATER #error Target framework not supported! #endif diff --git a/LLama/Extensions/KeyValuePairExtensions.cs b/LLama/Extensions/KeyValuePairExtensions.cs index 610701b7..233195ed 100644 --- a/LLama/Extensions/KeyValuePairExtensions.cs +++ b/LLama/Extensions/KeyValuePairExtensions.cs @@ -5,7 +5,7 @@ /// internal static class KeyValuePairExtensions { -#if NETSTANDARD2_0_OR_GREATER +#if NETSTANDARD2_0 /// /// Deconstruct a KeyValuePair into it's constituent parts. /// @@ -19,7 +19,7 @@ internal static class KeyValuePairExtensions first = pair.Key; second = pair.Value; } -#elif !NET6_0_OR_GREATER +#elif !NET6_0_OR_GREATER && !NETSTANDARD2_1_OR_GREATER #error Target framework not supported! #endif } \ No newline at end of file diff --git a/LLama/Extensions/ListExtensions.cs b/LLama/Extensions/ListExtensions.cs index 08a841bd..003797dc 100644 --- a/LLama/Extensions/ListExtensions.cs +++ b/LLama/Extensions/ListExtensions.cs @@ -5,7 +5,7 @@ namespace LLama.Extensions { internal static class ListExtensions { -#if NETSTANDARD2_0_OR_GREATER +#if NETSTANDARD2_0 || NETSTANDARD2_1 public static void EnsureCapacity(this List list, int capacity) { if (list.Capacity < capacity) From a288e7c02bf0cc7b74aaca989e6dbd3913040db5 Mon Sep 17 00:00:00 2001 From: Philipp Bauer Date: Mon, 6 Nov 2023 18:20:07 -0600 Subject: [PATCH 07/12] Prevent duplication of user prompts / chat history in ChatSession. The way ChatSession.ChatAsync was using the provided methods from a IHistoryTransform interface implementation created unexpected duplication of the chat history messages. It also prevented loading previous history into the current session. --- LLama/ChatSession.cs | 50 ++++++++++++++++++++++++++++++++------------ 1 file changed, 37 insertions(+), 13 deletions(-) diff --git a/LLama/ChatSession.cs b/LLama/ChatSession.cs index 457e7e48..358d70c3 100644 --- a/LLama/ChatSession.cs +++ b/LLama/ChatSession.cs @@ -95,11 +95,11 @@ namespace LLama Directory.CreateDirectory(path); } _executor.Context.SaveState(Path.Combine(path, _modelStateFilename)); - if(Executor is StatelessExecutor) + if (Executor is StatelessExecutor) { } - else if(Executor is StatefulExecutorBase statefulExecutor) + else if (Executor is StatefulExecutorBase statefulExecutor) { statefulExecutor.SaveState(Path.Combine(path, _executorStateFilename)); } @@ -135,30 +135,54 @@ namespace LLama } /// - /// Get the response from the LLama model. Note that prompt could not only be the preset words, - /// but also the question you want to ask. + /// Generates a response for a given user prompt and manages history state for the user. + /// This will always pass the whole history to the model. Don't pass a whole history + /// to this method as the user prompt will be appended to the history of the current session. + /// If more control is needed, use the other overload of this method that accepts a ChatHistory object. /// /// /// /// - /// + /// Returns generated tokens of the assistant message. public async IAsyncEnumerable ChatAsync(string prompt, IInferenceParams? inferenceParams = null, [EnumeratorCancellation] CancellationToken cancellationToken = default) { - foreach(var inputTransform in InputTransformPipeline) + foreach (var inputTransform in InputTransformPipeline) prompt = inputTransform.Transform(prompt); - - History.Messages.AddRange(HistoryTransform.TextToHistory(AuthorRole.User, prompt).Messages); + + History.Messages.Add(new ChatHistory.Message(AuthorRole.User, prompt)); + + string internalPrompt = HistoryTransform.HistoryToText(History); + StringBuilder sb = new(); - await foreach (var result in ChatAsyncInternal(prompt, inferenceParams, cancellationToken)) + + await foreach (var result in ChatAsyncInternal(internalPrompt, inferenceParams, cancellationToken)) { yield return result; sb.Append(result); } - History.Messages.AddRange(HistoryTransform.TextToHistory(AuthorRole.Assistant, sb.ToString()).Messages); + + string assistantMessage = sb.ToString(); + + // Remove end tokens from the assistant message + // if defined in inferenceParams.AntiPrompts. + // We only want the response that was generated and not tokens + // that are delimiting the beginning or end of the response. + if (inferenceParams?.AntiPrompts != null) + { + foreach (var stopToken in inferenceParams.AntiPrompts) + { + assistantMessage = assistantMessage.Replace(stopToken, ""); + } + } + + History.Messages.Add(new ChatHistory.Message(AuthorRole.Assistant, assistantMessage)); } /// - /// Get the response from the LLama model with chat histories. + /// Generates a response for a given chat history. This method does not manage history state for the user. + /// If you want to e.g. truncate the history of a session to fit into the model's context window, + /// use this method and pass the truncated history to it. If you don't need this control, use the other + /// overload of this method that accepts a user prompt instead. /// /// /// @@ -167,14 +191,14 @@ namespace LLama public async IAsyncEnumerable ChatAsync(ChatHistory history, IInferenceParams? inferenceParams = null, [EnumeratorCancellation] CancellationToken cancellationToken = default) { var prompt = HistoryTransform.HistoryToText(history); - History.Messages.AddRange(HistoryTransform.TextToHistory(AuthorRole.User, prompt).Messages); + StringBuilder sb = new(); + await foreach (var result in ChatAsyncInternal(prompt, inferenceParams, cancellationToken)) { yield return result; sb.Append(result); } - History.Messages.AddRange(HistoryTransform.TextToHistory(AuthorRole.Assistant, sb.ToString()).Messages); } private async IAsyncEnumerable ChatAsyncInternal(string prompt, IInferenceParams? inferenceParams = null, [EnumeratorCancellation] CancellationToken cancellationToken = default) From 4071c1f5fcf2976246469100e756cd4a5a76e016 Mon Sep 17 00:00:00 2001 From: Udayshankar Ravikumar Date: Tue, 7 Nov 2023 14:05:00 +0530 Subject: [PATCH 08/12] Updated preprocessor directives --- LLama/Common/FixedSizeQueue.cs | 2 -- LLama/Extensions/ListExtensions.cs | 4 +--- 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/LLama/Common/FixedSizeQueue.cs b/LLama/Common/FixedSizeQueue.cs index 4b665969..37fb1cf5 100644 --- a/LLama/Common/FixedSizeQueue.cs +++ b/LLama/Common/FixedSizeQueue.cs @@ -48,8 +48,6 @@ namespace LLama.Common // in which case we'll have to check later if (data.TryGetNonEnumeratedCount(out var dataCount) && dataCount > size) throw new ArgumentException($"The max size set for the quene is {size}, but got {dataCount} initial values."); -#elif !NETSTANDARD2_0_OR_GREATER -#error Target framework not supported! #endif // Size of "data" is unknown, copy it all into a list diff --git a/LLama/Extensions/ListExtensions.cs b/LLama/Extensions/ListExtensions.cs index 003797dc..eb30a07a 100644 --- a/LLama/Extensions/ListExtensions.cs +++ b/LLama/Extensions/ListExtensions.cs @@ -5,14 +5,12 @@ namespace LLama.Extensions { internal static class ListExtensions { -#if NETSTANDARD2_0 || NETSTANDARD2_1 +#if !NET6_0_OR_GREATER public static void EnsureCapacity(this List list, int capacity) { if (list.Capacity < capacity) list.Capacity = capacity; } -#elif !NET6_0_OR_GREATER -#error Target framework not supported! #endif public static void AddSpan(this List list, ReadOnlySpan items) From db1bc741b039b6944a61477486a6e3c4ec10d51d Mon Sep 17 00:00:00 2001 From: Martin Evans Date: Tue, 7 Nov 2023 19:41:44 +0000 Subject: [PATCH 09/12] Modified `ContextSize` in parameters to be nullable. A null value means autodetect from the model. --- LLama.Web/Common/ModelOptions.cs | 4 +- LLama/Abstractions/IContextParams.cs | 4 +- LLama/Common/ModelParams.cs | 101 ++++++------------- LLama/Extensions/IContextParamsExtensions.cs | 2 +- LLama/Native/LLamaContextParams.cs | 2 +- 5 files changed, 38 insertions(+), 75 deletions(-) diff --git a/LLama.Web/Common/ModelOptions.cs b/LLama.Web/Common/ModelOptions.cs index 6a63ccc3..8cbf2f09 100644 --- a/LLama.Web/Common/ModelOptions.cs +++ b/LLama.Web/Common/ModelOptions.cs @@ -17,9 +17,9 @@ namespace LLama.Web.Common public int MaxInstances { get; set; } /// - /// Model context size (n_ctx) + /// Model context size (n_ctx). Null to use value from model. /// - public uint ContextSize { get; set; } = 512; + public uint? ContextSize { get; set; } /// /// the GPU that is used for scratch and small tensors diff --git a/LLama/Abstractions/IContextParams.cs b/LLama/Abstractions/IContextParams.cs index 8ff6d7cc..a2ac24f1 100644 --- a/LLama/Abstractions/IContextParams.cs +++ b/LLama/Abstractions/IContextParams.cs @@ -8,9 +8,9 @@ namespace LLama.Abstractions; public interface IContextParams { /// - /// Model context size (n_ctx) + /// Model context size (n_ctx). Null to use value from model file. /// - uint ContextSize { get; set; } + uint? ContextSize { get; set; } /// /// batch size for prompt processing (must be >=32 to use BLAS) (n_batch) diff --git a/LLama/Common/ModelParams.cs b/LLama/Common/ModelParams.cs index ee5bd3e4..9561e482 100644 --- a/LLama/Common/ModelParams.cs +++ b/LLama/Common/ModelParams.cs @@ -12,105 +12,68 @@ namespace LLama.Common public record ModelParams : ILLamaParams { - /// - /// Model context size (n_ctx) - /// - public uint ContextSize { get; set; } = 512; - /// - /// the GPU that is used for scratch and small tensors - /// + /// + public uint? ContextSize { get; set; } + + /// public int MainGpu { get; set; } = 0; - /// - /// Number of layers to run in VRAM / GPU memory (n_gpu_layers) - /// + /// public int GpuLayerCount { get; set; } = 20; - /// - /// Seed for the random number generator (seed) - /// + + /// public uint Seed { get; set; } = 0xFFFFFFFF; - /// - /// Use f16 instead of f32 for memory kv (memory_f16) - /// + + /// public bool UseFp16Memory { get; set; } = true; - /// - /// Use mmap for faster loads (use_mmap) - /// + + /// public bool UseMemorymap { get; set; } = true; - /// - /// Use mlock to keep model in memory (use_mlock) - /// + + /// public bool UseMemoryLock { get; set; } - /// - /// Compute perplexity over the prompt (perplexity) - /// + + /// public bool Perplexity { get; set; } - /// - /// Model path (model) - /// + + /// public string ModelPath { get; set; } - /// - /// List of LoRAs to apply - /// + /// public AdapterCollection LoraAdapters { get; set; } = new(); - /// - /// base model path for the lora adapter (lora_base) - /// + /// public string LoraBase { get; set; } = string.Empty; - /// - /// Number of threads (null = autodetect) (n_threads) - /// + /// public uint? Threads { get; set; } - /// - /// Number of threads to use for batch processing (null = autodetect) (n_threads) - /// + /// public uint? BatchThreads { get; set; } - /// - /// batch size for prompt processing (must be >=32 to use BLAS) (n_batch) - /// + /// public uint BatchSize { get; set; } = 512; - /// - /// Whether to use embedding mode. (embedding) Note that if this is set to true, - /// The LLamaModel won't produce text response anymore. - /// + /// public bool EmbeddingMode { get; set; } - /// - /// how split tensors should be distributed across GPUs. - /// - /// "[ 3, 2 ]" will assign 60% of the data to GPU 0 and 40% to GPU 1. + /// [JsonConverter(typeof(TensorSplitsCollectionConverter))] public TensorSplitsCollection TensorSplits { get; set; } = new(); - /// - /// RoPE base frequency - /// - public float? RopeFrequencyBase { get; set; } + /// + public float? RopeFrequencyBase { get; set; } - /// - /// RoPE frequency scaling factor - /// - public float? RopeFrequencyScale { get; set; } + /// + public float? RopeFrequencyScale { get; set; } - /// - /// Use experimental mul_mat_q kernels - /// - public bool MulMatQ { get; set; } + /// + public bool MulMatQ { get; set; } - /// - /// Load vocab only (no weights) - /// + /// public bool VocabOnly { get; set; } - /// - /// The encoding to use to convert text for the model - /// + /// [JsonConverter(typeof(EncodingConverter))] public Encoding Encoding { get; set; } = Encoding.UTF8; diff --git a/LLama/Extensions/IContextParamsExtensions.cs b/LLama/Extensions/IContextParamsExtensions.cs index fcc9d372..ed59c9df 100644 --- a/LLama/Extensions/IContextParamsExtensions.cs +++ b/LLama/Extensions/IContextParamsExtensions.cs @@ -21,7 +21,7 @@ namespace LLama.Extensions public static void ToLlamaContextParams(this IContextParams @params, out LLamaContextParams result) { result = NativeApi.llama_context_default_params(); - result.n_ctx = @params.ContextSize; + result.n_ctx = @params.ContextSize ?? 0; result.n_batch = @params.BatchSize; result.seed = @params.Seed; result.f16_kv = @params.UseFp16Memory; diff --git a/LLama/Native/LLamaContextParams.cs b/LLama/Native/LLamaContextParams.cs index 0a397a3d..9a0b2a8e 100644 --- a/LLama/Native/LLamaContextParams.cs +++ b/LLama/Native/LLamaContextParams.cs @@ -22,7 +22,7 @@ namespace LLama.Native public uint seed; /// - /// text context + /// text context, 0 = from model /// public uint n_ctx; From 13233147f470add12bdf1d2edfabd20ee002af98 Mon Sep 17 00:00:00 2001 From: Rinne Date: Thu, 9 Nov 2023 00:54:31 +0800 Subject: [PATCH 10/12] docs: fix typo in README. --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index ea72e8a4..397f641e 100644 --- a/README.md +++ b/README.md @@ -11,7 +11,7 @@ **The C#/.NET binding of [llama.cpp](https://github.com/ggerganov/llama.cpp). It provides higher-level APIs to inference the LLaMA Models and deploy it on local device with C#/.NET. It works on -both Windows, Linux and MAC without requirment for compiling llama.cpp yourself. Even without GPU or not enought GPU memory, you can still apply LLaMA models well with this repo. 🤗** +both Windows, Linux and MAC without requirment for compiling llama.cpp yourself. Even without GPU or not enough GPU memory, you can still apply LLaMA models well with this repo. 🤗** **Furthermore, it provides integrations with other projects such as [semantic-kernel](https://github.com/microsoft/semantic-kernel), [kernel-memory](https://github.com/microsoft/kernel-memory) and [BotSharp](https://github.com/SciSharp/BotSharp) to provide higher-level applications.** From 6ea40d15461a5243bfdd453bd6d2ede3cdaa5eaa Mon Sep 17 00:00:00 2001 From: Philipp Bauer Date: Wed, 8 Nov 2023 13:18:32 -0600 Subject: [PATCH 11/12] Use full history only when the ChatSession runs the first time --- LLama/ChatSession.cs | 33 ++++++++++++++++++++++++++++----- 1 file changed, 28 insertions(+), 5 deletions(-) diff --git a/LLama/ChatSession.cs b/LLama/ChatSession.cs index 358d70c3..68c3c093 100644 --- a/LLama/ChatSession.cs +++ b/LLama/ChatSession.cs @@ -1,11 +1,14 @@ using LLama.Abstractions; using LLama.Common; +using System; using System.Collections.Generic; using System.IO; +using System.Linq; using System.Runtime.CompilerServices; using System.Text; using System.Threading; using System.Threading.Tasks; +using static LLama.InteractiveExecutor; namespace LLama { @@ -151,11 +154,17 @@ namespace LLama History.Messages.Add(new ChatHistory.Message(AuthorRole.User, prompt)); - string internalPrompt = HistoryTransform.HistoryToText(History); + if (_executor is InteractiveExecutor executor) + { + InteractiveExecutorState state = (InteractiveExecutorState)executor.GetStateData(); + prompt = state.IsPromptRun + ? HistoryTransform.HistoryToText(History) + : prompt; + } StringBuilder sb = new(); - await foreach (var result in ChatAsyncInternal(internalPrompt, inferenceParams, cancellationToken)) + await foreach (var result in ChatAsyncInternal(prompt, inferenceParams, cancellationToken)) { yield return result; sb.Append(result); @@ -190,14 +199,28 @@ namespace LLama /// public async IAsyncEnumerable ChatAsync(ChatHistory history, IInferenceParams? inferenceParams = null, [EnumeratorCancellation] CancellationToken cancellationToken = default) { - var prompt = HistoryTransform.HistoryToText(history); + if (history.Messages.Count == 0) + { + throw new ArgumentException("History must contain at least one message."); + } - StringBuilder sb = new(); + string prompt; + if (_executor is InteractiveExecutor executor) + { + InteractiveExecutorState state = (InteractiveExecutorState)executor.GetStateData(); + + prompt = state.IsPromptRun + ? HistoryTransform.HistoryToText(History) + : history.Messages.Last().Content; + } + else + { + prompt = history.Messages.Last().Content; + } await foreach (var result in ChatAsyncInternal(prompt, inferenceParams, cancellationToken)) { yield return result; - sb.Append(result); } } From d2b544afb8225600ff9b4d07112315d8eddbffd7 Mon Sep 17 00:00:00 2001 From: Philipp Bauer Date: Wed, 8 Nov 2023 13:23:21 -0600 Subject: [PATCH 12/12] Improved method return description --- LLama/ChatSession.cs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/LLama/ChatSession.cs b/LLama/ChatSession.cs index 68c3c093..7ee99590 100644 --- a/LLama/ChatSession.cs +++ b/LLama/ChatSession.cs @@ -146,7 +146,7 @@ namespace LLama /// /// /// - /// Returns generated tokens of the assistant message. + /// Returns generated text of the assistant message. public async IAsyncEnumerable ChatAsync(string prompt, IInferenceParams? inferenceParams = null, [EnumeratorCancellation] CancellationToken cancellationToken = default) { foreach (var inputTransform in InputTransformPipeline) @@ -196,7 +196,7 @@ namespace LLama /// /// /// - /// + /// Returns generated text of the assistant message. public async IAsyncEnumerable ChatAsync(ChatHistory history, IInferenceParams? inferenceParams = null, [EnumeratorCancellation] CancellationToken cancellationToken = default) { if (history.Messages.Count == 0)