Merge branch 'master' into YaRN_scaling_parameters

2 years ago · 31244ae691
--- a/LLama.Examples/LLama.Examples.csproj
+++ b/LLama.Examples/LLama.Examples.csproj
@@ -29,7 +29,8 @@

  <ItemGroup>
    <PackageReference Include="Microsoft.Extensions.Logging.Console" Version="7.0.0" />
    <PackageReference Include="Microsoft.SemanticKernel" Version="1.0.0-beta4" />
    <PackageReference Include="Microsoft.SemanticKernel" Version="1.0.0-beta1" />
    <PackageReference Include="Spectre.Console" Version="0.47.0" />
  </ItemGroup>

  <ItemGroup>
--- a/LLama.Examples/NewVersion/GetEmbeddings.cs
+++ b/LLama.Examples/NewVersion/GetEmbeddings.cs
@@ -4,7 +4,7 @@ namespace LLama.Examples.NewVersion
 {
    public class GetEmbeddings
    {
        public static void Run()
        public static Task Run()
        {
            Console.Write("Please input your model path: ");
            var modelPath = Console.ReadLine();
@@ -23,6 +23,7 @@ namespace LLama.Examples.NewVersion
                Console.WriteLine(string.Join(", ", embedder.GetEmbeddings(text)));
                Console.WriteLine();
            }
            return Task.CompletedTask;
        }
    }
 }
--- a/LLama.Examples/NewVersion/QuantizeModel.cs
+++ b/LLama.Examples/NewVersion/QuantizeModel.cs
@@ -2,7 +2,7 @@
 {
    public class QuantizeModel
    {
        public static void Run()
        public static Task Run()
        {
            Console.Write("Please input your original model path: ");
            var inputPath = Console.ReadLine();
@@ -21,6 +21,8 @@
            {
                Console.WriteLine("Quantization failed!");
            }

            return Task.CompletedTask;
        }
    }
 }
--- a/LLama.Examples/NewVersion/TestRunner.cs
+++ b/LLama.Examples/NewVersion/TestRunner.cs
@@ -1,109 +1,54 @@
 namespace LLama.Examples.NewVersion
 using System.Linq.Expressions;
 using Spectre.Console;

 namespace LLama.Examples.NewVersion
 {
    public class NewVersionTestRunner
    {
        static Dictionary<string, Func<Task>> Examples = new Dictionary<string, Func<Task>>
        {
            {"Run a chat session without stripping the role names.", () => ChatSessionWithRoleName.Run()},
            {"Run a chat session with the role names stripped.",()=> ChatSessionStripRoleName.Run()},
            {"Interactive mode chat by using executor.",()=> InteractiveModeExecute.Run()},
            {"Instruct mode chat by using executor.",()=> InstructModeExecute.Run()},
            {"Stateless mode chat by using executor.",()=> StatelessModeExecute.Run()},
            {"Load and save chat session.",()=> SaveAndLoadSession.Run()},
            {"Load and save state of model and executor.",()=> LoadAndSaveState.Run()},
            {"Get embeddings from LLama model.",()=> GetEmbeddings.Run()},
            {"Quantize the model.",()=> QuantizeModel.Run()},
            {"Automatic conversation.",()=> TalkToYourself.Run()},
            {"Constrain response to json format using grammar.",()=> GrammarJsonResponse.Run()},
            {"Semantic Kernel Prompt.",()=> SemanticKernelPrompt.Run()},
            {"Semantic Kernel Chat.",()=> SemanticKernelChat.Run()},
            {"Semantic Kernel Memory.",()=> SemanticKernelMemory.Run()},
            {"Coding Assistant.",()=> CodingAssistant.Run()},
            {"Batch Decoding.",()=> BatchedDecoding.Run()},
            {"SK Kernel Memory.",()=> KernelMemory.Run()},
            {"Exit", ()=> Task.CompletedTask}
        };
        public static async Task Run()
        {
            Console.WriteLine("================LLamaSharp Examples (New Version)==================\n");

            Console.WriteLine("Please input a number to choose an example to run:");
            Console.WriteLine("0: Run a chat session without stripping the role names.");
            Console.WriteLine("1: Run a chat session with the role names stripped.");
            Console.WriteLine("2: Interactive mode chat by using executor.");
            Console.WriteLine("3: Instruct mode chat by using executor.");
            Console.WriteLine("4: Stateless mode chat by using executor.");
            Console.WriteLine("5: Load and save chat session.");
            Console.WriteLine("6: Load and save state of model and executor.");
            Console.WriteLine("7: Get embeddings from LLama model.");
            Console.WriteLine("8: Quantize the model.");
            Console.WriteLine("9: Automatic conversation.");
            Console.WriteLine("10: Constrain response to json format using grammar.");
            Console.WriteLine("11: Semantic Kernel Prompt.");
            Console.WriteLine("12: Semantic Kernel Chat.");
            Console.WriteLine("13: Semantic Kernel Memory.");
            Console.WriteLine("14: Coding Assistant.");
            Console.WriteLine("15: Batch Decoding.");
            Console.WriteLine("16: SK Kernel Memory.");
            AnsiConsole.Write(new Rule("LLamaSharp Examples"));

            while (true)
            {
                Console.Write("\nYour choice: ");
                int choice = int.Parse(Console.ReadLine());
                var choice = AnsiConsole.Prompt(
                    new SelectionPrompt<string>()
                        .Title("Please choose[green] an example[/] to run: ")
                        .AddChoices(Examples.Keys));

                if (choice == 0)
                {
                    await ChatSessionWithRoleName.Run();
                }
                else if (choice == 1)
                {
                    await ChatSessionStripRoleName.Run();
                }
                else if (choice == 2)
                {
                    await InteractiveModeExecute.Run();
                }
                else if (choice == 3)
                {
                    await InstructModeExecute.Run();
                }
                else if (choice == 4)
                {
                    await StatelessModeExecute.Run();
                }
                else if (choice == 5)
                {
                    await SaveAndLoadSession.Run();
                }
                else if (choice == 6)
                {
                    await LoadAndSaveState.Run();
                }
                else if (choice == 7)
                {
                    GetEmbeddings.Run();
                }
                else if (choice == 8)
                {
                    QuantizeModel.Run();
                }
                else if (choice == 9)
                {
                    await TalkToYourself.Run();
                }
                else if (choice == 10)
                {
                    await GrammarJsonResponse.Run();
                }
                else if (choice == 11)
                {
                    await SemanticKernelPrompt.Run();
                }
                else if (choice == 12)
                {
                    await SemanticKernelChat.Run();
                }
                else if (choice == 13)
                {
                    await SemanticKernelMemory.Run();
                }
                else if (choice == 14)
                {
                    await CodingAssistant.Run();
                }
                else if (choice == 15)
                {
                    await BatchedDecoding.Run();
                }
                else if (choice == 16)
                {
                    await KernelMemory.Run();
                }
                else

                if (Examples.TryGetValue(choice, out var example))
                {
                    Console.WriteLine("Cannot parse your choice. Please select again.");
                    continue;
                    if (choice == "Exit")
                    {
                        break;
                    }
                    AnsiConsole.Write(new Rule(choice));
                    await example();
                }
                break;

                AnsiConsole.Clear();
            }
        }
    }
--- a/LLama.KernelMemory/LLamaSharp.KernelMemory.csproj
+++ b/LLama.KernelMemory/LLamaSharp.KernelMemory.csproj
@@ -4,6 +4,27 @@
    <TargetFramework>net6.0</TargetFramework>
    <ImplicitUsings>enable</ImplicitUsings>
    <Nullable>enable</Nullable>

    <Version>0.7.1</Version>
    <Authors>Xbotter</Authors>
    <Company>SciSharp STACK</Company>
    <GeneratePackageOnBuild>true</GeneratePackageOnBuild>
    <Copyright>MIT, SciSharp STACK $([System.DateTime]::UtcNow.ToString(yyyy))</Copyright>
    <RepositoryUrl>https://github.com/SciSharp/LLamaSharp</RepositoryUrl>
    <RepositoryType>git</RepositoryType>
    <PackageIconUrl>https://avatars3.githubusercontent.com/u/44989469?s=200&amp;v=4</PackageIconUrl>
    <PackageTags>LLama, LLM, GPT, ChatGPT, kernel-memory, vector search, SciSharp</PackageTags>
    <Description>
      The integration of LLamaSharp and Microsoft kernel-memory. It could make it easy to support document search for LLamaSharp model inference.
    </Description>
    <PackageReleaseNotes>
      Support integration with kernel-memory
    </PackageReleaseNotes>
    <PackageLicenseExpression>MIT</PackageLicenseExpression>
    <PackageOutputPath>packages</PackageOutputPath>
    <Platforms>AnyCPU;x64;Arm64</Platforms>
    <PackageId>LLamaSharp.kernel-memory</PackageId>
    <Configurations>Debug;Release;GPU</Configurations>
  </PropertyGroup>

  <ItemGroup>
--- a/LLama.SemanticKernel/LLamaSharp.SemanticKernel.csproj
+++ b/LLama.SemanticKernel/LLamaSharp.SemanticKernel.csproj
@@ -10,8 +10,8 @@
 		<ImplicitUsings>enable</ImplicitUsings>
 		<Nullable>enable</Nullable>

    <Version>0.6.2-beta1</Version>
    <Authors>Tim Miller</Authors>
    <Version>0.7.1</Version>
    <Authors>Tim Miller, Xbotter</Authors>
    <Company>SciSharp STACK</Company>
    <GeneratePackageOnBuild>true</GeneratePackageOnBuild>
    <Copyright>MIT, SciSharp STACK $([System.DateTime]::UtcNow.ToString(yyyy))</Copyright>
@@ -20,7 +20,7 @@
    <PackageIconUrl>https://avatars3.githubusercontent.com/u/44989469?s=200&amp;v=4</PackageIconUrl>
    <PackageTags>LLama, LLM, GPT, ChatGPT, semantic-kernel, SciSharp</PackageTags>
    <Description>
      The integration of LLamaSharp ans semantic-kernel.
      The integration of LLamaSharp and Microsoft semantic-kernel.
    </Description>
    <PackageReleaseNotes>
      Support integration with semantic-kernel
--- a/LLama.Web/Common/ModelOptions.cs
+++ b/LLama.Web/Common/ModelOptions.cs
@@ -18,9 +18,9 @@ namespace LLama.Web.Common
        public int MaxInstances { get; set; }

        /// <summary>
        /// Model context size (n_ctx)
        /// Model context size (n_ctx). Null to use value from model.
        /// </summary>
        public uint ContextSize { get; set; } = 512;
        public uint? ContextSize { get; set; }

        /// <summary>
        /// the GPU that is used for scratch and small tensors
--- a/LLama/Abstractions/IContextParams.cs
+++ b/LLama/Abstractions/IContextParams.cs
@@ -9,9 +9,9 @@ namespace LLama.Abstractions;
 public interface IContextParams
 {
    /// <summary>
    /// Model context size (n_ctx)
    /// Model context size (n_ctx). Null to use value from model file.
    /// </summary>
    uint ContextSize { get; set; }
    uint? ContextSize { get; set; }

    /// <summary>
    /// batch size for prompt processing (must be >=32 to use BLAS) (n_batch)
--- a/LLama/ChatSession.cs
+++ b/LLama/ChatSession.cs
@@ -1,11 +1,14 @@
 using LLama.Abstractions;
 using LLama.Common;
 using System;
 using System.Collections.Generic;
 using System.IO;
 using System.Linq;
 using System.Runtime.CompilerServices;
 using System.Text;
 using System.Threading;
 using System.Threading.Tasks;
 using static LLama.InteractiveExecutor;

 namespace LLama
 {
@@ -95,11 +98,11 @@ namespace LLama
                Directory.CreateDirectory(path);
            }
            _executor.Context.SaveState(Path.Combine(path, _modelStateFilename));
            if(Executor is StatelessExecutor)
            if (Executor is StatelessExecutor)
            {

            }
            else if(Executor is StatefulExecutorBase statefulExecutor)
            else if (Executor is StatefulExecutorBase statefulExecutor)
            {
                statefulExecutor.SaveState(Path.Combine(path, _executorStateFilename));
            }
@@ -135,46 +138,90 @@ namespace LLama
        }

        /// <summary>
        /// Get the response from the LLama model. Note that prompt could not only be the preset words, 
        /// but also the question you want to ask.
        /// Generates a response for a given user prompt and manages history state for the user.
        /// This will always pass the whole history to the model. Don't pass a whole history
        /// to this method as the user prompt will be appended to the history of the current session.
        /// If more control is needed, use the other overload of this method that accepts a ChatHistory object.
        /// </summary>
        /// <param name="prompt"></param>
        /// <param name="inferenceParams"></param>
        /// <param name="cancellationToken"></param>
        /// <returns></returns>
        /// <returns>Returns generated text of the assistant message.</returns>
        public async IAsyncEnumerable<string> ChatAsync(string prompt, IInferenceParams? inferenceParams = null, [EnumeratorCancellation] CancellationToken cancellationToken = default)
        {
            foreach(var inputTransform in InputTransformPipeline)
            foreach (var inputTransform in InputTransformPipeline)
                prompt = inputTransform.Transform(prompt);
            
            History.Messages.AddRange(HistoryTransform.TextToHistory(AuthorRole.User, prompt).Messages);

            History.Messages.Add(new ChatHistory.Message(AuthorRole.User, prompt));

            if (_executor is InteractiveExecutor executor)
            {
                InteractiveExecutorState state = (InteractiveExecutorState)executor.GetStateData();
                prompt = state.IsPromptRun
                    ? HistoryTransform.HistoryToText(History)
                    : prompt;
            }

            StringBuilder sb = new();

            await foreach (var result in ChatAsyncInternal(prompt, inferenceParams, cancellationToken))
            {
                yield return result;
                sb.Append(result);
            }
            History.Messages.AddRange(HistoryTransform.TextToHistory(AuthorRole.Assistant, sb.ToString()).Messages);

            string assistantMessage = sb.ToString();

            // Remove end tokens from the assistant message
            // if defined in inferenceParams.AntiPrompts.
            // We only want the response that was generated and not tokens
            // that are delimiting the beginning or end of the response.
            if (inferenceParams?.AntiPrompts != null)
            {
                foreach (var stopToken in inferenceParams.AntiPrompts)
                {
                    assistantMessage = assistantMessage.Replace(stopToken, "");
                }
            }

            History.Messages.Add(new ChatHistory.Message(AuthorRole.Assistant, assistantMessage));
        }

        /// <summary>
        /// Get the response from the LLama model with chat histories.
        /// Generates a response for a given chat history. This method does not manage history state for the user.
        /// If you want to e.g. truncate the history of a session to fit into the model's context window,
        /// use this method and pass the truncated history to it. If you don't need this control, use the other
        /// overload of this method that accepts a user prompt instead.
        /// </summary>
        /// <param name="history"></param>
        /// <param name="inferenceParams"></param>
        /// <param name="cancellationToken"></param>
        /// <returns></returns>
        /// <returns>Returns generated text of the assistant message.</returns>
        public async IAsyncEnumerable<string> ChatAsync(ChatHistory history, IInferenceParams? inferenceParams = null, [EnumeratorCancellation] CancellationToken cancellationToken = default)
        {
            var prompt = HistoryTransform.HistoryToText(history);
            History.Messages.AddRange(HistoryTransform.TextToHistory(AuthorRole.User, prompt).Messages);
            StringBuilder sb = new();
            if (history.Messages.Count == 0)
            {
                throw new ArgumentException("History must contain at least one message.");
            }

            string prompt;
            if (_executor is InteractiveExecutor executor)
            {
                InteractiveExecutorState state = (InteractiveExecutorState)executor.GetStateData();

                prompt = state.IsPromptRun
                    ? HistoryTransform.HistoryToText(History)
                    : history.Messages.Last().Content;
            }
            else
            {
                prompt = history.Messages.Last().Content;
            }

            await foreach (var result in ChatAsyncInternal(prompt, inferenceParams, cancellationToken))
            {
                yield return result;
                sb.Append(result);
            }
            History.Messages.AddRange(HistoryTransform.TextToHistory(AuthorRole.Assistant, sb.ToString()).Messages);
        }

        private async IAsyncEnumerable<string> ChatAsyncInternal(string prompt, IInferenceParams? inferenceParams = null, [EnumeratorCancellation] CancellationToken cancellationToken = default)
--- a/LLama/Common/FixedSizeQueue.cs
+++ b/LLama/Common/FixedSizeQueue.cs
@@ -43,7 +43,7 @@ namespace LLama.Common
        /// <param name="data"></param>
        public FixedSizeQueue(int size, IEnumerable<T> data)
        {
 #if !NETSTANDARD2_0 
 #if NET6_0_OR_GREATER
            // Try to check the size without enumerating the entire IEnumerable. This may not be able to get the count,
            // in which case we'll have to check later
            if (data.TryGetNonEnumeratedCount(out var dataCount) && dataCount > size)
--- a/LLama/Common/ModelParams.cs
+++ b/LLama/Common/ModelParams.cs
@@ -13,92 +13,60 @@ namespace LLama.Common
    public record ModelParams
        : ILLamaParams
    {
        /// <summary>
        /// Model context size (n_ctx)
        /// </summary>
        public uint ContextSize { get; set; } = 512;
        /// <summary>
        /// the GPU that is used for scratch and small tensors
        /// </summary>
        /// <inheritdoc />
        public uint? ContextSize { get; set; }

        /// <inheritdoc />
        public int MainGpu { get; set; } = 0;

        /// <summary>
        /// Number of layers to run in VRAM / GPU memory (n_gpu_layers)
        /// </summary>
        /// <inheritdoc />
        public int GpuLayerCount { get; set; } = 20;
        /// <summary>
        /// Seed for the random number generator (seed)
        /// </summary>

        /// <inheritdoc />
        public uint Seed { get; set; } = 0xFFFFFFFF;
        /// <summary>
        /// Use f16 instead of f32 for memory kv (memory_f16)
        /// </summary>

        /// <inheritdoc />
        public bool UseFp16Memory { get; set; } = true;
        /// <summary>
        /// Use mmap for faster loads (use_mmap)
        /// </summary>

        /// <inheritdoc />
        public bool UseMemorymap { get; set; } = true;
        /// <summary>
        /// Use mlock to keep model in memory (use_mlock)
        /// </summary>

        /// <inheritdoc />
        public bool UseMemoryLock { get; set; }
        /// <summary>
        /// Compute perplexity over the prompt (perplexity)
        /// </summary>

        /// <inheritdoc />
        public bool Perplexity { get; set; }
        /// <summary>
        /// Model path (model)
        /// </summary>

        /// <inheritdoc />
        public string ModelPath { get; set; }

        /// <summary>
        /// List of LoRAs to apply
        /// </summary>
        /// <inheritdoc />
        public AdapterCollection LoraAdapters { get; set; } = new();

        /// <summary>
        /// base model path for the lora adapter (lora_base)
        /// </summary>
        /// <inheritdoc />
        public string LoraBase { get; set; } = string.Empty;

        /// <summary>
        /// Number of threads (null = autodetect) (n_threads)
        /// </summary>
        /// <inheritdoc />
        public uint? Threads { get; set; }

        /// <summary>
        /// Number of threads to use for batch processing (null = autodetect) (n_threads)
        /// </summary>
        /// <inheritdoc />
        public uint? BatchThreads { get; set; }


        /// <summary>
        /// batch size for prompt processing (must be >=32 to use BLAS) (n_batch)
        /// </summary>
        /// <inheritdoc />
        public uint BatchSize { get; set; } = 512;

        /// <summary>
        /// Whether to use embedding mode. (embedding) Note that if this is set to true, 
        /// The LLamaModel won't produce text response anymore.
        /// </summary>
        /// <inheritdoc />
        public bool EmbeddingMode { get; set; }

        /// <summary>
        /// how split tensors should be distributed across GPUs.
        /// </summary>
        /// <remarks>"[ 3, 2 ]" will assign 60% of the data to GPU 0 and 40% to GPU 1.</remarks>
        /// <inheritdoc />
        [JsonConverter(typeof(TensorSplitsCollectionConverter))]
        public TensorSplitsCollection TensorSplits { get; set; } = new();

 		/// <summary>
 		/// RoPE base frequency
 		/// </summary>
 		public float? RopeFrequencyBase { get; set; }
        /// <inheritdoc />
        public float? RopeFrequencyBase { get; set; }

 		/// <summary>
 		/// RoPE frequency scaling factor
 		/// </summary>
 		public float? RopeFrequencyScale { get; set; }
        /// <inheritdoc />
        public float? RopeFrequencyScale { get; set; }

        /// <inheritdoc />
        public float? YarnExtrapolationFactor { get; set; }
@@ -123,14 +91,10 @@ namespace LLama.Common
        /// </summary>
        public bool MulMatQ { get; set; }

        /// <summary>
        /// Load vocab only (no weights)
        /// </summary>
        /// <inheritdoc />
        public bool VocabOnly { get; set; }

        /// <summary>
        /// The encoding to use to convert text for the model
        /// </summary>
        /// <inheritdoc />
        [JsonConverter(typeof(EncodingConverter))]
        public Encoding Encoding { get; set; } = Encoding.UTF8;

--- a/LLama/Extensions/DictionaryExtensions.cs
+++ b/LLama/Extensions/DictionaryExtensions.cs
@@ -9,6 +9,8 @@ namespace LLama.Extensions
        {
            return GetValueOrDefaultImpl(dictionary, key, defaultValue);
        }
 #elif !NET6_0_OR_GREATER && !NETSTANDARD2_1_OR_GREATER
 #error Target framework not supported!
 #endif

        internal static TValue GetValueOrDefaultImpl<TKey, TValue>(IReadOnlyDictionary<TKey, TValue> dictionary, TKey key, TValue defaultValue)
--- a/LLama/Extensions/EncodingExtensions.cs
+++ b/LLama/Extensions/EncodingExtensions.cs
@@ -15,6 +15,8 @@ internal static class EncodingExtensions
    {
        return GetCharCountImpl(encoding, bytes);
    }
 #elif !NET6_0_OR_GREATER && !NETSTANDARD2_1_OR_GREATER
 #error Target framework not supported!
 #endif

    internal static int GetCharsImpl(Encoding encoding, ReadOnlySpan<byte> bytes, Span<char> output)
--- a/LLama/Extensions/IContextParamsExtensions.cs
+++ b/LLama/Extensions/IContextParamsExtensions.cs
@@ -21,7 +21,7 @@ namespace LLama.Extensions
        public static void ToLlamaContextParams(this IContextParams @params, out LLamaContextParams result)
        {
            result = NativeApi.llama_context_default_params();
            result.n_ctx = @params.ContextSize;
            result.n_ctx = @params.ContextSize ?? 0;
            result.n_batch = @params.BatchSize;
            result.seed = @params.Seed;
            result.f16_kv = @params.UseFp16Memory;
--- a/LLama/Extensions/IEnumerableExtensions.cs
+++ b/LLama/Extensions/IEnumerableExtensions.cs
@@ -10,6 +10,8 @@ namespace LLama.Extensions
        {
            return TakeLastImpl(source, count);
        }
 #elif !NET6_0_OR_GREATER && !NETSTANDARD2_1_OR_GREATER
 #error Target framework not supported!
 #endif

        internal static IEnumerable<T> TakeLastImpl<T>(IEnumerable<T> source, int count)
--- a/LLama/Extensions/KeyValuePairExtensions.cs
+++ b/LLama/Extensions/KeyValuePairExtensions.cs
@@ -19,5 +19,7 @@ internal static class KeyValuePairExtensions
        first = pair.Key;
        second = pair.Value;
    }
 #elif !NET6_0_OR_GREATER && !NETSTANDARD2_1_OR_GREATER
 #error Target framework not supported!
 #endif
 }
--- a/LLama/Extensions/ListExtensions.cs
+++ b/LLama/Extensions/ListExtensions.cs
@@ -5,7 +5,7 @@ namespace LLama.Extensions
 {
    internal static class ListExtensions
    {
 #if NETSTANDARD2_0
 #if !NET6_0_OR_GREATER
        public static void EnsureCapacity<T>(this List<T> list, int capacity)
        {
            if (list.Capacity < capacity)
--- a/LLama/Native/LLamaContextParams.cs
+++ b/LLama/Native/LLamaContextParams.cs
@@ -22,7 +22,7 @@ namespace LLama.Native
        public uint seed;

        /// <summary>
        /// text context
        /// text context, 0 = from model
        /// </summary>
        public uint n_ctx;

--- a/README.md
+++ b/README.md
@@ -11,7 +11,7 @@


 **The C#/.NET binding of [llama.cpp](https://github.com/ggerganov/llama.cpp). It provides higher-level APIs to inference the LLaMA Models and deploy it on local device with C#/.NET. It works on 
 both Windows, Linux and MAC without requirment for compiling llama.cpp yourself. Even without GPU or not enought GPU memory, you can still apply LLaMA models well with this repo. 🤗**
 both Windows, Linux and MAC without requirment for compiling llama.cpp yourself. Even without GPU or not enough GPU memory, you can still apply LLaMA models well with this repo. 🤗**

 **Furthermore, it provides integrations with other projects such as [semantic-kernel](https://github.com/microsoft/semantic-kernel), [kernel-memory](https://github.com/microsoft/kernel-memory) and [BotSharp](https://github.com/SciSharp/BotSharp) to provide higher-level applications.**

@@ -54,6 +54,12 @@ For [microsoft semantic-kernel](https://github.com/microsoft/semantic-kernel) in
 LLamaSharp.semantic-kernel
 ```

 For [microsoft kernel-memory](https://github.com/microsoft/kernel-memory) integration, please search and install the following package (currently kernel-memory only supports net6.0):

 ```
 LLamaSharp.kernel-memory
 ```

 ### Tips for choosing a version

 In general, there may be some break changes between two minor releases, for example 0.5.1 and 0.6.0. On the contrary, we don't introduce API break changes in patch release. Therefore it's recommended to keep the highest patch version of a minor release. For example, keep 0.5.6 instead of 0.5.3.
@@ -196,7 +202,7 @@ Another choice is generate gguf format file yourself with a pytorch weight (or a

 🔳 Fine-tune

 ⚠️ Local document search (enabled by kernel-memory now)
 ✅ Local document search (enabled by kernel-memory now)

 🔳 MAUI Integration