diff --git a/Assets/LLamaSharp-Integrations.png b/Assets/LLamaSharp-Integrations.png
new file mode 100644
index 00000000..0533e0cb
Binary files /dev/null and b/Assets/LLamaSharp-Integrations.png differ
diff --git a/Assets/LLamaSharp-Integrations.vsdx b/Assets/LLamaSharp-Integrations.vsdx
new file mode 100644
index 00000000..f1b67a58
Binary files /dev/null and b/Assets/LLamaSharp-Integrations.vsdx differ
diff --git a/Assets/llava_demo.gif b/Assets/llava_demo.gif
new file mode 100644
index 00000000..3c5c9e68
Binary files /dev/null and b/Assets/llava_demo.gif differ
diff --git a/LLama.Examples/Examples/ChatChineseGB2312.cs b/LLama.Examples/Examples/ChatChineseGB2312.cs
index a1d78b09..f3a964b4 100644
--- a/LLama.Examples/Examples/ChatChineseGB2312.cs
+++ b/LLama.Examples/Examples/ChatChineseGB2312.cs
@@ -3,6 +3,7 @@ using LLama.Common;
namespace LLama.Examples.Examples;
+// This example shows how to deal with Chinese input with gb2312 encoding.
public class ChatChineseGB2312
{
private static string ConvertEncoding(string input, Encoding original, Encoding target)
diff --git a/LLama.Examples/Examples/ChatSessionStripRoleName.cs b/LLama.Examples/Examples/ChatSessionStripRoleName.cs
index 6b8b6187..b46c92e4 100644
--- a/LLama.Examples/Examples/ChatSessionStripRoleName.cs
+++ b/LLama.Examples/Examples/ChatSessionStripRoleName.cs
@@ -2,6 +2,8 @@
namespace LLama.Examples.Examples;
+// When using chatsession, it's a common case that you want to strip the role names
+// rather than display them. This example shows how to use transforms to strip them.
public class ChatSessionStripRoleName
{
public static async Task Run()
diff --git a/LLama.Examples/Examples/InstructModeExecute.cs b/LLama.Examples/Examples/InstructModeExecute.cs
index 54a9858d..1f88c2f1 100644
--- a/LLama.Examples/Examples/InstructModeExecute.cs
+++ b/LLama.Examples/Examples/InstructModeExecute.cs
@@ -2,6 +2,7 @@
namespace LLama.Examples.Examples
{
+ // This example shows how to use InstructExecutor to generate the response.
public class InstructModeExecute
{
public static async Task Run()
diff --git a/LLama.Examples/Examples/InteractiveModeExecute.cs b/LLama.Examples/Examples/InteractiveModeExecute.cs
index 40d84df8..20544f8d 100644
--- a/LLama.Examples/Examples/InteractiveModeExecute.cs
+++ b/LLama.Examples/Examples/InteractiveModeExecute.cs
@@ -2,6 +2,7 @@
namespace LLama.Examples.Examples
{
+ // This is an example which shows how to chat with LLM with InteractiveExecutor.
public class InteractiveModeExecute
{
public static async Task Run()
diff --git a/LLama.Examples/Examples/LlavaInteractiveModeExecute.cs b/LLama.Examples/Examples/LlavaInteractiveModeExecute.cs
index 4932a2ae..34c7aca0 100644
--- a/LLama.Examples/Examples/LlavaInteractiveModeExecute.cs
+++ b/LLama.Examples/Examples/LlavaInteractiveModeExecute.cs
@@ -5,6 +5,8 @@ using Spectre.Console;
namespace LLama.Examples.Examples
{
+ // This example shows how to chat with LLaVA model with both image and text as input.
+ // It uses the interactive executor to inference.
public class LlavaInteractiveModeExecute
{
public static async Task Run()
diff --git a/LLama.Examples/Examples/LoadAndSaveState.cs b/LLama.Examples/Examples/LoadAndSaveState.cs
index 75c597bf..730b7080 100644
--- a/LLama.Examples/Examples/LoadAndSaveState.cs
+++ b/LLama.Examples/Examples/LoadAndSaveState.cs
@@ -2,6 +2,7 @@
namespace LLama.Examples.Examples
{
+ // This example shows how to save/load state of the executor.
public class LoadAndSaveState
{
public static async Task Run()
diff --git a/README.md b/README.md
index 6935efe9..efb7385a 100644
--- a/README.md
+++ b/README.md
@@ -11,41 +11,28 @@
[](https://www.nuget.org/packages/LLamaSharp.Backend.OpenCL)
-**The C#/.NET binding of [llama.cpp](https://github.com/ggerganov/llama.cpp). It provides higher-level APIs to inference the LLaMA Models and deploy it on local device with C#/.NET. It works on Windows, Linux and Mac without need to compile llama.cpp yourself. Even without a GPU or not enough GPU memory, you can still use LLaMA models! π€**
+**LLamaSharp is a cross-platform library to run π¦LLaMA/LLaVA model (and others) in local device. Based on [llama.cpp](https://github.com/ggerganov/llama.cpp), inference with LLamaSharp is efficient on both CPU and GPU. With the higher-level APIs and RAG support, it's convenient to deploy LLM (Large Language Model) in your application with LLamaSharp.**
+
+**Please star the repo to show your support for this project!π€**
+
+---
-**Furthermore, it provides integrations with other projects such as [semantic-kernel](https://github.com/microsoft/semantic-kernel), [kernel-memory](https://github.com/microsoft/kernel-memory) and [BotSharp](https://github.com/SciSharp/BotSharp) to provide higher-level applications.**
-**Discussions about the roadmap to v1.0.0: [#287](https://github.com/SciSharp/LLamaSharp/issues/287)**
Table of Contents
-
## Documentation
- [Quick start](https://scisharp.github.io/LLamaSharp/latest/GetStarted/)
@@ -53,42 +40,80 @@
- [Full documentation](https://scisharp.github.io/LLamaSharp/latest/)
- [API reference](https://scisharp.github.io/LLamaSharp/latest/xmldocs/)
-## Examples
+
+## Console Demo
+
+
+
+ | LLaMA |
+ LLaVA |
+
+
+  |
+  |
+
+
+
+
+## Toolkits & Examples
+
+There are integarions for the following libraries, making it easier to develop your APP. Integrations for semantic-kernel and kernel-memory are developed in LLamaSharp repository, while others are developed in their own repositories.
+
+- [semantic-kernel](https://github.com/microsoft/semantic-kernel): an SDK that integrates LLM like OpenAI, Azure OpenAI, and Hugging Face.
+- [kernel-memory](https://github.com/microsoft/kernel-memory): a multi-modal AI Service specialized in the efficient indexing of datasets through custom continuous data hybrid pipelines, with support for RAG ([Retrieval Augmented Generation](https://en.wikipedia.org/wiki/Prompt_engineering#Retrieval-augmented_generation)), synthetic memory, prompt engineering, and custom semantic memory processing.
+- [BotSharp](https://github.com/SciSharp/BotSharp): an open source machine learning framework for AI Bot platform builder.
+- [Langchain](https://github.com/tryAGI/LangChain): a framework for developing applications powered by language models.
+
+
+The following examples show how to build APPs with LLamaSharp.
+
- [Official Console Examples](./LLama.Examples/)
- [Unity Demo](https://github.com/eublefar/LLAMASharpUnityDemo)
-- [LLamaStack (with WPF and Web support)](https://github.com/saddam213/LLamaStack)
+- [LLamaStack (with WPF and Web demo)](https://github.com/saddam213/LLamaStack)
- [Blazor Demo (with Model Explorer)](https://github.com/alexhiggins732/BLlamaSharp.ChatGpt.Blazor)
+- [ASP.NET Demo](./LLama.Web/)
+
+
-## Installation
-1. Install [`LLamaSharp`](https://www.nuget.org/packages/LLamaSharp) package in NuGet:
+## Get started
+
+### Installation
+
+To gain high performance, LLamaSharp interacts with a native library compiled from c++, which is called `backend`. We provide backend packages for Windows, Linux and MAC with CPU, Cuda, Metal and OpenCL. You **don't** need to handle anything about c++ but just install the backend packages.
+
+If no published backend match your device, please open an issue to let us know. If compiling c++ code is not difficult for you, you could also follow [this guide](./docs/ContributingGuide.md) to compile a backend and run LLamaSharp with it.
+
+1. Install [LLamaSharp](https://www.nuget.org/packages/LLamaSharp) package on NuGet:
```
PM> Install-Package LLamaSharp
```
-2. Install **one** of these backends:
+2. Install one or more of these backends, or use self-compiled backend.
- - [`LLamaSharp.Backend.Cpu`](https://www.nuget.org/packages/LLamaSharp.Backend.Cpu): Pure CPU for Windows & Linux. Metal for Mac.
- - [`LLamaSharp.Backend.Cuda11`](https://www.nuget.org/packages/LLamaSharp.Backend.Cuda11): CUDA11 for Windows and Linux
- - [`LLamaSharp.Backend.Cuda12`](https://www.nuget.org/packages/LLamaSharp.Backend.Cuda12): CUDA 12 for Windows and Linux
- - [`LLamaSharp.Backend.OpenCL`](https://www.nuget.org/packages/LLamaSharp.Backend.OpenCL): OpenCL for Windows and Linux
- - If none of these backends is suitable you can compile [llama.cpp](https://github.com/ggerganov/llama.cpp) yourself. In this case, please **DO NOT** install the backend packages! Instead, add your DLL to your project and ensure it will be copied to the output directory when compiling your project. If you do this you must use exactly the correct llama.cpp commit, refer to the version table further down.
+ - [`LLamaSharp.Backend.Cpu`](https://www.nuget.org/packages/LLamaSharp.Backend.Cpu): Pure CPU for Windows & Linux & MAC. Metal (GPU) support for MAC.
+ - [`LLamaSharp.Backend.Cuda11`](https://www.nuget.org/packages/LLamaSharp.Backend.Cuda11): CUDA11 for Windows & Linux.
+ - [`LLamaSharp.Backend.Cuda12`](https://www.nuget.org/packages/LLamaSharp.Backend.Cuda12): CUDA 12 for Windows & Linux.
+ - [`LLamaSharp.Backend.OpenCL`](https://www.nuget.org/packages/LLamaSharp.Backend.OpenCL): OpenCL for Windows & Linux.
3. (optional) For [Microsoft semantic-kernel](https://github.com/microsoft/semantic-kernel) integration, install the [LLamaSharp.semantic-kernel](https://www.nuget.org/packages/LLamaSharp.semantic-kernel) package.
-4. (optional) For [Microsoft kernel-memory](https://github.com/microsoft/kernel-memory) integration, install the [LLamaSharp.kernel-memory](https://www.nuget.org/packages/LLamaSharp.kernel-memory) package (this package currently only supports `net6.0`).
+4. (optional) To enable RAG support, install the [LLamaSharp.kernel-memory](https://www.nuget.org/packages/LLamaSharp.kernel-memory) package (this package only supports `net6.0` or higher yet), which is based on [Microsoft kernel-memory](https://github.com/microsoft/kernel-memory) integration.
-### Tips for choosing a version
+### Model preparation
-Llama.cpp is a fast moving project with frequent breaking changes, therefore breaking changes are expected frequently in LLamaSharp. LLamaSharp follows [semantic versioning](https://semver.org/) and will not introduce breaking API changes on patch versions.
+There are two popular format of model file of LLM now, which are PyTorch format (.pth) and Huggingface format (.bin). LLamaSharp uses `GGUF` format file, which could be converted from these two formats. To get `GGUF` file, there are two options:
-It is suggested to update to the latest patch version as soon as it is released, and to update to new major versions as soon as possible.
+1. Search model name + 'gguf' in [Huggingface](https://huggingface.co), you will find lots of model files that have already been converted to GGUF format. Please take care of the publishing time of them because some old ones could only work with old version of LLamaSharp.
-## Quick Start
+2. Convert PyTorch or Huggingface format to GGUF format yourself. Please follow the instructions of [this part of llama.cpp readme](https://github.com/ggerganov/llama.cpp?tab=readme-ov-file#prepare-and-quantize) to convert them with the python scripts.
-#### Model Inference and Chat Session
+Generally, we recommend downloading models with quantization rather than fp16, because it significantly reduce the required memory size while only slightly impact on its generation quality.
-LLamaSharp provides two ways to run inference: `LLamaExecutor` and `ChatSession`. The chat session is a higher-level wrapping of the executor and the model. Here's a simple example to use chat session.
+
+### Example of LLaMA chat session
+
+Here is a simple example to chat with bot based on LLM in LLamaSharp. Please replace the model path with yours.
```cs
using LLama.Common;
@@ -140,45 +165,36 @@ while (userInput != "exit")
}
```
-For more usage, please refer to [Examples](./LLama.Examples).
+For more examples, please refer to [LLamaSharp.Examples](./LLama.Examples).
-#### Web API
-We provide [an integration with ASP.NET core](./LLama.WebAPI) and a [web app demo](./LLama.Web). Since we are in short of hands, if you're familiar with ASP.NET core, we'll appreciate it if you would like to help upgrading the Web API integration.
+## FAQ
-## Features
+#### Why GPU is not used when I have installed CUDA
----
+1. If you are using backend packages, please make sure you have installed the cuda backend package which matches the cuda version of your device. Please note that before LLamaSharp v0.10.0, only one backend package should be installed.
+2. Add `NativeLibraryConfig.Instance.WithLogs(LLamaLogLevel.Info)` to the very beginning of your code. The log will show which native library file is loaded. If the CPU library is loaded, please try to compile the native library yourself and open an issue for that. If the CUDA libraty is loaded, please check if `GpuLayerCount > 0` when loading the model weight.
-β
: completed. β οΈ: outdated for latest release but will be updated. π³: not completed
+#### Why the inference is slow
----
+Firstly, due to the large size of LLM models, it requires more time to generate outputs than other models, especially when you are using models larger than 30B.
-β
LLaMa model inference
-β
Embeddings generation, tokenization and detokenization
-β
Chat session
-β
Quantization
-β
Grammar
-β
State saving and loading
-β
BotSharp Integration [Online Demo](https://victorious-moss-007e11310.4.azurestaticapps.net/)
-β
ASP.NET core Integration
-β
Semantic-kernel Integration
-π³ Fine-tune
-β
Local document search (enabled by kernel-memory)
-π³ MAUI Integration
+To see if that's a LLamaSharp performance issue, please follow the two tips below.
-## Console Demo
+1. If you are using CUDA, Metal or OpenCL, please set `GpuLayerCount` as large as possible.
+2. If it's still slower than you expect it to be, please try to run the same model with same setting in [llama.cpp examples](https://github.com/ggerganov/llama.cpp/tree/master/examples). If llama.cpp outperforms LLamaSharp significantly, it's likely a LLamaSharp BUG and please report us for that.
-
-## FAQ
+#### Why the program crashes before any output is generated
+
+Generally, there are two possible cases for this problem:
-1. GPU out of memory: Please try setting `n_gpu_layers` to a smaller number.
-2. Unsupported model: `llama.cpp` is under quick development and often has breaking changes. Please check the release date of the model and find a suitable version of LLamaSharp to install, or generate `gguf` format weights from original weights yourself.
-3. Cannot load native library:
- - Ensure you have installed one of the backend packages.
- - Run `NativeLibraryConfig.WithLogs()` at the very beginning of your code to print more information.
-4. Models in GGUF format are compatible with LLamaSharp. It's a good idea to search for [`gguf` on huggingface](https://huggingface.co/models?search=gguf) to find a model. Another choice is generate a GGUF format file yourself, please refer to [convert.py](https://github.com/ggerganov/llama.cpp/blob/master/convert.py) for more information.
+1. The native library (backend) you are using is not compatible with the LLamaSharp version. If you compiled the native library yourself, please make sure you have checkouted llama.cpp to the corresponding commit of LLamaSharp, which could be found at the bottom of README.
+2. The model file you are using is not compatible with the backend. If you are using a GGUF file downloaded from huggingface, please check its publishing time.
+
+#### Why my model is generating output infinitely
+
+Please set anti-prompt or max-length when executing the inference.
## Contributing
@@ -193,15 +209,13 @@ You can also do one of the followings to help us make LLamaSharp better:
- Help to develop Web API and UI integration.
- Just open an issue about the problem you met!
-## Contact us
+## Join the community
Join our chat on [Discord](https://discord.gg/7wNVU65ZDY) (please contact Rinne to join the dev channel if you want to be a contributor).
Join [QQ group](http://qm.qq.com/cgi-bin/qm/qr?_wv=1027&k=sN9VVMwbWjs5L0ATpizKKxOcZdEPMrp8&authKey=RLDw41bLTrEyEgZZi%2FzT4pYk%2BwmEFgFcrhs8ZbkiVY7a4JFckzJefaYNW6Lk4yPX&noverify=0&group_code=985366726)
-## Appendix
-
-### LLamaSharp and llama.cpp versions
+## Map of LLamaSharp and llama.cpp versions
If you want to compile llama.cpp yourself you **must** use the exact commit ID listed for each version.
| LLamaSharp | Verified Model Resources | llama.cpp commit id |
diff --git a/docs/Architecture.md b/docs/Architecture.md
index 7ab6776e..49980e77 100644
--- a/docs/Architecture.md
+++ b/docs/Architecture.md
@@ -2,22 +2,14 @@
## Architecture of main functions
-The figure below shows the core framework structure, which is separated to four levels.
+The figure below shows the core framework structure of LLamaSharp.
-- **LLamaContext**: The holder of a model which directly interact with native library and provide some basic APIs such as tokenization and embedding. Currently it includes three classes: `LLamaContext`, `LLamaEmbedder` and `LLamaQuantizer`.
-- **LLamaExecutors**: Executors which define the way to run the LLama model. It provides text-to-text APIs to make it easy to use. Currently we provide three kinds of executors: `InteractiveExecutor`, `InstructExecutor` and `StatelessExecutor`.
+- **Native APIs**: LLamaSharp calls the exported C APIs to load and run the model. The APIs defined in LLamaSharp specially for calling C APIs are named `Native APIs`. We have made all the native APIs public under namespace `LLama.Native`. However, it's strongly recommended not to use them unless you know what you are doing.
+- **LLamaWeights**: The holder of the model weight.
+- **LLamaContext**: A context which directly interact with the native library and provide some basic APIs such as tokenization and embedding. It takes use of `LLamaWeights`.
+- **LLamaExecutors**: Executors which define the way to run the LLama model. It provides text-to-text and image-to-text APIs to make it easy to use. Currently we provide four kinds of executors: `InteractiveExecutor`, `InstructExecutor`, `StatelessExecutor` and `BatchedExecutor`.
- **ChatSession**: A wrapping for `InteractiveExecutor` and `LLamaContext`, which supports interactive tasks and saving/re-loading sessions. It also provides a flexible way to customize the text process by `IHistoryTransform`, `ITextTransform` and `ITextStreamTransform`.
-- **High-level Applications**: Some applications that provides higher-level integration. For example, [BotSharp](https://github.com/SciSharp/BotSharp) provides integration for vector search, Chatbot UI and Web APIs. [semantic-kernel](https://github.com/microsoft/semantic-kernel) provides various APIs for manipulations related with LLM. If you've made an integration, please tell us and add it to the doc!
+- **Integrations**: Integrations with other libraries to expand the application of LLamaSharp. For example, if you want to do RAG ([Retrieval Augmented Generation](https://en.wikipedia.org/wiki/Prompt_engineering#Retrieval-augmented_generation)), kernel-memory integration is a good option for you.

-
-## Recommended Use
-
-Since `LLamaContext` interact with native library, it's not recommended to use the methods of it directly unless you know what you are doing. So does the `NativeApi`, which is not included in the architecture figure above.
-
-`ChatSession` is recommended to be used when you want to build an application similar to ChatGPT, or the ChatBot, because it works best with `InteractiveExecutor`. Though other executors are also allowed to passed as a parameter to initialize a `ChatSession`, it's not encouraged if you are new to LLamaSharp and LLM.
-
-High-level applications, such as BotSharp, are supposed to be used when you concentrate on the part not related with LLM. For example, if you want to deploy a chat bot to help you remember your schedules, using BotSharp may be a good choice.
-
-Note that the APIs of the high-level applications may not be stable now. Please take it into account when using them.
\ No newline at end of file
diff --git a/docs/ChatSession/basic-usages.md b/docs/ChatSession/basic-usages.md
deleted file mode 100644
index fb1826bb..00000000
--- a/docs/ChatSession/basic-usages.md
+++ /dev/null
@@ -1,36 +0,0 @@
-# Basic usages of ChatSession
-
-`ChatSession` is a higher-level abstraction than the executors. In the context of a chat application like ChatGPT, a "chat session" refers to an interactive conversation or exchange of messages between the user and the chatbot. It represents a continuous flow of communication where the user enters input or asks questions, and the chatbot responds accordingly. A chat session typically starts when the user initiates a conversation with the chatbot and continues until the interaction comes to a natural end or is explicitly terminated by either the user or the system. During a chat session, the chatbot maintains the context of the conversation, remembers previous messages, and generates appropriate responses based on the user's inputs and the ongoing dialogue.
-
-## Initialize a session
-
-Currently, the only parameter that is accepted is an `ILLamaExecutor`, because this is the only parameter that we're sure to exist in all the future versions. Since it's the high-level abstraction, we're conservative to the API designs. In the future, there may be more kinds of constructors added.
-
-```cs
-InteractiveExecutor ex = new(new LLamaModel(new ModelParams(modelPath)));
-ChatSession session = new ChatSession(ex);
-```
-
-## Chat with the bot
-
-There'll be two kinds of input accepted by the `Chat` API, which are `ChatHistory` and `String`. The API with string is quite similar to that of the executors. Meanwhile, the API with `ChatHistory` is aimed to provide more flexible usages. For example, you have had a chat with the bot in session A before you open the session B. Now session B has no memory for what you said before. Therefore, you can feed the history of A to B.
-
-```cs
-string prompt = "What is C#?";
-
-await foreach (var text in session.ChatAsync(prompt, new InferenceParams() { Temperature = 0.6f, AntiPrompts = new List { "User:" } })) // the inference params should be changed depending on your statement
-{
- Console.Write(text);
-}
-```
-
-## Get the history
-
-Currently `History` is a property of `ChatSession`.
-
-```cs
-foreach(var rec in session.History.Messages)
-{
- Console.WriteLine($"{rec.AuthorRole}: {rec.Content}");
-}
-```
diff --git a/docs/ChatSession/save-load-session.md b/docs/ChatSession/save-load-session.md
deleted file mode 100644
index cfa6669c..00000000
--- a/docs/ChatSession/save-load-session.md
+++ /dev/null
@@ -1,14 +0,0 @@
-# Save/Load Chat Session
-
-Generally, the chat session could be switched, which requires the ability of loading and saving session.
-
-When building a chat bot app, it's **NOT encouraged** to initialize many chat sessions and keep them in memory to wait for being switched, because the memory consumption of both CPU and GPU is expensive. It's recommended to save the current session before switching to a new session, and load the file when switching back to the session.
-
-The API is also quite simple, the files will be saved into a directory you specified. If the path does not exist, a new directory will be created.
-
-```cs
-string savePath = "";
-session.SaveSession(savePath);
-
-session.LoadSession(savePath);
-```
diff --git a/docs/ContributingGuide.md b/docs/ContributingGuide.md
index 458a4511..922059cb 100644
--- a/docs/ContributingGuide.md
+++ b/docs/ContributingGuide.md
@@ -2,21 +2,65 @@
Hi, welcome to develop LLamaSharp with us together! We are always open for every contributor and any format of contributions! If you want to maintain this library actively together, please contact us to get the write access after some PRs. (Email: AsakusaRinne@gmail.com)
-In this page, we'd like to introduce how to make contributions here easily. π
+In this page, we introduce how to make contributions here easily. π
-## Compile the native library from source
+## The goal of LLamaSharp
-Firstly, please clone the [llama.cpp](https://github.com/ggerganov/llama.cpp) repository and following the instructions in [llama.cpp readme](https://github.com/ggerganov/llama.cpp#build) to configure your local environment.
+At the beginning, LLamaSharp is a C# binding of [llama.cpp](https://github.com/ggerganov/llama.cpp). It provided only some wrappers for llama.cpp to let C#/.NET users could run LLM models on their local device efficiently even if without any experience with C++. After around a year of development, more tools and integrations has been added to LLamaSharp, significantly expanding the application of LLamaSharp. Though llama.cpp is still the only backend of LLamaSharp, the goal of this repository is more likely to be an efficient and easy-to-use library of LLM inference, rather than just a binding of llama.cpp.
-If you want to support cublas in the compilation, please make sure that you've installed the cuda.
+In this way, our development of LLamaSharp is divided into two main directions:
-When building from source, please add `-DBUILD_SHARED_LIBS=ON` to the cmake instruction. For example, when building with cublas but without openblas, use the following instruction:
+1. To make LLamaSharp more efficient. For example, `BatchedExecutor` could accept multiple queries and generate the response for them at the same time, which significantly improves the throughput. This part is always related with native APIs and executors in LLamaSharp.
+2. To make it easier to use LLamaSharp. We believe the best library is to let users build powerful functionalities with simple code. Higher-level APIs and integrations with other libraries are the key points of it.
+
+
+## How to compile the native library from source
+
+If you want to contribute to the first direction of our goal, you may need to compile the native library yourself.
+
+Firstly, please follow the instructions in [llama.cpp readme](https://github.com/ggerganov/llama.cpp#build) to configure your local environment. Most importantly, CMake with version higher than 3.14 should be installed on your device.
+
+Secondly, clone the llama.cpp repositories. You could manually clone it and checkout to the right commit according to [Map of LLamaSharp and llama.cpp versions](https://github.com/SciSharp/LLamaSharp?tab=readme-ov-file#map-of-llamasharp-and-llama.cpp-versions), or use clone the submodule of LLamaSharp when cloning LLamaSharp.
+
+```shell
+git clone --recursive https://github.com/SciSharp/LLamaSharp.git
+```
+
+If you want to support cublas in the compilation, please make sure that you've installed it. If you are using Intel CPU, please check the highest AVX ([Advanced Vector Extensions](https://en.wikipedia.org/wiki/Advanced_Vector_Extensions)) level that is supported by your device.
+
+As shown in [llama.cpp cmake file](https://github.com/ggerganov/llama.cpp/blob/master/CMakeLists.txt), there are many options that could be enabled or disabled when building the library. The following ones are commonly used when using it as a native library of LLamaSharp.
+
+```cpp
+option(BUILD_SHARED_LIBS "build shared libraries") // Please always enable it
+option(LLAMA_NATIVE "llama: enable -march=native flag") // Could be disabled
+option(LLAMA_AVX "llama: enable AVX") // Enable it if the highest supported avx level is AVX
+option(LLAMA_AVX2 "llama: enable AVX2") // Enable it if the highest supported avx level is AVX2
+option(LLAMA_AVX512 "llama: enable AVX512") // Enable it if the highest supported avx level is AVX512
+option(LLAMA_BLAS "llama: use BLAS") // Enable it if you want to use BLAS library to acclerate the computation on CPU
+option(LLAMA_CUDA "llama: use CUDA") // Enable it if you have CUDA device
+option(LLAMA_CLBLAST "llama: use CLBlast") // Enable it if you have a device with CLBLast or OpenCL support, for example, some AMD GPUs.
+option(LLAMA_VULKAN "llama: use Vulkan") // Enable it if you have a device with Vulkan support
+option(LLAMA_METAL "llama: use Metal") // Enable it if you are using a MAC with Metal device.
+option(LLAMA_BUILD_TESTS "llama: build tests") // Please disable it.
+option(LLAMA_BUILD_EXAMPLES "llama: build examples") // Please disable it.
+option(LLAMA_BUILD_SERVER "llama: build server example")// Please disable it.
+```
+
+Most importantly, `-DBUILD_SHARED_LIBS=ON` must be added to the cmake instruction and other options depends on you. For example, when building with cublas but without openblas, use the following instruction:
```bash
+mkdir build && cd build
cmake .. -DLLAMA_CUBLAS=ON -DBUILD_SHARED_LIBS=ON
+cmake --build . --config Release
```
-After running `cmake --build . --config Release`, you could find the `llama.dll`, `llama.so` or `llama.dylib` in your build directory. After pasting it to `LLamaSharp/LLama/runtimes` , you can use it as the native library in LLamaSharp.
+Now you could find the `llama.dll`, `libllama.so` or `llama.dylib` in your build directory (or `build/bin`).
+
+To load the compiled native library, please add the following code to the very beginning of your code.
+
+```cs
+NativeLibraryConfig.Instance.WithLibrary("");
+```
## Add a new feature to LLamaSharp
@@ -24,7 +68,7 @@ After running `cmake --build . --config Release`, you could find the `llama.dll`
After refactoring the framework in `v0.4.0`, LLamaSharp will try to maintain the backward compatibility. However, in the following cases a breaking change will be required:
1. Due to some break changes in [llama.cpp](https://github.com/ggerganov/llama.cpp), making a breaking change will help to maintain the good abstraction and friendly user APIs.
-2. A very important feature cannot be implemented unless refactoring some parts.
+2. An important feature cannot be implemented unless refactoring some parts.
3. After some discussions, an agreement was reached that making the break change is reasonable.
If a new feature could be added without introducing any break change, please **open a PR** rather than open an issue first. We will never refuse the PR but help to improve it, unless it's malicious.
@@ -39,19 +83,19 @@ You could use exactly the same prompt, the same model and the same parameters to
If the experiment showed that it worked well in llama.cpp but didn't in LLamaSharp, a search for the problem could be started. While the reason of the problem could be various, the best way I think is to add log-print in the code of llama.cpp and use it in LLamaSharp after compilation. Thus, when running LLamaSharp, you could see what happened in the native library.
-After finding out the reason, a painful but happy process comes. When working on the BUG fix, there's only one rule to follow, that is keeping the examples working well. If the modification fixed the BUG but impact on other functions, it would not be a good fix.
-
-During the BUG fix process, please don't hesitate to discuss together when you stuck on something.
+During the BUG fix process, please don't hesitate to discuss together when you are blocked.
## Add integrations
-All kinds of integration are welcomed here! Currently the following integrations are under work or on our schedule:
+All kinds of integration are welcomed here! Currently the following integrations have been added but still need improvement:
+
+1. semantic-kernel
+2. kernel-memory
+3. BotSharp (maintained in SciSharp/BotSharp repo)
+4. Langchain (maintained in tryAGI/LangChain repo)
-1. BotSharp
-2. semantic-kernel
-3. Unity
+If you find another library that is good to be integrated, please open an issue to let us know!
-Besides, for some other integrations, like `ASP.NET core`, `SQL`, `Blazor` and so on, we'll appreciate it if you could help with that. If the time is limited for you, providing an example for it also means a lot!
## Add examples
@@ -62,4 +106,4 @@ There're mainly two ways to add an example:
## Add documents
-LLamaSharp uses [mkdocs](https://github.com/mkdocs/mkdocs) to build the documentation, please follow the tutorial of mkdocs to add or modify documents in LLamaSharp.
\ No newline at end of file
+LLamaSharp uses [mkdocs](https://github.com/mkdocs/mkdocs) to build the documentation, please follow the tutorial of mkdocs to add or modify documents in LLamaSharp.
diff --git a/docs/Examples/BatchDecoding.md b/docs/Examples/BatchDecoding.md
deleted file mode 100644
index 0b03ed04..00000000
--- a/docs/Examples/BatchDecoding.md
+++ /dev/null
@@ -1,170 +0,0 @@
-# Batch decoding
-
-```cs
-using System.Diagnostics;
-using System.Text;
-using LLama.Common;
-using LLama.Native;
-using LLama.Sampling;
-
-public class BatchedDecoding
-{
- private const int n_parallel = 8;
- private const int n_len = 32;
-
- public static async Task Run()
- {
- Console.Write("Please input your model path: ");
- var modelPath = Console.ReadLine();
-
- Console.WriteLine("Prompt (leave blank to select automatically):");
- var prompt = Console.ReadLine();
- if (string.IsNullOrWhiteSpace(prompt))
- prompt = "Not many people know that";
-
- // Load model
- var parameters = new ModelParams(modelPath);
-
- using var model = LLamaWeights.LoadFromFile(parameters);
-
- // Tokenize prompt
- var prompt_tokens = model.Tokenize(prompt, true, false, Encoding.UTF8);
- var n_kv_req = prompt_tokens.Length + (n_len - prompt_tokens.Length) * n_parallel;
-
- // Create a context
- parameters.ContextSize = (uint)model.ContextSize;
- parameters.BatchSize = (uint)Math.Max(n_len, n_parallel);
- using var context = model.CreateContext(parameters);
-
- var n_ctx = context.ContextSize;
-
- // make sure the KV cache is big enough to hold all the prompt and generated tokens
- if (n_kv_req > n_ctx)
- {
- await Console.Error.WriteLineAsync($"error: n_kv_req ({n_kv_req}) > n_ctx, the required KV cache size is not big enough\n");
- await Console.Error.WriteLineAsync(" either reduce n_parallel or increase n_ctx\n");
- return;
- }
-
- var batch = new LLamaBatch();
-
- // evaluate the initial prompt
- batch.AddRange(prompt_tokens, 0, LLamaSeqId.Zero, true);
-
- if (await context.DecodeAsync(batch) != DecodeResult.Ok)
- {
- await Console.Error.WriteLineAsync("llama_decode failed");
- return;
- }
-
- // assign the system KV cache to all parallel sequences
- // this way, the parallel sequences will "reuse" the prompt tokens without having to copy them
- for (var i = 1; i < n_parallel; ++i)
- {
- context.NativeHandle.KvCacheSequenceCopy((LLamaSeqId)0, (LLamaSeqId)i, 0, batch.TokenCount);
- }
-
- if (n_parallel > 1)
- {
- Console.WriteLine();
- Console.WriteLine($"generating {n_parallel} sequences...");
- }
-
- // remember the batch index of the last token for each parallel sequence
- // we need this to determine which logits to sample from
- List i_batch = new();
- for (var i = 0; i < n_parallel; i++)
- i_batch.Add(batch.TokenCount - 1);
-
- // Create per-stream decoder and sampler
- var decoders = new StreamingTokenDecoder[n_parallel];
- var samplers = new ISamplingPipeline[n_parallel];
- for (var i = 0; i < n_parallel; i++)
- {
- decoders[i] = new StreamingTokenDecoder(context);
- samplers[i] = new DefaultSamplingPipeline
- {
- Temperature = 0.1f + (float)i / n_parallel,
- MinP = 0.25f,
- };
- }
-
- var n_cur = batch.TokenCount;
- var n_decode = 0;
-
- var timer = new Stopwatch();
- timer.Start();
- while (n_cur <= n_len)
- {
- batch.Clear();
-
- for (var i = 0; i < n_parallel; i++)
- {
- // Skip completed streams
- if (i_batch[i] < 0)
- continue;
-
- // Use the sampling pipeline to select a token
- var new_token_id = samplers[i].Sample(
- context.NativeHandle,
- context.NativeHandle.GetLogitsIth(i_batch[i]),
- Array.Empty()
- );
-
- // Finish this stream early if necessary
- if (new_token_id == model.EndOfSentenceToken || new_token_id == model.NewlineToken)
- {
- i_batch[i] = -1;
- Console.WriteLine($"Completed Stream {i} early");
- continue;
- }
-
- // Add this token to the decoder, so it will be turned into text
- decoders[i].Add(new_token_id);
-
- i_batch[i] = batch.TokenCount;
-
- // push this new token for next evaluation
- batch.Add(new_token_id, n_cur, (LLamaSeqId)i, true);
-
- n_decode++;
- }
-
- // Check if all streams are finished
- if (batch.TokenCount == 0)
- {
- break;
- }
-
- n_cur++;
-
- // evaluate the current batch with the transformer model
- if (await context.DecodeAsync(batch) != 0)
- {
- await Console.Error.WriteLineAsync("failed to eval");
- return;
- }
- }
-
- timer.Stop();
- Console.ForegroundColor = ConsoleColor.Yellow;
- Console.WriteLine();
- Console.WriteLine($"Decoded {n_decode} tokens in {timer.ElapsedMilliseconds}ms");
- Console.WriteLine($"Rate: {n_decode / timer.Elapsed.TotalSeconds:##.000} tokens/second");
-
- var index = 0;
- foreach (var stream in decoders)
- {
- var text = stream.Read();
-
- Console.ForegroundColor = ConsoleColor.Green;
- Console.Write($"{index++}. {prompt}");
- Console.ForegroundColor = ConsoleColor.Red;
- Console.WriteLine(text);
- }
-
- Console.WriteLine("Press any key to exit demo");
- Console.ReadKey(true);
- }
-}
-```
\ No newline at end of file
diff --git a/docs/Examples/BatchedExecutorFork.md b/docs/Examples/BatchedExecutorFork.md
new file mode 100644
index 00000000..ad391dd1
--- /dev/null
+++ b/docs/Examples/BatchedExecutorFork.md
@@ -0,0 +1,148 @@
+# Bacthed executor - multi-output to one input
+
+```cs
+using LLama.Batched;
+using LLama.Common;
+using LLama.Native;
+using LLama.Sampling;
+using Spectre.Console;
+
+namespace LLama.Examples.Examples;
+
+///
+/// This demonstrates generating multiple replies to the same prompt, with a shared cache
+///
+public class BatchedExecutorFork
+{
+ private const int n_split = 16;
+ private const int n_len = 72;
+
+ public static async Task Run()
+ {
+ string modelPath = UserSettings.GetModelPath();
+
+ var parameters = new ModelParams(modelPath);
+ using var model = LLamaWeights.LoadFromFile(parameters);
+
+ var prompt = AnsiConsole.Ask("Prompt (or ENTER for default):", "Not many people know that");
+
+ // Create an executor that can evaluate a batch of conversations together
+ using var executor = new BatchedExecutor(model, parameters);
+
+ // Print some info
+ var name = executor.Model.Metadata.GetValueOrDefault("general.name", "unknown model name");
+ Console.WriteLine($"Created executor with model: {name}");
+
+ // Evaluate the initial prompt to create one conversation
+ using var start = executor.Create();
+ start.Prompt(prompt);
+ await executor.Infer();
+
+ // Create the root node of the tree
+ var root = new Node(start);
+
+ await AnsiConsole
+ .Progress()
+ .StartAsync(async progress =>
+ {
+ var reporter = progress.AddTask("Running Inference (1)", maxValue: n_len);
+
+ // Run inference loop
+ for (var i = 0; i < n_len; i++)
+ {
+ if (i != 0)
+ await executor.Infer();
+
+ // Occasionally fork all the active conversations
+ if (i != 0 && i % n_split == 0)
+ root.Split();
+
+ // Sample all active conversations
+ root.Sample();
+
+ // Update progress bar
+ reporter.Increment(1);
+ reporter.Description($"Running Inference ({root.ActiveConversationCount})");
+ }
+
+ // Display results
+ var display = new Tree(prompt);
+ root.Display(display);
+ AnsiConsole.Write(display);
+ });
+ }
+
+ private class Node
+ {
+ private readonly StreamingTokenDecoder _decoder;
+
+ private readonly DefaultSamplingPipeline _sampler;
+ private Conversation? _conversation;
+
+ private Node? _left;
+ private Node? _right;
+
+ public int ActiveConversationCount => _conversation != null ? 1 : _left!.ActiveConversationCount + _right!.ActiveConversationCount;
+
+ public Node(Conversation conversation)
+ {
+ _sampler = new DefaultSamplingPipeline();
+ _conversation = conversation;
+ _decoder = new StreamingTokenDecoder(conversation.Executor.Context);
+ }
+
+ public void Sample()
+ {
+ if (_conversation == null)
+ {
+ _left?.Sample();
+ _right?.Sample();
+ return;
+ }
+
+ if (_conversation.RequiresInference)
+ return;
+
+ // Sample one token
+ var ctx = _conversation.Executor.Context.NativeHandle;
+ var token = _sampler.Sample(ctx, _conversation.Sample(), Array.Empty());
+ _sampler.Accept(ctx, token);
+ _decoder.Add(token);
+
+ // Prompt the conversation with this token, to continue generating from there
+ _conversation.Prompt(token);
+ }
+
+ public void Split()
+ {
+ if (_conversation != null)
+ {
+ _left = new Node(_conversation.Fork());
+ _right = new Node(_conversation.Fork());
+
+ _conversation.Dispose();
+ _conversation = null;
+ }
+ else
+ {
+ _left?.Split();
+ _right?.Split();
+ }
+ }
+
+ public void Display(T tree, int depth = 0)
+ where T : IHasTreeNodes
+ {
+ var colors = new[] { "red", "green", "blue", "yellow", "white" };
+ var color = colors[depth % colors.Length];
+
+ var message = Markup.Escape(_decoder.Read().ReplaceLineEndings(""));
+
+ var n = tree.AddNode($"[{color}]{message}[/]");
+
+ _left?.Display(n, depth + 1);
+ _right?.Display(n, depth + 1);
+ }
+ }
+}
+```
\ No newline at end of file
diff --git a/docs/Examples/BatchedExecutorGuidance.md b/docs/Examples/BatchedExecutorGuidance.md
new file mode 100644
index 00000000..94d0ef86
--- /dev/null
+++ b/docs/Examples/BatchedExecutorGuidance.md
@@ -0,0 +1,130 @@
+# Batched executor - basic guidance
+
+```cs
+using LLama.Batched;
+using LLama.Common;
+using LLama.Native;
+using LLama.Sampling;
+using Spectre.Console;
+
+namespace LLama.Examples.Examples;
+
+///
+/// This demonstrates using a batch to generate two sequences and then using one
+/// sequence as the negative guidance ("classifier free guidance") for the other.
+///
+public class BatchedExecutorGuidance
+{
+ private const int n_len = 32;
+
+ public static async Task Run()
+ {
+ string modelPath = UserSettings.GetModelPath();
+
+ var parameters = new ModelParams(modelPath);
+ using var model = LLamaWeights.LoadFromFile(parameters);
+
+ var positivePrompt = AnsiConsole.Ask("Positive Prompt (or ENTER for default):", "My favourite colour is").Trim();
+ var negativePrompt = AnsiConsole.Ask("Negative Prompt (or ENTER for default):", "I hate the colour red. My favourite colour is").Trim();
+ var weight = AnsiConsole.Ask("Guidance Weight (or ENTER for default):", 2.0f);
+
+ // Create an executor that can evaluate a batch of conversations together
+ using var executor = new BatchedExecutor(model, parameters);
+
+ // Print some info
+ var name = executor.Model.Metadata.GetValueOrDefault("general.name", "unknown model name");
+ Console.WriteLine($"Created executor with model: {name}");
+
+ // Load the two prompts into two conversations
+ using var guided = executor.Create();
+ guided.Prompt(positivePrompt);
+ using var guidance = executor.Create();
+ guidance.Prompt(negativePrompt);
+
+ // Run inference to evaluate prompts
+ await AnsiConsole
+ .Status()
+ .Spinner(Spinner.Known.Line)
+ .StartAsync("Evaluating Prompts...", _ => executor.Infer());
+
+ // Fork the "guided" conversation. We'll run this one without guidance for comparison
+ using var unguided = guided.Fork();
+
+ // Run inference loop
+ var unguidedSampler = new GuidedSampler(null, weight);
+ var unguidedDecoder = new StreamingTokenDecoder(executor.Context);
+ var guidedSampler = new GuidedSampler(guidance, weight);
+ var guidedDecoder = new StreamingTokenDecoder(executor.Context);
+ await AnsiConsole
+ .Progress()
+ .StartAsync(async progress =>
+ {
+ var reporter = progress.AddTask("Running Inference", maxValue: n_len);
+
+ for (var i = 0; i < n_len; i++)
+ {
+ if (i != 0)
+ await executor.Infer();
+
+ // Sample from the "unguided" conversation. This is just a conversation using the same prompt, without any
+ // guidance. This serves as a comparison to show the effect of guidance.
+ var u = unguidedSampler.Sample(executor.Context.NativeHandle, unguided.Sample(), Array.Empty());
+ unguidedDecoder.Add(u);
+ unguided.Prompt(u);
+
+ // Sample from the "guided" conversation. This sampler will internally use the "guidance" conversation
+ // to steer the conversation. See how this is done in GuidedSampler.ProcessLogits (bottom of this file).
+ var g = guidedSampler.Sample(executor.Context.NativeHandle, guided.Sample(), Array.Empty());
+ guidedDecoder.Add(g);
+
+ // Use this token to advance both guided _and_ guidance. Keeping them in sync (except for the initial prompt).
+ guided.Prompt(g);
+ guidance.Prompt(g);
+
+ // Early exit if we reach the natural end of the guided sentence
+ if (g == model.EndOfSentenceToken)
+ break;
+
+ // Update progress bar
+ reporter.Increment(1);
+ }
+ });
+
+ AnsiConsole.MarkupLine($"[green]Unguided:[/][white]{unguidedDecoder.Read().ReplaceLineEndings(" ")}[/]");
+ AnsiConsole.MarkupLine($"[green]Guided:[/][white]{guidedDecoder.Read().ReplaceLineEndings(" ")}[/]");
+ }
+
+ private class GuidedSampler(Conversation? guidance, float weight)
+ : BaseSamplingPipeline
+ {
+ public override void Accept(SafeLLamaContextHandle ctx, LLamaToken token)
+ {
+ }
+
+ public override ISamplingPipeline Clone()
+ {
+ throw new NotSupportedException();
+ }
+
+ protected override void ProcessLogits(SafeLLamaContextHandle ctx, Span logits, ReadOnlySpan lastTokens)
+ {
+ if (guidance == null)
+ return;
+
+ // Get the logits generated by the guidance sequences
+ var guidanceLogits = guidance.Sample();
+
+ // Use those logits to guide this sequence
+ NativeApi.llama_sample_apply_guidance(ctx, logits, guidanceLogits, weight);
+ }
+
+ protected override LLamaToken ProcessTokenDataArray(SafeLLamaContextHandle ctx, LLamaTokenDataArray candidates, ReadOnlySpan lastTokens)
+ {
+ candidates.Temperature(ctx, 0.8f);
+ candidates.TopK(ctx, 25);
+
+ return candidates.SampleToken(ctx);
+ }
+ }
+}
+```
\ No newline at end of file
diff --git a/docs/Examples/BatchedExecutorRewind.md b/docs/Examples/BatchedExecutorRewind.md
new file mode 100644
index 00000000..06287b7c
--- /dev/null
+++ b/docs/Examples/BatchedExecutorRewind.md
@@ -0,0 +1,121 @@
+# Batched executor - rewinding to an earlier state
+
+```cs
+using LLama.Batched;
+using LLama.Common;
+using LLama.Native;
+using LLama.Sampling;
+using Spectre.Console;
+
+namespace LLama.Examples.Examples;
+
+///
+/// This demonstrates generating tokens and then rewinding to an earlier state
+///
+public class BatchedExecutorRewind
+{
+ private const int n_generate = 24;
+ private const int n_rewind = 12;
+ private const int n_repeats = 6;
+
+ public static async Task Run()
+ {
+ string modelPath = UserSettings.GetModelPath();
+
+ var parameters = new ModelParams(modelPath);
+ using var model = LLamaWeights.LoadFromFile(parameters);
+
+ var prompt = AnsiConsole.Ask("Prompt (or ENTER for default):", "Not many people know that");
+
+ // Create an executor that can evaluate a batch of conversations together
+ using var executor = new BatchedExecutor(model, parameters);
+
+ // Print some info
+ var name = executor.Model.Metadata.GetValueOrDefault("general.name", "unknown model name");
+ Console.WriteLine($"Created executor with model: {name}");
+
+ // Evaluate the initial prompt to create one conversation
+ using var conversation = executor.Create();
+ conversation.Prompt(prompt);
+
+ // Create the start node wrapping the conversation
+ var node = new Node(executor.Context);
+
+ // Print the prompt
+ Console.ForegroundColor = ConsoleColor.Green;
+ Console.WriteLine(prompt);
+
+ for (var i = 0; i < n_repeats; i++)
+ {
+ for (var j = 0; j < n_generate; j++)
+ {
+ // Run inference
+ await executor.Infer();
+
+ // Sample a token
+ var token = node.Sample(conversation);
+
+ // Continue conversation with this token
+ if (j != n_generate - 1)
+ conversation.Prompt(token);
+ }
+
+ // Write out what we generated
+ node.Write(n_rewind, i + 1);
+
+ // Rewind back a few tokens
+ conversation.Rewind(n_rewind + 1);
+
+ // Prompt with a token
+ conversation.Prompt(node.GetToken(n_generate - n_rewind - 1));
+
+ // Create a new node around the rewound conversation
+ node = new Node(executor.Context);
+ }
+
+ Console.WriteLine("Press any key to exit demo");
+ Console.ReadKey(true);
+ }
+
+ private class Node
+ {
+ private readonly LLamaContext _context;
+
+ private readonly List _tokens = new List();
+ private readonly DefaultSamplingPipeline Sampler;
+
+ public Node(LLamaContext context)
+ {
+ _context = context;
+ Sampler = new DefaultSamplingPipeline();
+ }
+
+ public LLamaToken Sample(Conversation conversation)
+ {
+ var token = Sampler.Sample(_context.NativeHandle, conversation.Sample(), Array.Empty());
+ _tokens.Add(token);
+ return token;
+ }
+
+ public void Write(int n_rewind, int depth)
+ {
+ var decoder = new StreamingTokenDecoder(_context);
+
+ for (var i = 0; i < _tokens.Count - n_rewind; i++)
+ decoder.Add(_tokens[i]);
+
+ AnsiConsole.MarkupLine($"[green]{new string(' ', depth * 3) + decoder.Read().ReplaceLineEndings(" ")}[/]");
+
+ for (var i = _tokens.Count - n_rewind; i < _tokens.Count; i++)
+ decoder.Add(_tokens[i]);
+
+ AnsiConsole.MarkupLine($"[maroon]{decoder.Read().ReplaceLineEndings(" ")}[/]");
+ }
+
+ public LLamaToken GetToken(int index)
+ {
+ return _tokens[index];
+ }
+ }
+}
+```
\ No newline at end of file
diff --git a/docs/Examples/ChatChineseGB2312.md b/docs/Examples/ChatChineseGB2312.md
index a5a0defe..2a5748ee 100644
--- a/docs/Examples/ChatChineseGB2312.md
+++ b/docs/Examples/ChatChineseGB2312.md
@@ -1,9 +1,12 @@
-# Chat Chinese
+# Chinese LLM - with GB2312 encoding
```cs
using System.Text;
using LLama.Common;
+namespace LLama.Examples.Examples;
+
+// This example shows how to deal with Chinese input with gb2312 encoding.
public class ChatChineseGB2312
{
private static string ConvertEncoding(string input, Encoding original, Encoding target)
@@ -23,8 +26,7 @@ public class ChatChineseGB2312
" to use https://huggingface.co/hfl/chinese-alpaca-2-7b-gguf/blob/main/ggml-model-q5_0.gguf, which has been verified by LLamaSharp developers.");
Console.ForegroundColor = ConsoleColor.White;
- Console.Write("Please input your model path: ");
- var modelPath = Console.ReadLine();
+ string modelPath = UserSettings.GetModelPath();
var parameters = new ModelParams(modelPath)
{
@@ -121,5 +123,4 @@ public class ChatChineseGB2312
}
}
}
-
```
\ No newline at end of file
diff --git a/docs/Examples/ChatSessionStripRoleName.md b/docs/Examples/ChatSessionStripRoleName.md
index 03000154..3afd5de7 100644
--- a/docs/Examples/ChatSessionStripRoleName.md
+++ b/docs/Examples/ChatSessionStripRoleName.md
@@ -1,19 +1,17 @@
-ο»Ώ# Use chat session and strip role names
+# ChatSession - stripping role names
```cs
using LLama.Common;
-using System;
-using System.Collections.Generic;
-using System.Linq;
-using System.Text;
-using System.Threading.Tasks;
+namespace LLama.Examples.Examples;
+
+// When using chatsession, it's a common case that you want to strip the role names
+// rather than display them. This example shows how to use transforms to strip them.
public class ChatSessionStripRoleName
{
- public static void Run()
+ public static async Task Run()
{
- Console.Write("Please input your model path: ");
- var modelPath = Console.ReadLine();
+ string modelPath = UserSettings.GetModelPath();
var parameters = new ModelParams(modelPath)
{
@@ -65,4 +63,5 @@ public class ChatSessionStripRoleName
}
}
}
+
```
\ No newline at end of file
diff --git a/docs/Examples/ChatSessionWithHistory.md b/docs/Examples/ChatSessionWithHistory.md
index a07a269e..ae9091b5 100644
--- a/docs/Examples/ChatSessionWithHistory.md
+++ b/docs/Examples/ChatSessionWithHistory.md
@@ -1,16 +1,16 @@
-# Chat session with history
+# ChatSession - with history
```cs
using LLama.Common;
namespace LLama.Examples.Examples;
+// This example shows how to save the state and history of chat session and load it again.
public class ChatSessionWithHistory
{
public static async Task Run()
{
- Console.Write("Please input your model path: ");
- var modelPath = Console.ReadLine();
+ string modelPath = UserSettings.GetModelPath();
var parameters = new ModelParams(modelPath)
{
@@ -52,6 +52,10 @@ public class ChatSessionWithHistory
Console.ForegroundColor = ConsoleColor.Yellow;
Console.WriteLine("The chat session has started.");
+ Console.WriteLine("Type 'exit' to end the chat session.");
+ Console.WriteLine("Type 'save' to save the chat session to disk.");
+ Console.WriteLine("Type 'load' to load the chat session from disk.");
+ Console.WriteLine("Type 'regenerate' to regenerate the last response.");
// show the prompt
Console.ForegroundColor = ConsoleColor.Green;
@@ -59,12 +63,20 @@ public class ChatSessionWithHistory
while (userInput != "exit")
{
+ // Save the chat state to disk
if (userInput == "save")
{
session.SaveSession("Assets/chat-with-bob");
Console.ForegroundColor = ConsoleColor.Yellow;
Console.WriteLine("Session saved.");
}
+ // Load the chat state from disk
+ else if (userInput == "load")
+ {
+ session.LoadSession("Assets/chat-with-bob");
+ Console.ForegroundColor = ConsoleColor.Yellow;
+ Console.WriteLine("Session loaded.");
+ }
else if (userInput == "regenerate")
{
Console.ForegroundColor = ConsoleColor.Yellow;
@@ -99,6 +111,4 @@ public class ChatSessionWithHistory
}
}
}
-
-
```
\ No newline at end of file
diff --git a/docs/Examples/ChatSessionWithRestart.md b/docs/Examples/ChatSessionWithRestart.md
new file mode 100644
index 00000000..2b7bc7c5
--- /dev/null
+++ b/docs/Examples/ChatSessionWithRestart.md
@@ -0,0 +1,112 @@
+# ChatSession - restarting
+
+```cs
+using LLama.Common;
+
+namespace LLama.Examples.Examples;
+
+// This example shows how to restart the chat session
+public class ChatSessionWithRestart
+{
+ public static async Task Run()
+ {
+ string modelPath = UserSettings.GetModelPath();
+
+ var parameters = new ModelParams(modelPath)
+ {
+ ContextSize = 1024,
+ Seed = 1337,
+ GpuLayerCount = 5
+ };
+ using var model = LLamaWeights.LoadFromFile(parameters);
+ using var context = model.CreateContext(parameters);
+ var executor = new InteractiveExecutor(context);
+
+ var chatHistoryJson = File.ReadAllText("Assets/chat-with-bob.json");
+ ChatHistory chatHistory = ChatHistory.FromJson(chatHistoryJson) ?? new ChatHistory();
+ ChatSession prototypeSession =
+ await ChatSession.InitializeSessionFromHistoryAsync(executor, chatHistory);
+ prototypeSession.WithOutputTransform(new LLamaTransforms.KeywordTextOutputStreamTransform(
+ new string[] { "User:", "Assistant:" },
+ redundancyLength: 8));
+ var resetState = prototypeSession.GetSessionState();
+
+ ChatSession session = new ChatSession(executor);
+ session.LoadSession(resetState);
+
+ InferenceParams inferenceParams = new InferenceParams()
+ {
+ Temperature = 0.9f,
+ AntiPrompts = new List { "User:" }
+ };
+
+ Console.ForegroundColor = ConsoleColor.Yellow;
+ Console.WriteLine("The chat session has started. Starting point saved.");
+ Console.WriteLine("Type 'exit' to end the chat session.");
+ Console.WriteLine("Type 'save' to save chat session state in memory.");
+ Console.WriteLine("Type 'reset' to reset the chat session to its saved state.");
+ Console.WriteLine("Type 'answer for assistant' to add and process provided user and assistant messages.");
+
+ // show the prompt
+ Console.ForegroundColor = ConsoleColor.Green;
+ string userInput = Console.ReadLine() ?? "";
+
+ while (userInput != "exit")
+ {
+ // Load the session state from the reset state
+ if(userInput == "reset")
+ {
+ session.LoadSession(resetState);
+ Console.WriteLine($"Reset to history:\n{session.HistoryTransform.HistoryToText(session.History)}");
+ Console.ForegroundColor = ConsoleColor.Yellow;
+ Console.WriteLine("Session reset.");
+ }
+ // Assign new reset state.
+ else if (userInput == "save")
+ {
+ resetState = session.GetSessionState();
+ Console.ForegroundColor = ConsoleColor.Yellow;
+ Console.WriteLine("Session saved.");
+ }
+ // Provide user and override assistant answer with your own.
+ else if (userInput == "answer for assistant")
+ {
+ Console.ForegroundColor = ConsoleColor.Yellow;
+ Console.WriteLine("Provide user input: ");
+
+ Console.ForegroundColor = ConsoleColor.Green;
+ string userInputOverride = Console.ReadLine() ?? "";
+
+ Console.ForegroundColor = ConsoleColor.Yellow;
+ Console.WriteLine("Provide assistant input: ");
+
+ Console.ForegroundColor = ConsoleColor.Green;
+ string assistantInputOverride = Console.ReadLine() ?? "";
+
+ await session.AddAndProcessUserMessage(userInputOverride);
+ await session.AddAndProcessAssistantMessage(assistantInputOverride);
+
+ Console.ForegroundColor = ConsoleColor.Yellow;
+ Console.WriteLine("User and assistant messages processed. Provide next user message:");
+ }
+ else
+ {
+ await foreach (
+ var text
+ in session.ChatAsync(
+ new ChatHistory.Message(AuthorRole.User, userInput),
+ inferenceParams))
+ {
+ Console.ForegroundColor = ConsoleColor.White;
+ Console.Write(text);
+ }
+ }
+
+ Console.ForegroundColor = ConsoleColor.Green;
+ userInput = Console.ReadLine() ?? "";
+
+ Console.ForegroundColor = ConsoleColor.White;
+ }
+ }
+}
+```
\ No newline at end of file
diff --git a/docs/Examples/ChatSessionWithRoleName.md b/docs/Examples/ChatSessionWithRoleName.md
index 4f15ade4..35851b7b 100644
--- a/docs/Examples/ChatSessionWithRoleName.md
+++ b/docs/Examples/ChatSessionWithRoleName.md
@@ -1,19 +1,16 @@
-ο»Ώ# Use chat session without removing role names
+# ChatSession - Basic
```cs
using LLama.Common;
-using System;
-using System.Collections.Generic;
-using System.Linq;
-using System.Text;
-using System.Threading.Tasks;
+namespace LLama.Examples.Examples;
+
+// The basic example for using ChatSession
public class ChatSessionWithRoleName
{
- public static void Run()
+ public static async Task Run()
{
- Console.Write("Please input your model path: ");
- var modelPath = Console.ReadLine();
+ string modelPath = UserSettings.GetModelPath();
var parameters = new ModelParams(modelPath)
{
@@ -62,4 +59,5 @@ public class ChatSessionWithRoleName
}
}
}
+
```
\ No newline at end of file
diff --git a/docs/Examples/CodingAssistant.md b/docs/Examples/CodingAssistant.md
index 3069f301..4c59281c 100644
--- a/docs/Examples/CodingAssistant.md
+++ b/docs/Examples/CodingAssistant.md
@@ -1,97 +1,73 @@
-# Coding Assistant
+# Coding assistant
```cs
-using LLama.Common;
-using System;
-using System.Reflection;
-
-internal class CodingAssistant
+namespace LLama.Examples.Examples
{
- const string DefaultModelUri = "https://huggingface.co/TheBloke/CodeLlama-7B-Instruct-GGUF/resolve/main/codellama-7b-instruct.Q4_K_S.gguf";
-
- // Source paper with example prompts:
- // https://doi.org/10.48550/arXiv.2308.12950
- const string InstructionPrefix = "[INST]";
- const string InstructionSuffix = "[/INST]";
- const string SystemInstruction = "You're an intelligent, concise coding assistant. Wrap code in ``` for readability. Don't repeat yourself. Use best practice and good coding standards.";
- private static string ModelsDirectory = Path.Combine(Directory.GetParent(Assembly.GetExecutingAssembly().Location)!.FullName, "Models");
+ using LLama.Common;
+ using System;
- public static async Task Run()
+ // This example shows how to apply code completion as a coding assistant
+ internal class CodingAssistant
{
- Console.Write("Please input your model path (if left empty, a default model will be downloaded for you): ");
- var modelPath = Console.ReadLine();
-
- if(string.IsNullOrWhiteSpace(modelPath) )
- {
- modelPath = await GetDefaultModel();
- }
-
- var parameters = new ModelParams(modelPath)
- {
- ContextSize = 4096
- };
- using var model = LLamaWeights.LoadFromFile(parameters);
- using var context = model.CreateContext(parameters);
- var executor = new InstructExecutor(context, InstructionPrefix, InstructionSuffix, null);
-
- Console.ForegroundColor = ConsoleColor.Yellow;
- Console.WriteLine("The executor has been enabled. In this example, the LLM will follow your instructions." +
- "\nIt's a 7B Code Llama, so it's trained for programming tasks like \"Write a C# function reading a file name from a given URI\" or \"Write some programming interview questions\"." +
- "\nWrite 'exit' to exit");
- Console.ForegroundColor = ConsoleColor.White;
-
- var inferenceParams = new InferenceParams() {
- Temperature = 0.8f,
- MaxTokens = -1,
- };
-
- string instruction = $"{SystemInstruction}\n\n";
- await Console.Out.WriteAsync("Instruction: ");
- instruction += Console.ReadLine() ?? "Ask me for instructions.";
- while (instruction != "exit")
+ // Source paper with example prompts:
+ // https://doi.org/10.48550/arXiv.2308.12950
+ const string InstructionPrefix = "[INST]";
+ const string InstructionSuffix = "[/INST]";
+ const string SystemInstruction = "You're an intelligent, concise coding assistant. " +
+ "Wrap code in ``` for readability. Don't repeat yourself. " +
+ "Use best practice and good coding standards.";
+
+ public static async Task Run()
{
-
- Console.ForegroundColor = ConsoleColor.Green;
- await foreach (var text in executor.InferAsync(instruction + System.Environment.NewLine, inferenceParams))
+ string modelPath = UserSettings.GetModelPath();
+ if (!modelPath.Contains("codellama", StringComparison.InvariantCultureIgnoreCase))
{
- Console.Write(text);
+ Console.ForegroundColor = ConsoleColor.Yellow;
+ Console.WriteLine("WARNING: the model you selected is not a Code LLama model!");
+ Console.WriteLine("For this example we specifically recommend 'codellama-7b-instruct.Q4_K_S.gguf'");
+ Console.WriteLine("Press ENTER to continue...");
+ Console.ReadLine();
}
+
+ var parameters = new ModelParams(modelPath)
+ {
+ ContextSize = 4096
+ };
+ using var model = LLamaWeights.LoadFromFile(parameters);
+ using var context = model.CreateContext(parameters);
+ var executor = new InstructExecutor(context, InstructionPrefix, InstructionSuffix, null);
+
+ Console.ForegroundColor = ConsoleColor.Yellow;
+ Console.WriteLine("The executor has been enabled. In this example, the LLM will follow your instructions." +
+ "\nIt's a 7B Code Llama, so it's trained for programming tasks like \"Write a C# function reading " +
+ "a file name from a given URI\" or \"Write some programming interview questions\"." +
+ "\nWrite 'exit' to exit");
Console.ForegroundColor = ConsoleColor.White;
+ var inferenceParams = new InferenceParams()
+ {
+ Temperature = 0.8f,
+ MaxTokens = -1,
+ };
+
+ string instruction = $"{SystemInstruction}\n\n";
await Console.Out.WriteAsync("Instruction: ");
- instruction = Console.ReadLine() ?? "Ask me for instructions.";
- }
- }
+ instruction += Console.ReadLine() ?? "Ask me for instructions.";
+ while (instruction != "exit")
+ {
- private static async Task GetDefaultModel()
- {
- var uri = new Uri(DefaultModelUri);
- var modelName = uri.Segments[^1];
- await Console.Out.WriteLineAsync($"The following model will be used: {modelName}");
- var modelPath = Path.Combine(ModelsDirectory, modelName);
- if(!Directory.Exists(ModelsDirectory))
- {
- Directory.CreateDirectory(ModelsDirectory);
- }
+ Console.ForegroundColor = ConsoleColor.Green;
+ await foreach (var text in executor.InferAsync(instruction + Environment.NewLine, inferenceParams))
+ {
+ Console.Write(text);
+ }
+ Console.ForegroundColor = ConsoleColor.White;
- if (File.Exists(modelPath))
- {
- await Console.Out.WriteLineAsync($"Existing model found, using {modelPath}");
- }
- else
- {
- await Console.Out.WriteLineAsync($"Model not found locally, downloading {DefaultModelUri}...");
- using var http = new HttpClient();
- await using var downloadStream = await http.GetStreamAsync(uri);
- await using var fileStream = new FileStream(modelPath, FileMode.Create, FileAccess.Write);
- await downloadStream.CopyToAsync(fileStream);
- await Console.Out.WriteLineAsync($"Model downloaded and saved to {modelPath}");
+ await Console.Out.WriteAsync("Instruction: ");
+ instruction = Console.ReadLine() ?? "Ask me for instructions.";
+ }
}
-
-
- return modelPath;
}
}
-
```
\ No newline at end of file
diff --git a/docs/Examples/GetEmbeddings.md b/docs/Examples/GetEmbeddings.md
index 56c0b995..560371e9 100644
--- a/docs/Examples/GetEmbeddings.md
+++ b/docs/Examples/GetEmbeddings.md
@@ -1,32 +1,49 @@
-ο»Ώ# Get embeddings
+# Get embeddings
```cs
using LLama.Common;
-using System;
-using System.Collections.Generic;
-using System.Linq;
-using System.Text;
-using System.Threading.Tasks;
-public class GetEmbeddings
+namespace LLama.Examples.Examples
{
- public static void Run()
+ // This example shows how to get embeddings from a text prompt.
+ public class GetEmbeddings
{
- Console.Write("Please input your model path: ");
- string modelPath = Console.ReadLine();
- var modelParams = new ModelParams(modelPath) { EmbeddingMode = true };
- var embedder = new LLamaEmbedder(modelParams);
-
- while (true)
+ public static void Run()
{
- Console.Write("Please input your text: ");
- Console.ForegroundColor = ConsoleColor.Green;
- var text = Console.ReadLine();
- Console.ForegroundColor = ConsoleColor.White;
+ string modelPath = UserSettings.GetModelPath();
+
+ Console.ForegroundColor = ConsoleColor.DarkGray;
+ var @params = new ModelParams(modelPath) { EmbeddingMode = true };
+ using var weights = LLamaWeights.LoadFromFile(@params);
+ var embedder = new LLamaEmbedder(weights, @params);
+
+ Console.ForegroundColor = ConsoleColor.Yellow;
+ Console.WriteLine(
+ """
+ This example displays embeddings from a text prompt.
+ Embeddings are numerical codes that represent information like words, images, or concepts.
+ These codes capture important relationships between those objects,
+ like how similar words are in meaning or how close images are visually.
+ This allows machine learning models to efficiently understand and process complex data.
+ Embeddings of a text in LLM is sometimes useful, for example, to train other MLP models.
+ """); // NOTE: this description was AI generated
- Console.WriteLine(string.Join(", ", embedder.GetEmbeddings(text)));
- Console.WriteLine();
+ while (true)
+ {
+ Console.ForegroundColor = ConsoleColor.White;
+ Console.Write("Please input your text: ");
+ Console.ForegroundColor = ConsoleColor.Green;
+ var text = Console.ReadLine();
+ Console.ForegroundColor = ConsoleColor.White;
+
+ float[] embeddings = embedder.GetEmbeddings(text).Result;
+ Console.WriteLine($"Embeddings contain {embeddings.Length:N0} floating point values:");
+ Console.ForegroundColor = ConsoleColor.DarkGray;
+ Console.WriteLine(string.Join(", ", embeddings.Take(20)) + ", ...");
+ Console.WriteLine();
+ }
}
}
}
+
```
\ No newline at end of file
diff --git a/docs/Examples/GrammarJsonResponse.md b/docs/Examples/GrammarJsonResponse.md
new file mode 100644
index 00000000..5054b27a
--- /dev/null
+++ b/docs/Examples/GrammarJsonResponse.md
@@ -0,0 +1,58 @@
+# Grammar - json response
+
+```cs
+using LLama.Common;
+using LLama.Grammars;
+
+namespace LLama.Examples.Examples
+{
+ // This example shows how to get response in json format using grammar.
+ public class GrammarJsonResponse
+ {
+ public static async Task Run()
+ {
+ string modelPath = UserSettings.GetModelPath();
+
+ var gbnf = File.ReadAllText("Assets/json.gbnf").Trim();
+ var grammar = Grammar.Parse(gbnf, "root");
+
+ var parameters = new ModelParams(modelPath)
+ {
+ ContextSize = 1024,
+ Seed = 1337,
+ GpuLayerCount = 5
+ };
+ using var model = LLamaWeights.LoadFromFile(parameters);
+ var ex = new StatelessExecutor(model, parameters);
+
+ Console.ForegroundColor = ConsoleColor.Yellow;
+ Console.WriteLine("The executor has been enabled. In this example, the LLM will follow your instructions and always respond in a JSON format. For example, you can input \"Tell me the attributes of a good dish\"");
+ Console.ForegroundColor = ConsoleColor.White;
+
+ using var grammarInstance = grammar.CreateInstance();
+ var inferenceParams = new InferenceParams()
+ {
+ Temperature = 0.6f,
+ AntiPrompts = new List { "Question:", "#", "Question: ", ".\n" },
+ MaxTokens = 50,
+ Grammar = grammarInstance
+ };
+
+ while (true)
+ {
+ Console.Write("\nQuestion: ");
+ Console.ForegroundColor = ConsoleColor.Green;
+ var prompt = Console.ReadLine();
+ Console.ForegroundColor = ConsoleColor.White;
+ Console.Write("Answer: ");
+ prompt = $"Question: {prompt?.Trim()} Answer: ";
+ await foreach (var text in ex.InferAsync(prompt, inferenceParams))
+ {
+ Console.Write(text);
+ }
+ }
+ }
+ }
+}
+
+```
\ No newline at end of file
diff --git a/docs/Examples/GrammerJsonResponse.md b/docs/Examples/GrammerJsonResponse.md
deleted file mode 100644
index 59672d68..00000000
--- a/docs/Examples/GrammerJsonResponse.md
+++ /dev/null
@@ -1,55 +0,0 @@
-# Grammer json response
-
-```cs
-using LLama.Common;
-using LLama.Grammars;
-
-public class GrammarJsonResponse
-{
- public static async Task Run()
- {
- var gbnf = (await File.ReadAllTextAsync("Assets/json.gbnf")).Trim();
- var grammar = Grammar.Parse(gbnf, "root");
-
- Console.Write("Please input your model path: ");
- var modelPath = Console.ReadLine();
-
- var parameters = new ModelParams(modelPath)
- {
- ContextSize = 1024,
- Seed = 1337,
- GpuLayerCount = 5
- };
- using var model = LLamaWeights.LoadFromFile(parameters);
- var ex = new StatelessExecutor(model, parameters);
-
- Console.ForegroundColor = ConsoleColor.Yellow;
- Console.WriteLine("The executor has been enabled. In this example, the LLM will follow your instructions and always respond in a JSON format. For example, you can input \"Tell me the attributes of a good dish\"");
- Console.ForegroundColor = ConsoleColor.White;
-
- using var grammarInstance = grammar.CreateInstance();
- var inferenceParams = new InferenceParams()
- {
- Temperature = 0.6f,
- AntiPrompts = new List { "Question:", "#", "Question: ", ".\n" },
- MaxTokens = 50,
- Grammar = grammarInstance
- };
-
- while (true)
- {
- Console.Write("\nQuestion: ");
- Console.ForegroundColor = ConsoleColor.Green;
- var prompt = Console.ReadLine();
- Console.ForegroundColor = ConsoleColor.White;
- Console.Write("Answer: ");
- prompt = $"Question: {prompt?.Trim()} Answer: ";
- await foreach (var text in ex.InferAsync(prompt, inferenceParams))
- {
- Console.Write(text);
- }
- }
- }
-}
-
-```
\ No newline at end of file
diff --git a/docs/Examples/InstructModeExecute.md b/docs/Examples/InstructModeExecute.md
index 8daba174..a404d898 100644
--- a/docs/Examples/InstructModeExecute.md
+++ b/docs/Examples/InstructModeExecute.md
@@ -1,40 +1,48 @@
-ο»Ώ# Use instruct executor
+# Instruct executor - basic
```cs
using LLama.Common;
-using System;
-using System.Collections.Generic;
-using System.Linq;
-using System.Text;
-using System.Threading.Tasks;
-public class InstructModeExecute
+namespace LLama.Examples.Examples
{
- public static void Run()
+ // This example shows how to use InstructExecutor to generate the response.
+ public class InstructModeExecute
{
- Console.Write("Please input your model path: ");
- string modelPath = Console.ReadLine();
- var prompt = File.ReadAllText("Assets/dan.txt").Trim();
+ public static async Task Run()
+ {
+ string modelPath = UserSettings.GetModelPath();
- InstructExecutor ex = new(new LLamaModel(new ModelParams(modelPath, contextSize: 1024)));
+ var prompt = File.ReadAllText("Assets/dan.txt").Trim();
- Console.ForegroundColor = ConsoleColor.Yellow;
- Console.WriteLine("The executor has been enabled. In this example, the LLM will follow your instructions. For example, you can input \"Write a story about a fox who want to " +
- "make friend with human, no less than 200 words.\"");
- Console.ForegroundColor = ConsoleColor.White;
+ var parameters = new ModelParams(modelPath)
+ {
+ ContextSize = 1024,
+ Seed = 1337,
+ GpuLayerCount = 5
+ };
+ using var model = LLamaWeights.LoadFromFile(parameters);
+ using var context = model.CreateContext(parameters);
+ var executor = new InstructExecutor(context);
- var inferenceParams = new InferenceParams() { Temperature = 0.8f, MaxTokens = 300 };
+ Console.ForegroundColor = ConsoleColor.Yellow;
+ Console.WriteLine("The executor has been enabled. In this example, the LLM will follow your instructions. For example, you can input \"Write a story about a fox who want to " +
+ "make friend with human, no less than 200 words.\"");
+ Console.ForegroundColor = ConsoleColor.White;
- while (true)
- {
- foreach (var text in ex.Infer(prompt, inferenceParams))
+ var inferenceParams = new InferenceParams() { Temperature = 0.8f, MaxTokens = 600 };
+
+ while (true)
{
- Console.Write(text);
+ await foreach (var text in executor.InferAsync(prompt, inferenceParams))
+ {
+ Console.Write(text);
+ }
+ Console.ForegroundColor = ConsoleColor.Green;
+ prompt = Console.ReadLine();
+ Console.ForegroundColor = ConsoleColor.White;
}
- Console.ForegroundColor = ConsoleColor.Green;
- prompt = Console.ReadLine();
- Console.ForegroundColor = ConsoleColor.White;
}
}
}
+
```
\ No newline at end of file
diff --git a/docs/Examples/InteractiveModeExecute.md b/docs/Examples/InteractiveModeExecute.md
index 9b15667b..97e26e3b 100644
--- a/docs/Examples/InteractiveModeExecute.md
+++ b/docs/Examples/InteractiveModeExecute.md
@@ -1,41 +1,49 @@
-ο»Ώ# Use interactive executor
+# Interactive executor - basic
```cs
using LLama.Common;
-using System;
-using System.Collections.Generic;
-using System.Linq;
-using System.Text;
-using System.Threading.Tasks;
-public class InteractiveModeExecute
+namespace LLama.Examples.Examples
{
- public async static Task Run()
+ // This is an example which shows how to chat with LLM with InteractiveExecutor.
+ public class InteractiveModeExecute
{
- Console.Write("Please input your model path: ");
- string modelPath = Console.ReadLine();
- var prompt = File.ReadAllText("Assets/chat-with-bob.txt").Trim();
+ public static async Task Run()
+ {
+ string modelPath = UserSettings.GetModelPath();
- InteractiveExecutor ex = new(new LLamaModel(new ModelParams(modelPath, contextSize: 256)));
+ var prompt = (await File.ReadAllTextAsync("Assets/chat-with-bob.txt")).Trim();
- Console.ForegroundColor = ConsoleColor.Yellow;
- Console.WriteLine("The executor has been enabled. In this example, the prompt is printed, the maximum tokens is set to 64 and the context size is 256. (an example for small scale usage)");
- Console.ForegroundColor = ConsoleColor.White;
+ var parameters = new ModelParams(modelPath)
+ {
+ ContextSize = 1024,
+ Seed = 1337,
+ GpuLayerCount = 5
+ };
+ using var model = LLamaWeights.LoadFromFile(parameters);
+ using var context = model.CreateContext(parameters);
+ var ex = new InteractiveExecutor(context);
- Console.Write(prompt);
+ Console.ForegroundColor = ConsoleColor.Yellow;
+ Console.WriteLine("The executor has been enabled. In this example, the prompt is printed, the maximum tokens is set to 128 and the context size is 256. (an example for small scale usage)");
+ Console.ForegroundColor = ConsoleColor.White;
- var inferenceParams = new InferenceParams() { Temperature = 0.6f, AntiPrompts = new List { "User:" }, MaxTokens = 64 };
+ Console.Write(prompt);
- while (true)
- {
- await foreach (var text in ex.InferAsync(prompt, inferenceParams))
+ var inferenceParams = new InferenceParams() { Temperature = 0.6f, AntiPrompts = new List { "User:" }, MaxTokens = 128 };
+
+ while (true)
{
- Console.Write(text);
+ await foreach (var text in ex.InferAsync(prompt, inferenceParams))
+ {
+ Console.Write(text);
+ }
+ Console.ForegroundColor = ConsoleColor.Green;
+ prompt = Console.ReadLine();
+ Console.ForegroundColor = ConsoleColor.White;
}
- Console.ForegroundColor = ConsoleColor.Green;
- prompt = Console.ReadLine();
- Console.ForegroundColor = ConsoleColor.White;
}
}
}
+
```
\ No newline at end of file
diff --git a/docs/Examples/KernelMemory.md b/docs/Examples/KernelMemory.md
index 76ac266a..0b9c076e 100644
--- a/docs/Examples/KernelMemory.md
+++ b/docs/Examples/KernelMemory.md
@@ -1,62 +1,114 @@
-# Kernel memory
+# Kernel memory integration - basic
```cs
-using System;
-using System.Collections.Generic;
-using System.Linq;
-using System.Text;
-using System.Threading.Tasks;
using LLamaSharp.KernelMemory;
using Microsoft.KernelMemory;
using Microsoft.KernelMemory.Configuration;
-using Microsoft.KernelMemory.Handlers;
+using System.Diagnostics;
-public class KernelMemory
+namespace LLama.Examples.Examples
{
- public static async Task Run()
+ // This example is from Microsoft's official kernel memory "custom prompts" example:
+ // https://github.com/microsoft/kernel-memory/blob/6d516d70a23d50c6cb982e822e6a3a9b2e899cfa/examples/101-dotnet-custom-Prompts/Program.cs#L1-L86
+
+ // Microsoft.KernelMemory has more features than Microsoft.SemanticKernel.
+ // See https://microsoft.github.io/kernel-memory/ for details.
+
+ public class KernelMemory
{
- Console.WriteLine("Example from: https://github.com/microsoft/kernel-memory/blob/main/examples/101-using-core-nuget/Program.cs");
- Console.Write("Please input your model path: ");
- var modelPath = Console.ReadLine();
- var searchClientConfig = new SearchClientConfig
+ public static async Task Run()
{
- MaxMatchesCount = 1,
- AnswerTokens = 100,
- };
- var memory = new KernelMemoryBuilder()
- .WithLLamaSharpDefaults(new LLamaSharpConfig(modelPath)
- {
- DefaultInferenceParams = new Common.InferenceParams
- {
- AntiPrompts = new List { "\n\n" }
- }
- })
- .WithSearchClientConfig(searchClientConfig)
- .With(new TextPartitioningOptions
- {
- MaxTokensPerParagraph = 300,
- MaxTokensPerLine = 100,
- OverlappingTokens = 30
- })
- .Build();
+ Console.ForegroundColor = ConsoleColor.Yellow;
+ Console.WriteLine(
+ """
- await memory.ImportDocumentAsync(@"./Assets/sample-SK-Readme.pdf", steps: Constants.PipelineWithoutSummary);
+ This program uses the Microsoft.KernelMemory package to ingest documents
+ and answer questions about them in an interactive chat prompt.
- var question = "What's Semantic Kernel?";
+ """);
- Console.WriteLine($"\n\nQuestion: {question}");
+ // Setup the kernel memory with the LLM model
+ string modelPath = UserSettings.GetModelPath();
+ IKernelMemory memory = CreateMemory(modelPath);
- var answer = await memory.AskAsync(question);
+ // Ingest documents (format is automatically detected from the filename)
+ string[] filesToIngest = [
+ Path.GetFullPath(@"./Assets/sample-SK-Readme.pdf"),
+ Path.GetFullPath(@"./Assets/sample-KM-Readme.pdf"),
+ ];
- Console.WriteLine($"\nAnswer: {answer.Result}");
+ for (int i = 0; i < filesToIngest.Length; i++)
+ {
+ string path = filesToIngest[i];
+ Stopwatch sw = Stopwatch.StartNew();
+ Console.ForegroundColor = ConsoleColor.Blue;
+ Console.WriteLine($"Importing {i + 1} of {filesToIngest.Length}: {path}");
+ await memory.ImportDocumentAsync(path, steps: Constants.PipelineWithoutSummary);
+ Console.WriteLine($"Completed in {sw.Elapsed}\n");
+ }
- Console.WriteLine("\n\n Sources:\n");
+ // Ask a predefined question
+ Console.ForegroundColor = ConsoleColor.Green;
+ string question1 = "What formats does KM support";
+ Console.WriteLine($"Question: {question1}");
+ await AnswerQuestion(memory, question1);
- foreach (var x in answer.RelevantSources)
+ // Let the user ask additional questions
+ while (true)
+ {
+ Console.ForegroundColor = ConsoleColor.Green;
+ Console.Write("Question: ");
+ string question = Console.ReadLine()!;
+ if (string.IsNullOrEmpty(question))
+ return;
+
+ await AnswerQuestion(memory, question);
+ }
+ }
+
+ private static IKernelMemory CreateMemory(string modelPath)
{
- Console.WriteLine($" - {x.SourceName} - {x.Link} [{x.Partitions.First().LastUpdate:D}]");
+ Common.InferenceParams infParams = new() { AntiPrompts = ["\n\n"] };
+
+ LLamaSharpConfig lsConfig = new(modelPath) { DefaultInferenceParams = infParams };
+
+ SearchClientConfig searchClientConfig = new()
+ {
+ MaxMatchesCount = 1,
+ AnswerTokens = 100,
+ };
+
+ TextPartitioningOptions parseOptions = new()
+ {
+ MaxTokensPerParagraph = 300,
+ MaxTokensPerLine = 100,
+ OverlappingTokens = 30
+ };
+
+ return new KernelMemoryBuilder()
+ .WithLLamaSharpDefaults(lsConfig)
+ .WithSearchClientConfig(searchClientConfig)
+ .With(parseOptions)
+ .Build();
+ }
+
+ private static async Task AnswerQuestion(IKernelMemory memory, string question)
+ {
+ Stopwatch sw = Stopwatch.StartNew();
+ Console.ForegroundColor = ConsoleColor.DarkGray;
+ Console.WriteLine($"Generating answer...");
+
+ MemoryAnswer answer = await memory.AskAsync(question);
+ Console.WriteLine($"Answer generated in {sw.Elapsed}");
+
+ Console.ForegroundColor = ConsoleColor.Gray;
+ Console.WriteLine($"Answer: {answer.Result}");
+ foreach (var source in answer.RelevantSources)
+ {
+ Console.WriteLine($"Source: {source.SourceName}");
+ }
+ Console.WriteLine();
}
}
}
-
```
\ No newline at end of file
diff --git a/docs/Examples/KernelMemorySaveAndLoad.md b/docs/Examples/KernelMemorySaveAndLoad.md
new file mode 100644
index 00000000..dd1c670a
--- /dev/null
+++ b/docs/Examples/KernelMemorySaveAndLoad.md
@@ -0,0 +1,166 @@
+# Kernel-memory - save & load
+
+```cs
+using LLamaSharp.KernelMemory;
+using Microsoft.KernelMemory;
+using Microsoft.KernelMemory.Configuration;
+using Microsoft.KernelMemory.ContentStorage.DevTools;
+using Microsoft.KernelMemory.FileSystem.DevTools;
+using Microsoft.KernelMemory.MemoryStorage.DevTools;
+using System.Diagnostics;
+
+namespace LLama.Examples.Examples;
+
+// This example shows how to use kernel-memory integration with pre-saved embeddings.
+public class KernelMemorySaveAndLoad
+{
+ static string StorageFolder => Path.GetFullPath($"./storage-{nameof(KernelMemorySaveAndLoad)}");
+ static bool StorageExists => Directory.Exists(StorageFolder) && Directory.GetDirectories(StorageFolder).Length > 0;
+
+ public static async Task Run()
+ {
+ Console.ForegroundColor = ConsoleColor.Yellow;
+ Console.WriteLine(
+ """
+
+ This program uses the Microsoft.KernelMemory package to ingest documents
+ and store the embeddings as local files so they can be quickly recalled
+ when this application is launched again.
+
+ """);
+
+ string modelPath = UserSettings.GetModelPath();
+ IKernelMemory memory = CreateMemoryWithLocalStorage(modelPath);
+
+ Console.ForegroundColor = ConsoleColor.Yellow;
+ if (StorageExists)
+ {
+ Console.WriteLine(
+ """
+
+ Kernel memory files have been located!
+ Information about previously analyzed documents has been loaded.
+
+ """);
+ }
+ else
+ {
+ Console.WriteLine(
+ $"""
+
+ Existing kernel memory was not found.
+ Documents will be analyzed (slow) and information saved to disk.
+ Analysis will not be required the next time this program is run.
+ Press ENTER to proceed...
+
+ """);
+ Console.ReadLine();
+ await IngestDocuments(memory);
+ }
+
+ await AskSingleQuestion(memory, "What formats does KM support?");
+ await StartUserChatSession(memory);
+ }
+
+ private static IKernelMemory CreateMemoryWithLocalStorage(string modelPath)
+ {
+ Common.InferenceParams infParams = new() { AntiPrompts = ["\n\n"] };
+
+ LLamaSharpConfig lsConfig = new(modelPath) { DefaultInferenceParams = infParams };
+
+ SearchClientConfig searchClientConfig = new()
+ {
+ MaxMatchesCount = 1,
+ AnswerTokens = 100,
+ };
+
+ TextPartitioningOptions parseOptions = new()
+ {
+ MaxTokensPerParagraph = 300,
+ MaxTokensPerLine = 100,
+ OverlappingTokens = 30
+ };
+
+ SimpleFileStorageConfig storageConfig = new()
+ {
+ Directory = StorageFolder,
+ StorageType = FileSystemTypes.Disk,
+ };
+
+ SimpleVectorDbConfig vectorDbConfig = new()
+ {
+ Directory = StorageFolder,
+ StorageType = FileSystemTypes.Disk,
+ };
+
+ Console.ForegroundColor = ConsoleColor.Blue;
+ Console.WriteLine($"Kernel memory folder: {StorageFolder}");
+
+ Console.ForegroundColor = ConsoleColor.DarkGray;
+ return new KernelMemoryBuilder()
+ .WithSimpleFileStorage(storageConfig)
+ .WithSimpleVectorDb(vectorDbConfig)
+ .WithLLamaSharpDefaults(lsConfig)
+ .WithSearchClientConfig(searchClientConfig)
+ .With(parseOptions)
+ .Build();
+ }
+
+ private static async Task AskSingleQuestion(IKernelMemory memory, string question)
+ {
+ Console.ForegroundColor = ConsoleColor.Green;
+ Console.WriteLine($"Question: {question}");
+ await ShowAnswer(memory, question);
+ }
+
+ private static async Task StartUserChatSession(IKernelMemory memory)
+ {
+ while (true)
+ {
+ Console.ForegroundColor = ConsoleColor.Green;
+ Console.Write("Question: ");
+ string question = Console.ReadLine()!;
+ if (string.IsNullOrEmpty(question))
+ return;
+
+ await ShowAnswer(memory, question);
+ }
+ }
+
+ private static async Task IngestDocuments(IKernelMemory memory)
+ {
+ string[] filesToIngest = [
+ Path.GetFullPath(@"./Assets/sample-SK-Readme.pdf"),
+ Path.GetFullPath(@"./Assets/sample-KM-Readme.pdf"),
+ ];
+
+ for (int i = 0; i < filesToIngest.Length; i++)
+ {
+ string path = filesToIngest[i];
+ Stopwatch sw = Stopwatch.StartNew();
+ Console.ForegroundColor = ConsoleColor.Blue;
+ Console.WriteLine($"Importing {i + 1} of {filesToIngest.Length}: {path}");
+ await memory.ImportDocumentAsync(path, steps: Constants.PipelineWithoutSummary);
+ Console.WriteLine($"Completed in {sw.Elapsed}\n");
+ }
+ }
+
+ private static async Task ShowAnswer(IKernelMemory memory, string question)
+ {
+ Stopwatch sw = Stopwatch.StartNew();
+ Console.ForegroundColor = ConsoleColor.DarkGray;
+ Console.WriteLine($"Generating answer...");
+
+ MemoryAnswer answer = await memory.AskAsync(question);
+ Console.WriteLine($"Answer generated in {sw.Elapsed}");
+
+ Console.ForegroundColor = ConsoleColor.Gray;
+ Console.WriteLine($"Answer: {answer.Result}");
+ foreach (var source in answer.RelevantSources)
+ {
+ Console.WriteLine($"Source: {source.SourceName}");
+ }
+ Console.WriteLine();
+ }
+}
+```
\ No newline at end of file
diff --git a/docs/Examples/LLavaInteractiveModeExecute.md b/docs/Examples/LLavaInteractiveModeExecute.md
new file mode 100644
index 00000000..9c6faa4f
--- /dev/null
+++ b/docs/Examples/LLavaInteractiveModeExecute.md
@@ -0,0 +1,127 @@
+# LLaVA - basic
+
+```cs
+using System.Text.RegularExpressions;
+using LLama.Batched;
+using LLama.Common;
+using Spectre.Console;
+
+namespace LLama.Examples.Examples
+{
+ // This example shows how to chat with LLaVA model with both image and text as input.
+ // It uses the interactive executor to inference.
+ public class LlavaInteractiveModeExecute
+ {
+ public static async Task Run()
+ {
+ string multiModalProj = UserSettings.GetMMProjPath();
+ string modelPath = UserSettings.GetModelPath();
+ string modelImage = UserSettings.GetImagePath();
+ const int maxTokens = 1024;
+
+ var prompt = $"{{{modelImage}}}\nUSER:\nProvide a full description of the image.\nASSISTANT:\n";
+
+ var parameters = new ModelParams(modelPath)
+ {
+ ContextSize = 4096,
+ Seed = 1337,
+ };
+ using var model = LLamaWeights.LoadFromFile(parameters);
+ using var context = model.CreateContext(parameters);
+
+ // Llava Init
+ using var clipModel = LLavaWeights.LoadFromFile(multiModalProj);
+
+ var ex = new InteractiveExecutor(context, clipModel );
+
+ Console.ForegroundColor = ConsoleColor.Yellow;
+ Console.WriteLine("The executor has been enabled. In this example, the prompt is printed, the maximum tokens is set to {0} and the context size is {1}.", maxTokens, parameters.ContextSize );
+ Console.WriteLine("To send an image, enter its filename in curly braces, like this {c:/image.jpg}.");
+
+ var inferenceParams = new InferenceParams() { Temperature = 0.1f, AntiPrompts = new List { "\nUSER:" }, MaxTokens = maxTokens };
+
+ do
+ {
+
+ // Evaluate if we have images
+ //
+ var imageMatches = Regex.Matches(prompt, "{([^}]*)}").Select(m => m.Value);
+ var imageCount = imageMatches.Count();
+ var hasImages = imageCount > 0;
+ byte[][] imageBytes = null;
+
+ if (hasImages)
+ {
+ var imagePathsWithCurlyBraces = Regex.Matches(prompt, "{([^}]*)}").Select(m => m.Value);
+ var imagePaths = Regex.Matches(prompt, "{([^}]*)}").Select(m => m.Groups[1].Value);
+
+ try
+ {
+ imageBytes = imagePaths.Select(File.ReadAllBytes).ToArray();
+ }
+ catch (IOException exception)
+ {
+ Console.ForegroundColor = ConsoleColor.Red;
+ Console.Write(
+ $"Could not load your {(imageCount == 1 ? "image" : "images")}:");
+ Console.Write($"{exception.Message}");
+ Console.ForegroundColor = ConsoleColor.Yellow;
+ Console.WriteLine("Please try again.");
+ break;
+ }
+
+
+ int index = 0;
+ foreach (var path in imagePathsWithCurlyBraces)
+ {
+ // First image replace to tag ");
+ else
+ prompt = prompt.Replace(path, "");
+ }
+
+
+ Console.ForegroundColor = ConsoleColor.Yellow;
+ Console.WriteLine($"Here are the images, that are sent to the chat model in addition to your message.");
+ Console.WriteLine();
+
+ foreach (var consoleImage in imageBytes?.Select(bytes => new CanvasImage(bytes)))
+ {
+ consoleImage.MaxWidth = 50;
+ AnsiConsole.Write(consoleImage);
+ }
+
+ Console.WriteLine();
+ Console.ForegroundColor = ConsoleColor.Yellow;
+ Console.WriteLine($"The images were scaled down for the console only, the model gets full versions.");
+ Console.WriteLine($"Write /exit or press Ctrl+c to return to main menu.");
+ Console.WriteLine();
+
+
+ // Initilize Images in executor
+ //
+ ex.ImagePaths = imagePaths.ToList();
+ }
+
+ Console.ForegroundColor = Color.White;
+ await foreach (var text in ex.InferAsync(prompt, inferenceParams))
+ {
+ Console.Write(text);
+ }
+ Console.Write(" ");
+ Console.ForegroundColor = ConsoleColor.Green;
+ prompt = Console.ReadLine();
+ Console.WriteLine();
+
+ // let the user finish with exit
+ //
+ if (prompt.Equals("/exit", StringComparison.OrdinalIgnoreCase))
+ break;
+
+ }
+ while(true);
+ }
+ }
+}
+```
\ No newline at end of file
diff --git a/docs/Examples/LoadAndSaveSession.md b/docs/Examples/LoadAndSaveSession.md
index 01a9957d..2f97f9bb 100644
--- a/docs/Examples/LoadAndSaveSession.md
+++ b/docs/Examples/LoadAndSaveSession.md
@@ -1,66 +1,84 @@
-ο»Ώ# Load and save chat session
+# ChatSession - load & save
+
+β οΈWarning: this example has been outdated for the latest version of LLamaSharp, please refer to [this example](./ChatSessionWithRestart.md) to see how to save and load state for `ChatSession`. If you are using some old versions of LLamaSharp, this example may help you.
```cs
using LLama.Common;
-using LLama.OldVersion;
-using System;
-using System.Collections.Generic;
-using System.Linq;
-using System.Text;
-using System.Threading.Tasks;
-public class SaveAndLoadSession
+namespace LLama.Examples.Examples
{
- public static void Run()
+ public class SaveAndLoadSession
{
- Console.Write("Please input your model path: ");
- string modelPath = Console.ReadLine();
- var prompt = File.ReadAllText("Assets/chat-with-bob.txt").Trim();
- InteractiveExecutor ex = new(new LLamaModel(new ModelParams(modelPath, contextSize: 1024, seed: 1337, gpuLayerCount: 5)));
- ChatSession session = new ChatSession(ex); // The only change is to remove the transform for the output text stream.
+ public static async Task Run()
+ {
+ string modelPath = UserSettings.GetModelPath();
- Console.ForegroundColor = ConsoleColor.Yellow;
- Console.WriteLine("The chat session has started. In this example, the prompt is printed for better visual result. Input \"save\" to save and reload the session.");
- Console.ForegroundColor = ConsoleColor.White;
+ var prompt = (await File.ReadAllTextAsync("Assets/chat-with-bob.txt")).Trim();
- // show the prompt
- Console.Write(prompt);
- while (true)
- {
- foreach (var text in session.Chat(prompt, new InferenceParams() { Temperature = 0.6f, AntiPrompts = new List { "User:" } }))
+ var parameters = new ModelParams(modelPath)
{
- Console.Write(text);
- }
+ ContextSize = 1024,
+ Seed = 1337,
+ GpuLayerCount = 5
+ };
+ using var model = LLamaWeights.LoadFromFile(parameters);
+ using var context = model.CreateContext(parameters);
+ var ex = new InteractiveExecutor(context);
- Console.ForegroundColor = ConsoleColor.Green;
- prompt = Console.ReadLine();
+ var session = new ChatSession(ex);
+
+ Console.ForegroundColor = ConsoleColor.Yellow;
+ Console.WriteLine("The chat session has started. In this example, the prompt is printed for better visual result. Input \"save\" to save and reload the session.");
Console.ForegroundColor = ConsoleColor.White;
- if (prompt == "save")
+
+ // show the prompt
+ Console.Write(prompt);
+ while (true)
{
- Console.Write("Preparing to save the state, please input the path you want to save it: ");
+ await foreach (
+ var text
+ in session.ChatAsync(
+ new ChatHistory.Message(AuthorRole.User, prompt),
+ new InferenceParams()
+ {
+ Temperature = 0.6f,
+ AntiPrompts = new List { "User:" }
+ }))
+ {
+ Console.Write(text);
+ }
+
Console.ForegroundColor = ConsoleColor.Green;
- var statePath = Console.ReadLine();
- session.SaveSession(statePath);
- Console.ForegroundColor = ConsoleColor.White;
- Console.ForegroundColor = ConsoleColor.Yellow;
- Console.WriteLine("Saved session!");
+ prompt = Console.ReadLine();
Console.ForegroundColor = ConsoleColor.White;
+ if (prompt == "save")
+ {
+ Console.Write("Preparing to save the state, please input the path you want to save it: ");
+ Console.ForegroundColor = ConsoleColor.Green;
+ var statePath = Console.ReadLine();
+ session.SaveSession(statePath);
+ Console.ForegroundColor = ConsoleColor.White;
+ Console.ForegroundColor = ConsoleColor.Yellow;
+ Console.WriteLine("Saved session!");
+ Console.ForegroundColor = ConsoleColor.White;
- ex.Model.Dispose();
- ex = new(new LLamaModel(new ModelParams(modelPath, contextSize: 1024, seed: 1337, gpuLayerCount: 5)));
- session = new ChatSession(ex).WithOutputTransform(new LLamaTransforms.KeywordTextOutputStreamTransform(new string[] { "User:", "Bob:" }, redundancyLength: 8));
- session.LoadSession(statePath);
+ ex.Context.Dispose();
+ ex = new(new LLamaContext(model, parameters));
+ session = new ChatSession(ex);
+ session.LoadSession(statePath);
- Console.ForegroundColor = ConsoleColor.Yellow;
- Console.WriteLine("Loaded session!");
- Console.ForegroundColor = ConsoleColor.White;
+ Console.ForegroundColor = ConsoleColor.Yellow;
+ Console.WriteLine("Loaded session!");
+ Console.ForegroundColor = ConsoleColor.White;
- Console.Write("Now you can continue your session: ");
- Console.ForegroundColor = ConsoleColor.Green;
- prompt = Console.ReadLine();
- Console.ForegroundColor = ConsoleColor.White;
+ Console.Write("Now you can continue your session: ");
+ Console.ForegroundColor = ConsoleColor.Green;
+ prompt = Console.ReadLine();
+ Console.ForegroundColor = ConsoleColor.White;
+ }
}
}
}
}
+
```
\ No newline at end of file
diff --git a/docs/Examples/LoadAndSaveState.md b/docs/Examples/LoadAndSaveState.md
index 377dc215..69b978e3 100644
--- a/docs/Examples/LoadAndSaveState.md
+++ b/docs/Examples/LoadAndSaveState.md
@@ -1,67 +1,76 @@
-ο»Ώ# Load and save model/executor state
+# Executor - save/load state
```cs
using LLama.Common;
-using System;
-using System.Collections.Generic;
-using System.Linq;
-using System.Text;
-using System.Threading.Tasks;
-public class LoadAndSaveState
+namespace LLama.Examples.Examples
{
- public static void Run()
+ // This example shows how to save/load state of the executor.
+ public class LoadAndSaveState
{
- Console.Write("Please input your model path: ");
- string modelPath = Console.ReadLine();
- var prompt = File.ReadAllText("Assets/chat-with-bob.txt").Trim();
+ public static async Task Run()
+ {
+ string modelPath = UserSettings.GetModelPath();
- InteractiveExecutor ex = new(new LLamaModel(new ModelParams(modelPath, contextSize: 256)));
+ var prompt = (await File.ReadAllTextAsync("Assets/chat-with-bob.txt")).Trim();
- Console.ForegroundColor = ConsoleColor.Yellow;
- Console.WriteLine("The executor has been enabled. In this example, the prompt is printed, the maximum tokens is set to 64 and the context size is 256. (an example for small scale usage)");
- Console.ForegroundColor = ConsoleColor.White;
+ var parameters = new ModelParams(modelPath)
+ {
+ ContextSize = 1024,
+ Seed = 1337,
+ GpuLayerCount = 5
+ };
+ using var model = LLamaWeights.LoadFromFile(parameters);
+ using var context = model.CreateContext(parameters);
+ var ex = new InteractiveExecutor(context);
- Console.Write(prompt);
+ Console.ForegroundColor = ConsoleColor.Yellow;
+ Console.WriteLine("The executor has been enabled. In this example, the prompt is printed, " +
+ "the maximum tokens is set to 64 and the context size is 256. (an example for small scale usage)");
- var inferenceParams = new InferenceParams() { Temperature = 0.6f, AntiPrompts = new List { "User:" } };
+ Console.ForegroundColor = ConsoleColor.White;
+ Console.Write(prompt);
- while (true)
- {
- foreach (var text in ex.Infer(prompt, inferenceParams))
- {
- Console.Write(text);
- }
+ var inferenceParams = new InferenceParams() { Temperature = 0.6f, AntiPrompts = new List { "User:" } };
- prompt = Console.ReadLine();
- if (prompt == "save")
+ while (true)
{
- Console.Write("Your path to save model state: ");
- string modelStatePath = Console.ReadLine();
- ex.Model.SaveState(modelStatePath);
+ await foreach (var text in ex.InferAsync(prompt, inferenceParams))
+ {
+ Console.Write(text);
+ }
- Console.Write("Your path to save executor state: ");
- string executorStatePath = Console.ReadLine();
- ex.SaveState(executorStatePath);
+ prompt = Console.ReadLine();
+ if (prompt == "save")
+ {
+ Console.Write("Your path to save model state: ");
+ var modelStatePath = Console.ReadLine();
+ ex.Context.SaveState(modelStatePath);
- Console.ForegroundColor = ConsoleColor.Yellow;
- Console.WriteLine("All states saved!");
- Console.ForegroundColor = ConsoleColor.White;
+ Console.Write("Your path to save executor state: ");
+ var executorStatePath = Console.ReadLine();
+ await ex.SaveState(executorStatePath);
- var model = ex.Model;
- model.LoadState(modelStatePath);
- ex = new InteractiveExecutor(model);
- ex.LoadState(executorStatePath);
- Console.ForegroundColor = ConsoleColor.Yellow;
- Console.WriteLine("Loaded state!");
- Console.ForegroundColor = ConsoleColor.White;
+ Console.ForegroundColor = ConsoleColor.Yellow;
+ Console.WriteLine("All states saved!");
+ Console.ForegroundColor = ConsoleColor.White;
- Console.Write("Now you can continue your session: ");
- Console.ForegroundColor = ConsoleColor.Green;
- prompt = Console.ReadLine();
- Console.ForegroundColor = ConsoleColor.White;
+ var ctx = ex.Context;
+ ctx.LoadState(modelStatePath);
+ ex = new InteractiveExecutor(ctx);
+ await ex.LoadState(executorStatePath);
+ Console.ForegroundColor = ConsoleColor.Yellow;
+ Console.WriteLine("Loaded state!");
+ Console.ForegroundColor = ConsoleColor.White;
+
+ Console.Write("Now you can continue your session: ");
+ Console.ForegroundColor = ConsoleColor.Green;
+ prompt = Console.ReadLine();
+ Console.ForegroundColor = ConsoleColor.White;
+ }
}
}
}
}
+
```
\ No newline at end of file
diff --git a/docs/Examples/QuantizeModel.md b/docs/Examples/QuantizeModel.md
index 22596657..3c198583 100644
--- a/docs/Examples/QuantizeModel.md
+++ b/docs/Examples/QuantizeModel.md
@@ -1,30 +1,28 @@
-ο»Ώ# Quantize model
+# Quantization
```cs
-using System;
-using System.Collections.Generic;
-using System.Linq;
-using System.Text;
-using System.Threading;
-using System.Threading.Tasks;
-
-public class QuantizeModel
+namespace LLama.Examples.Examples
{
- public static void Run()
+ public class QuantizeModel
{
- Console.Write("Please input your original model path: ");
- var inputPath = Console.ReadLine();
- Console.Write("Please input your output model path: ");
- var outputPath = Console.ReadLine();
- Console.Write("Please input the quantize type (one of q4_0, q4_1, q5_0, q5_1, q8_0): ");
- var quantizeType = Console.ReadLine();
- if (LLamaQuantizer.Quantize(inputPath, outputPath, quantizeType))
- {
- Console.WriteLine("Quantization succeed!");
- }
- else
+ public static void Run()
{
- Console.WriteLine("Quantization failed!");
+ string inputPath = UserSettings.GetModelPath();
+
+ Console.Write("Please input your output model path: ");
+ var outputPath = Console.ReadLine();
+
+ Console.Write("Please input the quantize type (one of q4_0, q4_1, q5_0, q5_1, q8_0): ");
+ var quantizeType = Console.ReadLine();
+
+ if (LLamaQuantizer.Quantize(inputPath, outputPath, quantizeType))
+ {
+ Console.WriteLine("Quantization succeeded!");
+ }
+ else
+ {
+ Console.WriteLine("Quantization failed!");
+ }
}
}
}
diff --git a/docs/Examples/SemanticKernelChat.md b/docs/Examples/SemanticKernelChat.md
new file mode 100644
index 00000000..c936d162
--- /dev/null
+++ b/docs/Examples/SemanticKernelChat.md
@@ -0,0 +1,67 @@
+# Semantic-kernel - chat
+
+```cs
+using LLama.Common;
+using LLamaSharp.SemanticKernel.ChatCompletion;
+using Microsoft.SemanticKernel.ChatCompletion;
+
+namespace LLama.Examples.Examples
+{
+ public class SemanticKernelChat
+ {
+ public static async Task Run()
+ {
+ string modelPath = UserSettings.GetModelPath();
+
+ Console.ForegroundColor = ConsoleColor.Yellow;
+ Console.WriteLine("This example is from: \n" +
+ "https://github.com/microsoft/semantic-kernel/blob/main/dotnet/samples/KernelSyntaxExamples/Example17_ChatGPT.cs");
+
+ // Load weights into memory
+ var parameters = new ModelParams(modelPath);
+ using var model = LLamaWeights.LoadFromFile(parameters);
+ var ex = new StatelessExecutor(model, parameters);
+
+ var chatGPT = new LLamaSharpChatCompletion(ex);
+
+ var chatHistory = chatGPT.CreateNewChat("This is a conversation between the " +
+ "assistant and the user. \n\n You are a librarian, expert about books. ");
+
+ Console.WriteLine("Chat content:");
+ Console.WriteLine("------------------------");
+
+ chatHistory.AddUserMessage("Hi, I'm looking for book suggestions");
+ await MessageOutputAsync(chatHistory);
+
+ // First bot assistant message
+ var reply = await chatGPT.GetChatMessageContentAsync(chatHistory);
+ chatHistory.AddAssistantMessage(reply.Content);
+ await MessageOutputAsync(chatHistory);
+
+ // Second user message
+ chatHistory.AddUserMessage("I love history and philosophy, I'd like to learn " +
+ "something new about Greece, any suggestion");
+ await MessageOutputAsync(chatHistory);
+
+ // Second bot assistant message
+ reply = await chatGPT.GetChatMessageContentAsync(chatHistory);
+ chatHistory.AddAssistantMessage(reply.Content);
+ await MessageOutputAsync(chatHistory);
+ }
+
+ ///
+ /// Outputs the last message of the chat history
+ ///
+ private static Task MessageOutputAsync(Microsoft.SemanticKernel.ChatCompletion.ChatHistory chatHistory)
+ {
+ var message = chatHistory.Last();
+
+ Console.WriteLine($"{message.Role}: {message.Content}");
+ Console.WriteLine("------------------------");
+
+ return Task.CompletedTask;
+ }
+ }
+}
+
+```
\ No newline at end of file
diff --git a/docs/Examples/SemanticKernelMemory.md b/docs/Examples/SemanticKernelMemory.md
index 6ea7bd48..b124fc57 100644
--- a/docs/Examples/SemanticKernelMemory.md
+++ b/docs/Examples/SemanticKernelMemory.md
@@ -1,169 +1,170 @@
-# Semantic kernel memory
+# Semantic-kernel - with kernel-memory
+
+Semantic Memory allows to store your data like traditional DBs, adding the ability to query it using natural language.
```cs
using LLama.Common;
-using Microsoft.SemanticKernel;
using Microsoft.SemanticKernel.Memory;
using LLamaSharp.SemanticKernel.TextEmbedding;
-using Microsoft.SemanticKernel.AI.Embeddings;
-using Microsoft.SemanticKernel.Plugins.Memory;
-
-public class SemanticKernelMemory
+namespace LLama.Examples.Examples
{
- private const string MemoryCollectionName = "SKGitHub";
-
- public static async Task Run()
+ public class SemanticKernelMemory
{
- var loggerFactory = ConsoleLogger.LoggerFactory;
- Console.WriteLine("Example from: https://github.com/microsoft/semantic-kernel/blob/main/dotnet/samples/KernelSyntaxExamples/Example14_SemanticMemory.cs");
- Console.Write("Please input your model path: ");
- var modelPath = Console.ReadLine();
-
- var seed = 1337u;
- // Load weights into memory
- var parameters = new ModelParams(modelPath)
- {
- Seed = seed,
- EmbeddingMode = true
- };
-
- using var model = LLamaWeights.LoadFromFile(parameters);
- var embedding = new LLamaEmbedder(model, parameters);
-
- Console.WriteLine("====================================================");
- Console.WriteLine("======== Semantic Memory (volatile, in RAM) ========");
- Console.WriteLine("====================================================");
-
- /* You can build your own semantic memory combining an Embedding Generator
- * with a Memory storage that supports search by similarity (ie semantic search).
- *
- * In this example we use a volatile memory, a local simulation of a vector DB.
- *
- * You can replace VolatileMemoryStore with Qdrant (see QdrantMemoryStore connector)
- * or implement your connectors for Pinecone, Vespa, Postgres + pgvector, SQLite VSS, etc.
- */
+ private const string MemoryCollectionName = "SKGitHub";
- var memory = new MemoryBuilder()
- .WithTextEmbeddingGeneration(new LLamaSharpEmbeddingGeneration(embedding))
- .WithMemoryStore(new VolatileMemoryStore())
- .Build();
+ public static async Task Run()
+ {
+ string modelPath = UserSettings.GetModelPath();
+
+ Console.WriteLine("This example is from: \n" +
+ "https://github.com/microsoft/semantic-kernel/blob/main/dotnet/samples/KernelSyntaxExamples/Example14_SemanticMemory.cs");
+
+ var seed = 1337u;
+ // Load weights into memory
+ var parameters = new ModelParams(modelPath)
+ {
+ Seed = seed,
+ EmbeddingMode = true
+ };
+
+ using var model = LLamaWeights.LoadFromFile(parameters);
+ var embedding = new LLamaEmbedder(model, parameters);
+
+ Console.WriteLine("====================================================");
+ Console.WriteLine("======== Semantic Memory (volatile, in RAM) ========");
+ Console.WriteLine("====================================================");
+
+ /* You can build your own semantic memory combining an Embedding Generator
+ * with a Memory storage that supports search by similarity (ie semantic search).
+ *
+ * In this example we use a volatile memory, a local simulation of a vector DB.
+ *
+ * You can replace VolatileMemoryStore with Qdrant (see QdrantMemoryStore connector)
+ * or implement your connectors for Pinecone, Vespa, Postgres + pgvector, SQLite VSS, etc.
+ */
+
+ var memory = new MemoryBuilder()
+ .WithTextEmbeddingGeneration(new LLamaSharpEmbeddingGeneration(embedding))
+ .WithMemoryStore(new VolatileMemoryStore())
+ .Build();
+
+ await RunExampleAsync(memory);
+ }
- await RunExampleAsync(memory);
- }
+ private static async Task RunExampleAsync(ISemanticTextMemory memory)
+ {
+ await StoreMemoryAsync(memory);
- private static async Task RunExampleAsync(ISemanticTextMemory memory)
- {
- await StoreMemoryAsync(memory);
+ await SearchMemoryAsync(memory, "How do I get started?");
- await SearchMemoryAsync(memory, "How do I get started?");
+ /*
+ Output:
- /*
- Output:
+ Query: How do I get started?
- Query: How do I get started?
+ Result 1:
+ URL: : https://github.com/microsoft/semantic-kernel/blob/main/README.md
+ Title : README: Installation, getting started, and how to contribute
- Result 1:
- URL: : https://github.com/microsoft/semantic-kernel/blob/main/README.md
- Title : README: Installation, getting started, and how to contribute
+ Result 2:
+ URL: : https://github.com/microsoft/semantic-kernel/blob/main/samples/dotnet-jupyter-notebooks/00-getting-started.ipynb
+ Title : Jupyter notebook describing how to get started with the Semantic Kernel
- Result 2:
- URL: : https://github.com/microsoft/semantic-kernel/blob/main/samples/dotnet-jupyter-notebooks/00-getting-started.ipynb
- Title : Jupyter notebook describing how to get started with the Semantic Kernel
+ */
- */
+ await SearchMemoryAsync(memory, "Can I build a chat with SK?");
- await SearchMemoryAsync(memory, "Can I build a chat with SK?");
+ /*
+ Output:
- /*
- Output:
+ Query: Can I build a chat with SK?
- Query: Can I build a chat with SK?
+ Result 1:
+ URL: : https://github.com/microsoft/semantic-kernel/tree/main/samples/skills/ChatSkill/ChatGPT
+ Title : Sample demonstrating how to create a chat skill interfacing with ChatGPT
- Result 1:
- URL: : https://github.com/microsoft/semantic-kernel/tree/main/samples/skills/ChatSkill/ChatGPT
- Title : Sample demonstrating how to create a chat skill interfacing with ChatGPT
+ Result 2:
+ URL: : https://github.com/microsoft/semantic-kernel/blob/main/samples/apps/chat-summary-webapp-react/README.md
+ Title : README: README associated with a sample chat summary react-based webapp
- Result 2:
- URL: : https://github.com/microsoft/semantic-kernel/blob/main/samples/apps/chat-summary-webapp-react/README.md
- Title : README: README associated with a sample chat summary react-based webapp
+ */
- */
+ await SearchMemoryAsync(memory, "Jupyter notebook");
- await SearchMemoryAsync(memory, "Jupyter notebook");
+ await SearchMemoryAsync(memory, "README: README associated with a sample chat summary react-based webapp");
- await SearchMemoryAsync(memory, "README: README associated with a sample chat summary react-based webapp");
+ await SearchMemoryAsync(memory, "Jupyter notebook describing how to pass prompts from a file to a semantic skill or function");
+ }
- await SearchMemoryAsync(memory, "Jupyter notebook describing how to pass prompts from a file to a semantic skill or function");
- }
+ private static async Task SearchMemoryAsync(ISemanticTextMemory memory, string query)
+ {
+ Console.WriteLine("\nQuery: " + query + "\n");
- private static async Task SearchMemoryAsync(ISemanticTextMemory memory, string query)
- {
- Console.WriteLine("\nQuery: " + query + "\n");
+ var memories = memory.SearchAsync(MemoryCollectionName, query, limit: 10, minRelevanceScore: 0.5);
- var memories = memory.SearchAsync(MemoryCollectionName, query, limit: 10, minRelevanceScore: 0.5);
+ int i = 0;
+ await foreach (MemoryQueryResult result in memories)
+ {
+ Console.WriteLine($"Result {++i}:");
+ Console.WriteLine(" URL: : " + result.Metadata.Id);
+ Console.WriteLine(" Title : " + result.Metadata.Description);
+ Console.WriteLine(" Relevance: " + result.Relevance);
+ Console.WriteLine();
+ }
- int i = 0;
- await foreach (MemoryQueryResult result in memories)
- {
- Console.WriteLine($"Result {++i}:");
- Console.WriteLine(" URL: : " + result.Metadata.Id);
- Console.WriteLine(" Title : " + result.Metadata.Description);
- Console.WriteLine(" Relevance: " + result.Relevance);
- Console.WriteLine();
+ Console.WriteLine("----------------------");
}
- Console.WriteLine("----------------------");
- }
-
- private static async Task StoreMemoryAsync(ISemanticTextMemory memory)
- {
- /* Store some data in the semantic memory.
- *
- * When using Azure Cognitive Search the data is automatically indexed on write.
- *
- * When using the combination of VolatileStore and Embedding generation, SK takes
- * care of creating and storing the index
- */
-
- Console.WriteLine("\nAdding some GitHub file URLs and their descriptions to the semantic memory.");
- var githubFiles = SampleData();
- var i = 0;
- foreach (var entry in githubFiles)
+ private static async Task StoreMemoryAsync(ISemanticTextMemory memory)
{
- var result = await memory.SaveReferenceAsync(
- collection: MemoryCollectionName,
- externalSourceName: "GitHub",
- externalId: entry.Key,
- description: entry.Value,
- text: entry.Value);
-
- Console.WriteLine($"#{++i} saved.");
- Console.WriteLine(result);
+ /* Store some data in the semantic memory.
+ *
+ * When using Azure Cognitive Search the data is automatically indexed on write.
+ *
+ * When using the combination of VolatileStore and Embedding generation, SK takes
+ * care of creating and storing the index
+ */
+
+ Console.WriteLine("\nAdding some GitHub file URLs and their descriptions to the semantic memory.");
+ var githubFiles = SampleData();
+ var i = 0;
+ foreach (var entry in githubFiles)
+ {
+ var result = await memory.SaveReferenceAsync(
+ collection: MemoryCollectionName,
+ externalSourceName: "GitHub",
+ externalId: entry.Key,
+ description: entry.Value,
+ text: entry.Value);
+
+ Console.WriteLine($"#{++i} saved.");
+ Console.WriteLine(result);
+ }
+
+ Console.WriteLine("\n----------------------");
}
- Console.WriteLine("\n----------------------");
- }
-
- private static Dictionary SampleData()
- {
- return new Dictionary
+ private static Dictionary SampleData()
{
- ["https://github.com/microsoft/semantic-kernel/blob/main/README.md"]
- = "README: Installation, getting started, and how to contribute",
- ["https://github.com/microsoft/semantic-kernel/blob/main/dotnet/notebooks/02-running-prompts-from-file.ipynb"]
- = "Jupyter notebook describing how to pass prompts from a file to a semantic skill or function",
- ["https://github.com/microsoft/semantic-kernel/blob/main/dotnet/notebooks//00-getting-started.ipynb"]
- = "Jupyter notebook describing how to get started with the Semantic Kernel",
- ["https://github.com/microsoft/semantic-kernel/tree/main/samples/skills/ChatSkill/ChatGPT"]
- = "Sample demonstrating how to create a chat skill interfacing with ChatGPT",
- ["https://github.com/microsoft/semantic-kernel/blob/main/dotnet/src/SemanticKernel/Memory/VolatileMemoryStore.cs"]
- = "C# class that defines a volatile embedding store",
- ["https://github.com/microsoft/semantic-kernel/blob/main/samples/dotnet/KernelHttpServer/README.md"]
- = "README: How to set up a Semantic Kernel Service API using Azure Function Runtime v4",
- ["https://github.com/microsoft/semantic-kernel/blob/main/samples/apps/chat-summary-webapp-react/README.md"]
- = "README: README associated with a sample chat summary react-based webapp",
- };
+ return new Dictionary
+ {
+ ["https://github.com/microsoft/semantic-kernel/blob/main/README.md"]
+ = "README: Installation, getting started, and how to contribute",
+ ["https://github.com/microsoft/semantic-kernel/blob/main/dotnet/notebooks/02-running-prompts-from-file.ipynb"]
+ = "Jupyter notebook describing how to pass prompts from a file to a semantic skill or function",
+ ["https://github.com/microsoft/semantic-kernel/blob/main/dotnet/notebooks//00-getting-started.ipynb"]
+ = "Jupyter notebook describing how to get started with the Semantic Kernel",
+ ["https://github.com/microsoft/semantic-kernel/tree/main/samples/skills/ChatSkill/ChatGPT"]
+ = "Sample demonstrating how to create a chat skill interfacing with ChatGPT",
+ ["https://github.com/microsoft/semantic-kernel/blob/main/dotnet/src/SemanticKernel/Memory/VolatileMemoryStore.cs"]
+ = "C# class that defines a volatile embedding store",
+ ["https://github.com/microsoft/semantic-kernel/blob/main/samples/dotnet/KernelHttpServer/README.md"]
+ = "README: How to set up a Semantic Kernel Service API using Azure Function Runtime v4",
+ ["https://github.com/microsoft/semantic-kernel/blob/main/samples/apps/chat-summary-webapp-react/README.md"]
+ = "README: README associated with a sample chat summary react-based webapp",
+ };
+ }
}
}
diff --git a/docs/Examples/SemanticKernelPrompt.md b/docs/Examples/SemanticKernelPrompt.md
index f7589a44..a57ec0ea 100644
--- a/docs/Examples/SemanticKernelPrompt.md
+++ b/docs/Examples/SemanticKernelPrompt.md
@@ -1,7 +1,6 @@
-# Semantic kernel mode
+# Semantic-kernel - basic
```cs
-using System.Security.Cryptography;
using LLama.Common;
using LLamaSharp.SemanticKernel.ChatCompletion;
using Microsoft.SemanticKernel;
@@ -9,47 +8,51 @@ using LLamaSharp.SemanticKernel.TextCompletion;
using Microsoft.SemanticKernel.TextGeneration;
using Microsoft.Extensions.DependencyInjection;
-
-public class SemanticKernelPrompt
+namespace LLama.Examples.Examples
{
- public static async Task Run()
+ // The basic example for using the semantic-kernel integration
+ public class SemanticKernelPrompt
{
- Console.WriteLine("Example from: https://github.com/microsoft/semantic-kernel/blob/main/dotnet/README.md");
- Console.Write("Please input your model path: ");
- var modelPath = Console.ReadLine();
+ public static async Task Run()
+ {
+ string modelPath = UserSettings.GetModelPath();
+
+ Console.ForegroundColor = ConsoleColor.Yellow;
+ Console.WriteLine("This example is from: " +
+ "https://github.com/microsoft/semantic-kernel/blob/main/dotnet/README.md");
- // Load weights into memory
- var parameters = new ModelParams(modelPath);
- using var model = LLamaWeights.LoadFromFile(parameters);
- var ex = new StatelessExecutor(model, parameters);
+ // Load weights into memory
+ var parameters = new ModelParams(modelPath);
+ using var model = LLamaWeights.LoadFromFile(parameters);
+ var ex = new StatelessExecutor(model, parameters);
- var builder = Kernel.CreateBuilder();
- builder.Services.AddKeyedSingleton("local-llama", new LLamaSharpTextCompletion(ex));
+ var builder = Kernel.CreateBuilder();
+ builder.Services.AddKeyedSingleton("local-llama", new LLamaSharpTextCompletion(ex));
- var kernel = builder.Build();
+ var kernel = builder.Build();
- var prompt = @"{{$input}}
+ var prompt = @"{{$input}}
One line TLDR with the fewest words.";
- ChatRequestSettings settings = new() { MaxTokens = 100 };
- var summarize = kernel.CreateFunctionFromPrompt(prompt, settings);
+ ChatRequestSettings settings = new() { MaxTokens = 100 };
+ var summarize = kernel.CreateFunctionFromPrompt(prompt, settings);
- string text1 = @"
+ string text1 = @"
1st Law of Thermodynamics - Energy cannot be created or destroyed.
2nd Law of Thermodynamics - For a spontaneous process, the entropy of the universe increases.
3rd Law of Thermodynamics - A perfect crystal at zero Kelvin has zero entropy.";
- string text2 = @"
+ string text2 = @"
1. An object at rest remains at rest, and an object in motion remains in motion at constant speed and in a straight line unless acted on by an unbalanced force.
2. The acceleration of an object depends on the mass of the object and the amount of force applied.
3. Whenever one object exerts a force on another object, the second object exerts an equal and opposite on the first.";
- Console.WriteLine((await kernel.InvokeAsync(summarize, new() { ["input"] = text1 })).GetValue());
+ Console.WriteLine((await kernel.InvokeAsync(summarize, new() { ["input"] = text1 })).GetValue());
- Console.WriteLine((await kernel.InvokeAsync(summarize, new() { ["input"] = text2 })).GetValue());
+ Console.WriteLine((await kernel.InvokeAsync(summarize, new() { ["input"] = text2 })).GetValue());
+ }
}
}
-
```
\ No newline at end of file
diff --git a/docs/Examples/StatelessModeExecute.md b/docs/Examples/StatelessModeExecute.md
index e41ca6b7..ad512e7e 100644
--- a/docs/Examples/StatelessModeExecute.md
+++ b/docs/Examples/StatelessModeExecute.md
@@ -1,44 +1,51 @@
-ο»Ώ# Use stateless executor
+# Stateless executor
```cs
using LLama.Common;
-using System;
-using System.Collections.Generic;
-using System.Linq;
-using System.Text;
-using System.Threading.Tasks;
+using LLama.Examples.Extensions;
-public class StatelessModeExecute
+namespace LLama.Examples.Examples
{
- public static void Run()
+ // Basic usage of the stateless executor.
+ public class StatelessModeExecute
{
- Console.Write("Please input your model path: ");
- string modelPath = Console.ReadLine();
+ public static async Task Run()
+ {
+ string modelPath = UserSettings.GetModelPath();
- StatelessExecutor ex = new(new LLamaModel(new ModelParams(modelPath, contextSize: 256)));
+ var parameters = new ModelParams(modelPath)
+ {
+ ContextSize = 1024,
+ Seed = 1337,
+ GpuLayerCount = 5
+ };
+ using var model = LLamaWeights.LoadFromFile(parameters);
+ var ex = new StatelessExecutor(model, parameters);
- Console.ForegroundColor = ConsoleColor.Yellow;
- Console.WriteLine("The executor has been enabled. In this example, the inference is an one-time job. That says, the previous input and response has " +
- "no impact on the current response. Now you can ask it questions. Note that in this example, no prompt was set for LLM and the maximum response tokens is 50. " +
- "It may not perform well because of lack of prompt. This is also an example that could indicate the improtance of prompt in LLM. To improve it, you can add " +
- "a prompt for it yourself!");
- Console.ForegroundColor = ConsoleColor.White;
+ Console.ForegroundColor = ConsoleColor.Yellow;
+ Console.WriteLine("The executor has been enabled. In this example, the inference is an one-time job. That says, the previous input and response has " +
+ "no impact on the current response. Now you can ask it questions. Note that in this example, no prompt was set for LLM and the maximum response tokens is 50. " +
+ "It may not perform well because of lack of prompt. This is also an example that could indicate the importance of prompt in LLM. To improve it, you can add " +
+ "a prompt for it yourself!");
+ Console.ForegroundColor = ConsoleColor.White;
- var inferenceParams = new InferenceParams() { Temperature = 0.6f, AntiPrompts = new List { "Question:", "#", "Question: ", ".\n" }, MaxTokens = 50 };
+ var inferenceParams = new InferenceParams() { Temperature = 0.6f, AntiPrompts = new List { "Question:", "#", "Question: ", ".\n" }, MaxTokens = 50 };
- while (true)
- {
- Console.Write("\nQuestion: ");
- Console.ForegroundColor = ConsoleColor.Green;
- string prompt = Console.ReadLine();
- Console.ForegroundColor = ConsoleColor.White;
- Console.Write("Answer: ");
- prompt = $"Question: {prompt.Trim()} Answer: ";
- foreach (var text in ex.Infer(prompt, inferenceParams))
+ while (true)
{
- Console.Write(text);
+ Console.Write("\nQuestion: ");
+ Console.ForegroundColor = ConsoleColor.Green;
+ var prompt = Console.ReadLine();
+ Console.ForegroundColor = ConsoleColor.White;
+ Console.Write("Answer: ");
+ prompt = $"Question: {prompt?.Trim()} Answer: ";
+ await foreach (var text in ex.InferAsync(prompt, inferenceParams).Spinner())
+ {
+ Console.Write(text);
+ }
}
}
}
}
+
```
\ No newline at end of file
diff --git a/docs/Examples/TalkToYourself.md b/docs/Examples/TalkToYourself.md
index a45676b1..6099ad49 100644
--- a/docs/Examples/TalkToYourself.md
+++ b/docs/Examples/TalkToYourself.md
@@ -1,72 +1,74 @@
# Talk to yourself
```cs
-using System.Security.Cryptography;
using System.Text;
using LLama.Abstractions;
using LLama.Common;
-public class TalkToYourself
+namespace LLama.Examples.Examples
{
- public static async Task Run()
+ // Let two bots chat with each other.
+ public class TalkToYourself
{
- Console.Write("Please input your model path: ");
- var modelPath = Console.ReadLine();
+ public static async Task Run()
+ {
+ string modelPath = UserSettings.GetModelPath();
- // Load weights into memory
- var @params = new ModelParams(modelPath);
- using var weights = LLamaWeights.LoadFromFile(@params);
+ // Load weights into memory
+ var @params = new ModelParams(modelPath);
+ using var weights = LLamaWeights.LoadFromFile(@params);
- // Create 2 contexts sharing the same weights
- using var aliceCtx = weights.CreateContext(@params);
- var alice = new InteractiveExecutor(aliceCtx);
- using var bobCtx = weights.CreateContext(@params);
- var bob = new InteractiveExecutor(bobCtx);
+ // Create 2 contexts sharing the same weights
+ using var aliceCtx = weights.CreateContext(@params);
+ var alice = new InteractiveExecutor(aliceCtx);
+ using var bobCtx = weights.CreateContext(@params);
+ var bob = new InteractiveExecutor(bobCtx);
- // Initial alice prompt
- var alicePrompt = "Transcript of a dialog, where the Alice interacts a person named Bob. Alice is friendly, kind, honest and good at writing.\nAlice: Hello";
- var aliceResponse = await Prompt(alice, ConsoleColor.Green, alicePrompt, false, false);
+ // Initial alice prompt
+ var alicePrompt = "Transcript of a dialog, where the Alice interacts a person named Bob. Alice is friendly, kind, honest and good at writing.\nAlice: Hello";
+ var aliceResponse = await Prompt(alice, ConsoleColor.Green, alicePrompt, false, false);
- // Initial bob prompt
- var bobPrompt = $"Transcript of a dialog, where the Bob interacts a person named Alice. Bob is smart, intellectual and good at writing.\nAlice: Hello{aliceResponse}";
- var bobResponse = await Prompt(bob, ConsoleColor.Red, bobPrompt, true, true);
+ // Initial bob prompt
+ var bobPrompt = $"Transcript of a dialog, where the Bob interacts a person named Alice. Bob is smart, intellectual and good at writing.\nAlice: Hello{aliceResponse}";
+ var bobResponse = await Prompt(bob, ConsoleColor.Red, bobPrompt, true, true);
- // swap back and forth from Alice to Bob
- while (true)
- {
- aliceResponse = await Prompt(alice, ConsoleColor.Green, bobResponse, false, true);
- bobResponse = await Prompt(bob, ConsoleColor.Red, aliceResponse, false, true);
+ // swap back and forth from Alice to Bob
+ while (true)
+ {
+ aliceResponse = await Prompt(alice, ConsoleColor.Green, bobResponse, false, true);
+ bobResponse = await Prompt(bob, ConsoleColor.Red, aliceResponse, false, true);
- if (Console.KeyAvailable)
- break;
+ if (Console.KeyAvailable)
+ break;
+ }
}
- }
- private static async Task Prompt(ILLamaExecutor executor, ConsoleColor color, string prompt, bool showPrompt, bool showResponse)
- {
- var inferenceParams = new InferenceParams
+ private static async Task Prompt(ILLamaExecutor executor, ConsoleColor color, string prompt, bool showPrompt, bool showResponse)
{
- Temperature = 0.9f,
- AntiPrompts = new List { "Alice:", "Bob:", "User:" },
- MaxTokens = 128,
- Mirostat = MirostatType.Mirostat2,
- MirostatTau = 10,
- };
+ var inferenceParams = new InferenceParams
+ {
+ Temperature = 0.9f,
+ AntiPrompts = new List { "Alice:", "Bob:", "User:" },
+ MaxTokens = 128,
+ Mirostat = MirostatType.Mirostat2,
+ MirostatTau = 10,
+ };
- Console.ForegroundColor = ConsoleColor.White;
- if (showPrompt)
- Console.Write(prompt);
+ Console.ForegroundColor = ConsoleColor.White;
+ if (showPrompt)
+ Console.Write(prompt);
- Console.ForegroundColor = color;
- var builder = new StringBuilder();
- await foreach (var text in executor.InferAsync(prompt, inferenceParams))
- {
- builder.Append(text);
- if (showResponse)
- Console.Write(text);
- }
+ Console.ForegroundColor = color;
+ var builder = new StringBuilder();
+ await foreach (var text in executor.InferAsync(prompt, inferenceParams))
+ {
+ builder.Append(text);
+ if (showResponse)
+ Console.Write(text);
+ }
- return builder.ToString();
+ return builder.ToString();
+ }
}
}
diff --git a/docs/FAQ.md b/docs/FAQ.md
new file mode 100644
index 00000000..6b5781fb
--- /dev/null
+++ b/docs/FAQ.md
@@ -0,0 +1,64 @@
+# Frequently asked qustions
+
+Sometimes, your application with LLM and LLamaSharp may have unexpected behaviours. Here are some frequently asked questions, which may help you to deal with your problem.
+
+## Why GPU is not used when I have installed CUDA
+
+1. If you are using backend packages, please make sure you have installed the cuda backend package which matches the cuda version of your device. Please note that before LLamaSharp v0.10.0, only one backend package should be installed.
+2. Add `NativeLibraryConfig.Instance.WithLogs(LLamaLogLevel.Info)` to the very beginning of your code. The log will show which native library file is loaded. If the CPU library is loaded, please try to compile the native library yourself and open an issue for that. If the CUDA libraty is loaded, please check if `GpuLayerCount > 0` when loading the model weight.
+
+## Why the inference is slow
+
+Firstly, due to the large size of LLM models, it requires more time to generate outputs than other models, especially when you are using models larger than 30B.
+
+To see if that's a LLamaSharp performance issue, please follow the two tips below.
+
+1. If you are using CUDA, Metal or OpenCL, please set `GpuLayerCount` as large as possible.
+2. If it's still slower than you expect it to be, please try to run the same model with same setting in [llama.cpp examples](https://github.com/ggerganov/llama.cpp/tree/master/examples). If llama.cpp outperforms LLamaSharp significantly, it's likely a LLamaSharp BUG and please report us for that.
+
+
+## Why the program crashes before any output is generated
+
+Generally, there are two possible cases for this problem:
+
+1. The native library (backend) you are using is not compatible with the LLamaSharp version. If you compiled the native library yourself, please make sure you have checkouted llama.cpp to the corresponding commit of LLamaSharp, which could be found at the bottom of README.
+2. The model file you are using is not compatible with the backend. If you are using a GGUF file downloaded from huggingface, please check its publishing time.
+
+
+## Why my model is generating output infinitely
+
+Please set anti-prompt or max-length when executing the inference.
+
+Anti-prompt can also be called as "Stop-keyword", which decides when to stop the response generation. Under interactive mode, the maximum tokens count is always not set, which makes the LLM generates responses infinitively. Therefore, setting anti-prompt correctly helps a lot to avoid the strange behaviours. For example, the prompt file `chat-with-bob.txt` has the following content:
+
+```
+Transcript of a dialog, where the User interacts with an Assistant named Bob. Bob is helpful, kind, honest, good at writing, and never fails to answer the User's requests immediately and with precision.
+
+User: Hello, Bob.
+Bob: Hello. How may I help you today?
+User: Please tell me the largest city in Europe.
+Bob: Sure. The largest city in Europe is Moscow, the capital of Russia.
+User:
+```
+
+Therefore, the anti-prompt should be set as "User:". If the last line of the prompt is removed, LLM will automatically generate a question (user) and a response (bob) for one time when running the chat session. Therefore, the antiprompt is suggested to be appended to the prompt when starting a chat session.
+
+What if an extra line is appended? The string "User:" in the prompt will be followed with a char "\n". Thus when running the model, the automatic generation of a pair of question and response may appear because the anti-prompt is "User:" but the last token is "User:\n". As for whether it will appear, it's an undefined behaviour, which depends on the implementation inside the `LLamaExecutor`. Anyway, since it may leads to unexpected behaviors, it's recommended to trim your prompt or carefully keep consistent with your anti-prompt.
+
+## How to run LLM with non-English languages
+
+English is the most popular language in the world, and in the region of LLM. If you want to accept inputs and generate outputs of other languages, please follow the two tips below.
+
+1. Ensure the model you selected is well-trained with data of your language. For example, [LLaMA](https://github.com/meta-llama/llama) (original) used few Chinese text during the pretrain, while [Chinese-LLaMA-Alpaca](https://github.com/ymcui/Chinese-LLaMA-Alpaca) finetuned LLaMA with a large amount of Chinese text data. Therefore, the quality of the output of Chinese-LLaMA-Alpaca is much better than that of LLaMA.
+
+## Pay attention to the length of prompt
+
+Sometimes we want to input a long prompt to execute a task. However, the context size may limit the inference of LLama model. Please ensure the inequality below holds.
+
+$$ len(prompt) + len(response) < len(context) $$
+
+In this inequality, `len(response)` refers to the expected tokens for LLM to generate.
+
+## Choose models weight depending on you task
+
+The differences between modes may lead to much different behaviours under the same task. For example, if you're building a chat bot with non-English, a fine-tuned model specially for the language you want to use will have huge effect on the performance.
diff --git a/docs/GetStarted.md b/docs/GetStarted.md
deleted file mode 100644
index 8c0ff835..00000000
--- a/docs/GetStarted.md
+++ /dev/null
@@ -1,118 +0,0 @@
-# Get Started
-
-## Install packages
-
-Firstly, search `LLamaSharp` in nuget package manager and install it.
-
-```
-PM> Install-Package LLamaSharp
-```
-
-Then, search and install one of the following backends:
-
-```
-LLamaSharp.Backend.Cpu
-LLamaSharp.Backend.Cuda11
-LLamaSharp.Backend.Cuda12
-```
-
-Here's the mapping of them and corresponding model samples provided by `LLamaSharp`. If you're not sure which model is available for a version, please try our sample model.
-
-| LLamaSharp.Backend | LLamaSharp | Verified Model Resources | llama.cpp commit id |
-| - | - | -- | - |
-| - | v0.2.0 | This version is not recommended to use. | - |
-| - | v0.2.1 | [WizardLM](https://huggingface.co/TheBloke/wizardLM-7B-GGML/tree/previous_llama), [Vicuna (filenames with "old")](https://huggingface.co/eachadea/ggml-vicuna-13b-1.1/tree/main) | - |
-| v0.2.2 | v0.2.2, v0.2.3 | [WizardLM](https://huggingface.co/TheBloke/wizardLM-7B-GGML/tree/previous_llama_ggmlv2), [Vicuna (filenames without "old")](https://huggingface.co/eachadea/ggml-vicuna-13b-1.1/tree/main) | 63d2046 |
-| v0.3.0 | v0.3.0 | [LLamaSharpSamples v0.3.0](https://huggingface.co/AsakusaRinne/LLamaSharpSamples/tree/v0.3.0), [WizardLM](https://huggingface.co/TheBloke/wizardLM-7B-GGML/tree/main) | 7e4ea5b |
-
-
-## Download a model
-
-One of the following models could be okay:
-
-- LLaMA π¦
-- [Alpaca](https://github.com/ggerganov/llama.cpp#instruction-mode-with-alpaca)
-- [GPT4All](https://github.com/ggerganov/llama.cpp#using-gpt4all)
-- [Chinese LLaMA / Alpaca](https://github.com/ymcui/Chinese-LLaMA-Alpaca)
-- [Vigogne (French)](https://github.com/bofenghuang/vigogne)
-- [Vicuna](https://github.com/ggerganov/llama.cpp/discussions/643#discussioncomment-5533894)
-- [Koala](https://bair.berkeley.edu/blog/2023/04/03/koala/)
-- [OpenBuddy πΆ (Multilingual)](https://github.com/OpenBuddy/OpenBuddy)
-- [Pygmalion 7B / Metharme 7B](#using-pygmalion-7b--metharme-7b)
-- [WizardLM](https://github.com/nlpxucan/WizardLM)
-
-**Note that because `llama.cpp` is under fast development now and often introduce break changes, some model weights on huggingface which works under a version may be invalid with another version. If it's your first time to configure LLamaSharp, we'd like to suggest for using verified model weights in the table above.**
-
-## Run the program
-
-Please create a console program with dotnet runtime >= netstandard 2.0 (>= net6.0 is more recommended). Then, paste the following code to `program.cs`;
-
-```cs
-using LLama.Common;
-using LLama;
-
-string modelPath = "" // change it to your own model path
-var prompt = "Transcript of a dialog, where the User interacts with an Assistant named Bob. Bob is helpful, kind, honest, good at writing, and never fails to answer the User's requests immediately and with precision.\r\n\r\nUser: Hello, Bob.\r\nBob: Hello. How may I help you today?\r\nUser: Please tell me the largest city in Europe.\r\nBob: Sure. The largest city in Europe is Moscow, the capital of Russia.\r\nUser:"; // use the "chat-with-bob" prompt here.
-
-// Load model
-var parameters = new ModelParams(modelPath)
-{
- ContextSize = 1024
-};
-using var model = LLamaWeights.LoadFromFile(parameters);
-
-// Initialize a chat session
-using var context = model.CreateContext(parameters);
-var ex = new InteractiveExecutor(context);
-ChatSession session = new ChatSession(ex);
-
-// show the prompt
-Console.WriteLine();
-Console.Write(prompt);
-
-// run the inference in a loop to chat with LLM
-while (true)
-{
- await foreach (var text in session.ChatAsync(prompt, new InferenceParams() { Temperature = 0.6f, AntiPrompts = new List { "User:" } }))
- {
- Console.Write(text);
- }
-
- Console.ForegroundColor = ConsoleColor.Green;
- prompt = Console.ReadLine();
- Console.ForegroundColor = ConsoleColor.White;
-}
-```
-
-After starting it, you'll see the following outputs.
-
-```
-Please input your model path: D:\development\llama\weights\wizard-vicuna-13B.ggmlv3.q4_1.bin
-llama.cpp: loading model from D:\development\llama\weights\wizard-vicuna-13B.ggmlv3.q4_1.bin
-llama_model_load_internal: format = ggjt v3 (latest)
-llama_model_load_internal: n_vocab = 32000
-llama_model_load_internal: n_ctx = 1024
-llama_model_load_internal: n_embd = 5120
-llama_model_load_internal: n_mult = 256
-llama_model_load_internal: n_head = 40
-llama_model_load_internal: n_layer = 40
-llama_model_load_internal: n_rot = 128
-llama_model_load_internal: ftype = 3 (mostly Q4_1)
-llama_model_load_internal: n_ff = 13824
-llama_model_load_internal: n_parts = 1
-llama_model_load_internal: model size = 13B
-llama_model_load_internal: ggml ctx size = 7759.48 MB
-llama_model_load_internal: mem required = 9807.48 MB (+ 1608.00 MB per state)
-....................................................................................................
-llama_init_from_file: kv self size = 800.00 MB
-
-Transcript of a dialog, where the User interacts with an Assistant named Bob. Bob is helpful, kind, honest, good at writing, and never fails to answer the User's requests immediately and with precision.
-
-User: Hello, Bob.
-Bob: Hello. How may I help you today?
-User: Please tell me the largest city in Europe.
-Bob: Sure. The largest city in Europe is Moscow, the capital of Russia.
-User:
-```
-
-Now, enjoy chatting with LLM!
\ No newline at end of file
diff --git a/docs/HighLevelApps/semantic-kernel.md b/docs/HighLevelApps/semantic-kernel.md
deleted file mode 100644
index b6ebe65c..00000000
--- a/docs/HighLevelApps/semantic-kernel.md
+++ /dev/null
@@ -1,3 +0,0 @@
-# The Usage of semantic-kernel Integration
-
-Please see [this doc](../../LLama.SemanticKernel/README.md)
\ No newline at end of file
diff --git a/docs/HighLevelApps/bot-sharp.md b/docs/Integrations/bot-sharp.md
similarity index 56%
rename from docs/HighLevelApps/bot-sharp.md
rename to docs/Integrations/bot-sharp.md
index 7dfbb28a..9e949645 100644
--- a/docs/HighLevelApps/bot-sharp.md
+++ b/docs/Integrations/bot-sharp.md
@@ -1,3 +1,3 @@
-# The Usage of BotSharp Integration
+# BotSharp integration
The document is under work, please have a wait. Thank you for your support! :)
\ No newline at end of file
diff --git a/docs/Integrations/kernel-memory.md b/docs/Integrations/kernel-memory.md
new file mode 100644
index 00000000..4966871a
--- /dev/null
+++ b/docs/Integrations/kernel-memory.md
@@ -0,0 +1,3 @@
+# LLamaSharp.kernel-memory
+
+The document is under work, please have a wait. Thank you for your support! :)
\ No newline at end of file
diff --git a/docs/Integrations/langchain.md b/docs/Integrations/langchain.md
new file mode 100644
index 00000000..b3d8940e
--- /dev/null
+++ b/docs/Integrations/langchain.md
@@ -0,0 +1,3 @@
+# Langchain integration
+
+The document is under work, please have a wait. Thank you for your support! :)
\ No newline at end of file
diff --git a/docs/Integrations/semantic-kernel.md b/docs/Integrations/semantic-kernel.md
new file mode 100644
index 00000000..907a9912
--- /dev/null
+++ b/docs/Integrations/semantic-kernel.md
@@ -0,0 +1,38 @@
+# LLamaSharp.SemanticKernel
+
+LLamaSharp.SemanticKernel are connections for [SemanticKernel](https://github.com/microsoft/semantic-kernel): an SDK for integrating various LLM interfaces into a single implementation. With this, you can add local LLaMa queries as another connection point with your existing connections.
+
+For reference on how to implement it, view the following examples:
+
+- [SemanticKernelChat](../LLama.Examples/Examples/SemanticKernelChat.cs)
+- [SemanticKernelPrompt](../LLama.Examples/Examples/SemanticKernelPrompt.cs)
+- [SemanticKernelMemory](../LLama.Examples/Examples/SemanticKernelMemory.cs)
+
+## ITextCompletion
+```csharp
+using var model = LLamaWeights.LoadFromFile(parameters);
+// LLamaSharpTextCompletion can accept ILLamaExecutor.
+var ex = new StatelessExecutor(model, parameters);
+var builder = new KernelBuilder();
+builder.WithAIService("local-llama", new LLamaSharpTextCompletion(ex), true);
+```
+
+## IChatCompletion
+```csharp
+using var model = LLamaWeights.LoadFromFile(parameters);
+using var context = model.CreateContext(parameters);
+// LLamaSharpChatCompletion requires InteractiveExecutor, as it's the best fit for the given command.
+var ex = new InteractiveExecutor(context);
+var chatGPT = new LLamaSharpChatCompletion(ex);
+```
+
+## ITextEmbeddingGeneration
+```csharp
+using var model = LLamaWeights.LoadFromFile(parameters);
+var embedding = new LLamaEmbedder(model, parameters);
+var kernelWithCustomDb = Kernel.Builder
+ .WithLoggerFactory(ConsoleLogger.LoggerFactory)
+ .WithAIService("local-llama-embed", new LLamaSharpEmbeddingGeneration(embedding), true)
+ .WithMemoryStorage(new VolatileMemoryStore())
+ .Build();
+```
diff --git a/docs/LLamaContext/parameters.md b/docs/LLamaContext/parameters.md
deleted file mode 100644
index ba94fb70..00000000
--- a/docs/LLamaContext/parameters.md
+++ /dev/null
@@ -1,208 +0,0 @@
-# LLamaModel Parameters
-
-When initializing a `LLamaModel` object, there're three parameters, `ModelParams Params, string encoding = "UTF-8", ILLamaLogger? logger = null`.
-
-The usage of `logger` will be further introduced in [logger doc](../More/log.md). The `encoding` is the encoding you want to use when dealing with text via this model.
-
-The most important of all, is the `ModelParams`, which is defined as below. We'll explain the parameters step by step in this document.
-
-```cs
-public class ModelParams
-{
- public int ContextSize { get; set; } = 512;
- public int GpuLayerCount { get; set; } = 20;
- public int Seed { get; set; } = 1686349486;
- public bool UseFp16Memory { get; set; } = true;
- public bool UseMemorymap { get; set; } = true;
- public bool UseMemoryLock { get; set; } = false;
- public bool Perplexity { get; set; } = false;
- public string ModelPath { get; set; }
- public string LoraAdapter { get; set; } = string.Empty;
- public string LoraBase { get; set; } = string.Empty;
- public int Threads { get; set; } = Math.Max(Environment.ProcessorCount / 2, 1);
- public int BatchSize { get; set; } = 512;
- public bool ConvertEosToNewLine { get; set; } = false;
-}
-```
-
-
-# ModelParams
-
-Namespace: LLama.Common
-
-```csharp
-public class ModelParams
-```
-
-Inheritance [Object](https://docs.microsoft.com/en-us/dotnet/api/system.object) β [ModelParams]()
-
-## Properties
-
-### **ContextSize**
-
-Model context size (n_ctx)
-
-```csharp
-public int ContextSize { get; set; }
-```
-
-#### Property Value
-
-[Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
-
-### **GpuLayerCount**
-
-Number of layers to run in VRAM / GPU memory (n_gpu_layers)
-
-```csharp
-public int GpuLayerCount { get; set; }
-```
-
-#### Property Value
-
-[Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
-
-### **Seed**
-
-Seed for the random number generator (seed)
-
-```csharp
-public int Seed { get; set; }
-```
-
-#### Property Value
-
-[Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
-
-### **UseFp16Memory**
-
-Use f16 instead of f32 for memory kv (memory_f16)
-
-```csharp
-public bool UseFp16Memory { get; set; }
-```
-
-#### Property Value
-
-[Boolean](https://docs.microsoft.com/en-us/dotnet/api/system.boolean)
-
-### **UseMemorymap**
-
-Use mmap for faster loads (use_mmap)
-
-```csharp
-public bool UseMemorymap { get; set; }
-```
-
-#### Property Value
-
-[Boolean](https://docs.microsoft.com/en-us/dotnet/api/system.boolean)
-
-### **UseMemoryLock**
-
-Use mlock to keep model in memory (use_mlock)
-
-```csharp
-public bool UseMemoryLock { get; set; }
-```
-
-#### Property Value
-
-[Boolean](https://docs.microsoft.com/en-us/dotnet/api/system.boolean)
-
-### **Perplexity**
-
-Compute perplexity over the prompt (perplexity)
-
-```csharp
-public bool Perplexity { get; set; }
-```
-
-#### Property Value
-
-[Boolean](https://docs.microsoft.com/en-us/dotnet/api/system.boolean)
-
-### **ModelPath**
-
-Model path (model)
-
-```csharp
-public string ModelPath { get; set; }
-```
-
-#### Property Value
-
-[String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
-
-### **LoraAdapter**
-
-lora adapter path (lora_adapter)
-
-```csharp
-public string LoraAdapter { get; set; }
-```
-
-#### Property Value
-
-[String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
-
-### **LoraBase**
-
-base model path for the lora adapter (lora_base)
-
-```csharp
-public string LoraBase { get; set; }
-```
-
-#### Property Value
-
-[String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
-
-### **Threads**
-
-Number of threads (-1 = autodetect) (n_threads)
-
-```csharp
-public int Threads { get; set; }
-```
-
-#### Property Value
-
-[Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
-
-### **BatchSize**
-
-batch size for prompt processing (must be >=32 to use BLAS) (n_batch)
-
-```csharp
-public int BatchSize { get; set; }
-```
-
-#### Property Value
-
-[Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
-
-### **ConvertEosToNewLine**
-
-Whether to convert eos to newline during the inference.
-
-```csharp
-public bool ConvertEosToNewLine { get; set; }
-```
-
-#### Property Value
-
-[Boolean](https://docs.microsoft.com/en-us/dotnet/api/system.boolean)
-
-### **EmbeddingMode**
-
-Whether to use embedding mode. (embedding) Note that if this is set to true,
- The LLamaModel won't produce text response anymore.
-
-```csharp
-public bool EmbeddingMode { get; set; }
-```
-
-#### Property Value
-
-[Boolean](https://docs.microsoft.com/en-us/dotnet/api/system.boolean)
diff --git a/docs/LLamaContext/quantization.md b/docs/LLamaContext/quantization.md
deleted file mode 100644
index b3bf93b4..00000000
--- a/docs/LLamaContext/quantization.md
+++ /dev/null
@@ -1,23 +0,0 @@
-# Quantization
-
-Quantization is significant to accelerate the model inference. Since there's little accuracy (performance) reduction when quantizing the model, get it easy to quantize it!
-
-To quantize the model, please call `Quantize` from `LLamaQuantizer`, which is a static method.
-
-```cs
-string srcPath = "";
-string dstPath = "";
-LLamaQuantizer.Quantize(srcPath, dstPath, "q4_0");
-// The following overload is also okay.
-// LLamaQuantizer.Quantize(srcPath, dstPath, LLamaFtype.LLAMA_FTYPE_MOSTLY_Q4_0);
-```
-
-After calling it, a quantized model file will be saved.
-
-There're currently 5 types of quantization supported:
-
-- q4_0
-- q4_1
-- q5_0
-- q5_1
-- q8_0
\ No newline at end of file
diff --git a/docs/LLamaContext/save-load-state.md b/docs/LLamaContext/save-load-state.md
deleted file mode 100644
index 76cacc7c..00000000
--- a/docs/LLamaContext/save-load-state.md
+++ /dev/null
@@ -1,19 +0,0 @@
-# Save/Load State
-
-There're two ways to load state: loading from path and loading from bite array. Therefore, correspondingly, state data can be extracted as byte array or saved to a file.
-
-```cs
-LLamaModel model = new LLamaModel(new ModelParams(""));
-// do some things...
-model.SaveState("model.st");
-var stateData = model.GetStateData();
-model.Dispose();
-
-LLamaModel model2 = new LLamaModel(new ModelParams(""));
-model2.LoadState(stateData);
-// do some things...
-
-LLamaModel model3 = new LLamaModel(new ModelParams(""));
-model3.LoadState("model.st");
-// do some things...
-```
\ No newline at end of file
diff --git a/docs/LLamaContext/tokenization.md b/docs/LLamaContext/tokenization.md
deleted file mode 100644
index aadff33a..00000000
--- a/docs/LLamaContext/tokenization.md
+++ /dev/null
@@ -1,25 +0,0 @@
-# Tokenization/Detokenization
-
-A pair of APIs to make conversion between text and tokens.
-
-## Tokenization
-
-The basic usage is to call `Tokenize` after initializing the model.
-
-```cs
-LLamaModel model = new LLamaModel(new ModelParams(""));
-string text = "hello";
-int[] tokens = model.Tokenize(text).ToArray();
-```
-
-Depending on different model (or vocab), the output will be various.
-
-## Detokenization
-
-Similar to tokenization, just pass an `IEnumerable` to `Detokenize` method.
-
-```cs
-LLamaModel model = new LLamaModel(new ModelParams(""));
-int[] tokens = new int[] {125, 2568, 13245};
-string text = model.Detokenize(tokens);
-```
diff --git a/docs/LLamaExecutors/differences.md b/docs/LLamaExecutors/differences.md
deleted file mode 100644
index db85d278..00000000
--- a/docs/LLamaExecutors/differences.md
+++ /dev/null
@@ -1,69 +0,0 @@
-## Differences between the executors
-
-There're currently three kinds of executors provided, which are `InteractiveExecutor`, `InstructExecutor` and `StatelessExecutor`.
-
-In a word, `InteractiveExecutor` is suitable for getting answer of your questions from LLM continuously. `InstructExecutor` let LLM execute your instructions, such as "continue writing". `StatelessExecutor` is best for one-time job because the previous inference has no impact on the current inference.
-
-
-## Interactive mode & Instruct mode
-
-Both of them are taking "completing the prompt" as the goal to generate the response. For example, if you input `Long long ago, there was a fox who wanted to make friend with humen. One day`, then the LLM will continue to write the story.
-
-Under interactive mode, you serve a role of user and the LLM serves the role of assistant. Then it will help you with your question or request.
-
-Under instruct mode, you give LLM some instructions and it follows.
-
-Though the behaviors of them sounds similar, it could introduce many differences depending on your prompt. For example, "chat-with-bob" has good performance under interactive mode and `alpaca` does well with instruct mode.
-
-```
-// chat-with-bob
-
-Transcript of a dialog, where the User interacts with an Assistant named Bob. Bob is helpful, kind, honest, good at writing, and never fails to answer the User's requests immediately and with precision.
-
-User: Hello, Bob.
-Bob: Hello. How may I help you today?
-User: Please tell me the largest city in Europe.
-Bob: Sure. The largest city in Europe is Moscow, the capital of Russia.
-User:
-```
-
-```
-// alpaca
-
-Below is an instruction that describes a task. Write a response that appropriately completes the request.
-```
-
-Therefore, please modify the prompt correspondingly when switching from one mode to the other.
-
-## Stateful mode and Stateless mode.
-
-Despite the differences between interactive mode and instruct mode, both of them are stateful mode. That is, your previous question/instruction will impact on the current response from LLM. On the contrary, the stateless executor does not have such a "memory". No matter how many times you talk to it, it will only concentrate on what you say in this time.
-
-Since the stateless executor has no memory of conversations before, you need to input your question with the whole prompt into it to get the better answer.
-
-For example, if you feed `Q: Who is Trump? A: ` to the stateless executor, it may give the following answer with the antiprompt `Q: `.
-
-```
-Donald J. Trump, born June 14, 1946, is an American businessman, television personality, politician and the 45th President of the United States (2017-2021). # Anexo:Torneo de Hamburgo 2022 (individual masculino)
-
-## PresentaciΓ³n previa
-
-* Defensor del tΓtulo: Daniil MedvΓ©dev
-```
-
-It seems that things went well at first. However, after answering the question itself, LLM began to talk about some other things until the answer reached the token count limit. The reason of this strange behavior is the anti-prompt cannot be match. With the input, LLM cannot decide whether to append a string "A: " at the end of the response.
-
-As an improvement, let's take the following text as the input:
-
-```
-Q: What is the capital of the USA? A: Washingtong. Q: What is the sum of 1 and 2? A: 3. Q: Who is Trump? A:
-```
-
-Then, I got the following answer with the anti-prompt `Q: `.
-
-```
-45th president of the United States.
-```
-
-At this time, by repeating the same mode of `Q: xxx? A: xxx.`, LLM outputs the anti-prompt we want to help to decide where to stop the generation.
-
diff --git a/docs/LLamaExecutors/parameters.md b/docs/LLamaExecutors/parameters.md
deleted file mode 100644
index 47ef1951..00000000
--- a/docs/LLamaExecutors/parameters.md
+++ /dev/null
@@ -1,261 +0,0 @@
-# Inference Parameters
-
-Different from `LLamaModel`, when using an executor, `InferenceParams` is passed to the `Infer` method instead of constructor. This is because executors only define the ways to run the model, therefore in each run, you can change the settings for this time inference.
-
-
-# InferenceParams
-
-Namespace: LLama.Common
-
-```csharp
-public class InferenceParams
-```
-
-Inheritance [Object](https://docs.microsoft.com/en-us/dotnet/api/system.object) β [InferenceParams]()
-
-## Properties
-
-### **TokensKeep**
-
-number of tokens to keep from initial prompt
-
-```csharp
-public int TokensKeep { get; set; }
-```
-
-#### Property Value
-
-[Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
-
-### **MaxTokens**
-
-how many new tokens to predict (n_predict), set to -1 to infinitely generate response
- until it complete.
-
-```csharp
-public int MaxTokens { get; set; }
-```
-
-#### Property Value
-
-[Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
-
-### **LogitBias**
-
-logit bias for specific tokens
-
-```csharp
-public Dictionary LogitBias { get; set; }
-```
-
-#### Property Value
-
-[Dictionary<Int32, Single>](https://docs.microsoft.com/en-us/dotnet/api/system.collections.generic.dictionary-2)
-
-### **AntiPrompts**
-
-Sequences where the model will stop generating further tokens.
-
-```csharp
-public IEnumerable AntiPrompts { get; set; }
-```
-
-#### Property Value
-
-[IEnumerable<String>](https://docs.microsoft.com/en-us/dotnet/api/system.collections.generic.ienumerable-1)
-
-### **PathSession**
-
-path to file for saving/loading model eval state
-
-```csharp
-public string PathSession { get; set; }
-```
-
-#### Property Value
-
-[String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
-
-### **InputSuffix**
-
-string to suffix user inputs with
-
-```csharp
-public string InputSuffix { get; set; }
-```
-
-#### Property Value
-
-[String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
-
-### **InputPrefix**
-
-string to prefix user inputs with
-
-```csharp
-public string InputPrefix { get; set; }
-```
-
-#### Property Value
-
-[String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
-
-### **TopK**
-
-0 or lower to use vocab size
-
-```csharp
-public int TopK { get; set; }
-```
-
-#### Property Value
-
-[Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
-
-### **TopP**
-
-1.0 = disabled
-
-```csharp
-public float TopP { get; set; }
-```
-
-#### Property Value
-
-[Single](https://docs.microsoft.com/en-us/dotnet/api/system.single)
-
-### **TfsZ**
-
-1.0 = disabled
-
-```csharp
-public float TfsZ { get; set; }
-```
-
-#### Property Value
-
-[Single](https://docs.microsoft.com/en-us/dotnet/api/system.single)
-
-### **TypicalP**
-
-1.0 = disabled
-
-```csharp
-public float TypicalP { get; set; }
-```
-
-#### Property Value
-
-[Single](https://docs.microsoft.com/en-us/dotnet/api/system.single)
-
-### **Temperature**
-
-1.0 = disabled
-
-```csharp
-public float Temperature { get; set; }
-```
-
-#### Property Value
-
-[Single](https://docs.microsoft.com/en-us/dotnet/api/system.single)
-
-### **RepeatPenalty**
-
-1.0 = disabled
-
-```csharp
-public float RepeatPenalty { get; set; }
-```
-
-#### Property Value
-
-[Single](https://docs.microsoft.com/en-us/dotnet/api/system.single)
-
-### **RepeatLastTokensCount**
-
-last n tokens to penalize (0 = disable penalty, -1 = context size) (repeat_last_n)
-
-```csharp
-public int RepeatLastTokensCount { get; set; }
-```
-
-#### Property Value
-
-[Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
-
-### **FrequencyPenalty**
-
-frequency penalty coefficient
- 0.0 = disabled
-
-```csharp
-public float FrequencyPenalty { get; set; }
-```
-
-#### Property Value
-
-[Single](https://docs.microsoft.com/en-us/dotnet/api/system.single)
-
-### **PresencePenalty**
-
-presence penalty coefficient
- 0.0 = disabled
-
-```csharp
-public float PresencePenalty { get; set; }
-```
-
-#### Property Value
-
-[Single](https://docs.microsoft.com/en-us/dotnet/api/system.single)
-
-### **Mirostat**
-
-Mirostat uses tokens instead of words.
- algorithm described in the paper https://arxiv.org/abs/2007.14966.
- 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
-
-```csharp
-public MiroStateType Mirostat { get; set; }
-```
-
-#### Property Value
-
-[MiroStateType]()
-
-### **MirostatTau**
-
-target entropy
-
-```csharp
-public float MirostatTau { get; set; }
-```
-
-#### Property Value
-
-[Single](https://docs.microsoft.com/en-us/dotnet/api/system.single)
-
-### **MirostatEta**
-
-learning rate
-
-```csharp
-public float MirostatEta { get; set; }
-```
-
-#### Property Value
-
-[Single](https://docs.microsoft.com/en-us/dotnet/api/system.single)
-
-### **PenalizeNL**
-
-consider newlines as a repeatable token (penalize_nl)
-
-```csharp
-public bool PenalizeNL { get; set; }
-```
-
-#### Property Value
-
-[Boolean](https://docs.microsoft.com/en-us/dotnet/api/system.boolean)
\ No newline at end of file
diff --git a/docs/LLamaExecutors/save-load-state.md b/docs/LLamaExecutors/save-load-state.md
deleted file mode 100644
index 3af6a8c7..00000000
--- a/docs/LLamaExecutors/save-load-state.md
+++ /dev/null
@@ -1,27 +0,0 @@
-# Save/Load State of Executor
-
-Similar to `LLamaModel`, an executor also has its state, which can be saved and loaded. **Note that in most of cases, the state of executor and the state of the model should be loaded and saved at the same time.**
-
-To decouple the model and executor, we provide APIs to save/load state for model and executor respectively. However, during the inference, the processed information will leave footprint in `LLamaModel`'s native context. Therefore, if you just load a state from another executor but keep the model unmodified, some strange things may happen. So will loading model state only.
-
-Is there a condition that requires to load one of them only? The answer is YES. For example, after resetting the model state, if you don't want the inference starting from the new position, leaving the executor unmodified is okay. But, anyway, this flexible usage may cause some unexpected behaviors, therefore please ensure you know what you're doing before using it in this way.
-
-In the future version, we'll open the access for some variables inside the executor to support more flexible usages.
-
-The APIs to load/save state of the executors is similar to that of `LLamaModel`. However, note that `StatelessExecutor` doesn't have such APIs because it's stateless itself. Besides, the output of `GetStateData` is an object of type `ExecutorBaseState`.
-
-```cs
-LLamaModel model = new LLamaModel(new ModelParams(""));
-InteractiveExecutor executor = new InteractiveExecutor(model);
-// do some things...
-executor.SaveState("executor.st");
-var stateData = model.GetStateData();
-
-InteractiveExecutor executor2 = new InteractiveExecutor(model);
-executor2.LoadState(stateData);
-// do some things...
-
-InteractiveExecutor executor3 = new InteractiveExecutor(model);
-executor3.LoadState("executor.st");
-// do some things...
-```
\ No newline at end of file
diff --git a/docs/LLamaExecutors/text-to-text-apis.md b/docs/LLamaExecutors/text-to-text-apis.md
deleted file mode 100644
index eeb2440c..00000000
--- a/docs/LLamaExecutors/text-to-text-apis.md
+++ /dev/null
@@ -1,18 +0,0 @@
-# Text-to-Text APIs of the executors
-
-All the executors implements the interface `ILLamaExecutor`, which provides two APIs to execute text-to-text tasks.
-
-```cs
-public interface ILLamaExecutor
-{
- public LLamaModel Model { get; }
-
- IEnumerable Infer(string text, InferenceParams? inferenceParams = null, CancellationToken token = default);
-
- IAsyncEnumerable InferAsync(string text, InferenceParams? inferenceParams = null, CancellationToken token = default);
-}
-```
-
-Just pass the text to the executor with the inference parameters. For the inference parameters, please refer to [executor inference parameters doc](./parameters.md).
-
-The output of both two APIs are **yield enumerable**. Therefore, when receiving the output, you can directly use `foreach` to take actions on each word you get by order, instead of waiting for the whole process completed.
\ No newline at end of file
diff --git a/docs/NonEnglishUsage/Chinese.md b/docs/NonEnglishUsage/Chinese.md
deleted file mode 100644
index 2d03f3bc..00000000
--- a/docs/NonEnglishUsage/Chinese.md
+++ /dev/null
@@ -1,3 +0,0 @@
-# Use LLamaSharp with Chinese
-
-It's supported now but the document is under work. Please wait for some time. Thank you for your support! :)
\ No newline at end of file
diff --git a/docs/QuickStart.md b/docs/QuickStart.md
new file mode 100644
index 00000000..96151d83
--- /dev/null
+++ b/docs/QuickStart.md
@@ -0,0 +1,197 @@
+# Quick start
+
+## Installation
+
+To gain high performance, LLamaSharp interacts with a native library compiled from c++, which is called `backend`. We provide backend packages for Windows, Linux and MAC with CPU, Cuda, Metal and OpenCL. You **don't** need to handle anything about c++ but just install the backend packages.
+
+If no published backend match your device, please open an issue to let us know. If compiling c++ code is not difficult for you, you could also follow [this guide](./ContributingGuide.md) to compile a backend and run LLamaSharp with it.
+
+1. Install [LLamaSharp](https://www.nuget.org/packages/LLamaSharp) package on NuGet:
+
+```
+PM> Install-Package LLamaSharp
+```
+
+2. Install one or more of these backends, or use self-compiled backend.
+
+ - [`LLamaSharp.Backend.Cpu`](https://www.nuget.org/packages/LLamaSharp.Backend.Cpu): Pure CPU for Windows & Linux & MAC. Metal (GPU) support for MAC.
+ - [`LLamaSharp.Backend.Cuda11`](https://www.nuget.org/packages/LLamaSharp.Backend.Cuda11): CUDA11 for Windows & Linux.
+ - [`LLamaSharp.Backend.Cuda12`](https://www.nuget.org/packages/LLamaSharp.Backend.Cuda12): CUDA 12 for Windows & Linux.
+ - [`LLamaSharp.Backend.OpenCL`](https://www.nuget.org/packages/LLamaSharp.Backend.OpenCL): OpenCL for Windows & Linux.
+
+3. (optional) For [Microsoft semantic-kernel](https://github.com/microsoft/semantic-kernel) integration, install the [LLamaSharp.semantic-kernel](https://www.nuget.org/packages/LLamaSharp.semantic-kernel) package.
+4. (optional) To enable RAG support, install the [LLamaSharp.kernel-memory](https://www.nuget.org/packages/LLamaSharp.kernel-memory) package (this package only supports `net6.0` or higher yet), which is based on [Microsoft kernel-memory](https://github.com/microsoft/kernel-memory) integration.
+
+## Model preparation
+
+There are two popular format of model file of LLM now, which are PyTorch format (.pth) and Huggingface format (.bin). LLamaSharp uses `GGUF` format file, which could be converted from these two formats. To get `GGUF` file, there are two options:
+
+1. Search model name + 'gguf' in [Huggingface](https://huggingface.co), you will find lots of model files that have already been converted to GGUF format. Please take care of the publishing time of them because some old ones could only work with old version of LLamaSharp.
+
+2. Convert PyTorch or Huggingface format to GGUF format yourself. Please follow the instructions of [this part of llama.cpp readme](https://github.com/ggerganov/llama.cpp?tab=readme-ov-file#prepare-and-quantize) to convert them with the python scripts.
+
+Generally, we recommend downloading models with quantization rather than fp16, because it significantly reduce the required memory size while only slightly impact on its generation quality.
+
+
+## Example of LLaMA chat session
+
+Here is a simple example to chat with bot based on LLM in LLamaSharp. Please replace the model path with yours.
+
+
+
+```cs
+using LLama.Common;
+using LLama;
+
+string modelPath = @""; // change it to your own model path.
+
+var parameters = new ModelParams(modelPath)
+{
+ ContextSize = 1024, // The longest length of chat as memory.
+ GpuLayerCount = 5 // How many layers to offload to GPU. Please adjust it according to your GPU memory.
+};
+using var model = LLamaWeights.LoadFromFile(parameters);
+using var context = model.CreateContext(parameters);
+var executor = new InteractiveExecutor(context);
+
+// Add chat histories as prompt to tell AI how to act.
+var chatHistory = new ChatHistory();
+chatHistory.AddMessage(AuthorRole.System, "Transcript of a dialog, where the User interacts with an Assistant named Bob. Bob is helpful, kind, honest, good at writing, and never fails to answer the User's requests immediately and with precision.");
+chatHistory.AddMessage(AuthorRole.User, "Hello, Bob.");
+chatHistory.AddMessage(AuthorRole.Assistant, "Hello. How may I help you today?");
+
+ChatSession session = new(executor, chatHistory);
+
+InferenceParams inferenceParams = new InferenceParams()
+{
+ MaxTokens = 256, // No more than 256 tokens should appear in answer. Remove it if antiprompt is enough for control.
+ AntiPrompts = new List { "User:" } // Stop generation once antiprompts appear.
+};
+
+Console.ForegroundColor = ConsoleColor.Yellow;
+Console.Write("The chat session has started.\nUser: ");
+Console.ForegroundColor = ConsoleColor.Green;
+string userInput = Console.ReadLine() ?? "";
+
+while (userInput != "exit")
+{
+ await foreach ( // Generate the response streamingly.
+ var text
+ in session.ChatAsync(
+ new ChatHistory.Message(AuthorRole.User, userInput),
+ inferenceParams))
+ {
+ Console.ForegroundColor = ConsoleColor.White;
+ Console.Write(text);
+ }
+ Console.ForegroundColor = ConsoleColor.Green;
+ userInput = Console.ReadLine() ?? "";
+}
+```
+
+
+## Examples of chatting with LLaVA
+
+This example shows chatting with LLaVA to ask it to describe the picture.
+
+
+```cs
+using System.Text.RegularExpressions;
+using LLama;
+using LLama.Common;
+
+string multiModalProj = @"";
+string modelPath = @"";
+string modelImage = @"";
+const int maxTokens = 1024; // The max tokens that could be generated.
+
+var prompt = $"{{{modelImage}}}\nUSER:\nProvide a full description of the image.\nASSISTANT:\n";
+
+var parameters = new ModelParams(modelPath)
+{
+ ContextSize = 4096,
+ Seed = 1337,
+};
+using var model = LLamaWeights.LoadFromFile(parameters);
+using var context = model.CreateContext(parameters);
+
+// Llava Init
+using var clipModel = LLavaWeights.LoadFromFile(multiModalProj);
+
+var ex = new InteractiveExecutor(context, clipModel);
+
+Console.ForegroundColor = ConsoleColor.Yellow;
+Console.WriteLine("The executor has been enabled. In this example, the prompt is printed, the maximum tokens is set to {0} and the context size is {1}.", maxTokens, parameters.ContextSize);
+Console.WriteLine("To send an image, enter its filename in curly braces, like this {c:/image.jpg}.");
+
+var inferenceParams = new InferenceParams() { Temperature = 0.1f, AntiPrompts = new List { "\nUSER:" }, MaxTokens = maxTokens };
+
+do
+{
+
+ // Evaluate if we have images
+ //
+ var imageMatches = Regex.Matches(prompt, "{([^}]*)}").Select(m => m.Value);
+ var imageCount = imageMatches.Count();
+ var hasImages = imageCount > 0;
+ byte[][] imageBytes = null;
+
+ if (hasImages)
+ {
+ var imagePathsWithCurlyBraces = Regex.Matches(prompt, "{([^}]*)}").Select(m => m.Value);
+ var imagePaths = Regex.Matches(prompt, "{([^}]*)}").Select(m => m.Groups[1].Value);
+
+ try
+ {
+ imageBytes = imagePaths.Select(File.ReadAllBytes).ToArray();
+ }
+ catch (IOException exception)
+ {
+ Console.ForegroundColor = ConsoleColor.Red;
+ Console.Write(
+ $"Could not load your {(imageCount == 1 ? "image" : "images")}:");
+ Console.Write($"{exception.Message}");
+ Console.ForegroundColor = ConsoleColor.Yellow;
+ Console.WriteLine("Please try again.");
+ break;
+ }
+
+
+ int index = 0;
+ foreach (var path in imagePathsWithCurlyBraces)
+ {
+ // First image replace to tag ");
+ else
+ prompt = prompt.Replace(path, "");
+ }
+ Console.WriteLine();
+
+
+ // Initilize Images in executor
+ //
+ ex.ImagePaths = imagePaths.ToList();
+ }
+
+ Console.ForegroundColor = ConsoleColor.White;
+ await foreach (var text in ex.InferAsync(prompt, inferenceParams))
+ {
+ Console.Write(text);
+ }
+ Console.Write(" ");
+ Console.ForegroundColor = ConsoleColor.Green;
+ prompt = Console.ReadLine();
+ Console.WriteLine();
+
+ // let the user finish with exit
+ //
+ if (prompt.Equals("/exit", StringComparison.OrdinalIgnoreCase))
+ break;
+
+}
+while (true);
+```
+
+
+*For more examples, please refer to [LLamaSharp.Examples](./LLama.Examples).*
diff --git a/docs/Tricks.md b/docs/Tricks.md
deleted file mode 100644
index 4b72f440..00000000
--- a/docs/Tricks.md
+++ /dev/null
@@ -1,44 +0,0 @@
-# Tricks for FAQ
-
-Sometimes, your application with LLM and LLamaSharp may have strange behaviours. Before opening an issue to report the BUG, the following tricks may worth a try.
-
-
-## Carefully set the anti-prompts
-
-Anti-prompt can also be called as "Stop-keyword", which decides when to stop the response generation. Under interactive mode, the maximum tokens count is always not set, which makes the LLM generates responses infinitively. Therefore, setting anti-prompt correctly helps a lot to avoid the strange behaviours. For example, the prompt file `chat-with-bob.txt` has the following content:
-
-```
-Transcript of a dialog, where the User interacts with an Assistant named Bob. Bob is helpful, kind, honest, good at writing, and never fails to answer the User's requests immediately and with precision.
-
-User: Hello, Bob.
-Bob: Hello. How may I help you today?
-User: Please tell me the largest city in Europe.
-Bob: Sure. The largest city in Europe is Moscow, the capital of Russia.
-User:
-```
-
-Therefore, the anti-prompt should be set as "User:". If the last line of the prompt is removed, LLM will automatically generate a question (user) and a response (bob) for one time when running the chat session. Therefore, the antiprompt is suggested to be appended to the prompt when starting a chat session.
-
-What if an extra line is appended? The string "User:" in the prompt will be followed with a char "\n". Thus when running the model, the automatic generation of a pair of question and response may appear because the anti-prompt is "User:" but the last token is "User:\n". As for whether it will appear, it's an undefined behaviour, which depends on the implementation inside the `LLamaExecutor`. Anyway, since it may leads to unexpected behaviors, it's recommended to trim your prompt or carefully keep consistent with your anti-prompt.
-
-## Pay attention to the length of prompt
-
-Sometimes we want to input a long prompt to execute a task. However, the context size may limit the inference of LLama model. Please ensure the inequality below holds.
-
-$$ len(prompt) + len(response) < len(context) $$
-
-In this inequality, `len(response)` refers to the expected tokens for LLM to generate.
-
-## Try different executors with a prompt
-
-Some prompt works well under interactive mode, such as `chat-with-bob`, some others may work well with instruct mode, such as `alpaca`. Besides, if your input is quite simple and one-time job, such as "Q: what is the satellite of the earth? A: ", stateless mode will be a good choice.
-
-If your chat bot has bad performance, trying different executor will possibly make it work well.
-
-## Choose models weight depending on you task
-
-The differences between modes may lead to much different behaviours under the same task. For example, if you're building a chat bot with non-English, a fine-tuned model specially for the language you want to use will have huge effect on the performance.
-
-## Set the layer count you want to offload to GPU
-
-Currently, the `GpuLayerCount` parameter, which decides the number of layer loaded into GPU, is set to 20 by default. However, if you have some efficient GPUs, setting it as a larger number will attain faster inference.
\ No newline at end of file
diff --git a/docs/ChatSession/transforms.md b/docs/Tutorials/ChatSession.md
similarity index 74%
rename from docs/ChatSession/transforms.md
rename to docs/Tutorials/ChatSession.md
index b0fa66d8..e248fe9b 100644
--- a/docs/ChatSession/transforms.md
+++ b/docs/Tutorials/ChatSession.md
@@ -1,10 +1,70 @@
-# Transforms in Chat Session
+# LLamaSharp chat session
+
+## Basic usages of ChatSession
+
+`ChatSession` is a higher-level abstraction than the executors. In the context of a chat application like ChatGPT, a "chat session" refers to an interactive conversation or exchange of messages between the user and the chatbot. It represents a continuous flow of communication where the user enters input or asks questions, and the chatbot responds accordingly. A chat session typically starts when the user initiates a conversation with the chatbot and continues until the interaction comes to a natural end or is explicitly terminated by either the user or the system. During a chat session, the chatbot maintains the context of the conversation, remembers previous messages, and generates appropriate responses based on the user's inputs and the ongoing dialogue.
+
+### Initialize a session
+
+Currently, the only parameter that is accepted is an `ILLamaExecutor`, because this is the only parameter that we're sure to exist in all the future versions. Since it's the high-level abstraction, we're conservative to the API designs. In the future, there may be more kinds of constructors added.
+
+```cs
+InteractiveExecutor ex = new(new LLamaModel(new ModelParams(modelPath)));
+ChatSession session = new ChatSession(ex);
+```
+
+### Chat with the bot
+
+There'll be two kinds of input accepted by the `Chat` API, which are `ChatHistory` and `String`. The API with string is quite similar to that of the executors. Meanwhile, the API with `ChatHistory` is aimed to provide more flexible usages. For example, you have had a chat with the bot in session A before you open the session B. Now session B has no memory for what you said before. Therefore, you can feed the history of A to B.
+
+```cs
+string prompt = "What is C#?";
+
+await foreach (var text in session.ChatAsync(prompt, new InferenceParams() { Temperature = 0.6f, AntiPrompts = new List { "User:" } })) // the inference params should be changed depending on your statement
+{
+ Console.Write(text);
+}
+```
+
+### Get the history
+
+Currently `History` is a property of `ChatSession`.
+
+```cs
+foreach(var rec in session.History.Messages)
+{
+ Console.WriteLine($"{rec.AuthorRole}: {rec.Content}");
+}
+```
+
+## Save/Load Chat Session
+
+Generally, the chat session could be switched, which requires the ability of loading and saving session.
+
+The API is also quite simple, the files will be saved into a directory you specified. If the path does not exist, a new directory will be created.
+
+```cs
+string savePath = "";
+session.SaveSession(savePath);
+
+session.LoadSession(savePath, loadTransforms:true);
+session.LoadSession(savePath, loadTransforms:false);
+```
+
+You could also keep the state in memory and load them with the following APIs.
+
+```cs
+var sessionState = session.GetSessionState();
+session.LoadSession(sessionState, loadTransforms:true);
+session.LoadSession(sessionState, loadTransforms:false);
+
+## Transforms in Chat Session
There's three important elements in `ChatSession`, which are input, output and history. Besides, there're some conversions between them. Since the process of them under different conditions varies, LLamaSharp hands over this part of the power to the users.
Currently, there're three kinds of process that could be customized, as introduced below.
-## Input transform
+### Input transform
In general, the input of the chat API is a text (without stream), therefore `ChatSession` processes it in a pipeline. If you want to use your customized transform, you need to define a transform that implements `ITextTransform` and add it to the pipeline of `ChatSession`.
@@ -35,7 +95,7 @@ public class MyInputTransform2 : ITextTransform
session.AddInputTransform(new MyInputTransform1()).AddInputTransform(new MyInputTransform2());
```
-## Output transform
+### Output transform
Different from the input, the output of chat API is a text stream. Therefore you need to process it word by word, instead of getting the full text at once.
@@ -145,7 +205,7 @@ public class KeywordTextOutputStreamTransform : ITextStreamTransform
}
```
-## History transform
+### History transform
The chat history could be converted to or from a text, which is exactly what the interface of it.
@@ -242,4 +302,4 @@ public class DefaultHistoryTransform : IHistoryTransform
return text;
}
}
-```
\ No newline at end of file
+```
diff --git a/docs/Tutorials/Executors.md b/docs/Tutorials/Executors.md
new file mode 100644
index 00000000..d014da88
--- /dev/null
+++ b/docs/Tutorials/Executors.md
@@ -0,0 +1,349 @@
+# LLamaSharp executors
+
+LLamaSharp executor defines the behavior of the model when it is called. Currently, there are four kinds of executors, which are `InteractiveExecutor`, `InstructExecutor`, `StatelessExecutor` and `BatchedExecutor`.
+
+In a word, `InteractiveExecutor` is suitable for getting answer of your questions from LLM continuously. `InstructExecutor` let LLM execute your instructions, such as "continue writing". `StatelessExecutor` is best for one-time job because the previous inference has no impact on the current inference. `BatchedExecutor` could accept multiple inputs and generate multiple outputs of different sessions at the same time, significantly improving the throughput of the program.
+
+## Text-to-Text APIs of the executors
+
+All the executors implements the interface `ILLamaExecutor`, which provides two APIs to execute text-to-text tasks.
+
+```cs
+public interface ILLamaExecutor
+{
+ ///
+ /// The loaded context for this executor.
+ ///
+ public LLamaContext Context { get; }
+
+ // LLava Section
+ //
+ ///
+ /// Identify if it's a multi-modal model and there is a image to process.
+ ///
+ public bool IsMultiModal { get; }
+ ///
+ /// Muti-Modal Projections / Clip Model weights
+ ///
+ public LLavaWeights? ClipModel { get; }
+
+ ///
+ /// List of images: Image filename and path (jpeg images).
+ ///
+ public List ImagePaths { get; set; }
+
+
+ ///
+ /// Asynchronously infers a response from the model.
+ ///
+ /// Your prompt
+ /// Any additional parameters
+ /// A cancellation token.
+ ///
+ IAsyncEnumerable InferAsync(string text, IInferenceParams? inferenceParams = null, CancellationToken token = default);
+}
+```
+
+The output of both two APIs are **yield enumerable**. Therefore, when receiving the output, you can directly use `foreach` to take actions on each word you get by order, instead of waiting for the whole process completed.
+
+## InteractiveExecutor & InstructExecutor
+
+Both of them are taking "completing the prompt" as the goal to generate the response. For example, if you input `Long long ago, there was a fox who wanted to make friend with humen. One day`, then the LLM will continue to write the story.
+
+Under interactive mode, you serve a role of user and the LLM serves the role of assistant. Then it will help you with your question or request.
+
+Under instruct mode, you give LLM some instructions and it follows.
+
+Though the behaviors of them sounds similar, it could introduce many differences depending on your prompt. For example, "chat-with-bob" has good performance under interactive mode and `alpaca` does well with instruct mode.
+
+```
+// chat-with-bob
+
+Transcript of a dialog, where the User interacts with an Assistant named Bob. Bob is helpful, kind, honest, good at writing, and never fails to answer the User's requests immediately and with precision.
+
+User: Hello, Bob.
+Bob: Hello. How may I help you today?
+User: Please tell me the largest city in Europe.
+Bob: Sure. The largest city in Europe is Moscow, the capital of Russia.
+User:
+```
+
+```
+// alpaca
+
+Below is an instruction that describes a task. Write a response that appropriately completes the request.
+```
+
+Therefore, please modify the prompt correspondingly when switching from one mode to the other.
+
+## StatelessExecutor.
+
+Despite the differences between interactive mode and instruct mode, both of them are stateful mode. That is, your previous question/instruction will impact on the current response from LLM. On the contrary, the stateless executor does not have such a "memory". No matter how many times you talk to it, it will only concentrate on what you say in this time. It is very useful when you want a clean context, without being affected by previous inputs.
+
+Since the stateless executor has no memory of conversations before, you need to input your question with the whole prompt into it to get the better answer.
+
+For example, if you feed `Q: Who is Trump? A: ` to the stateless executor, it may give the following answer with the antiprompt `Q: `.
+
+```
+Donald J. Trump, born June 14, 1946, is an American businessman, television personality, politician and the 45th President of the United States (2017-2021). # Anexo:Torneo de Hamburgo 2022 (individual masculino)
+
+## PresentaciΓ³n previa
+
+* Defensor del tΓtulo: Daniil MedvΓ©dev
+```
+
+It seems that things went well at first. However, after answering the question itself, LLM began to talk about some other things until the answer reached the token count limit. The reason of this strange behavior is the anti-prompt cannot be match. With the input, LLM cannot decide whether to append a string "A: " at the end of the response.
+
+As an improvement, let's take the following text as the input:
+
+```
+Q: What is the capital of the USA? A: Washingtong. Q: What is the sum of 1 and 2? A: 3. Q: Who is Trump? A:
+```
+
+Then, I got the following answer with the anti-prompt `Q: `.
+
+```
+45th president of the United States.
+```
+
+At this time, by repeating the same mode of `Q: xxx? A: xxx.`, LLM outputs the anti-prompt we want to help to decide where to stop the generation.
+
+## BatchedExecutor
+
+Different from other executors, `BatchedExecutor` could accept multiple inputs from different sessions and geneate outputs for them at the same time. Here is an example to use it.
+
+```cs
+using LLama.Batched;
+using LLama.Common;
+using LLama.Native;
+using LLama.Sampling;
+using Spectre.Console;
+
+namespace LLama.Examples.Examples;
+
+///
+/// This demonstrates using a batch to generate two sequences and then using one
+/// sequence as the negative guidance ("classifier free guidance") for the other.
+///
+public class BatchedExecutorGuidance
+{
+ private const int n_len = 32;
+
+ public static async Task Run()
+ {
+ string modelPath = UserSettings.GetModelPath();
+
+ var parameters = new ModelParams(modelPath);
+ using var model = LLamaWeights.LoadFromFile(parameters);
+
+ var positivePrompt = AnsiConsole.Ask("Positive Prompt (or ENTER for default):", "My favourite colour is").Trim();
+ var negativePrompt = AnsiConsole.Ask("Negative Prompt (or ENTER for default):", "I hate the colour red. My favourite colour is").Trim();
+ var weight = AnsiConsole.Ask("Guidance Weight (or ENTER for default):", 2.0f);
+
+ // Create an executor that can evaluate a batch of conversations together
+ using var executor = new BatchedExecutor(model, parameters);
+
+ // Print some info
+ var name = executor.Model.Metadata.GetValueOrDefault("general.name", "unknown model name");
+ Console.WriteLine($"Created executor with model: {name}");
+
+ // Load the two prompts into two conversations
+ using var guided = executor.Create();
+ guided.Prompt(positivePrompt);
+ using var guidance = executor.Create();
+ guidance.Prompt(negativePrompt);
+
+ // Run inference to evaluate prompts
+ await AnsiConsole
+ .Status()
+ .Spinner(Spinner.Known.Line)
+ .StartAsync("Evaluating Prompts...", _ => executor.Infer());
+
+ // Fork the "guided" conversation. We'll run this one without guidance for comparison
+ using var unguided = guided.Fork();
+
+ // Run inference loop
+ var unguidedSampler = new GuidedSampler(null, weight);
+ var unguidedDecoder = new StreamingTokenDecoder(executor.Context);
+ var guidedSampler = new GuidedSampler(guidance, weight);
+ var guidedDecoder = new StreamingTokenDecoder(executor.Context);
+ await AnsiConsole
+ .Progress()
+ .StartAsync(async progress =>
+ {
+ var reporter = progress.AddTask("Running Inference", maxValue: n_len);
+
+ for (var i = 0; i < n_len; i++)
+ {
+ if (i != 0)
+ await executor.Infer();
+
+ // Sample from the "unguided" conversation. This is just a conversation using the same prompt, without any
+ // guidance. This serves as a comparison to show the effect of guidance.
+ var u = unguidedSampler.Sample(executor.Context.NativeHandle, unguided.Sample(), Array.Empty());
+ unguidedDecoder.Add(u);
+ unguided.Prompt(u);
+
+ // Sample from the "guided" conversation. This sampler will internally use the "guidance" conversation
+ // to steer the conversation. See how this is done in GuidedSampler.ProcessLogits (bottom of this file).
+ var g = guidedSampler.Sample(executor.Context.NativeHandle, guided.Sample(), Array.Empty());
+ guidedDecoder.Add(g);
+
+ // Use this token to advance both guided _and_ guidance. Keeping them in sync (except for the initial prompt).
+ guided.Prompt(g);
+ guidance.Prompt(g);
+
+ // Early exit if we reach the natural end of the guided sentence
+ if (g == model.EndOfSentenceToken)
+ break;
+
+ // Update progress bar
+ reporter.Increment(1);
+ }
+ });
+
+ AnsiConsole.MarkupLine($"[green]Unguided:[/][white]{unguidedDecoder.Read().ReplaceLineEndings(" ")}[/]");
+ AnsiConsole.MarkupLine($"[green]Guided:[/][white]{guidedDecoder.Read().ReplaceLineEndings(" ")}[/]");
+ }
+
+ private class GuidedSampler(Conversation? guidance, float weight)
+ : BaseSamplingPipeline
+ {
+ public override void Accept(SafeLLamaContextHandle ctx, LLamaToken token)
+ {
+ }
+
+ public override ISamplingPipeline Clone()
+ {
+ throw new NotSupportedException();
+ }
+
+ protected override void ProcessLogits(SafeLLamaContextHandle ctx, Span logits, ReadOnlySpan lastTokens)
+ {
+ if (guidance == null)
+ return;
+
+ // Get the logits generated by the guidance sequences
+ var guidanceLogits = guidance.Sample();
+
+ // Use those logits to guide this sequence
+ NativeApi.llama_sample_apply_guidance(ctx, logits, guidanceLogits, weight);
+ }
+
+ protected override LLamaToken ProcessTokenDataArray(SafeLLamaContextHandle ctx, LLamaTokenDataArray candidates, ReadOnlySpan lastTokens)
+ {
+ candidates.Temperature(ctx, 0.8f);
+ candidates.TopK(ctx, 25);
+
+ return candidates.SampleToken(ctx);
+ }
+ }
+}
+```
+
+## Inference parameters
+
+Different from context parameters, which is indicated in [understand-llama-context](./UnderstandLLamaContext.md), executors accept parameters when you call its API to execute the inference. That means you could change the parameters every time you ask the model to generate the outputs.
+
+Here is the parameters for LLamaSharp executors.
+
+```cs
+///
+/// The paramters used for inference.
+///
+public record InferenceParams
+ : IInferenceParams
+{
+ ///
+ /// number of tokens to keep from initial prompt
+ ///
+ public int TokensKeep { get; set; } = 0;
+
+ ///
+ /// how many new tokens to predict (n_predict), set to -1 to inifinitely generate response
+ /// until it complete.
+ ///
+ public int MaxTokens { get; set; } = -1;
+
+ ///
+ /// logit bias for specific tokens
+ ///
+ public Dictionary? LogitBias { get; set; } = null;
+
+ ///
+ /// Sequences where the model will stop generating further tokens.
+ ///
+ public IReadOnlyList AntiPrompts { get; set; } = Array.Empty();
+
+ ///
+ public int TopK { get; set; } = 40;
+
+ ///
+ public float TopP { get; set; } = 0.95f;
+
+ ///
+ public float MinP { get; set; } = 0.05f;
+
+ ///
+ public float TfsZ { get; set; } = 1.0f;
+
+ ///
+ public float TypicalP { get; set; } = 1.0f;
+
+ ///
+ public float Temperature { get; set; } = 0.8f;
+
+ ///
+ public float RepeatPenalty { get; set; } = 1.1f;
+
+ ///
+ public int RepeatLastTokensCount { get; set; } = 64;
+
+ ///
+ public float FrequencyPenalty { get; set; } = .0f;
+
+ ///
+ public float PresencePenalty { get; set; } = .0f;
+
+ ///
+ public MirostatType Mirostat { get; set; } = MirostatType.Disable;
+
+ ///
+ public float MirostatTau { get; set; } = 5.0f;
+
+ ///
+ public float MirostatEta { get; set; } = 0.1f;
+
+ ///
+ public bool PenalizeNL { get; set; } = true;
+
+ ///
+ public SafeLLamaGrammarHandle? Grammar { get; set; }
+
+ ///
+ public ISamplingPipeline? SamplingPipeline { get; set; }
+}
+```
+
+
+
+## Save and load executor state
+
+An executor also has its state, which can be saved and loaded. That means a lot when you want to support restore a previous session for the user in your application.
+
+The following code shows how to use save and load executor state.
+
+```cs
+InteractiveExecutor executor = new InteractiveExecutor(model);
+// do some things...
+executor.SaveState("executor.st");
+var stateData = executor.GetStateData();
+
+InteractiveExecutor executor2 = new InteractiveExecutor(model);
+executor2.LoadState(stateData);
+// do some things...
+
+InteractiveExecutor executor3 = new InteractiveExecutor(model);
+executor3.LoadState("executor.st");
+// do some things...
+```
\ No newline at end of file
diff --git a/docs/LLamaContext/embeddings.md b/docs/Tutorials/GetEmbeddings.md
similarity index 87%
rename from docs/LLamaContext/embeddings.md
rename to docs/Tutorials/GetEmbeddings.md
index 6f4b6677..ecdffd63 100644
--- a/docs/LLamaContext/embeddings.md
+++ b/docs/Tutorials/GetEmbeddings.md
@@ -1,4 +1,4 @@
-# Get Embeddings
+# Get embeddings
Getting the embeddings of a text in LLM is sometimes useful, for example, to train other MLP models.
diff --git a/docs/Tutorials/NativeLibraryConfig.md b/docs/Tutorials/NativeLibraryConfig.md
new file mode 100644
index 00000000..4bdd12dc
--- /dev/null
+++ b/docs/Tutorials/NativeLibraryConfig.md
@@ -0,0 +1,42 @@
+# Customize the native library loading
+
+As indicated in [Architecture](../Architecture.md), LLamaSharp uses the native library to run the LLM models. Sometimes you may want to compile the native library yourself, or just dynamically load the library due to the environment of your user of your application. Luckily, since version 0.7.0, dynamic loading of native library has been supported! That allows you to customize the native library loading process.
+
+
+## When you should compile the native library yourself
+
+Before introducing the way to customize native library loading, please follow the tips below to see if you need to compile the native library yourself, rather than use the published backend packages, which contain native library files for multiple targets.
+
+1. Your device/environment has not been supported by any published backend packages. For example, vulkan has not been supported yet. In this case, it will mean a lot to open an issue to tell us you are using it. Since our support for new backend will have a delay, you could compile yourself before that.
+2. You want to gain the best performance of LLamaSharp. Because LLamaSharp offloads the model to both GPU and CPU, the performance is significantly related with CPU if your GPU memory size is small. AVX ([Advanced Vector Extensions](https://en.wikipedia.org/wiki/Advanced_Vector_Extensions)) and BLAS ([Basic Linear Algebra Subprograms](https://en.wikipedia.org/wiki/Basic_Linear_Algebra_Subprograms)) are the most important ways to accelerate the CPU computation. By default, LLamaSharp disables the support for BLAS and use AVX2 for CUDA backend yet. If you would like to enable BLAS or use AVX 512 along with CUDA, please compile the native library youself, following the [instructions here](../ContributingGuide.md).
+3. You want to debug the c++ code.
+
+
+## Use NativeLibraryConfig
+
+We provide `LLama.Native.NativeLibraryConfig` class with singleton mode to allow users to customize the loading process of the native library. Any method of it should be called before the model loading, because a native library file must be decided before any model is loaded.
+
+### Load specified native library file
+
+All you need to do is adding the following code to the very beginning of your code.
+
+```cs
+NativeLibraryConfig.Instance.WithLibrary("");
+```
+
+### Automatically select one from multiple native library files
+
+Let's consider this case: you don't know your user's device when distributing your application, so you put all the possible native libraries in a folder and want to select the best one depending on the user's device. LLamaSharp allows you to define the strategy to do it.
+
+- `NativeLibraryConfig.Instance.WithCuda`: decide if you want to use cuda if possible.
+- `NativeLibraryConfig.Instance.WithAvx`: decide the highest AVX level you want to use if possible.
+- `NativeLibraryConfig.Instance.WithSearchDirectory`: specify the directory to search the native library files.
+- `NativeLibraryConfig.Instance.WithAutoFallback`: whether to allow fall back to other options if no native library that matches your specified settings could be found.
+
+### Set the log level of native library loading
+
+```cs
+NativeLibraryConfig.Instance.WithLogs();
+```
+
+There are four log levels, which are error, warning, info and debug. If you are not sure if the correct library is selected, please set log level to `info` to see the full logs.
diff --git a/docs/Tutorials/Quantization.md b/docs/Tutorials/Quantization.md
new file mode 100644
index 00000000..6e4d5dc3
--- /dev/null
+++ b/docs/Tutorials/Quantization.md
@@ -0,0 +1,54 @@
+# Quantization
+
+Quantization is significant to accelerate the model inference. Since there's little accuracy (performance) reduction when quantizing the model, get it easy to quantize it!
+
+To quantize the model, please call `Quantize` from `LLamaQuantizer`, which is a static method.
+
+```cs
+string srcPath = "";
+string dstPath = "";
+LLamaQuantizer.Quantize(srcPath, dstPath, "q4_0");
+// The following overload is also okay.
+// LLamaQuantizer.Quantize(srcPath, dstPath, LLamaFtype.LLAMA_FTYPE_MOSTLY_Q4_0);
+```
+
+After calling it, a quantized model file will be saved.
+
+There're currently the following types of quantization supported:
+
+```cpp
+{ "Q4_0", LLAMA_FTYPE_MOSTLY_Q4_0, " 3.56G, +0.2166 ppl @ LLaMA-v1-7B", },
+{ "Q4_1", LLAMA_FTYPE_MOSTLY_Q4_1, " 3.90G, +0.1585 ppl @ LLaMA-v1-7B", },
+{ "Q5_0", LLAMA_FTYPE_MOSTLY_Q5_0, " 4.33G, +0.0683 ppl @ LLaMA-v1-7B", },
+{ "Q5_1", LLAMA_FTYPE_MOSTLY_Q5_1, " 4.70G, +0.0349 ppl @ LLaMA-v1-7B", },
+{ "IQ2_XXS",LLAMA_FTYPE_MOSTLY_IQ2_XXS," 2.06 bpw quantization", },
+{ "IQ2_XS", LLAMA_FTYPE_MOSTLY_IQ2_XS, " 2.31 bpw quantization", },
+{ "IQ2_S", LLAMA_FTYPE_MOSTLY_IQ2_S, " 2.5 bpw quantization", },
+{ "IQ2_M", LLAMA_FTYPE_MOSTLY_IQ2_M, " 2.7 bpw quantization", },
+{ "IQ1_S", LLAMA_FTYPE_MOSTLY_IQ1_S, " 1.56 bpw quantization", },
+{ "IQ1_M", LLAMA_FTYPE_MOSTLY_IQ1_M, " 1.75 bpw quantization", },
+{ "Q2_K", LLAMA_FTYPE_MOSTLY_Q2_K, " 2.63G, +0.6717 ppl @ LLaMA-v1-7B", },
+{ "Q2_K_S", LLAMA_FTYPE_MOSTLY_Q2_K_S, " 2.16G, +9.0634 ppl @ LLaMA-v1-7B", },
+{ "IQ3_XXS",LLAMA_FTYPE_MOSTLY_IQ3_XXS," 3.06 bpw quantization", },
+{ "IQ3_S", LLAMA_FTYPE_MOSTLY_IQ3_S, " 3.44 bpw quantization", },
+{ "IQ3_M", LLAMA_FTYPE_MOSTLY_IQ3_M, " 3.66 bpw quantization mix", },
+{ "Q3_K", LLAMA_FTYPE_MOSTLY_Q3_K_M, "alias for Q3_K_M" },
+{ "IQ3_XS", LLAMA_FTYPE_MOSTLY_IQ3_XS, " 3.3 bpw quantization" , },
+{ "Q3_K_S", LLAMA_FTYPE_MOSTLY_Q3_K_S, " 2.75G, +0.5551 ppl @ LLaMA-v1-7B", },
+{ "Q3_K_M", LLAMA_FTYPE_MOSTLY_Q3_K_M, " 3.07G, +0.2496 ppl @ LLaMA-v1-7B", },
+{ "Q3_K_L", LLAMA_FTYPE_MOSTLY_Q3_K_L, " 3.35G, +0.1764 ppl @ LLaMA-v1-7B", },
+{ "IQ4_NL", LLAMA_FTYPE_MOSTLY_IQ4_NL, " 4.50 bpw non-linear quantization", },
+{ "IQ4_XS", LLAMA_FTYPE_MOSTLY_IQ4_XS, " 4.25 bpw non-linear quantization", },
+{ "Q4_K", LLAMA_FTYPE_MOSTLY_Q4_K_M, "alias for Q4_K_M", },
+{ "Q4_K_S", LLAMA_FTYPE_MOSTLY_Q4_K_S, " 3.59G, +0.0992 ppl @ LLaMA-v1-7B", },
+{ "Q4_K_M", LLAMA_FTYPE_MOSTLY_Q4_K_M, " 3.80G, +0.0532 ppl @ LLaMA-v1-7B", },
+{ "Q5_K", LLAMA_FTYPE_MOSTLY_Q5_K_M, "alias for Q5_K_M", },
+{ "Q5_K_S", LLAMA_FTYPE_MOSTLY_Q5_K_S, " 4.33G, +0.0400 ppl @ LLaMA-v1-7B", },
+{ "Q5_K_M", LLAMA_FTYPE_MOSTLY_Q5_K_M, " 4.45G, +0.0122 ppl @ LLaMA-v1-7B", },
+{ "Q6_K", LLAMA_FTYPE_MOSTLY_Q6_K, " 5.15G, +0.0008 ppl @ LLaMA-v1-7B", },
+{ "Q8_0", LLAMA_FTYPE_MOSTLY_Q8_0, " 6.70G, +0.0004 ppl @ LLaMA-v1-7B", },
+{ "F16", LLAMA_FTYPE_MOSTLY_F16, "13.00G @ 7B", },
+{ "F32", LLAMA_FTYPE_ALL_F32, "26.00G @ 7B", },
+// Note: Ensure COPY comes after F32 to avoid ftype 0 from matching.
+{ "COPY", LLAMA_FTYPE_ALL_F32, "only copy tensors, no quantizing", },
+```
\ No newline at end of file
diff --git a/docs/Tutorials/UnderstandLLamaContext.md b/docs/Tutorials/UnderstandLLamaContext.md
new file mode 100644
index 00000000..7be85a7c
--- /dev/null
+++ b/docs/Tutorials/UnderstandLLamaContext.md
@@ -0,0 +1,122 @@
+# Understand LLamaSharp context
+
+`LLamaContext` is the most important component as a link between native APIs and higher-level APIs. It contains the basic settings for model inference and holds the kv-cache, which could significantly accelerate the model inference. Since `LLamaContext` is not coupled with `LLamaWeights`, it's possible to create multiple context based on one piece of model weight. Each `ILLamaExecutor` will hold a `LLamaContext` instance, but it's possible to switch to different context in an executor.
+
+If your application has multiple sessions, please take care of managing `LLamaContext`.
+
+`LLamaContext` takes the following parameters as its settings. Note that the parameters could not be changed once the context has been created.
+
+```cs
+public interface IContextParams
+{
+ ///
+ /// Model context size (n_ctx)
+ ///
+ uint? ContextSize { get; }
+
+ ///
+ /// batch size for prompt processing (must be >=32 to use BLAS) (n_batch)
+ ///
+ uint BatchSize { get; }
+
+ ///
+ /// Seed for the random number generator (seed)
+ ///
+ uint Seed { get; }
+
+ ///
+ /// Whether to use embedding mode. (embedding) Note that if this is set to true,
+ /// The LLamaModel won't produce text response anymore.
+ ///
+ bool EmbeddingMode { get; }
+
+ ///
+ /// RoPE base frequency (null to fetch from the model)
+ ///
+ float? RopeFrequencyBase { get; }
+
+ ///
+ /// RoPE frequency scaling factor (null to fetch from the model)
+ ///
+ float? RopeFrequencyScale { get; }
+
+ ///
+ /// The encoding to use for models
+ ///
+ Encoding Encoding { get; }
+
+ ///
+ /// Number of threads (null = autodetect) (n_threads)
+ ///
+ uint? Threads { get; }
+
+ ///
+ /// Number of threads to use for batch processing (null = autodetect) (n_threads)
+ ///
+ uint? BatchThreads { get; }
+
+ ///
+ /// YaRN extrapolation mix factor (null = from model)
+ ///
+ float? YarnExtrapolationFactor { get; }
+
+ ///
+ /// YaRN magnitude scaling factor (null = from model)
+ ///
+ float? YarnAttentionFactor { get; }
+
+ ///
+ /// YaRN low correction dim (null = from model)
+ ///
+ float? YarnBetaFast { get; }
+
+ ///
+ /// YaRN high correction dim (null = from model)
+ ///
+ float? YarnBetaSlow { get; }
+
+ ///
+ /// YaRN original context length (null = from model)
+ ///
+ uint? YarnOriginalContext { get; }
+
+ ///
+ /// YaRN scaling method to use.
+ ///
+ RopeScalingType? YarnScalingType { get; }
+
+ ///
+ /// Override the type of the K cache
+ ///
+ GGMLType? TypeK { get; }
+
+ ///
+ /// Override the type of the V cache
+ ///
+ GGMLType? TypeV { get; }
+
+ ///
+ /// Whether to disable offloading the KQV cache to the GPU
+ ///
+ bool NoKqvOffload { get; }
+
+ ///
+ /// defragment the KV cache if holes/size > defrag_threshold, Set to < 0 to disable (default)
+ ///
+ float DefragThreshold { get; }
+
+ ///
+ /// Whether to pool (sum) embedding results by sequence id (ignored if no pooling layer)
+ ///
+ bool DoPooling { get; }
+}
+```
+
+
+`LLamaContext` has its state, which could be saved and loaded.
+
+```cs
+LLamaContext.SaveState(string filename)
+LLamaContext.GetState()
+```
+
diff --git a/docs/index.md b/docs/index.md
index 5f82ccb9..0e5ec5af 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -2,27 +2,30 @@

-LLamaSharp is the C#/.NET binding of [llama.cpp](https://github.com/ggerganov/llama.cpp). It provides APIs to inference the LLaMa Models and deploy it on native environment or Web. It could help C# developers to deploy the LLM (Large Language Model) locally and integrate with C# apps.
-
-## Main features
-
-- Model inference
-- Model quantization
-- Generating embeddings
-- Grammar parse
-- Interactive/Instruct/Stateless executor mode
-- Chat session APIs
-- Save/load the state
-- Integration with other applications like BotSharp and semantic-kernel
+LLamaSharp is a cross-platform library to run π¦LLaMA/LLaVA model (and others) in local device. Based on [llama.cpp](https://github.com/ggerganov/llama.cpp), inference with LLamaSharp is efficient on both CPU and GPU. With the higher-level APIs and RAG support, it's convenient to deploy LLM (Large Language Model) in your application with LLamaSharp.
## Essential insights for novice learners
If you are new to LLM, here're some tips for you to help you to get start with `LLamaSharp`. If you are experienced in this field, we'd still recommend you to take a few minutes to read it because some things perform differently compared to cpp/python.
-1. The main ability of LLamaSharp is to provide an efficient way to run inference of LLM (Large Language Model) locally (and fine-tune model in the future). The model weights, however, need to be downloaded from other resources such as [huggingface](https://huggingface.co).
-2. Since LLamaSharp supports multiple platforms, The nuget package is split into `LLamaSharp` and `LLama.Backend`. After installing `LLamaSharp`, please install one of `LLama.Backend.Cpu`, `LLama.Backend.Cuda11` or `LLama.Backend.Cuda12`. If you use the source code, dynamic libraries can be found in `LLama/Runtimes`.
-3. `LLaMa` originally refers to the weights released by Meta (Facebook Research). After that, many models are fine-tuned based on it, such as `Vicuna`, `GPT4All`, and `Pyglion`. Though all of these models are supported by LLamaSharp, some steps are necessary with different file formats. There're mainly three kinds of files, which are `.pth`, `.bin (ggml)`, `.bin (quantized)`. If you have the `.bin (quantized)` file, it could be used directly by LLamaSharp. If you have the `.bin (ggml)` file, you could use it directly but get higher inference speed after the quantization. If you have the `.pth` file, you need to follow [the instructions in llama.cpp](https://github.com/ggerganov/llama.cpp#prepare-data--run) to convert it to `.bin (ggml)` file at first.
-4. LLamaSharp supports GPU acceleration, but it requires cuda installation. Please install cuda 11 or cuda 12 on your system before using LLamaSharp to enable GPU. If you have another cuda version, you could compile llama.cpp from source to get the dll. For building from source, please refer to [issue #5](https://github.com/SciSharp/LLamaSharp/issues/5).
+1. The main ability of LLamaSharp is to provide an efficient way to run inference of LLM on your device (and fine-tune model in the future). The model weights, however, need to be downloaded from other resources such as [huggingface](https://huggingface.co).
+2. To gain high performance, LLamaSharp interacts with a native library compiled from c++, which is called `backend`. We provide backend packages for Windows, Linux and MAC with CPU, Cuda, Metal and OpenCL. You **don't** need to handle anything about c++ but just install the backend packages. If no published backend match your device, please open an issue to let us know. If compiling c++ code is not difficult for you, you could also follow [this guide]() to compile a backend and run LLamaSharp with it.
+3. `LLaMA` originally refers to the weights released by Meta (Facebook Research). After that, many models are fine-tuned based on it, such as `Vicuna`, `GPT4All`, and `Pyglion`. There are two popular file format of these model now, which are PyTorch format (.pth) and Huggingface format (.bin). LLamaSharp uses `GGUF` format file, which could be converted from these two formats. There are two options for you to get GGUF format file. a) Search model name + 'gguf' in [Huggingface](https://huggingface.co), you will find lots of model files that have already been converted to GGUF format. Please take care of the publishing time of them because some old ones could only work with old version of LLamaSharp. b) Convert PyTorch or Huggingface format to GGUF format yourself. Please follow the instructions of [this part of llama.cpp readme](https://github.com/ggerganov/llama.cpp?tab=readme-ov-file#prepare-and-quantize) to convert them with the python scripts.
+4. LLamaSharp supports multi-modal, which means that the model could take both text and image as input. Note that there are two model files requied for using multi-modal (LLaVA), which are main model and mm-proj model. Here is a huggingface repo which shows that: [link](https://huggingface.co/ShadowBeast/llava-v1.6-mistral-7b-Q5_K_S-GGUF/tree/main).
+
+
+
+## Integrations
+
+There are integarions for the following libraries, which help to expand the application of LLamaSharp. Integrations for semantic-kernel and kernel-memory are developed in LLamaSharp repository, while others are developed in their own repositories.
+
+- [semantic-kernel](https://github.com/microsoft/semantic-kernel): an SDK that integrates LLM like OpenAI, Azure OpenAI, and Hugging Face.
+- [kernel-memory](https://github.com/microsoft/kernel-memory): a multi-modal AI Service specialized in the efficient indexing of datasets through custom continuous data hybrid pipelines, with support for RAG ([Retrieval Augmented Generation](https://en.wikipedia.org/wiki/Prompt_engineering#Retrieval-augmented_generation)), synthetic memory, prompt engineering, and custom semantic memory processing.
+- [BotSharp](https://github.com/SciSharp/BotSharp): an open source machine learning framework for AI Bot platform builder.
+- [Langchain](https://github.com/tryAGI/LangChain): a framework for developing applications powered by language models.
+
+
+
## Welcome to join the development!
@@ -32,6 +35,6 @@ Community effort is always one of the most important things in open-source proje
2. Open an PR if you've fixed something. Even if just correcting a typo, it also makes great sense.
3. Help to optimize the documentation.
4. Write an example or blog about how to integrate LLamaSharp with your APPs.
-5. Ask for a missed feature and discuss with other developers.
+5. Ask for a missing feature and discuss with us.
-If you'd like to get deeply involved in development, please touch us in discord channel or send email to `AsakusaRinne@gmail.com`. :)
+If you'd like to get deeply involved in development, please touch us in discord channel or send email to `AsakusaRinne@gmail.com`. π€
diff --git a/docs/media/LLamaSharp-Integrations.png b/docs/media/LLamaSharp-Integrations.png
new file mode 100644
index 00000000..0533e0cb
Binary files /dev/null and b/docs/media/LLamaSharp-Integrations.png differ
diff --git a/docs/media/console_demo.gif b/docs/media/console_demo.gif
new file mode 100644
index 00000000..8ace26c8
Binary files /dev/null and b/docs/media/console_demo.gif differ
diff --git a/docs/media/llava_demo.gif b/docs/media/llava_demo.gif
new file mode 100644
index 00000000..3c5c9e68
Binary files /dev/null and b/docs/media/llava_demo.gif differ
diff --git a/docs/media/structure.jpg b/docs/media/structure.jpg
index 74173977..92855643 100644
Binary files a/docs/media/structure.jpg and b/docs/media/structure.jpg differ
diff --git a/docs/media/structure.vsdx b/docs/media/structure.vsdx
index c36500eb..e891c9b0 100644
Binary files a/docs/media/structure.vsdx and b/docs/media/structure.vsdx differ
diff --git a/docs/sciprts/auto_gen_example_yml.py b/docs/sciprts/auto_gen_example_yml.py
new file mode 100644
index 00000000..46f8ffc5
--- /dev/null
+++ b/docs/sciprts/auto_gen_example_yml.py
@@ -0,0 +1,20 @@
+import os
+
+dir = 'Examples'
+
+if __name__ == '__main__':
+ res = []
+
+ # loop all the files of `dir`
+ for root, dirs, files in os.walk(dir):
+ for file in files:
+ with open(os.path.join(root, file), 'r', encoding='utf-8') as f:
+ first_line = f.readline()
+ title = first_line.split('#')[-1]
+ filename = file.split('/')[-1].split('\\')[-1]
+ res.append(f'- {title.strip()}: {dir}/{filename}')
+
+ for item in res:
+ print(item)
+
+
\ No newline at end of file
diff --git a/docs/xmldocs/index.md b/docs/xmldocs/index.md
index 68daac1b..0d85291b 100644
--- a/docs/xmldocs/index.md
+++ b/docs/xmldocs/index.md
@@ -2,6 +2,8 @@
## LLama
+[AntipromptProcessor](./llama.antipromptprocessor.md)
+
[ChatSession](./llama.chatsession.md)
[InstructExecutor](./llama.instructexecutor.md)
@@ -18,26 +20,66 @@
[LLamaWeights](./llama.llamaweights.md)
+[LLavaWeights](./llama.llavaweights.md)
+
+[SessionState](./llama.sessionstate.md)
+
[StatefulExecutorBase](./llama.statefulexecutorbase.md)
[StatelessExecutor](./llama.statelessexecutor.md)
-[Utils](./llama.utils.md)
+[StreamingTokenDecoder](./llama.streamingtokendecoder.md)
## LLama.Abstractions
+[AdapterCollection](./llama.abstractions.adaptercollection.md)
+
+[IContextParams](./llama.abstractions.icontextparams.md)
+
[IHistoryTransform](./llama.abstractions.ihistorytransform.md)
[IInferenceParams](./llama.abstractions.iinferenceparams.md)
[ILLamaExecutor](./llama.abstractions.illamaexecutor.md)
+[ILLamaParams](./llama.abstractions.illamaparams.md)
+
[IModelParams](./llama.abstractions.imodelparams.md)
[ITextStreamTransform](./llama.abstractions.itextstreamtransform.md)
[ITextTransform](./llama.abstractions.itexttransform.md)
+[LoraAdapter](./llama.abstractions.loraadapter.md)
+
+[MetadataOverride](./llama.abstractions.metadataoverride.md)
+
+[MetadataOverrideConverter](./llama.abstractions.metadataoverrideconverter.md)
+
+[TensorSplitsCollection](./llama.abstractions.tensorsplitscollection.md)
+
+[TensorSplitsCollectionConverter](./llama.abstractions.tensorsplitscollectionconverter.md)
+
+## LLama.Batched
+
+[AlreadyPromptedConversationException](./llama.batched.alreadypromptedconversationexception.md)
+
+[BatchedExecutor](./llama.batched.batchedexecutor.md)
+
+[CannotForkWhileRequiresInferenceException](./llama.batched.cannotforkwhilerequiresinferenceexception.md)
+
+[CannotModifyWhileRequiresInferenceException](./llama.batched.cannotmodifywhilerequiresinferenceexception.md)
+
+[CannotSampleRequiresInferenceException](./llama.batched.cannotsamplerequiresinferenceexception.md)
+
+[CannotSampleRequiresPromptException](./llama.batched.cannotsamplerequirespromptexception.md)
+
+[Conversation](./llama.batched.conversation.md)
+
+[ConversationExtensions](./llama.batched.conversationextensions.md)
+
+[ExperimentalBatchedExecutorException](./llama.batched.experimentalbatchedexecutorexception.md)
+
## LLama.Common
[AuthorRole](./llama.common.authorrole.md)
@@ -46,12 +88,8 @@
[FixedSizeQueue<T>](./llama.common.fixedsizequeue-1.md)
-[ILLamaLogger](./llama.common.illamalogger.md)
-
[InferenceParams](./llama.common.inferenceparams.md)
-[LLamaDefaultLogger](./llama.common.llamadefaultlogger.md)
-
[MirostatType](./llama.common.mirostattype.md)
[ModelParams](./llama.common.modelparams.md)
@@ -78,13 +116,17 @@
[GrammarUnknownEscapeCharacter](./llama.exceptions.grammarunknownescapecharacter.md)
+[LLamaDecodeError](./llama.exceptions.llamadecodeerror.md)
+
+[LoadWeightsFailedException](./llama.exceptions.loadweightsfailedexception.md)
+
[RuntimeError](./llama.exceptions.runtimeerror.md)
## LLama.Extensions
-[IModelParamsExtensions](./llama.extensions.imodelparamsextensions.md)
+[IContextParamsExtensions](./llama.extensions.icontextparamsextensions.md)
-[KeyValuePairExtensions](./llama.extensions.keyvaluepairextensions.md)
+[IModelParamsExtensions](./llama.extensions.imodelparamsextensions.md)
## LLama.Grammars
@@ -94,6 +136,20 @@
## LLama.Native
+[DecodeResult](./llama.native.decoderesult.md)
+
+[GGMLType](./llama.native.ggmltype.md)
+
+[GPUSplitMode](./llama.native.gpusplitmode.md)
+
+[LLamaBatch](./llama.native.llamabatch.md)
+
+[LLamaBeamsState](./llama.native.llamabeamsstate.md)
+
+[LLamaBeamView](./llama.native.llamabeamview.md)
+
+[LLamaChatMessage](./llama.native.llamachatmessage.md)
+
[LLamaContextParams](./llama.native.llamacontextparams.md)
[LLamaFtype](./llama.native.llamaftype.md)
@@ -102,66 +158,76 @@
[LLamaGrammarElementType](./llama.native.llamagrammarelementtype.md)
-[LLamaModelQuantizeParams](./llama.native.llamamodelquantizeparams.md)
+[LLamaKvCacheView](./llama.native.llamakvcacheview.md)
-[LLamaTokenData](./llama.native.llamatokendata.md)
+[LLamaKvCacheViewCell](./llama.native.llamakvcacheviewcell.md)
-[LLamaTokenDataArray](./llama.native.llamatokendataarray.md)
+[LLamaKvCacheViewSafeHandle](./llama.native.llamakvcacheviewsafehandle.md)
-[LLamaTokenDataArrayNative](./llama.native.llamatokendataarraynative.md)
+[LLamaLogLevel](./llama.native.llamaloglevel.md)
-[NativeApi](./llama.native.nativeapi.md)
+[LLamaModelKvOverrideType](./llama.native.llamamodelkvoverridetype.md)
-[SafeLLamaContextHandle](./llama.native.safellamacontexthandle.md)
+[LLamaModelMetadataOverride](./llama.native.llamamodelmetadataoverride.md)
-[SafeLLamaGrammarHandle](./llama.native.safellamagrammarhandle.md)
+[LLamaModelParams](./llama.native.llamamodelparams.md)
-[SafeLLamaHandleBase](./llama.native.safellamahandlebase.md)
+[LLamaModelQuantizeParams](./llama.native.llamamodelquantizeparams.md)
-[SafeLlamaModelHandle](./llama.native.safellamamodelhandle.md)
+[LLamaNativeBatch](./llama.native.llamanativebatch.md)
+
+[LLamaPoolingType](./llama.native.llamapoolingtype.md)
+
+[LLamaPos](./llama.native.llamapos.md)
+
+[LLamaRopeType](./llama.native.llamaropetype.md)
-[SamplingApi](./llama.native.samplingapi.md)
+[LLamaSeqId](./llama.native.llamaseqid.md)
+
+[LLamaToken](./llama.native.llamatoken.md)
+
+[LLamaTokenData](./llama.native.llamatokendata.md)
-## LLama.OldVersion
+[LLamaTokenDataArray](./llama.native.llamatokendataarray.md)
-[ChatCompletion](./llama.oldversion.chatcompletion.md)
+[LLamaTokenDataArrayNative](./llama.native.llamatokendataarraynative.md)
-[ChatCompletionChoice](./llama.oldversion.chatcompletionchoice.md)
+[LLamaTokenType](./llama.native.llamatokentype.md)
-[ChatCompletionChunk](./llama.oldversion.chatcompletionchunk.md)
+[LLamaVocabType](./llama.native.llamavocabtype.md)
-[ChatCompletionChunkChoice](./llama.oldversion.chatcompletionchunkchoice.md)
+[LLavaImageEmbed](./llama.native.llavaimageembed.md)
-[ChatCompletionChunkDelta](./llama.oldversion.chatcompletionchunkdelta.md)
+[NativeApi](./llama.native.nativeapi.md)
-[ChatCompletionMessage](./llama.oldversion.chatcompletionmessage.md)
+[NativeLibraryConfig](./llama.native.nativelibraryconfig.md)
-[ChatMessageRecord](./llama.oldversion.chatmessagerecord.md)
+[RopeScalingType](./llama.native.ropescalingtype.md)
-[ChatRole](./llama.oldversion.chatrole.md)
+[SafeLLamaContextHandle](./llama.native.safellamacontexthandle.md)
-[ChatSession<T>](./llama.oldversion.chatsession-1.md)
+[SafeLLamaGrammarHandle](./llama.native.safellamagrammarhandle.md)
-[Completion](./llama.oldversion.completion.md)
+[SafeLLamaHandleBase](./llama.native.safellamahandlebase.md)
-[CompletionChoice](./llama.oldversion.completionchoice.md)
+[SafeLlamaModelHandle](./llama.native.safellamamodelhandle.md)
-[CompletionChunk](./llama.oldversion.completionchunk.md)
+[SafeLlavaImageEmbedHandle](./llama.native.safellavaimageembedhandle.md)
-[CompletionLogprobs](./llama.oldversion.completionlogprobs.md)
+[SafeLlavaModelHandle](./llama.native.safellavamodelhandle.md)
-[CompletionUsage](./llama.oldversion.completionusage.md)
+## LLama.Sampling
-[Embedding](./llama.oldversion.embedding.md)
+[BaseSamplingPipeline](./llama.sampling.basesamplingpipeline.md)
-[EmbeddingData](./llama.oldversion.embeddingdata.md)
+[DefaultSamplingPipeline](./llama.sampling.defaultsamplingpipeline.md)
-[EmbeddingUsage](./llama.oldversion.embeddingusage.md)
+[GreedySamplingPipeline](./llama.sampling.greedysamplingpipeline.md)
-[IChatModel](./llama.oldversion.ichatmodel.md)
+[ISamplingPipeline](./llama.sampling.isamplingpipeline.md)
-[LLamaEmbedder](./llama.oldversion.llamaembedder.md)
+[ISamplingPipelineExtensions](./llama.sampling.isamplingpipelineextensions.md)
-[LLamaModel](./llama.oldversion.llamamodel.md)
+[Mirostate2SamplingPipeline](./llama.sampling.mirostate2samplingpipeline.md)
-[LLamaParams](./llama.oldversion.llamaparams.md)
+[MirostateSamplingPipeline](./llama.sampling.mirostatesamplingpipeline.md)
diff --git a/docs/xmldocs/llama.abstractions.adaptercollection.md b/docs/xmldocs/llama.abstractions.adaptercollection.md
new file mode 100644
index 00000000..4b49d3a7
--- /dev/null
+++ b/docs/xmldocs/llama.abstractions.adaptercollection.md
@@ -0,0 +1,92 @@
+# AdapterCollection
+
+Namespace: LLama.Abstractions
+
+A list of LoraAdapter objects
+
+```csharp
+public sealed class AdapterCollection : System.Collections.Generic.List`1[[LLama.Abstractions.LoraAdapter, LLamaSharp, Version=0.0.0.0, Culture=neutral, PublicKeyToken=null]], System.Collections.Generic.IList`1[[LLama.Abstractions.LoraAdapter, LLamaSharp, Version=0.0.0.0, Culture=neutral, PublicKeyToken=null]], System.Collections.Generic.ICollection`1[[LLama.Abstractions.LoraAdapter, LLamaSharp, Version=0.0.0.0, Culture=neutral, PublicKeyToken=null]], System.Collections.Generic.IEnumerable`1[[LLama.Abstractions.LoraAdapter, LLamaSharp, Version=0.0.0.0, Culture=neutral, PublicKeyToken=null]], System.Collections.IEnumerable, System.Collections.IList, System.Collections.ICollection, System.Collections.Generic.IReadOnlyList`1[[LLama.Abstractions.LoraAdapter, LLamaSharp, Version=0.0.0.0, Culture=neutral, PublicKeyToken=null]], System.Collections.Generic.IReadOnlyCollection`1[[LLama.Abstractions.LoraAdapter, LLamaSharp, Version=0.0.0.0, Culture=neutral, PublicKeyToken=null]], System.IEquatable`1[[LLama.Abstractions.AdapterCollection, LLamaSharp, Version=0.0.0.0, Culture=neutral, PublicKeyToken=null]]
+```
+
+Inheritance [Object](https://docs.microsoft.com/en-us/dotnet/api/system.object) β [List<LoraAdapter>](https://docs.microsoft.com/en-us/dotnet/api/system.collections.generic.list-1) β [AdapterCollection](./llama.abstractions.adaptercollection.md)
+Implements [IList<LoraAdapter>](https://docs.microsoft.com/en-us/dotnet/api/system.collections.generic.ilist-1), [ICollection<LoraAdapter>](https://docs.microsoft.com/en-us/dotnet/api/system.collections.generic.icollection-1), [IEnumerable<LoraAdapter>](https://docs.microsoft.com/en-us/dotnet/api/system.collections.generic.ienumerable-1), [IEnumerable](https://docs.microsoft.com/en-us/dotnet/api/system.collections.ienumerable), [IList](https://docs.microsoft.com/en-us/dotnet/api/system.collections.ilist), [ICollection](https://docs.microsoft.com/en-us/dotnet/api/system.collections.icollection), [IReadOnlyList<LoraAdapter>](https://docs.microsoft.com/en-us/dotnet/api/system.collections.generic.ireadonlylist-1), [IReadOnlyCollection<LoraAdapter>](https://docs.microsoft.com/en-us/dotnet/api/system.collections.generic.ireadonlycollection-1), [IEquatable<AdapterCollection>](https://docs.microsoft.com/en-us/dotnet/api/system.iequatable-1)
+
+## Properties
+
+### **Capacity**
+
+```csharp
+public int Capacity { get; set; }
+```
+
+#### Property Value
+
+[Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
+
+### **Count**
+
+```csharp
+public int Count { get; }
+```
+
+#### Property Value
+
+[Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
+
+### **Item**
+
+```csharp
+public LoraAdapter Item { get; set; }
+```
+
+#### Property Value
+
+[LoraAdapter](./llama.abstractions.loraadapter.md)
+
+## Constructors
+
+### **AdapterCollection()**
+
+```csharp
+public AdapterCollection()
+```
+
+## Methods
+
+### **Equals(AdapterCollection)**
+
+```csharp
+public bool Equals(AdapterCollection other)
+```
+
+#### Parameters
+
+`other` [AdapterCollection](./llama.abstractions.adaptercollection.md)
+
+#### Returns
+
+[Boolean](https://docs.microsoft.com/en-us/dotnet/api/system.boolean)
+
+### **Equals(Object)**
+
+```csharp
+public bool Equals(object obj)
+```
+
+#### Parameters
+
+`obj` [Object](https://docs.microsoft.com/en-us/dotnet/api/system.object)
+
+#### Returns
+
+[Boolean](https://docs.microsoft.com/en-us/dotnet/api/system.boolean)
+
+### **GetHashCode()**
+
+```csharp
+public int GetHashCode()
+```
+
+#### Returns
+
+[Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
diff --git a/docs/xmldocs/llama.abstractions.icontextparams.md b/docs/xmldocs/llama.abstractions.icontextparams.md
new file mode 100644
index 00000000..1cfc4794
--- /dev/null
+++ b/docs/xmldocs/llama.abstractions.icontextparams.md
@@ -0,0 +1,252 @@
+# IContextParams
+
+Namespace: LLama.Abstractions
+
+The parameters for initializing a LLama context from a model.
+
+```csharp
+public interface IContextParams
+```
+
+## Properties
+
+### **ContextSize**
+
+Model context size (n_ctx)
+
+```csharp
+public abstract Nullable ContextSize { get; }
+```
+
+#### Property Value
+
+[Nullable<UInt32>](https://docs.microsoft.com/en-us/dotnet/api/system.nullable-1)
+
+### **BatchSize**
+
+batch size for prompt processing (must be >=32 to use BLAS) (n_batch)
+
+```csharp
+public abstract uint BatchSize { get; }
+```
+
+#### Property Value
+
+[UInt32](https://docs.microsoft.com/en-us/dotnet/api/system.uint32)
+
+### **Seed**
+
+Seed for the random number generator (seed)
+
+```csharp
+public abstract uint Seed { get; }
+```
+
+#### Property Value
+
+[UInt32](https://docs.microsoft.com/en-us/dotnet/api/system.uint32)
+
+### **EmbeddingMode**
+
+Whether to use embedding mode. (embedding) Note that if this is set to true,
+ The LLamaModel won't produce text response anymore.
+
+```csharp
+public abstract bool EmbeddingMode { get; }
+```
+
+#### Property Value
+
+[Boolean](https://docs.microsoft.com/en-us/dotnet/api/system.boolean)
+
+### **RopeFrequencyBase**
+
+RoPE base frequency (null to fetch from the model)
+
+```csharp
+public abstract Nullable RopeFrequencyBase { get; }
+```
+
+#### Property Value
+
+[Nullable<Single>](https://docs.microsoft.com/en-us/dotnet/api/system.nullable-1)
+
+### **RopeFrequencyScale**
+
+RoPE frequency scaling factor (null to fetch from the model)
+
+```csharp
+public abstract Nullable RopeFrequencyScale { get; }
+```
+
+#### Property Value
+
+[Nullable<Single>](https://docs.microsoft.com/en-us/dotnet/api/system.nullable-1)
+
+### **Encoding**
+
+The encoding to use for models
+
+```csharp
+public abstract Encoding Encoding { get; }
+```
+
+#### Property Value
+
+[Encoding](https://docs.microsoft.com/en-us/dotnet/api/system.text.encoding)
+
+### **Threads**
+
+Number of threads (null = autodetect) (n_threads)
+
+```csharp
+public abstract Nullable Threads { get; }
+```
+
+#### Property Value
+
+[Nullable<UInt32>](https://docs.microsoft.com/en-us/dotnet/api/system.nullable-1)
+
+### **BatchThreads**
+
+Number of threads to use for batch processing (null = autodetect) (n_threads)
+
+```csharp
+public abstract Nullable BatchThreads { get; }
+```
+
+#### Property Value
+
+[Nullable<UInt32>](https://docs.microsoft.com/en-us/dotnet/api/system.nullable-1)
+
+### **YarnExtrapolationFactor**
+
+YaRN extrapolation mix factor (null = from model)
+
+```csharp
+public abstract Nullable YarnExtrapolationFactor { get; }
+```
+
+#### Property Value
+
+[Nullable<Single>](https://docs.microsoft.com/en-us/dotnet/api/system.nullable-1)
+
+### **YarnAttentionFactor**
+
+YaRN magnitude scaling factor (null = from model)
+
+```csharp
+public abstract Nullable YarnAttentionFactor { get; }
+```
+
+#### Property Value
+
+[Nullable<Single>](https://docs.microsoft.com/en-us/dotnet/api/system.nullable-1)
+
+### **YarnBetaFast**
+
+YaRN low correction dim (null = from model)
+
+```csharp
+public abstract Nullable YarnBetaFast { get; }
+```
+
+#### Property Value
+
+[Nullable<Single>](https://docs.microsoft.com/en-us/dotnet/api/system.nullable-1)
+
+### **YarnBetaSlow**
+
+YaRN high correction dim (null = from model)
+
+```csharp
+public abstract Nullable YarnBetaSlow { get; }
+```
+
+#### Property Value
+
+[Nullable<Single>](https://docs.microsoft.com/en-us/dotnet/api/system.nullable-1)
+
+### **YarnOriginalContext**
+
+YaRN original context length (null = from model)
+
+```csharp
+public abstract Nullable YarnOriginalContext { get; }
+```
+
+#### Property Value
+
+[Nullable<UInt32>](https://docs.microsoft.com/en-us/dotnet/api/system.nullable-1)
+
+### **YarnScalingType**
+
+YaRN scaling method to use.
+
+```csharp
+public abstract Nullable YarnScalingType { get; }
+```
+
+#### Property Value
+
+[Nullable<RopeScalingType>](https://docs.microsoft.com/en-us/dotnet/api/system.nullable-1)
+
+### **TypeK**
+
+Override the type of the K cache
+
+```csharp
+public abstract Nullable TypeK { get; }
+```
+
+#### Property Value
+
+[Nullable<GGMLType>](https://docs.microsoft.com/en-us/dotnet/api/system.nullable-1)
+
+### **TypeV**
+
+Override the type of the V cache
+
+```csharp
+public abstract Nullable TypeV { get; }
+```
+
+#### Property Value
+
+[Nullable<GGMLType>](https://docs.microsoft.com/en-us/dotnet/api/system.nullable-1)
+
+### **NoKqvOffload**
+
+Whether to disable offloading the KQV cache to the GPU
+
+```csharp
+public abstract bool NoKqvOffload { get; }
+```
+
+#### Property Value
+
+[Boolean](https://docs.microsoft.com/en-us/dotnet/api/system.boolean)
+
+### **DefragThreshold**
+
+defragment the KV cache if holes/size > defrag_threshold, Set to < 0 to disable (default)
+
+```csharp
+public abstract float DefragThreshold { get; }
+```
+
+#### Property Value
+
+[Single](https://docs.microsoft.com/en-us/dotnet/api/system.single)
+
+### **DoPooling**
+
+Whether to pool (sum) embedding results by sequence id (ignored if no pooling layer)
+
+```csharp
+public abstract bool DoPooling { get; }
+```
+
+#### Property Value
+
+[Boolean](https://docs.microsoft.com/en-us/dotnet/api/system.boolean)
diff --git a/docs/xmldocs/llama.abstractions.ihistorytransform.md b/docs/xmldocs/llama.abstractions.ihistorytransform.md
index 729e457e..b76503ac 100644
--- a/docs/xmldocs/llama.abstractions.ihistorytransform.md
+++ b/docs/xmldocs/llama.abstractions.ihistorytransform.md
@@ -47,3 +47,15 @@ The chat history as plain text.
[ChatHistory](./llama.common.chathistory.md)
The updated history.
+
+### **Clone()**
+
+Copy the transform.
+
+```csharp
+IHistoryTransform Clone()
+```
+
+#### Returns
+
+[IHistoryTransform](./llama.abstractions.ihistorytransform.md)
diff --git a/docs/xmldocs/llama.abstractions.iinferenceparams.md b/docs/xmldocs/llama.abstractions.iinferenceparams.md
index 5b48b8d5..4a03092a 100644
--- a/docs/xmldocs/llama.abstractions.iinferenceparams.md
+++ b/docs/xmldocs/llama.abstractions.iinferenceparams.md
@@ -40,79 +40,55 @@ public abstract int MaxTokens { get; set; }
logit bias for specific tokens
```csharp
-public abstract Dictionary LogitBias { get; set; }
+public abstract Dictionary LogitBias { get; set; }
```
#### Property Value
-[Dictionary<Int32, Single>](https://docs.microsoft.com/en-us/dotnet/api/system.collections.generic.dictionary-2)
+[Dictionary<LLamaToken, Single>](https://docs.microsoft.com/en-us/dotnet/api/system.collections.generic.dictionary-2)
### **AntiPrompts**
Sequences where the model will stop generating further tokens.
```csharp
-public abstract IEnumerable AntiPrompts { get; set; }
+public abstract IReadOnlyList AntiPrompts { get; set; }
```
#### Property Value
-[IEnumerable<String>](https://docs.microsoft.com/en-us/dotnet/api/system.collections.generic.ienumerable-1)
+[IReadOnlyList<String>](https://docs.microsoft.com/en-us/dotnet/api/system.collections.generic.ireadonlylist-1)
-### **PathSession**
-
-path to file for saving/loading model eval state
-
-```csharp
-public abstract string PathSession { get; set; }
-```
-
-#### Property Value
-
-[String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
-
-### **InputSuffix**
-
-string to suffix user inputs with
-
-```csharp
-public abstract string InputSuffix { get; set; }
-```
-
-#### Property Value
-
-[String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
-
-### **InputPrefix**
+### **TopK**
-string to prefix user inputs with
+0 or lower to use vocab size
```csharp
-public abstract string InputPrefix { get; set; }
+public abstract int TopK { get; set; }
```
#### Property Value
-[String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
+[Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
-### **TopK**
+### **TopP**
-0 or lower to use vocab size
+1.0 = disabled
```csharp
-public abstract int TopK { get; set; }
+public abstract float TopP { get; set; }
```
#### Property Value
-[Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
+[Single](https://docs.microsoft.com/en-us/dotnet/api/system.single)
-### **TopP**
+### **MinP**
-1.0 = disabled
+0.0 = disabled
```csharp
-public abstract float TopP { get; set; }
+public abstract float MinP { get; set; }
```
#### Property Value
@@ -266,3 +242,15 @@ public abstract SafeLLamaGrammarHandle Grammar { get; set; }
#### Property Value
[SafeLLamaGrammarHandle](./llama.native.safellamagrammarhandle.md)
+
+### **SamplingPipeline**
+
+Set a custom sampling pipeline to use. If this is set All other sampling parameters are ignored!
+
+```csharp
+public abstract ISamplingPipeline SamplingPipeline { get; set; }
+```
+
+#### Property Value
+
+[ISamplingPipeline](./llama.sampling.isamplingpipeline.md)
diff --git a/docs/xmldocs/llama.abstractions.illamaexecutor.md b/docs/xmldocs/llama.abstractions.illamaexecutor.md
index 3091b6f3..72eab1bc 100644
--- a/docs/xmldocs/llama.abstractions.illamaexecutor.md
+++ b/docs/xmldocs/llama.abstractions.illamaexecutor.md
@@ -22,30 +22,43 @@ public abstract LLamaContext Context { get; }
[LLamaContext](./llama.llamacontext.md)
-## Methods
+### **IsMultiModal**
+
+Identify if it's a multi-modal model and there is a image to process.
+
+```csharp
+public abstract bool IsMultiModal { get; }
+```
+
+#### Property Value
+
+[Boolean](https://docs.microsoft.com/en-us/dotnet/api/system.boolean)
-### **Infer(String, IInferenceParams, CancellationToken)**
+### **ClipModel**
-Infers a response from the model.
+Muti-Modal Projections / Clip Model weights
```csharp
-IEnumerable Infer(string text, IInferenceParams inferenceParams, CancellationToken token)
+public abstract LLavaWeights ClipModel { get; }
```
-#### Parameters
+#### Property Value
-`text` [String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
-Your prompt
+[LLavaWeights](./llama.llavaweights.md)
-`inferenceParams` [IInferenceParams](./llama.abstractions.iinferenceparams.md)
-Any additional parameters
+### **ImagePaths**
-`token` [CancellationToken](https://docs.microsoft.com/en-us/dotnet/api/system.threading.cancellationtoken)
-A cancellation token.
+List of images: Image filename and path (jpeg images).
-#### Returns
+```csharp
+public abstract List ImagePaths { get; set; }
+```
+
+#### Property Value
-[IEnumerable<String>](https://docs.microsoft.com/en-us/dotnet/api/system.collections.generic.ienumerable-1)
+[List<String>](https://docs.microsoft.com/en-us/dotnet/api/system.collections.generic.list-1)
+
+## Methods
### **InferAsync(String, IInferenceParams, CancellationToken)**
diff --git a/docs/xmldocs/llama.abstractions.illamaparams.md b/docs/xmldocs/llama.abstractions.illamaparams.md
new file mode 100644
index 00000000..e3d598db
--- /dev/null
+++ b/docs/xmldocs/llama.abstractions.illamaparams.md
@@ -0,0 +1,15 @@
+# ILLamaParams
+
+Namespace: LLama.Abstractions
+
+Convenience interface for implementing both type of parameters.
+
+```csharp
+public interface ILLamaParams : IModelParams, IContextParams
+```
+
+Implements [IModelParams](./llama.abstractions.imodelparams.md), [IContextParams](./llama.abstractions.icontextparams.md)
+
+**Remarks:**
+
+Mostly exists for backwards compatibility reasons, when these two were not split.
diff --git a/docs/xmldocs/llama.abstractions.imodelparams.md b/docs/xmldocs/llama.abstractions.imodelparams.md
index 140cfaf1..f319a49e 100644
--- a/docs/xmldocs/llama.abstractions.imodelparams.md
+++ b/docs/xmldocs/llama.abstractions.imodelparams.md
@@ -10,21 +10,10 @@ public interface IModelParams
## Properties
-### **ContextSize**
-
-Model context size (n_ctx)
-
-```csharp
-public abstract int ContextSize { get; set; }
-```
-
-#### Property Value
-
-[Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
-
### **MainGpu**
-the GPU that is used for scratch and small tensors
+main_gpu interpretation depends on split_mode:
+ NoneThe GPU that is used for the entire mode.RowThe GPU that is used for small tensors and intermediate results.LayerIgnored.
```csharp
public abstract int MainGpu { get; set; }
@@ -34,60 +23,36 @@ public abstract int MainGpu { get; set; }
[Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
-### **LowVram**
+### **SplitMode**
-if true, reduce VRAM usage at the cost of performance
+How to split the model across multiple GPUs
```csharp
-public abstract bool LowVram { get; set; }
+public abstract GPUSplitMode SplitMode { get; }
```
#### Property Value
-[Boolean](https://docs.microsoft.com/en-us/dotnet/api/system.boolean)
+[GPUSplitMode](./llama.native.gpusplitmode.md)
### **GpuLayerCount**
Number of layers to run in VRAM / GPU memory (n_gpu_layers)
```csharp
-public abstract int GpuLayerCount { get; set; }
+public abstract int GpuLayerCount { get; }
```
#### Property Value
[Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
-### **Seed**
-
-Seed for the random number generator (seed)
-
-```csharp
-public abstract int Seed { get; set; }
-```
-
-#### Property Value
-
-[Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
-
-### **UseFp16Memory**
-
-Use f16 instead of f32 for memory kv (memory_f16)
-
-```csharp
-public abstract bool UseFp16Memory { get; set; }
-```
-
-#### Property Value
-
-[Boolean](https://docs.microsoft.com/en-us/dotnet/api/system.boolean)
-
### **UseMemorymap**
Use mmap for faster loads (use_mmap)
```csharp
-public abstract bool UseMemorymap { get; set; }
+public abstract bool UseMemorymap { get; }
```
#### Property Value
@@ -99,19 +64,7 @@ public abstract bool UseMemorymap { get; set; }
Use mlock to keep model in memory (use_mlock)
```csharp
-public abstract bool UseMemoryLock { get; set; }
-```
-
-#### Property Value
-
-[Boolean](https://docs.microsoft.com/en-us/dotnet/api/system.boolean)
-
-### **Perplexity**
-
-Compute perplexity over the prompt (perplexity)
-
-```csharp
-public abstract bool Perplexity { get; set; }
+public abstract bool UseMemoryLock { get; }
```
#### Property Value
@@ -123,154 +76,69 @@ public abstract bool Perplexity { get; set; }
Model path (model)
```csharp
-public abstract string ModelPath { get; set; }
+public abstract string ModelPath { get; }
```
#### Property Value
[String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
-### **ModelAlias**
-
-model alias
-
-```csharp
-public abstract string ModelAlias { get; set; }
-```
-
-#### Property Value
-
-[String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
-
-### **LoraAdapter**
-
-lora adapter path (lora_adapter)
-
-```csharp
-public abstract string LoraAdapter { get; set; }
-```
-
-#### Property Value
-
-[String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
-
-### **LoraBase**
-
-base model path for the lora adapter (lora_base)
-
-```csharp
-public abstract string LoraBase { get; set; }
-```
-
-#### Property Value
-
-[String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
-
-### **Threads**
-
-Number of threads (-1 = autodetect) (n_threads)
-
-```csharp
-public abstract int Threads { get; set; }
-```
-
-#### Property Value
-
-[Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
-
-### **BatchSize**
-
-batch size for prompt processing (must be >=32 to use BLAS) (n_batch)
-
-```csharp
-public abstract int BatchSize { get; set; }
-```
-
-#### Property Value
-
-[Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
-
-### **ConvertEosToNewLine**
-
-Whether to convert eos to newline during the inference.
-
-```csharp
-public abstract bool ConvertEosToNewLine { get; set; }
-```
-
-#### Property Value
-
-[Boolean](https://docs.microsoft.com/en-us/dotnet/api/system.boolean)
-
-### **EmbeddingMode**
-
-Whether to use embedding mode. (embedding) Note that if this is set to true,
- The LLamaModel won't produce text response anymore.
-
-```csharp
-public abstract bool EmbeddingMode { get; set; }
-```
-
-#### Property Value
-
-[Boolean](https://docs.microsoft.com/en-us/dotnet/api/system.boolean)
-
### **TensorSplits**
how split tensors should be distributed across GPUs
```csharp
-public abstract Single[] TensorSplits { get; set; }
+public abstract TensorSplitsCollection TensorSplits { get; }
```
#### Property Value
-[Single[]](https://docs.microsoft.com/en-us/dotnet/api/system.single)
+[TensorSplitsCollection](./llama.abstractions.tensorsplitscollection.md)
-### **RopeFrequencyBase**
+### **VocabOnly**
-RoPE base frequency
+Load vocab only (no weights)
```csharp
-public abstract float RopeFrequencyBase { get; set; }
+public abstract bool VocabOnly { get; }
```
#### Property Value
-[Single](https://docs.microsoft.com/en-us/dotnet/api/system.single)
+[Boolean](https://docs.microsoft.com/en-us/dotnet/api/system.boolean)
-### **RopeFrequencyScale**
+### **LoraAdapters**
-RoPE frequency scaling factor
+List of LoRA adapters to apply
```csharp
-public abstract float RopeFrequencyScale { get; set; }
+public abstract AdapterCollection LoraAdapters { get; }
```
#### Property Value
-[Single](https://docs.microsoft.com/en-us/dotnet/api/system.single)
+[AdapterCollection](./llama.abstractions.adaptercollection.md)
-### **MulMatQ**
+### **LoraBase**
-Use experimental mul_mat_q kernels
+base model path for the lora adapter (lora_base)
```csharp
-public abstract bool MulMatQ { get; set; }
+public abstract string LoraBase { get; }
```
#### Property Value
-[Boolean](https://docs.microsoft.com/en-us/dotnet/api/system.boolean)
+[String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
-### **Encoding**
+### **MetadataOverrides**
-The encoding to use for models
+Override specific metadata items in the model
```csharp
-public abstract Encoding Encoding { get; set; }
+public abstract List MetadataOverrides { get; }
```
#### Property Value
-[Encoding](https://docs.microsoft.com/en-us/dotnet/api/system.text.encoding)
+[List<MetadataOverride>](https://docs.microsoft.com/en-us/dotnet/api/system.collections.generic.list-1)
diff --git a/docs/xmldocs/llama.abstractions.itextstreamtransform.md b/docs/xmldocs/llama.abstractions.itextstreamtransform.md
index caa50ac5..69e163aa 100644
--- a/docs/xmldocs/llama.abstractions.itextstreamtransform.md
+++ b/docs/xmldocs/llama.abstractions.itextstreamtransform.md
@@ -10,34 +10,30 @@ public interface ITextStreamTransform
## Methods
-### **Transform(IEnumerable<String>)**
+### **TransformAsync(IAsyncEnumerable<String>)**
-Takes a stream of tokens and transforms them, returning a new stream of tokens.
+Takes a stream of tokens and transforms them, returning a new stream of tokens asynchronously.
```csharp
-IEnumerable Transform(IEnumerable tokens)
+IAsyncEnumerable TransformAsync(IAsyncEnumerable tokens)
```
#### Parameters
-`tokens` [IEnumerable<String>](https://docs.microsoft.com/en-us/dotnet/api/system.collections.generic.ienumerable-1)
+`tokens` [IAsyncEnumerable<String>](https://docs.microsoft.com/en-us/dotnet/api/system.collections.generic.iasyncenumerable-1)
#### Returns
-[IEnumerable<String>](https://docs.microsoft.com/en-us/dotnet/api/system.collections.generic.ienumerable-1)
+[IAsyncEnumerable<String>](https://docs.microsoft.com/en-us/dotnet/api/system.collections.generic.iasyncenumerable-1)
-### **TransformAsync(IAsyncEnumerable<String>)**
+### **Clone()**
-Takes a stream of tokens and transforms them, returning a new stream of tokens asynchronously.
+Copy the transform.
```csharp
-IAsyncEnumerable TransformAsync(IAsyncEnumerable tokens)
+ITextStreamTransform Clone()
```
-#### Parameters
-
-`tokens` [IAsyncEnumerable<String>](https://docs.microsoft.com/en-us/dotnet/api/system.collections.generic.iasyncenumerable-1)
-
#### Returns
-[IAsyncEnumerable<String>](https://docs.microsoft.com/en-us/dotnet/api/system.collections.generic.iasyncenumerable-1)
+[ITextStreamTransform](./llama.abstractions.itextstreamtransform.md)
diff --git a/docs/xmldocs/llama.abstractions.itexttransform.md b/docs/xmldocs/llama.abstractions.itexttransform.md
index df026ae5..f38c028d 100644
--- a/docs/xmldocs/llama.abstractions.itexttransform.md
+++ b/docs/xmldocs/llama.abstractions.itexttransform.md
@@ -31,3 +31,15 @@ string Transform(string text)
#### Returns
[String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
+
+### **Clone()**
+
+Copy the transform.
+
+```csharp
+ITextTransform Clone()
+```
+
+#### Returns
+
+[ITextTransform](./llama.abstractions.itexttransform.md)
diff --git a/docs/xmldocs/llama.abstractions.loraadapter.md b/docs/xmldocs/llama.abstractions.loraadapter.md
new file mode 100644
index 00000000..09d48743
--- /dev/null
+++ b/docs/xmldocs/llama.abstractions.loraadapter.md
@@ -0,0 +1,118 @@
+# LoraAdapter
+
+Namespace: LLama.Abstractions
+
+A LoRA adapter to apply to a model
+
+```csharp
+public struct LoraAdapter
+```
+
+Inheritance [Object](https://docs.microsoft.com/en-us/dotnet/api/system.object) β [ValueType](https://docs.microsoft.com/en-us/dotnet/api/system.valuetype) β [LoraAdapter](./llama.abstractions.loraadapter.md)
+Implements [IEquatable<LoraAdapter>](https://docs.microsoft.com/en-us/dotnet/api/system.iequatable-1)
+
+## Properties
+
+### **Path**
+
+Path to the LoRA file
+
+```csharp
+public string Path { get; set; }
+```
+
+#### Property Value
+
+[String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
+
+### **Scale**
+
+Strength of this LoRA
+
+```csharp
+public float Scale { get; set; }
+```
+
+#### Property Value
+
+[Single](https://docs.microsoft.com/en-us/dotnet/api/system.single)
+
+## Constructors
+
+### **LoraAdapter(String, Single)**
+
+A LoRA adapter to apply to a model
+
+```csharp
+LoraAdapter(string Path, float Scale)
+```
+
+#### Parameters
+
+`Path` [String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
+Path to the LoRA file
+
+`Scale` [Single](https://docs.microsoft.com/en-us/dotnet/api/system.single)
+Strength of this LoRA
+
+## Methods
+
+### **ToString()**
+
+```csharp
+string ToString()
+```
+
+#### Returns
+
+[String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
+
+### **GetHashCode()**
+
+```csharp
+int GetHashCode()
+```
+
+#### Returns
+
+[Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
+
+### **Equals(Object)**
+
+```csharp
+bool Equals(object obj)
+```
+
+#### Parameters
+
+`obj` [Object](https://docs.microsoft.com/en-us/dotnet/api/system.object)
+
+#### Returns
+
+[Boolean](https://docs.microsoft.com/en-us/dotnet/api/system.boolean)
+
+### **Equals(LoraAdapter)**
+
+```csharp
+bool Equals(LoraAdapter other)
+```
+
+#### Parameters
+
+`other` [LoraAdapter](./llama.abstractions.loraadapter.md)
+
+#### Returns
+
+[Boolean](https://docs.microsoft.com/en-us/dotnet/api/system.boolean)
+
+### **Deconstruct(String&, Single&)**
+
+```csharp
+void Deconstruct(String& Path, Single& Scale)
+```
+
+#### Parameters
+
+`Path` [String&](https://docs.microsoft.com/en-us/dotnet/api/system.string&)
+
+`Scale` [Single&](https://docs.microsoft.com/en-us/dotnet/api/system.single&)
diff --git a/docs/xmldocs/llama.abstractions.metadataoverride.md b/docs/xmldocs/llama.abstractions.metadataoverride.md
new file mode 100644
index 00000000..428c7262
--- /dev/null
+++ b/docs/xmldocs/llama.abstractions.metadataoverride.md
@@ -0,0 +1,150 @@
+# MetadataOverride
+
+Namespace: LLama.Abstractions
+
+An override for a single key/value pair in model metadata
+
+```csharp
+public sealed class MetadataOverride : System.IEquatable`1[[LLama.Abstractions.MetadataOverride, LLamaSharp, Version=0.0.0.0, Culture=neutral, PublicKeyToken=null]]
+```
+
+Inheritance [Object](https://docs.microsoft.com/en-us/dotnet/api/system.object) β [MetadataOverride](./llama.abstractions.metadataoverride.md)
+Implements [IEquatable<MetadataOverride>](https://docs.microsoft.com/en-us/dotnet/api/system.iequatable-1)
+
+## Properties
+
+### **Key**
+
+Get the key being overriden by this override
+
+```csharp
+public string Key { get; }
+```
+
+#### Property Value
+
+[String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
+
+## Constructors
+
+### **MetadataOverride(String, Int32)**
+
+Create a new override for an int key
+
+```csharp
+public MetadataOverride(string key, int value)
+```
+
+#### Parameters
+
+`key` [String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
+
+`value` [Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
+
+### **MetadataOverride(String, Single)**
+
+Create a new override for a float key
+
+```csharp
+public MetadataOverride(string key, float value)
+```
+
+#### Parameters
+
+`key` [String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
+
+`value` [Single](https://docs.microsoft.com/en-us/dotnet/api/system.single)
+
+### **MetadataOverride(String, Boolean)**
+
+Create a new override for a boolean key
+
+```csharp
+public MetadataOverride(string key, bool value)
+```
+
+#### Parameters
+
+`key` [String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
+
+`value` [Boolean](https://docs.microsoft.com/en-us/dotnet/api/system.boolean)
+
+## Methods
+
+### **WriteValue(LLamaModelMetadataOverride&)**
+
+```csharp
+internal void WriteValue(LLamaModelMetadataOverride& dest)
+```
+
+#### Parameters
+
+`dest` [LLamaModelMetadataOverride&](./llama.native.llamamodelmetadataoverride&.md)
+
+### **WriteValue(Utf8JsonWriter)**
+
+```csharp
+internal void WriteValue(Utf8JsonWriter writer)
+```
+
+#### Parameters
+
+`writer` Utf8JsonWriter
+
+### **ToString()**
+
+```csharp
+public string ToString()
+```
+
+#### Returns
+
+[String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
+
+### **GetHashCode()**
+
+```csharp
+public int GetHashCode()
+```
+
+#### Returns
+
+[Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
+
+### **Equals(Object)**
+
+```csharp
+public bool Equals(object obj)
+```
+
+#### Parameters
+
+`obj` [Object](https://docs.microsoft.com/en-us/dotnet/api/system.object)
+
+#### Returns
+
+[Boolean](https://docs.microsoft.com/en-us/dotnet/api/system.boolean)
+
+### **Equals(MetadataOverride)**
+
+```csharp
+public bool Equals(MetadataOverride other)
+```
+
+#### Parameters
+
+`other` [MetadataOverride](./llama.abstractions.metadataoverride.md)
+
+#### Returns
+
+[Boolean](https://docs.microsoft.com/en-us/dotnet/api/system.boolean)
+
+### **<Clone>$()**
+
+```csharp
+public MetadataOverride $()
+```
+
+#### Returns
+
+[MetadataOverride](./llama.abstractions.metadataoverride.md)
diff --git a/docs/xmldocs/llama.abstractions.metadataoverrideconverter.md b/docs/xmldocs/llama.abstractions.metadataoverrideconverter.md
new file mode 100644
index 00000000..18afc9d3
--- /dev/null
+++ b/docs/xmldocs/llama.abstractions.metadataoverrideconverter.md
@@ -0,0 +1,65 @@
+# MetadataOverrideConverter
+
+Namespace: LLama.Abstractions
+
+A JSON converter for [MetadataOverride](./llama.abstractions.metadataoverride.md)
+
+```csharp
+public class MetadataOverrideConverter : System.Text.Json.Serialization.JsonConverter`1[[LLama.Abstractions.MetadataOverride, LLamaSharp, Version=0.0.0.0, Culture=neutral, PublicKeyToken=null]]
+```
+
+Inheritance [Object](https://docs.microsoft.com/en-us/dotnet/api/system.object) β JsonConverter β JsonConverter<MetadataOverride> β [MetadataOverrideConverter](./llama.abstractions.metadataoverrideconverter.md)
+
+## Properties
+
+### **HandleNull**
+
+```csharp
+public bool HandleNull { get; }
+```
+
+#### Property Value
+
+[Boolean](https://docs.microsoft.com/en-us/dotnet/api/system.boolean)
+
+## Constructors
+
+### **MetadataOverrideConverter()**
+
+```csharp
+public MetadataOverrideConverter()
+```
+
+## Methods
+
+### **Read(Utf8JsonReader&, Type, JsonSerializerOptions)**
+
+```csharp
+public MetadataOverride Read(Utf8JsonReader& reader, Type typeToConvert, JsonSerializerOptions options)
+```
+
+#### Parameters
+
+`reader` Utf8JsonReader&
+
+`typeToConvert` [Type](https://docs.microsoft.com/en-us/dotnet/api/system.type)
+
+`options` JsonSerializerOptions
+
+#### Returns
+
+[MetadataOverride](./llama.abstractions.metadataoverride.md)
+
+### **Write(Utf8JsonWriter, MetadataOverride, JsonSerializerOptions)**
+
+```csharp
+public void Write(Utf8JsonWriter writer, MetadataOverride value, JsonSerializerOptions options)
+```
+
+#### Parameters
+
+`writer` Utf8JsonWriter
+
+`value` [MetadataOverride](./llama.abstractions.metadataoverride.md)
+
+`options` JsonSerializerOptions
diff --git a/docs/xmldocs/llama.abstractions.tensorsplitscollection.md b/docs/xmldocs/llama.abstractions.tensorsplitscollection.md
new file mode 100644
index 00000000..d5723745
--- /dev/null
+++ b/docs/xmldocs/llama.abstractions.tensorsplitscollection.md
@@ -0,0 +1,92 @@
+# TensorSplitsCollection
+
+Namespace: LLama.Abstractions
+
+A fixed size array to set the tensor splits across multiple GPUs
+
+```csharp
+public sealed class TensorSplitsCollection : System.Collections.Generic.IEnumerable`1[[System.Single, System.Private.CoreLib, Version=6.0.0.0, Culture=neutral, PublicKeyToken=7cec85d7bea7798e]], System.Collections.IEnumerable
+```
+
+Inheritance [Object](https://docs.microsoft.com/en-us/dotnet/api/system.object) β [TensorSplitsCollection](./llama.abstractions.tensorsplitscollection.md)
+Implements [IEnumerable<Single>](https://docs.microsoft.com/en-us/dotnet/api/system.collections.generic.ienumerable-1), [IEnumerable](https://docs.microsoft.com/en-us/dotnet/api/system.collections.ienumerable)
+
+## Properties
+
+### **Length**
+
+The size of this array
+
+```csharp
+public int Length { get; }
+```
+
+#### Property Value
+
+[Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
+
+### **Item**
+
+```csharp
+public float Item { get; set; }
+```
+
+#### Property Value
+
+[Single](https://docs.microsoft.com/en-us/dotnet/api/system.single)
+
+## Constructors
+
+### **TensorSplitsCollection(Single[])**
+
+Create a new tensor splits collection, copying the given values
+
+```csharp
+public TensorSplitsCollection(Single[] splits)
+```
+
+#### Parameters
+
+`splits` [Single[]](https://docs.microsoft.com/en-us/dotnet/api/system.single)
+
+#### Exceptions
+
+[ArgumentException](https://docs.microsoft.com/en-us/dotnet/api/system.argumentexception)
+
+### **TensorSplitsCollection()**
+
+Create a new tensor splits collection with all values initialised to the default
+
+```csharp
+public TensorSplitsCollection()
+```
+
+## Methods
+
+### **Clear()**
+
+Set all values to zero
+
+```csharp
+public void Clear()
+```
+
+### **Pin()**
+
+```csharp
+internal MemoryHandle Pin()
+```
+
+#### Returns
+
+[MemoryHandle](https://docs.microsoft.com/en-us/dotnet/api/system.buffers.memoryhandle)
+
+### **GetEnumerator()**
+
+```csharp
+public IEnumerator GetEnumerator()
+```
+
+#### Returns
+
+[IEnumerator<Single>](https://docs.microsoft.com/en-us/dotnet/api/system.collections.generic.ienumerator-1)
diff --git a/docs/xmldocs/llama.abstractions.tensorsplitscollectionconverter.md b/docs/xmldocs/llama.abstractions.tensorsplitscollectionconverter.md
new file mode 100644
index 00000000..3b16aade
--- /dev/null
+++ b/docs/xmldocs/llama.abstractions.tensorsplitscollectionconverter.md
@@ -0,0 +1,65 @@
+# TensorSplitsCollectionConverter
+
+Namespace: LLama.Abstractions
+
+A JSON converter for [TensorSplitsCollection](./llama.abstractions.tensorsplitscollection.md)
+
+```csharp
+public class TensorSplitsCollectionConverter : System.Text.Json.Serialization.JsonConverter`1[[LLama.Abstractions.TensorSplitsCollection, LLamaSharp, Version=0.0.0.0, Culture=neutral, PublicKeyToken=null]]
+```
+
+Inheritance [Object](https://docs.microsoft.com/en-us/dotnet/api/system.object) β JsonConverter β JsonConverter<TensorSplitsCollection> β [TensorSplitsCollectionConverter](./llama.abstractions.tensorsplitscollectionconverter.md)
+
+## Properties
+
+### **HandleNull**
+
+```csharp
+public bool HandleNull { get; }
+```
+
+#### Property Value
+
+[Boolean](https://docs.microsoft.com/en-us/dotnet/api/system.boolean)
+
+## Constructors
+
+### **TensorSplitsCollectionConverter()**
+
+```csharp
+public TensorSplitsCollectionConverter()
+```
+
+## Methods
+
+### **Read(Utf8JsonReader&, Type, JsonSerializerOptions)**
+
+```csharp
+public TensorSplitsCollection Read(Utf8JsonReader& reader, Type typeToConvert, JsonSerializerOptions options)
+```
+
+#### Parameters
+
+`reader` Utf8JsonReader&
+
+`typeToConvert` [Type](https://docs.microsoft.com/en-us/dotnet/api/system.type)
+
+`options` JsonSerializerOptions
+
+#### Returns
+
+[TensorSplitsCollection](./llama.abstractions.tensorsplitscollection.md)
+
+### **Write(Utf8JsonWriter, TensorSplitsCollection, JsonSerializerOptions)**
+
+```csharp
+public void Write(Utf8JsonWriter writer, TensorSplitsCollection value, JsonSerializerOptions options)
+```
+
+#### Parameters
+
+`writer` Utf8JsonWriter
+
+`value` [TensorSplitsCollection](./llama.abstractions.tensorsplitscollection.md)
+
+`options` JsonSerializerOptions
diff --git a/docs/xmldocs/llama.antipromptprocessor.md b/docs/xmldocs/llama.antipromptprocessor.md
new file mode 100644
index 00000000..73edc277
--- /dev/null
+++ b/docs/xmldocs/llama.antipromptprocessor.md
@@ -0,0 +1,69 @@
+# AntipromptProcessor
+
+Namespace: LLama
+
+AntipromptProcessor keeps track of past tokens looking for any set Anti-Prompts
+
+```csharp
+public sealed class AntipromptProcessor
+```
+
+Inheritance [Object](https://docs.microsoft.com/en-us/dotnet/api/system.object) β [AntipromptProcessor](./llama.antipromptprocessor.md)
+
+## Constructors
+
+### **AntipromptProcessor(IEnumerable<String>)**
+
+Initializes a new instance of the [AntipromptProcessor](./llama.antipromptprocessor.md) class.
+
+```csharp
+public AntipromptProcessor(IEnumerable antiprompts)
+```
+
+#### Parameters
+
+`antiprompts` [IEnumerable<String>](https://docs.microsoft.com/en-us/dotnet/api/system.collections.generic.ienumerable-1)
+The antiprompts.
+
+## Methods
+
+### **AddAntiprompt(String)**
+
+Add an antiprompt to the collection
+
+```csharp
+public void AddAntiprompt(string antiprompt)
+```
+
+#### Parameters
+
+`antiprompt` [String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
+
+### **SetAntiprompts(IEnumerable<String>)**
+
+Overwrite all current antiprompts with a new set
+
+```csharp
+public void SetAntiprompts(IEnumerable antiprompts)
+```
+
+#### Parameters
+
+`antiprompts` [IEnumerable<String>](https://docs.microsoft.com/en-us/dotnet/api/system.collections.generic.ienumerable-1)
+
+### **Add(String)**
+
+Add some text and check if the buffer now ends with any antiprompt
+
+```csharp
+public bool Add(string text)
+```
+
+#### Parameters
+
+`text` [String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
+
+#### Returns
+
+[Boolean](https://docs.microsoft.com/en-us/dotnet/api/system.boolean)
+true if the text buffer ends with any antiprompt
diff --git a/docs/xmldocs/llama.batched.alreadypromptedconversationexception.md b/docs/xmldocs/llama.batched.alreadypromptedconversationexception.md
new file mode 100644
index 00000000..227fb059
--- /dev/null
+++ b/docs/xmldocs/llama.batched.alreadypromptedconversationexception.md
@@ -0,0 +1,96 @@
+# AlreadyPromptedConversationException
+
+Namespace: LLama.Batched
+
+This exception is thrown when "Prompt()" is called on a [Conversation](./llama.batched.conversation.md) which has
+ already been prompted and before "Infer()" has been called on the associated
+ [BatchedExecutor](./llama.batched.batchedexecutor.md).
+
+```csharp
+public class AlreadyPromptedConversationException : ExperimentalBatchedExecutorException, System.Runtime.Serialization.ISerializable
+```
+
+Inheritance [Object](https://docs.microsoft.com/en-us/dotnet/api/system.object) β [Exception](https://docs.microsoft.com/en-us/dotnet/api/system.exception) β [ExperimentalBatchedExecutorException](./llama.batched.experimentalbatchedexecutorexception.md) β [AlreadyPromptedConversationException](./llama.batched.alreadypromptedconversationexception.md)
+Implements [ISerializable](https://docs.microsoft.com/en-us/dotnet/api/system.runtime.serialization.iserializable)
+
+## Properties
+
+### **TargetSite**
+
+```csharp
+public MethodBase TargetSite { get; }
+```
+
+#### Property Value
+
+[MethodBase](https://docs.microsoft.com/en-us/dotnet/api/system.reflection.methodbase)
+
+### **Message**
+
+```csharp
+public string Message { get; }
+```
+
+#### Property Value
+
+[String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
+
+### **Data**
+
+```csharp
+public IDictionary Data { get; }
+```
+
+#### Property Value
+
+[IDictionary](https://docs.microsoft.com/en-us/dotnet/api/system.collections.idictionary)
+
+### **InnerException**
+
+```csharp
+public Exception InnerException { get; }
+```
+
+#### Property Value
+
+[Exception](https://docs.microsoft.com/en-us/dotnet/api/system.exception)
+
+### **HelpLink**
+
+```csharp
+public string HelpLink { get; set; }
+```
+
+#### Property Value
+
+[String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
+
+### **Source**
+
+```csharp
+public string Source { get; set; }
+```
+
+#### Property Value
+
+[String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
+
+### **HResult**
+
+```csharp
+public int HResult { get; set; }
+```
+
+#### Property Value
+
+[Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
+
+### **StackTrace**
+
+```csharp
+public string StackTrace { get; }
+```
+
+#### Property Value
+
+[String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
diff --git a/docs/xmldocs/llama.batched.batchedexecutor.md b/docs/xmldocs/llama.batched.batchedexecutor.md
new file mode 100644
index 00000000..0c3353ac
--- /dev/null
+++ b/docs/xmldocs/llama.batched.batchedexecutor.md
@@ -0,0 +1,151 @@
+# BatchedExecutor
+
+Namespace: LLama.Batched
+
+A batched executor that can infer multiple separate "conversations" simultaneously.
+
+```csharp
+public sealed class BatchedExecutor : System.IDisposable
+```
+
+Inheritance [Object](https://docs.microsoft.com/en-us/dotnet/api/system.object) β [BatchedExecutor](./llama.batched.batchedexecutor.md)
+Implements [IDisposable](https://docs.microsoft.com/en-us/dotnet/api/system.idisposable)
+
+## Properties
+
+### **Context**
+
+The [LLamaContext](./llama.llamacontext.md) this executor is using
+
+```csharp
+public LLamaContext Context { get; }
+```
+
+#### Property Value
+
+[LLamaContext](./llama.llamacontext.md)
+
+### **Model**
+
+The [LLamaWeights](./llama.llamaweights.md) this executor is using
+
+```csharp
+public LLamaWeights Model { get; }
+```
+
+#### Property Value
+
+[LLamaWeights](./llama.llamaweights.md)
+
+### **BatchedTokenCount**
+
+Get the number of tokens in the batch, waiting for [BatchedExecutor.Infer(CancellationToken)](./llama.batched.batchedexecutor.md#infercancellationtoken) to be called
+
+```csharp
+public int BatchedTokenCount { get; }
+```
+
+#### Property Value
+
+[Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
+
+### **IsDisposed**
+
+Check if this executor has been disposed.
+
+```csharp
+public bool IsDisposed { get; private set; }
+```
+
+#### Property Value
+
+[Boolean](https://docs.microsoft.com/en-us/dotnet/api/system.boolean)
+
+## Constructors
+
+### **BatchedExecutor(LLamaWeights, IContextParams)**
+
+Create a new batched executor
+
+```csharp
+public BatchedExecutor(LLamaWeights model, IContextParams contextParams)
+```
+
+#### Parameters
+
+`model` [LLamaWeights](./llama.llamaweights.md)
+The model to use
+
+`contextParams` [IContextParams](./llama.abstractions.icontextparams.md)
+Parameters to create a new context
+
+## Methods
+
+### **Prompt(String)**
+
+#### Caution
+
+Use BatchedExecutor.Create instead
+
+---
+
+Start a new [Conversation](./llama.batched.conversation.md) with the given prompt
+
+```csharp
+public Conversation Prompt(string prompt)
+```
+
+#### Parameters
+
+`prompt` [String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
+
+#### Returns
+
+[Conversation](./llama.batched.conversation.md)
+
+### **Create()**
+
+Start a new [Conversation](./llama.batched.conversation.md)
+
+```csharp
+public Conversation Create()
+```
+
+#### Returns
+
+[Conversation](./llama.batched.conversation.md)
+
+### **Infer(CancellationToken)**
+
+Run inference for all conversations in the batch which have pending tokens.
+
+ If the result is `NoKvSlot` then there is not enough memory for inference, try disposing some conversation
+ threads and running inference again.
+
+```csharp
+public Task Infer(CancellationToken cancellation)
+```
+
+#### Parameters
+
+`cancellation` [CancellationToken](https://docs.microsoft.com/en-us/dotnet/api/system.threading.cancellationtoken)
+
+#### Returns
+
+[Task<DecodeResult>](https://docs.microsoft.com/en-us/dotnet/api/system.threading.tasks.task-1)
+
+### **Dispose()**
+
+```csharp
+public void Dispose()
+```
+
+### **GetNextSequenceId()**
+
+```csharp
+internal LLamaSeqId GetNextSequenceId()
+```
+
+#### Returns
+
+[LLamaSeqId](./llama.native.llamaseqid.md)
diff --git a/docs/xmldocs/llama.batched.cannotforkwhilerequiresinferenceexception.md b/docs/xmldocs/llama.batched.cannotforkwhilerequiresinferenceexception.md
new file mode 100644
index 00000000..752f2410
--- /dev/null
+++ b/docs/xmldocs/llama.batched.cannotforkwhilerequiresinferenceexception.md
@@ -0,0 +1,94 @@
+# CannotForkWhileRequiresInferenceException
+
+Namespace: LLama.Batched
+
+This exception is thrown when [Conversation.Fork()](./llama.batched.conversation.md#fork) is called when [Conversation.RequiresInference](./llama.batched.conversation.md#requiresinference) = true
+
+```csharp
+public class CannotForkWhileRequiresInferenceException : ExperimentalBatchedExecutorException, System.Runtime.Serialization.ISerializable
+```
+
+Inheritance [Object](https://docs.microsoft.com/en-us/dotnet/api/system.object) β [Exception](https://docs.microsoft.com/en-us/dotnet/api/system.exception) β [ExperimentalBatchedExecutorException](./llama.batched.experimentalbatchedexecutorexception.md) β [CannotForkWhileRequiresInferenceException](./llama.batched.cannotforkwhilerequiresinferenceexception.md)
+Implements [ISerializable](https://docs.microsoft.com/en-us/dotnet/api/system.runtime.serialization.iserializable)
+
+## Properties
+
+### **TargetSite**
+
+```csharp
+public MethodBase TargetSite { get; }
+```
+
+#### Property Value
+
+[MethodBase](https://docs.microsoft.com/en-us/dotnet/api/system.reflection.methodbase)
+
+### **Message**
+
+```csharp
+public string Message { get; }
+```
+
+#### Property Value
+
+[String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
+
+### **Data**
+
+```csharp
+public IDictionary Data { get; }
+```
+
+#### Property Value
+
+[IDictionary](https://docs.microsoft.com/en-us/dotnet/api/system.collections.idictionary)
+
+### **InnerException**
+
+```csharp
+public Exception InnerException { get; }
+```
+
+#### Property Value
+
+[Exception](https://docs.microsoft.com/en-us/dotnet/api/system.exception)
+
+### **HelpLink**
+
+```csharp
+public string HelpLink { get; set; }
+```
+
+#### Property Value
+
+[String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
+
+### **Source**
+
+```csharp
+public string Source { get; set; }
+```
+
+#### Property Value
+
+[String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
+
+### **HResult**
+
+```csharp
+public int HResult { get; set; }
+```
+
+#### Property Value
+
+[Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
+
+### **StackTrace**
+
+```csharp
+public string StackTrace { get; }
+```
+
+#### Property Value
+
+[String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
diff --git a/docs/xmldocs/llama.batched.cannotmodifywhilerequiresinferenceexception.md b/docs/xmldocs/llama.batched.cannotmodifywhilerequiresinferenceexception.md
new file mode 100644
index 00000000..09e20f8c
--- /dev/null
+++ b/docs/xmldocs/llama.batched.cannotmodifywhilerequiresinferenceexception.md
@@ -0,0 +1,94 @@
+# CannotModifyWhileRequiresInferenceException
+
+Namespace: LLama.Batched
+
+This exception is thrown when [Conversation.Modify(ModifyKvCache)](./llama.batched.conversation.md#modifymodifykvcache) is called when [Conversation.RequiresInference](./llama.batched.conversation.md#requiresinference) = true
+
+```csharp
+public class CannotModifyWhileRequiresInferenceException : ExperimentalBatchedExecutorException, System.Runtime.Serialization.ISerializable
+```
+
+Inheritance [Object](https://docs.microsoft.com/en-us/dotnet/api/system.object) β [Exception](https://docs.microsoft.com/en-us/dotnet/api/system.exception) β [ExperimentalBatchedExecutorException](./llama.batched.experimentalbatchedexecutorexception.md) β [CannotModifyWhileRequiresInferenceException](./llama.batched.cannotmodifywhilerequiresinferenceexception.md)
+Implements [ISerializable](https://docs.microsoft.com/en-us/dotnet/api/system.runtime.serialization.iserializable)
+
+## Properties
+
+### **TargetSite**
+
+```csharp
+public MethodBase TargetSite { get; }
+```
+
+#### Property Value
+
+[MethodBase](https://docs.microsoft.com/en-us/dotnet/api/system.reflection.methodbase)
+
+### **Message**
+
+```csharp
+public string Message { get; }
+```
+
+#### Property Value
+
+[String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
+
+### **Data**
+
+```csharp
+public IDictionary Data { get; }
+```
+
+#### Property Value
+
+[IDictionary](https://docs.microsoft.com/en-us/dotnet/api/system.collections.idictionary)
+
+### **InnerException**
+
+```csharp
+public Exception InnerException { get; }
+```
+
+#### Property Value
+
+[Exception](https://docs.microsoft.com/en-us/dotnet/api/system.exception)
+
+### **HelpLink**
+
+```csharp
+public string HelpLink { get; set; }
+```
+
+#### Property Value
+
+[String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
+
+### **Source**
+
+```csharp
+public string Source { get; set; }
+```
+
+#### Property Value
+
+[String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
+
+### **HResult**
+
+```csharp
+public int HResult { get; set; }
+```
+
+#### Property Value
+
+[Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
+
+### **StackTrace**
+
+```csharp
+public string StackTrace { get; }
+```
+
+#### Property Value
+
+[String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
diff --git a/docs/xmldocs/llama.batched.cannotsamplerequiresinferenceexception.md b/docs/xmldocs/llama.batched.cannotsamplerequiresinferenceexception.md
new file mode 100644
index 00000000..4bda45d3
--- /dev/null
+++ b/docs/xmldocs/llama.batched.cannotsamplerequiresinferenceexception.md
@@ -0,0 +1,96 @@
+# CannotSampleRequiresInferenceException
+
+Namespace: LLama.Batched
+
+This exception is thrown when "Sample()" is called on a [Conversation](./llama.batched.conversation.md) which has
+ already been prompted and before "Infer()" has been called on the associated
+ [BatchedExecutor](./llama.batched.batchedexecutor.md).
+
+```csharp
+public class CannotSampleRequiresInferenceException : ExperimentalBatchedExecutorException, System.Runtime.Serialization.ISerializable
+```
+
+Inheritance [Object](https://docs.microsoft.com/en-us/dotnet/api/system.object) β [Exception](https://docs.microsoft.com/en-us/dotnet/api/system.exception) β [ExperimentalBatchedExecutorException](./llama.batched.experimentalbatchedexecutorexception.md) β [CannotSampleRequiresInferenceException](./llama.batched.cannotsamplerequiresinferenceexception.md)
+Implements [ISerializable](https://docs.microsoft.com/en-us/dotnet/api/system.runtime.serialization.iserializable)
+
+## Properties
+
+### **TargetSite**
+
+```csharp
+public MethodBase TargetSite { get; }
+```
+
+#### Property Value
+
+[MethodBase](https://docs.microsoft.com/en-us/dotnet/api/system.reflection.methodbase)
+
+### **Message**
+
+```csharp
+public string Message { get; }
+```
+
+#### Property Value
+
+[String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
+
+### **Data**
+
+```csharp
+public IDictionary Data { get; }
+```
+
+#### Property Value
+
+[IDictionary](https://docs.microsoft.com/en-us/dotnet/api/system.collections.idictionary)
+
+### **InnerException**
+
+```csharp
+public Exception InnerException { get; }
+```
+
+#### Property Value
+
+[Exception](https://docs.microsoft.com/en-us/dotnet/api/system.exception)
+
+### **HelpLink**
+
+```csharp
+public string HelpLink { get; set; }
+```
+
+#### Property Value
+
+[String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
+
+### **Source**
+
+```csharp
+public string Source { get; set; }
+```
+
+#### Property Value
+
+[String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
+
+### **HResult**
+
+```csharp
+public int HResult { get; set; }
+```
+
+#### Property Value
+
+[Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
+
+### **StackTrace**
+
+```csharp
+public string StackTrace { get; }
+```
+
+#### Property Value
+
+[String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
diff --git a/docs/xmldocs/llama.batched.cannotsamplerequirespromptexception.md b/docs/xmldocs/llama.batched.cannotsamplerequirespromptexception.md
new file mode 100644
index 00000000..d3a72c7b
--- /dev/null
+++ b/docs/xmldocs/llama.batched.cannotsamplerequirespromptexception.md
@@ -0,0 +1,96 @@
+# CannotSampleRequiresPromptException
+
+Namespace: LLama.Batched
+
+This exception is thrown when "Sample()" is called on a [Conversation](./llama.batched.conversation.md) which was not
+ first prompted.
+ [BatchedExecutor](./llama.batched.batchedexecutor.md).
+
+```csharp
+public class CannotSampleRequiresPromptException : ExperimentalBatchedExecutorException, System.Runtime.Serialization.ISerializable
+```
+
+Inheritance [Object](https://docs.microsoft.com/en-us/dotnet/api/system.object) β [Exception](https://docs.microsoft.com/en-us/dotnet/api/system.exception) β [ExperimentalBatchedExecutorException](./llama.batched.experimentalbatchedexecutorexception.md) β [CannotSampleRequiresPromptException](./llama.batched.cannotsamplerequirespromptexception.md)
+Implements [ISerializable](https://docs.microsoft.com/en-us/dotnet/api/system.runtime.serialization.iserializable)
+
+## Properties
+
+### **TargetSite**
+
+```csharp
+public MethodBase TargetSite { get; }
+```
+
+#### Property Value
+
+[MethodBase](https://docs.microsoft.com/en-us/dotnet/api/system.reflection.methodbase)
+
+### **Message**
+
+```csharp
+public string Message { get; }
+```
+
+#### Property Value
+
+[String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
+
+### **Data**
+
+```csharp
+public IDictionary Data { get; }
+```
+
+#### Property Value
+
+[IDictionary](https://docs.microsoft.com/en-us/dotnet/api/system.collections.idictionary)
+
+### **InnerException**
+
+```csharp
+public Exception InnerException { get; }
+```
+
+#### Property Value
+
+[Exception](https://docs.microsoft.com/en-us/dotnet/api/system.exception)
+
+### **HelpLink**
+
+```csharp
+public string HelpLink { get; set; }
+```
+
+#### Property Value
+
+[String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
+
+### **Source**
+
+```csharp
+public string Source { get; set; }
+```
+
+#### Property Value
+
+[String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
+
+### **HResult**
+
+```csharp
+public int HResult { get; set; }
+```
+
+#### Property Value
+
+[Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
+
+### **StackTrace**
+
+```csharp
+public string StackTrace { get; }
+```
+
+#### Property Value
+
+[String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
diff --git a/docs/xmldocs/llama.batched.conversation.md b/docs/xmldocs/llama.batched.conversation.md
new file mode 100644
index 00000000..115a95e4
--- /dev/null
+++ b/docs/xmldocs/llama.batched.conversation.md
@@ -0,0 +1,233 @@
+# Conversation
+
+Namespace: LLama.Batched
+
+A single conversation thread that can be prompted (adding tokens from the user) or inferred (extracting a token from the LLM)
+
+```csharp
+public sealed class Conversation : System.IDisposable
+```
+
+Inheritance [Object](https://docs.microsoft.com/en-us/dotnet/api/system.object) β [Conversation](./llama.batched.conversation.md)
+Implements [IDisposable](https://docs.microsoft.com/en-us/dotnet/api/system.idisposable)
+
+## Properties
+
+### **Executor**
+
+The executor which this conversation belongs to
+
+```csharp
+public BatchedExecutor Executor { get; }
+```
+
+#### Property Value
+
+[BatchedExecutor](./llama.batched.batchedexecutor.md)
+
+### **ConversationId**
+
+Unique ID for this conversation
+
+```csharp
+public LLamaSeqId ConversationId { get; }
+```
+
+#### Property Value
+
+[LLamaSeqId](./llama.native.llamaseqid.md)
+
+### **TokenCount**
+
+Total number of tokens in this conversation, cannot exceed the context length.
+
+```csharp
+public int TokenCount { get; }
+```
+
+#### Property Value
+
+[Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
+
+### **IsDisposed**
+
+Indicates if this conversation has been disposed, nothing can be done with a disposed conversation
+
+```csharp
+public bool IsDisposed { get; }
+```
+
+#### Property Value
+
+[Boolean](https://docs.microsoft.com/en-us/dotnet/api/system.boolean)
+
+### **RequiresInference**
+
+Indicates if this conversation is waiting for inference to be run on the executor. "Prompt" and "Sample" cannot be called when this is true.
+
+```csharp
+public bool RequiresInference { get; }
+```
+
+#### Property Value
+
+[Boolean](https://docs.microsoft.com/en-us/dotnet/api/system.boolean)
+
+### **RequiresSampling**
+
+Indicates that this conversation should be sampled.
+
+```csharp
+public bool RequiresSampling { get; }
+```
+
+#### Property Value
+
+[Boolean](https://docs.microsoft.com/en-us/dotnet/api/system.boolean)
+
+## Methods
+
+### **Finalize()**
+
+Finalizer for Conversation
+
+```csharp
+protected void Finalize()
+```
+
+### **Dispose()**
+
+End this conversation, freeing all resources used by it
+
+```csharp
+public void Dispose()
+```
+
+#### Exceptions
+
+[ObjectDisposedException](https://docs.microsoft.com/en-us/dotnet/api/system.objectdisposedexception)
+
+### **Fork()**
+
+Create a copy of the current conversation
+
+```csharp
+public Conversation Fork()
+```
+
+#### Returns
+
+[Conversation](./llama.batched.conversation.md)
+
+#### Exceptions
+
+[ObjectDisposedException](https://docs.microsoft.com/en-us/dotnet/api/system.objectdisposedexception)
+
+**Remarks:**
+
+The copy shares internal state, so consumes very little extra memory.
+
+### **Sample()**
+
+Get the logits from this conversation, ready for sampling
+
+```csharp
+public Span Sample()
+```
+
+#### Returns
+
+[Span<Single>](https://docs.microsoft.com/en-us/dotnet/api/system.span-1)
+
+#### Exceptions
+
+[ObjectDisposedException](https://docs.microsoft.com/en-us/dotnet/api/system.objectdisposedexception)
+
+[CannotSampleRequiresPromptException](./llama.batched.cannotsamplerequirespromptexception.md)
+Thrown if this conversation was not prompted before the previous call to infer
+
+[CannotSampleRequiresInferenceException](./llama.batched.cannotsamplerequiresinferenceexception.md)
+Thrown if Infer() must be called on the executor
+
+### **Prompt(String)**
+
+Add tokens to this conversation
+
+```csharp
+public void Prompt(string input)
+```
+
+#### Parameters
+
+`input` [String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
+
+### **Prompt(List<LLamaToken>)**
+
+Add tokens to this conversation
+
+```csharp
+public void Prompt(List tokens)
+```
+
+#### Parameters
+
+`tokens` [List<LLamaToken>](https://docs.microsoft.com/en-us/dotnet/api/system.collections.generic.list-1)
+
+#### Exceptions
+
+[ObjectDisposedException](https://docs.microsoft.com/en-us/dotnet/api/system.objectdisposedexception)
+
+[AlreadyPromptedConversationException](./llama.batched.alreadypromptedconversationexception.md)
+
+### **Prompt(ReadOnlySpan<LLamaToken>)**
+
+Add tokens to this conversation
+
+```csharp
+public void Prompt(ReadOnlySpan tokens)
+```
+
+#### Parameters
+
+`tokens` [ReadOnlySpan<LLamaToken>](https://docs.microsoft.com/en-us/dotnet/api/system.readonlyspan-1)
+
+#### Exceptions
+
+[ObjectDisposedException](https://docs.microsoft.com/en-us/dotnet/api/system.objectdisposedexception)
+
+[AlreadyPromptedConversationException](./llama.batched.alreadypromptedconversationexception.md)
+
+### **Prompt(LLamaToken)**
+
+Add a single token to this conversation
+
+```csharp
+public void Prompt(LLamaToken token)
+```
+
+#### Parameters
+
+`token` [LLamaToken](./llama.native.llamatoken.md)
+
+#### Exceptions
+
+[ObjectDisposedException](https://docs.microsoft.com/en-us/dotnet/api/system.objectdisposedexception)
+
+[AlreadyPromptedConversationException](./llama.batched.alreadypromptedconversationexception.md)
+
+### **Modify(ModifyKvCache)**
+
+Directly modify the KV cache of this conversation
+
+```csharp
+public void Modify(ModifyKvCache modifier)
+```
+
+#### Parameters
+
+`modifier` [ModifyKvCache](./llama.batched.conversation.modifykvcache.md)
+
+#### Exceptions
+
+[CannotModifyWhileRequiresInferenceException](./llama.batched.cannotmodifywhilerequiresinferenceexception.md)
+Thrown if this method is called while [Conversation.RequiresInference](./llama.batched.conversation.md#requiresinference) == true
diff --git a/docs/xmldocs/llama.batched.conversationextensions.md b/docs/xmldocs/llama.batched.conversationextensions.md
new file mode 100644
index 00000000..30cdfa2b
--- /dev/null
+++ b/docs/xmldocs/llama.batched.conversationextensions.md
@@ -0,0 +1,55 @@
+# ConversationExtensions
+
+Namespace: LLama.Batched
+
+Extension method for [Conversation](./llama.batched.conversation.md)
+
+```csharp
+public static class ConversationExtensions
+```
+
+Inheritance [Object](https://docs.microsoft.com/en-us/dotnet/api/system.object) β [ConversationExtensions](./llama.batched.conversationextensions.md)
+
+## Methods
+
+### **Rewind(Conversation, Int32)**
+
+Rewind a [Conversation](./llama.batched.conversation.md) back to an earlier state by removing tokens from the end
+
+```csharp
+public static void Rewind(Conversation conversation, int tokens)
+```
+
+#### Parameters
+
+`conversation` [Conversation](./llama.batched.conversation.md)
+The conversation to rewind
+
+`tokens` [Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
+The number of tokens to rewind
+
+#### Exceptions
+
+[ArgumentOutOfRangeException](https://docs.microsoft.com/en-us/dotnet/api/system.argumentoutofrangeexception)
+Thrown if `tokens` parameter is larger than TokenCount
+
+### **ShiftLeft(Conversation, Int32, Int32)**
+
+Shift all tokens over to the left, removing "count" tokens from the start and shifting everything over.
+ Leaves "keep" tokens at the start completely untouched. This can be used to free up space when the context
+ gets full, keeping the prompt at the start intact.
+
+```csharp
+public static void ShiftLeft(Conversation conversation, int count, int keep)
+```
+
+#### Parameters
+
+`conversation` [Conversation](./llama.batched.conversation.md)
+The conversation to rewind
+
+`count` [Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
+How much to shift tokens over by
+
+`keep` [Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
+The number of tokens at the start which should not be shifted
diff --git a/docs/xmldocs/llama.batched.experimentalbatchedexecutorexception.md b/docs/xmldocs/llama.batched.experimentalbatchedexecutorexception.md
new file mode 100644
index 00000000..35270bb0
--- /dev/null
+++ b/docs/xmldocs/llama.batched.experimentalbatchedexecutorexception.md
@@ -0,0 +1,94 @@
+# ExperimentalBatchedExecutorException
+
+Namespace: LLama.Batched
+
+Base class for exceptions thrown from [BatchedExecutor](./llama.batched.batchedexecutor.md)
+
+```csharp
+public class ExperimentalBatchedExecutorException : System.Exception, System.Runtime.Serialization.ISerializable
+```
+
+Inheritance [Object](https://docs.microsoft.com/en-us/dotnet/api/system.object) β [Exception](https://docs.microsoft.com/en-us/dotnet/api/system.exception) β [ExperimentalBatchedExecutorException](./llama.batched.experimentalbatchedexecutorexception.md)
+Implements [ISerializable](https://docs.microsoft.com/en-us/dotnet/api/system.runtime.serialization.iserializable)
+
+## Properties
+
+### **TargetSite**
+
+```csharp
+public MethodBase TargetSite { get; }
+```
+
+#### Property Value
+
+[MethodBase](https://docs.microsoft.com/en-us/dotnet/api/system.reflection.methodbase)
+
+### **Message**
+
+```csharp
+public string Message { get; }
+```
+
+#### Property Value
+
+[String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
+
+### **Data**
+
+```csharp
+public IDictionary Data { get; }
+```
+
+#### Property Value
+
+[IDictionary](https://docs.microsoft.com/en-us/dotnet/api/system.collections.idictionary)
+
+### **InnerException**
+
+```csharp
+public Exception InnerException { get; }
+```
+
+#### Property Value
+
+[Exception](https://docs.microsoft.com/en-us/dotnet/api/system.exception)
+
+### **HelpLink**
+
+```csharp
+public string HelpLink { get; set; }
+```
+
+#### Property Value
+
+[String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
+
+### **Source**
+
+```csharp
+public string Source { get; set; }
+```
+
+#### Property Value
+
+[String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
+
+### **HResult**
+
+```csharp
+public int HResult { get; set; }
+```
+
+#### Property Value
+
+[Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
+
+### **StackTrace**
+
+```csharp
+public string StackTrace { get; }
+```
+
+#### Property Value
+
+[String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
diff --git a/docs/xmldocs/llama.oldversion.chatsession-1.md b/docs/xmldocs/llama.chatsession-1.md
similarity index 52%
rename from docs/xmldocs/llama.oldversion.chatsession-1.md
rename to docs/xmldocs/llama.chatsession-1.md
index 1c68d554..1f3cf67e 100644
--- a/docs/xmldocs/llama.oldversion.chatsession-1.md
+++ b/docs/xmldocs/llama.chatsession-1.md
@@ -1,12 +1,6 @@
# ChatSession<T>
-Namespace: LLama.OldVersion
-
-#### Caution
-
-The entire LLama.OldVersion namespace will be removed
-
----
+Namespace: LLama
```csharp
public class ChatSession
@@ -16,7 +10,7 @@ public class ChatSession
`T`
-Inheritance [Object](https://docs.microsoft.com/en-us/dotnet/api/system.object) β [ChatSession<T>](./llama.oldversion.chatsession-1.md)
+Inheritance [Object](https://docs.microsoft.com/en-us/dotnet/api/system.object) β [ChatSession<T>](./llama.chatsession-1.md)
## Constructors
@@ -32,10 +26,10 @@ public ChatSession(T model)
## Methods
-### **Chat(String, String, String)**
+### **Chat(String, String)**
```csharp
-public IEnumerable Chat(string text, string prompt, string encoding)
+public IEnumerable Chat(string text, string prompt)
```
#### Parameters
@@ -44,48 +38,40 @@ public IEnumerable Chat(string text, string prompt, string encoding)
`prompt` [String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
-`encoding` [String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
-
#### Returns
[IEnumerable<String>](https://docs.microsoft.com/en-us/dotnet/api/system.collections.generic.ienumerable-1)
-### **WithPrompt(String, String)**
+### **WithPrompt(String)**
```csharp
-public ChatSession WithPrompt(string prompt, string encoding)
+public ChatSession WithPrompt(string prompt)
```
#### Parameters
`prompt` [String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
-`encoding` [String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
-
#### Returns
-[ChatSession<T>](./llama.oldversion.chatsession-1.md)
+[ChatSession<T>](./llama.chatsession-1.md)
-### **WithPromptFile(String, String)**
+### **WithPromptFile(String)**
```csharp
-public ChatSession WithPromptFile(string promptFilename, string encoding)
+public ChatSession WithPromptFile(string promptFilename)
```
#### Parameters
`promptFilename` [String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
-`encoding` [String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
-
#### Returns
-[ChatSession<T>](./llama.oldversion.chatsession-1.md)
+[ChatSession<T>](./llama.chatsession-1.md)
### **WithAntiprompt(String[])**
-Set the keywords to split the return value of chat AI.
-
```csharp
public ChatSession WithAntiprompt(String[] antiprompt)
```
@@ -96,4 +82,4 @@ public ChatSession WithAntiprompt(String[] antiprompt)
#### Returns
-[ChatSession<T>](./llama.oldversion.chatsession-1.md)
+[ChatSession<T>](./llama.chatsession-1.md)
diff --git a/docs/xmldocs/llama.chatsession.md b/docs/xmldocs/llama.chatsession.md
index dcd818b8..99c535cc 100644
--- a/docs/xmldocs/llama.chatsession.md
+++ b/docs/xmldocs/llama.chatsession.md
@@ -20,6 +20,54 @@ The output transform used in this session.
public ITextStreamTransform OutputTransform;
```
+### **MODEL_STATE_FILENAME**
+
+The filename for the serialized model state (KV cache, etc).
+
+```csharp
+public static string MODEL_STATE_FILENAME;
+```
+
+### **EXECUTOR_STATE_FILENAME**
+
+The filename for the serialized executor state.
+
+```csharp
+public static string EXECUTOR_STATE_FILENAME;
+```
+
+### **HISTORY_STATE_FILENAME**
+
+The filename for the serialized chat history.
+
+```csharp
+public static string HISTORY_STATE_FILENAME;
+```
+
+### **INPUT_TRANSFORM_FILENAME**
+
+The filename for the serialized input transform pipeline.
+
+```csharp
+public static string INPUT_TRANSFORM_FILENAME;
+```
+
+### **OUTPUT_TRANSFORM_FILENAME**
+
+The filename for the serialized output transform.
+
+```csharp
+public static string OUTPUT_TRANSFORM_FILENAME;
+```
+
+### **HISTORY_TRANSFORM_FILENAME**
+
+The filename for the serialized history transform.
+
+```csharp
+public static string HISTORY_TRANSFORM_FILENAME;
+```
+
## Properties
### **Executor**
@@ -27,7 +75,7 @@ public ITextStreamTransform OutputTransform;
The executor for this session.
```csharp
-public ILLamaExecutor Executor { get; }
+public ILLamaExecutor Executor { get; private set; }
```
#### Property Value
@@ -39,7 +87,7 @@ public ILLamaExecutor Executor { get; }
The chat history for this session.
```csharp
-public ChatHistory History { get; }
+public ChatHistory History { get; private set; }
```
#### Property Value
@@ -74,7 +122,7 @@ public List InputTransformPipeline { get; set; }
### **ChatSession(ILLamaExecutor)**
-
+Create a new chat session.
```csharp
public ChatSession(ILLamaExecutor executor)
@@ -85,8 +133,42 @@ public ChatSession(ILLamaExecutor executor)
`executor` [ILLamaExecutor](./llama.abstractions.illamaexecutor.md)
The executor for this session
+### **ChatSession(ILLamaExecutor, ChatHistory)**
+
+Create a new chat session with a custom history.
+
+```csharp
+public ChatSession(ILLamaExecutor executor, ChatHistory history)
+```
+
+#### Parameters
+
+`executor` [ILLamaExecutor](./llama.abstractions.illamaexecutor.md)
+
+`history` [ChatHistory](./llama.common.chathistory.md)
+
## Methods
+### **InitializeSessionFromHistoryAsync(ILLamaExecutor, ChatHistory)**
+
+Create a new chat session and preprocess history.
+
+```csharp
+public static Task InitializeSessionFromHistoryAsync(ILLamaExecutor executor, ChatHistory history)
+```
+
+#### Parameters
+
+`executor` [ILLamaExecutor](./llama.abstractions.illamaexecutor.md)
+The executor for this session
+
+`history` [ChatHistory](./llama.common.chathistory.md)
+History for this session
+
+#### Returns
+
+[Task<ChatSession>](https://docs.microsoft.com/en-us/dotnet/api/system.threading.tasks.task-1)
+
### **WithHistoryTransform(IHistoryTransform)**
Use a custom history transform.
@@ -137,7 +219,7 @@ public ChatSession WithOutputTransform(ITextStreamTransform transform)
### **SaveSession(String)**
-
+Save a session from a directory.
```csharp
public void SaveSession(string path)
@@ -146,32 +228,234 @@ public void SaveSession(string path)
#### Parameters
`path` [String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
-The directory name to save the session. If the directory does not exist, a new directory will be created.
-### **LoadSession(String)**
+#### Exceptions
+
+[ArgumentException](https://docs.microsoft.com/en-us/dotnet/api/system.argumentexception)
+
+### **GetSessionState()**
+
+Get the session state.
+
+```csharp
+public SessionState GetSessionState()
+```
+
+#### Returns
+
+[SessionState](./llama.sessionstate.md)
+SessionState object representing session state in-memory
+
+### **LoadSession(SessionState, Boolean)**
+
+Load a session from a session state.
+
+```csharp
+public void LoadSession(SessionState state, bool loadTransforms)
+```
+
+#### Parameters
+
+`state` [SessionState](./llama.sessionstate.md)
+
+`loadTransforms` [Boolean](https://docs.microsoft.com/en-us/dotnet/api/system.boolean)
+If true loads transforms saved in the session state.
+
+#### Exceptions
+
+[ArgumentException](https://docs.microsoft.com/en-us/dotnet/api/system.argumentexception)
+### **LoadSession(String, Boolean)**
+Load a session from a directory.
```csharp
-public void LoadSession(string path)
+public void LoadSession(string path, bool loadTransforms)
```
#### Parameters
`path` [String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
-The directory name to load the session.
-### **Chat(ChatHistory, IInferenceParams, CancellationToken)**
+`loadTransforms` [Boolean](https://docs.microsoft.com/en-us/dotnet/api/system.boolean)
+If true loads transforms saved in the session state.
-Get the response from the LLama model with chat histories.
+#### Exceptions
+
+[ArgumentException](https://docs.microsoft.com/en-us/dotnet/api/system.argumentexception)
+
+### **AddMessage(Message)**
+
+Add a message to the chat history.
```csharp
-public IEnumerable Chat(ChatHistory history, IInferenceParams inferenceParams, CancellationToken cancellationToken)
+public ChatSession AddMessage(Message message)
```
#### Parameters
-`history` [ChatHistory](./llama.common.chathistory.md)
+`message` [Message](./llama.common.chathistory.message.md)
+
+#### Returns
+
+[ChatSession](./llama.chatsession.md)
+
+### **AddSystemMessage(String)**
+
+Add a system message to the chat history.
+
+```csharp
+public ChatSession AddSystemMessage(string content)
+```
+
+#### Parameters
+
+`content` [String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
+
+#### Returns
+
+[ChatSession](./llama.chatsession.md)
+
+### **AddAssistantMessage(String)**
+
+Add an assistant message to the chat history.
+
+```csharp
+public ChatSession AddAssistantMessage(string content)
+```
+
+#### Parameters
+
+`content` [String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
+
+#### Returns
+
+[ChatSession](./llama.chatsession.md)
+
+### **AddUserMessage(String)**
+
+Add a user message to the chat history.
+
+```csharp
+public ChatSession AddUserMessage(string content)
+```
+
+#### Parameters
+
+`content` [String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
+
+#### Returns
+
+[ChatSession](./llama.chatsession.md)
+
+### **RemoveLastMessage()**
+
+Remove the last message from the chat history.
+
+```csharp
+public ChatSession RemoveLastMessage()
+```
+
+#### Returns
+
+[ChatSession](./llama.chatsession.md)
+
+### **AddAndProcessMessage(Message)**
+
+Compute KV cache for the message and add it to the chat history.
+
+```csharp
+public Task AddAndProcessMessage(Message message)
+```
+
+#### Parameters
+
+`message` [Message](./llama.common.chathistory.message.md)
+
+#### Returns
+
+[Task<ChatSession>](https://docs.microsoft.com/en-us/dotnet/api/system.threading.tasks.task-1)
+
+### **AddAndProcessSystemMessage(String)**
+
+Compute KV cache for the system message and add it to the chat history.
+
+```csharp
+public Task AddAndProcessSystemMessage(string content)
+```
+
+#### Parameters
+
+`content` [String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
+
+#### Returns
+
+[Task<ChatSession>](https://docs.microsoft.com/en-us/dotnet/api/system.threading.tasks.task-1)
+
+### **AddAndProcessUserMessage(String)**
+
+Compute KV cache for the user message and add it to the chat history.
+
+```csharp
+public Task AddAndProcessUserMessage(string content)
+```
+
+#### Parameters
+
+`content` [String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
+
+#### Returns
+
+[Task<ChatSession>](https://docs.microsoft.com/en-us/dotnet/api/system.threading.tasks.task-1)
+
+### **AddAndProcessAssistantMessage(String)**
+
+Compute KV cache for the assistant message and add it to the chat history.
+
+```csharp
+public Task AddAndProcessAssistantMessage(string content)
+```
+
+#### Parameters
+
+`content` [String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
+
+#### Returns
+
+[Task<ChatSession>](https://docs.microsoft.com/en-us/dotnet/api/system.threading.tasks.task-1)
+
+### **ReplaceUserMessage(Message, Message)**
+
+Replace a user message with a new message and remove all messages after the new message.
+ This is useful when the user wants to edit a message. And regenerate the response.
+
+```csharp
+public ChatSession ReplaceUserMessage(Message oldMessage, Message newMessage)
+```
+
+#### Parameters
+
+`oldMessage` [Message](./llama.common.chathistory.message.md)
+
+`newMessage` [Message](./llama.common.chathistory.message.md)
+
+#### Returns
+
+[ChatSession](./llama.chatsession.md)
+
+### **ChatAsync(Message, Boolean, IInferenceParams, CancellationToken)**
+
+Chat with the model.
+
+```csharp
+public IAsyncEnumerable ChatAsync(Message message, bool applyInputTransformPipeline, IInferenceParams inferenceParams, CancellationToken cancellationToken)
+```
+
+#### Parameters
+
+`message` [Message](./llama.common.chathistory.message.md)
+
+`applyInputTransformPipeline` [Boolean](https://docs.microsoft.com/en-us/dotnet/api/system.boolean)
`inferenceParams` [IInferenceParams](./llama.abstractions.iinferenceparams.md)
@@ -179,20 +463,23 @@ public IEnumerable Chat(ChatHistory history, IInferenceParams inferenceP
#### Returns
-[IEnumerable<String>](https://docs.microsoft.com/en-us/dotnet/api/system.collections.generic.ienumerable-1)
+[IAsyncEnumerable<String>](https://docs.microsoft.com/en-us/dotnet/api/system.collections.generic.iasyncenumerable-1)
+
+#### Exceptions
-### **Chat(String, IInferenceParams, CancellationToken)**
+[ArgumentException](https://docs.microsoft.com/en-us/dotnet/api/system.argumentexception)
-Get the response from the LLama model. Note that prompt could not only be the preset words,
- but also the question you want to ask.
+### **ChatAsync(Message, IInferenceParams, CancellationToken)**
+
+Chat with the model.
```csharp
-public IEnumerable Chat(string prompt, IInferenceParams inferenceParams, CancellationToken cancellationToken)
+public IAsyncEnumerable ChatAsync(Message message, IInferenceParams inferenceParams, CancellationToken cancellationToken)
```
#### Parameters
-`prompt` [String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
+`message` [Message](./llama.common.chathistory.message.md)
`inferenceParams` [IInferenceParams](./llama.abstractions.iinferenceparams.md)
@@ -200,20 +487,22 @@ public IEnumerable Chat(string prompt, IInferenceParams inferenceParams,
#### Returns
-[IEnumerable<String>](https://docs.microsoft.com/en-us/dotnet/api/system.collections.generic.ienumerable-1)
+[IAsyncEnumerable<String>](https://docs.microsoft.com/en-us/dotnet/api/system.collections.generic.iasyncenumerable-1)
-### **ChatAsync(ChatHistory, IInferenceParams, CancellationToken)**
+### **ChatAsync(ChatHistory, Boolean, IInferenceParams, CancellationToken)**
-Get the response from the LLama model with chat histories.
+Chat with the model.
```csharp
-public IAsyncEnumerable ChatAsync(ChatHistory history, IInferenceParams inferenceParams, CancellationToken cancellationToken)
+public IAsyncEnumerable ChatAsync(ChatHistory history, bool applyInputTransformPipeline, IInferenceParams inferenceParams, CancellationToken cancellationToken)
```
#### Parameters
`history` [ChatHistory](./llama.common.chathistory.md)
+`applyInputTransformPipeline` [Boolean](https://docs.microsoft.com/en-us/dotnet/api/system.boolean)
+
`inferenceParams` [IInferenceParams](./llama.abstractions.iinferenceparams.md)
`cancellationToken` [CancellationToken](https://docs.microsoft.com/en-us/dotnet/api/system.threading.cancellationtoken)
@@ -222,17 +511,21 @@ public IAsyncEnumerable ChatAsync(ChatHistory history, IInferenceParams
[IAsyncEnumerable<String>](https://docs.microsoft.com/en-us/dotnet/api/system.collections.generic.iasyncenumerable-1)
-### **ChatAsync(String, IInferenceParams, CancellationToken)**
+#### Exceptions
-Get the response from the LLama model with chat histories asynchronously.
+[ArgumentException](https://docs.microsoft.com/en-us/dotnet/api/system.argumentexception)
+
+### **ChatAsync(ChatHistory, IInferenceParams, CancellationToken)**
+
+Chat with the model.
```csharp
-public IAsyncEnumerable ChatAsync(string prompt, IInferenceParams inferenceParams, CancellationToken cancellationToken)
+public IAsyncEnumerable ChatAsync(ChatHistory history, IInferenceParams inferenceParams, CancellationToken cancellationToken)
```
#### Parameters
-`prompt` [String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
+`history` [ChatHistory](./llama.common.chathistory.md)
`inferenceParams` [IInferenceParams](./llama.abstractions.iinferenceparams.md)
@@ -241,3 +534,25 @@ public IAsyncEnumerable ChatAsync(string prompt, IInferenceParams infere
#### Returns
[IAsyncEnumerable<String>](https://docs.microsoft.com/en-us/dotnet/api/system.collections.generic.iasyncenumerable-1)
+
+### **RegenerateAssistantMessageAsync(InferenceParams, CancellationToken)**
+
+Regenerate the last assistant message.
+
+```csharp
+public IAsyncEnumerable RegenerateAssistantMessageAsync(InferenceParams inferenceParams, CancellationToken cancellationToken)
+```
+
+#### Parameters
+
+`inferenceParams` [InferenceParams](./llama.common.inferenceparams.md)
+
+`cancellationToken` [CancellationToken](https://docs.microsoft.com/en-us/dotnet/api/system.threading.cancellationtoken)
+
+#### Returns
+
+[IAsyncEnumerable<String>](https://docs.microsoft.com/en-us/dotnet/api/system.collections.generic.iasyncenumerable-1)
+
+#### Exceptions
+
+[InvalidOperationException](https://docs.microsoft.com/en-us/dotnet/api/system.invalidoperationexception)
diff --git a/docs/xmldocs/llama.common.chathistory.md b/docs/xmldocs/llama.common.chathistory.md
index ec2b4af0..78b5ecd2 100644
--- a/docs/xmldocs/llama.common.chathistory.md
+++ b/docs/xmldocs/llama.common.chathistory.md
@@ -17,7 +17,7 @@ Inheritance [Object](https://docs.microsoft.com/en-us/dotnet/api/system.object)
List of messages in the chat
```csharp
-public List Messages { get; }
+public List Messages { get; set; }
```
#### Property Value
@@ -34,6 +34,18 @@ Create a new instance of the chat content class
public ChatHistory()
```
+### **ChatHistory(Message[])**
+
+Create a new instance of the chat history from array of messages
+
+```csharp
+public ChatHistory(Message[] messageHistory)
+```
+
+#### Parameters
+
+`messageHistory` [Message[]](./llama.common.chathistory.message.md)
+
## Methods
### **AddMessage(AuthorRole, String)**
@@ -51,3 +63,31 @@ Role of the message author
`content` [String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
Message content
+
+### **ToJson()**
+
+Serialize the chat history to JSON
+
+```csharp
+public string ToJson()
+```
+
+#### Returns
+
+[String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
+
+### **FromJson(String)**
+
+Deserialize a chat history from JSON
+
+```csharp
+public static ChatHistory FromJson(string json)
+```
+
+#### Parameters
+
+`json` [String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
+
+#### Returns
+
+[ChatHistory](./llama.common.chathistory.md)
diff --git a/docs/xmldocs/llama.common.fixedsizequeue-1.md b/docs/xmldocs/llama.common.fixedsizequeue-1.md
index 32ba6ecf..1bb79f27 100644
--- a/docs/xmldocs/llama.common.fixedsizequeue-1.md
+++ b/docs/xmldocs/llama.common.fixedsizequeue-1.md
@@ -6,7 +6,7 @@ A queue with fixed storage size.
Currently it's only a naive implementation and needs to be further optimized in the future.
```csharp
-public class FixedSizeQueue : , System.Collections.IEnumerable
+public class FixedSizeQueue : , , , System.Collections.IEnumerable
```
#### Type Parameters
@@ -14,10 +14,20 @@ public class FixedSizeQueue : , System.Collections.IEnumerable
`T`
Inheritance [Object](https://docs.microsoft.com/en-us/dotnet/api/system.object) β [FixedSizeQueue<T>](./llama.common.fixedsizequeue-1.md)
-Implements IEnumerable<T>, [IEnumerable](https://docs.microsoft.com/en-us/dotnet/api/system.collections.ienumerable)
+Implements IReadOnlyList<T>, IReadOnlyCollection<T>, IEnumerable<T>, [IEnumerable](https://docs.microsoft.com/en-us/dotnet/api/system.collections.ienumerable)
## Properties
+### **Item**
+
+```csharp
+public T Item { get; }
+```
+
+#### Property Value
+
+T
+
### **Count**
Number of items in this queue
@@ -73,24 +83,6 @@ public FixedSizeQueue(int size, IEnumerable data)
## Methods
-### **FillWith(T)**
-
-Replace every item in the queue with the given value
-
-```csharp
-public FixedSizeQueue FillWith(T value)
-```
-
-#### Parameters
-
-`value` T
-The value to replace all items with
-
-#### Returns
-
-[FixedSizeQueue<T>](./llama.common.fixedsizequeue-1.md)
-returns this
-
### **Enqueue(T)**
Enquene an element.
diff --git a/docs/xmldocs/llama.common.illamalogger.md b/docs/xmldocs/llama.common.illamalogger.md
deleted file mode 100644
index e35a9417..00000000
--- a/docs/xmldocs/llama.common.illamalogger.md
+++ /dev/null
@@ -1,30 +0,0 @@
-# ILLamaLogger
-
-Namespace: LLama.Common
-
-receives log messages from LLamaSharp
-
-```csharp
-public interface ILLamaLogger
-```
-
-## Methods
-
-### **Log(String, String, LogLevel)**
-
-Write the log in customized way
-
-```csharp
-void Log(string source, string message, LogLevel level)
-```
-
-#### Parameters
-
-`source` [String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
-The source of the log. It may be a method name or class name.
-
-`message` [String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
-The message.
-
-`level` [LogLevel](./llama.common.illamalogger.loglevel.md)
-The log level.
diff --git a/docs/xmldocs/llama.common.inferenceparams.md b/docs/xmldocs/llama.common.inferenceparams.md
index f8142332..f178331d 100644
--- a/docs/xmldocs/llama.common.inferenceparams.md
+++ b/docs/xmldocs/llama.common.inferenceparams.md
@@ -5,11 +5,11 @@ Namespace: LLama.Common
The paramters used for inference.
```csharp
-public class InferenceParams : LLama.Abstractions.IInferenceParams
+public class InferenceParams : LLama.Abstractions.IInferenceParams, System.IEquatable`1[[LLama.Common.InferenceParams, LLamaSharp, Version=0.0.0.0, Culture=neutral, PublicKeyToken=null]]
```
Inheritance [Object](https://docs.microsoft.com/en-us/dotnet/api/system.object) β [InferenceParams](./llama.common.inferenceparams.md)
-Implements [IInferenceParams](./llama.abstractions.iinferenceparams.md)
+Implements [IInferenceParams](./llama.abstractions.iinferenceparams.md), [IEquatable<InferenceParams>](https://docs.microsoft.com/en-us/dotnet/api/system.iequatable-1)
## Properties
@@ -43,79 +43,49 @@ public int MaxTokens { get; set; }
logit bias for specific tokens
```csharp
-public Dictionary LogitBias { get; set; }
+public Dictionary LogitBias { get; set; }
```
#### Property Value
-[Dictionary<Int32, Single>](https://docs.microsoft.com/en-us/dotnet/api/system.collections.generic.dictionary-2)
+[Dictionary<LLamaToken, Single>](https://docs.microsoft.com/en-us/dotnet/api/system.collections.generic.dictionary-2)
### **AntiPrompts**
Sequences where the model will stop generating further tokens.
```csharp
-public IEnumerable AntiPrompts { get; set; }
+public IReadOnlyList AntiPrompts { get; set; }
```
#### Property Value
-[IEnumerable<String>](https://docs.microsoft.com/en-us/dotnet/api/system.collections.generic.ienumerable-1)
+[IReadOnlyList<String>](https://docs.microsoft.com/en-us/dotnet/api/system.collections.generic.ireadonlylist-1)
-### **PathSession**
-
-path to file for saving/loading model eval state
-
-```csharp
-public string PathSession { get; set; }
-```
-
-#### Property Value
-
-[String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
-
-### **InputSuffix**
-
-string to suffix user inputs with
-
-```csharp
-public string InputSuffix { get; set; }
-```
-
-#### Property Value
-
-[String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
-
-### **InputPrefix**
-
-string to prefix user inputs with
+### **TopK**
```csharp
-public string InputPrefix { get; set; }
+public int TopK { get; set; }
```
#### Property Value
-[String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
-
-### **TopK**
+[Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
-0 or lower to use vocab size
+### **TopP**
```csharp
-public int TopK { get; set; }
+public float TopP { get; set; }
```
#### Property Value
-[Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
-
-### **TopP**
+[Single](https://docs.microsoft.com/en-us/dotnet/api/system.single)
-1.0 = disabled
+### **MinP**
```csharp
-public float TopP { get; set; }
+public float MinP { get; set; }
```
#### Property Value
@@ -124,8 +94,6 @@ public float TopP { get; set; }
### **TfsZ**
-1.0 = disabled
-
```csharp
public float TfsZ { get; set; }
```
@@ -136,8 +104,6 @@ public float TfsZ { get; set; }
### **TypicalP**
-1.0 = disabled
-
```csharp
public float TypicalP { get; set; }
```
@@ -148,8 +114,6 @@ public float TypicalP { get; set; }
### **Temperature**
-1.0 = disabled
-
```csharp
public float Temperature { get; set; }
```
@@ -160,8 +124,6 @@ public float Temperature { get; set; }
### **RepeatPenalty**
-1.0 = disabled
-
```csharp
public float RepeatPenalty { get; set; }
```
@@ -172,8 +134,6 @@ public float RepeatPenalty { get; set; }
### **RepeatLastTokensCount**
-last n tokens to penalize (0 = disable penalty, -1 = context size) (repeat_last_n)
-
```csharp
public int RepeatLastTokensCount { get; set; }
```
@@ -184,9 +144,6 @@ public int RepeatLastTokensCount { get; set; }
### **FrequencyPenalty**
-frequency penalty coefficient
- 0.0 = disabled
-
```csharp
public float FrequencyPenalty { get; set; }
```
@@ -197,9 +154,6 @@ public float FrequencyPenalty { get; set; }
### **PresencePenalty**
-presence penalty coefficient
- 0.0 = disabled
-
```csharp
public float PresencePenalty { get; set; }
```
@@ -210,10 +164,6 @@ public float PresencePenalty { get; set; }
### **Mirostat**
-Mirostat uses tokens instead of words.
- algorithm described in the paper https://arxiv.org/abs/2007.14966.
- 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
-
```csharp
public MirostatType Mirostat { get; set; }
```
@@ -224,8 +174,6 @@ public MirostatType Mirostat { get; set; }
### **MirostatTau**
-target entropy
-
```csharp
public float MirostatTau { get; set; }
```
@@ -236,8 +184,6 @@ public float MirostatTau { get; set; }
### **MirostatEta**
-learning rate
-
```csharp
public float MirostatEta { get; set; }
```
@@ -248,8 +194,6 @@ public float MirostatEta { get; set; }
### **PenalizeNL**
-consider newlines as a repeatable token (penalize_nl)
-
```csharp
public bool PenalizeNL { get; set; }
```
@@ -260,8 +204,6 @@ public bool PenalizeNL { get; set; }
### **Grammar**
-A grammar to constrain the possible tokens
-
```csharp
public SafeLLamaGrammarHandle Grammar { get; set; }
```
@@ -270,6 +212,16 @@ public SafeLLamaGrammarHandle Grammar { get; set; }
[SafeLLamaGrammarHandle](./llama.native.safellamagrammarhandle.md)
+### **SamplingPipeline**
+
+```csharp
+public ISamplingPipeline SamplingPipeline { get; set; }
+```
+
+#### Property Value
+
+[ISamplingPipeline](./llama.sampling.isamplingpipeline.md)
+
## Constructors
### **InferenceParams()**
@@ -277,3 +229,77 @@ public SafeLLamaGrammarHandle Grammar { get; set; }
```csharp
public InferenceParams()
```
+
+## Methods
+
+### **ToString()**
+
+```csharp
+public string ToString()
+```
+
+#### Returns
+
+[String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
+
+### **PrintMembers(StringBuilder)**
+
+```csharp
+protected bool PrintMembers(StringBuilder builder)
+```
+
+#### Parameters
+
+`builder` [StringBuilder](https://docs.microsoft.com/en-us/dotnet/api/system.text.stringbuilder)
+
+#### Returns
+
+[Boolean](https://docs.microsoft.com/en-us/dotnet/api/system.boolean)
+
+### **GetHashCode()**
+
+```csharp
+public int GetHashCode()
+```
+
+#### Returns
+
+[Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
+
+### **Equals(Object)**
+
+```csharp
+public bool Equals(object obj)
+```
+
+#### Parameters
+
+`obj` [Object](https://docs.microsoft.com/en-us/dotnet/api/system.object)
+
+#### Returns
+
+[Boolean](https://docs.microsoft.com/en-us/dotnet/api/system.boolean)
+
+### **Equals(InferenceParams)**
+
+```csharp
+public bool Equals(InferenceParams other)
+```
+
+#### Parameters
+
+`other` [InferenceParams](./llama.common.inferenceparams.md)
+
+#### Returns
+
+[Boolean](https://docs.microsoft.com/en-us/dotnet/api/system.boolean)
+
+### **<Clone>$()**
+
+```csharp
+public InferenceParams $()
+```
+
+#### Returns
+
+[InferenceParams](./llama.common.inferenceparams.md)
diff --git a/docs/xmldocs/llama.common.llamadefaultlogger.md b/docs/xmldocs/llama.common.llamadefaultlogger.md
deleted file mode 100644
index aeef13b0..00000000
--- a/docs/xmldocs/llama.common.llamadefaultlogger.md
+++ /dev/null
@@ -1,173 +0,0 @@
-# LLamaDefaultLogger
-
-Namespace: LLama.Common
-
-The default logger of LLamaSharp. On default it write to console. Use methods of `LLamaLogger.Default` to change the behavior.
- It's recommended to inherit `ILLamaLogger` to customize the behavior.
-
-```csharp
-public sealed class LLamaDefaultLogger : ILLamaLogger
-```
-
-Inheritance [Object](https://docs.microsoft.com/en-us/dotnet/api/system.object) β [LLamaDefaultLogger](./llama.common.llamadefaultlogger.md)
-Implements [ILLamaLogger](./llama.common.illamalogger.md)
-
-## Properties
-
-### **Default**
-
-Get the default logger instance
-
-```csharp
-public static LLamaDefaultLogger Default { get; }
-```
-
-#### Property Value
-
-[LLamaDefaultLogger](./llama.common.llamadefaultlogger.md)
-
-## Methods
-
-### **EnableNative()**
-
-Enable logging output from llama.cpp
-
-```csharp
-public LLamaDefaultLogger EnableNative()
-```
-
-#### Returns
-
-[LLamaDefaultLogger](./llama.common.llamadefaultlogger.md)
-
-### **EnableConsole()**
-
-Enable writing log messages to console
-
-```csharp
-public LLamaDefaultLogger EnableConsole()
-```
-
-#### Returns
-
-[LLamaDefaultLogger](./llama.common.llamadefaultlogger.md)
-
-### **DisableConsole()**
-
-Disable writing messages to console
-
-```csharp
-public LLamaDefaultLogger DisableConsole()
-```
-
-#### Returns
-
-[LLamaDefaultLogger](./llama.common.llamadefaultlogger.md)
-
-### **EnableFile(String, FileMode)**
-
-Enable writing log messages to file
-
-```csharp
-public LLamaDefaultLogger EnableFile(string filename, FileMode mode)
-```
-
-#### Parameters
-
-`filename` [String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
-
-`mode` [FileMode](https://docs.microsoft.com/en-us/dotnet/api/system.io.filemode)
-
-#### Returns
-
-[LLamaDefaultLogger](./llama.common.llamadefaultlogger.md)
-
-### **DisableFile(String)**
-
-#### Caution
-
-Use DisableFile method without 'filename' parameter
-
----
-
-Disable writing log messages to file
-
-```csharp
-public LLamaDefaultLogger DisableFile(string filename)
-```
-
-#### Parameters
-
-`filename` [String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
-unused!
-
-#### Returns
-
-[LLamaDefaultLogger](./llama.common.llamadefaultlogger.md)
-
-### **DisableFile()**
-
-Disable writing log messages to file
-
-```csharp
-public LLamaDefaultLogger DisableFile()
-```
-
-#### Returns
-
-[LLamaDefaultLogger](./llama.common.llamadefaultlogger.md)
-
-### **Log(String, String, LogLevel)**
-
-Log a message
-
-```csharp
-public void Log(string source, string message, LogLevel level)
-```
-
-#### Parameters
-
-`source` [String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
-The source of this message (e.g. class name)
-
-`message` [String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
-The message to log
-
-`level` [LogLevel](./llama.common.illamalogger.loglevel.md)
-Severity level of this message
-
-### **Info(String)**
-
-Write a log message with "Info" severity
-
-```csharp
-public void Info(string message)
-```
-
-#### Parameters
-
-`message` [String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
-
-### **Warn(String)**
-
-Write a log message with "Warn" severity
-
-```csharp
-public void Warn(string message)
-```
-
-#### Parameters
-
-`message` [String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
-
-### **Error(String)**
-
-Write a log message with "Error" severity
-
-```csharp
-public void Error(string message)
-```
-
-#### Parameters
-
-`message` [String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
diff --git a/docs/xmldocs/llama.common.modelparams.md b/docs/xmldocs/llama.common.modelparams.md
index 85dc655e..a9af0a85 100644
--- a/docs/xmldocs/llama.common.modelparams.md
+++ b/docs/xmldocs/llama.common.modelparams.md
@@ -5,30 +5,26 @@ Namespace: LLama.Common
The parameters for initializing a LLama model.
```csharp
-public class ModelParams : LLama.Abstractions.IModelParams, System.IEquatable`1[[LLama.Common.ModelParams, LLamaSharp, Version=0.5.0.0, Culture=neutral, PublicKeyToken=null]]
+public class ModelParams : LLama.Abstractions.ILLamaParams, LLama.Abstractions.IModelParams, LLama.Abstractions.IContextParams, System.IEquatable`1[[LLama.Common.ModelParams, LLamaSharp, Version=0.0.0.0, Culture=neutral, PublicKeyToken=null]]
```
Inheritance [Object](https://docs.microsoft.com/en-us/dotnet/api/system.object) β [ModelParams](./llama.common.modelparams.md)
-Implements [IModelParams](./llama.abstractions.imodelparams.md), [IEquatable<ModelParams>](https://docs.microsoft.com/en-us/dotnet/api/system.iequatable-1)
+Implements [ILLamaParams](./llama.abstractions.illamaparams.md), [IModelParams](./llama.abstractions.imodelparams.md), [IContextParams](./llama.abstractions.icontextparams.md), [IEquatable<ModelParams>](https://docs.microsoft.com/en-us/dotnet/api/system.iequatable-1)
## Properties
### **ContextSize**
-Model context size (n_ctx)
-
```csharp
-public int ContextSize { get; set; }
+public Nullable ContextSize { get; set; }
```
#### Property Value
-[Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
+[Nullable<UInt32>](https://docs.microsoft.com/en-us/dotnet/api/system.nullable-1)
### **MainGpu**
-the GPU that is used for scratch and small tensors
-
```csharp
public int MainGpu { get; set; }
```
@@ -37,22 +33,18 @@ public int MainGpu { get; set; }
[Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
-### **LowVram**
-
-if true, reduce VRAM usage at the cost of performance
+### **SplitMode**
```csharp
-public bool LowVram { get; set; }
+public GPUSplitMode SplitMode { get; set; }
```
#### Property Value
-[Boolean](https://docs.microsoft.com/en-us/dotnet/api/system.boolean)
+[GPUSplitMode](./llama.native.gpusplitmode.md)
### **GpuLayerCount**
-Number of layers to run in VRAM / GPU memory (n_gpu_layers)
-
```csharp
public int GpuLayerCount { get; set; }
```
@@ -63,305 +55,288 @@ public int GpuLayerCount { get; set; }
### **Seed**
-Seed for the random number generator (seed)
-
```csharp
-public int Seed { get; set; }
+public uint Seed { get; set; }
```
#### Property Value
-[Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
-
-### **UseFp16Memory**
+[UInt32](https://docs.microsoft.com/en-us/dotnet/api/system.uint32)
-Use f16 instead of f32 for memory kv (memory_f16)
+### **UseMemorymap**
```csharp
-public bool UseFp16Memory { get; set; }
+public bool UseMemorymap { get; set; }
```
#### Property Value
[Boolean](https://docs.microsoft.com/en-us/dotnet/api/system.boolean)
-### **UseMemorymap**
-
-Use mmap for faster loads (use_mmap)
+### **UseMemoryLock**
```csharp
-public bool UseMemorymap { get; set; }
+public bool UseMemoryLock { get; set; }
```
#### Property Value
[Boolean](https://docs.microsoft.com/en-us/dotnet/api/system.boolean)
-### **UseMemoryLock**
-
-Use mlock to keep model in memory (use_mlock)
+### **ModelPath**
```csharp
-public bool UseMemoryLock { get; set; }
+public string ModelPath { get; set; }
```
#### Property Value
-[Boolean](https://docs.microsoft.com/en-us/dotnet/api/system.boolean)
-
-### **Perplexity**
+[String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
-Compute perplexity over the prompt (perplexity)
+### **LoraAdapters**
```csharp
-public bool Perplexity { get; set; }
+public AdapterCollection LoraAdapters { get; set; }
```
#### Property Value
-[Boolean](https://docs.microsoft.com/en-us/dotnet/api/system.boolean)
-
-### **ModelPath**
+[AdapterCollection](./llama.abstractions.adaptercollection.md)
-Model path (model)
+### **LoraBase**
```csharp
-public string ModelPath { get; set; }
+public string LoraBase { get; set; }
```
#### Property Value
[String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
-### **ModelAlias**
-
-model alias
+### **Threads**
```csharp
-public string ModelAlias { get; set; }
+public Nullable Threads { get; set; }
```
#### Property Value
-[String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
-
-### **LoraAdapter**
+[Nullable<UInt32>](https://docs.microsoft.com/en-us/dotnet/api/system.nullable-1)
-lora adapter path (lora_adapter)
+### **BatchThreads**
```csharp
-public string LoraAdapter { get; set; }
+public Nullable BatchThreads { get; set; }
```
#### Property Value
-[String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
-
-### **LoraBase**
+[Nullable<UInt32>](https://docs.microsoft.com/en-us/dotnet/api/system.nullable-1)
-base model path for the lora adapter (lora_base)
+### **BatchSize**
```csharp
-public string LoraBase { get; set; }
+public uint BatchSize { get; set; }
```
#### Property Value
-[String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
-
-### **Threads**
+[UInt32](https://docs.microsoft.com/en-us/dotnet/api/system.uint32)
-Number of threads (-1 = autodetect) (n_threads)
+### **EmbeddingMode**
```csharp
-public int Threads { get; set; }
+public bool EmbeddingMode { get; set; }
```
#### Property Value
-[Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
-
-### **BatchSize**
+[Boolean](https://docs.microsoft.com/en-us/dotnet/api/system.boolean)
-batch size for prompt processing (must be >=32 to use BLAS) (n_batch)
+### **TensorSplits**
```csharp
-public int BatchSize { get; set; }
+public TensorSplitsCollection TensorSplits { get; set; }
```
#### Property Value
-[Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
-
-### **ConvertEosToNewLine**
+[TensorSplitsCollection](./llama.abstractions.tensorsplitscollection.md)
-Whether to convert eos to newline during the inference.
+### **MetadataOverrides**
```csharp
-public bool ConvertEosToNewLine { get; set; }
+public List MetadataOverrides { get; set; }
```
#### Property Value
-[Boolean](https://docs.microsoft.com/en-us/dotnet/api/system.boolean)
-
-### **EmbeddingMode**
+[List<MetadataOverride>](https://docs.microsoft.com/en-us/dotnet/api/system.collections.generic.list-1)
-Whether to use embedding mode. (embedding) Note that if this is set to true,
- The LLamaModel won't produce text response anymore.
+### **RopeFrequencyBase**
```csharp
-public bool EmbeddingMode { get; set; }
+public Nullable RopeFrequencyBase { get; set; }
```
#### Property Value
-[Boolean](https://docs.microsoft.com/en-us/dotnet/api/system.boolean)
-
-### **TensorSplits**
+[Nullable<Single>](https://docs.microsoft.com/en-us/dotnet/api/system.nullable-1)
-how split tensors should be distributed across GPUs
+### **RopeFrequencyScale**
```csharp
-public Single[] TensorSplits { get; set; }
+public Nullable RopeFrequencyScale { get; set; }
```
#### Property Value
-[Single[]](https://docs.microsoft.com/en-us/dotnet/api/system.single)
+[Nullable<Single>](https://docs.microsoft.com/en-us/dotnet/api/system.nullable-1)
-### **RopeFrequencyBase**
-
-RoPE base frequency
+### **YarnExtrapolationFactor**
```csharp
-public float RopeFrequencyBase { get; set; }
+public Nullable YarnExtrapolationFactor { get; set; }
```
#### Property Value
-[Single](https://docs.microsoft.com/en-us/dotnet/api/system.single)
+[Nullable<Single>](https://docs.microsoft.com/en-us/dotnet/api/system.nullable-1)
-### **RopeFrequencyScale**
-
-RoPE frequency scaling factor
+### **YarnAttentionFactor**
```csharp
-public float RopeFrequencyScale { get; set; }
+public Nullable YarnAttentionFactor { get; set; }
```
#### Property Value
-[Single](https://docs.microsoft.com/en-us/dotnet/api/system.single)
-
-### **MulMatQ**
+[Nullable<Single>](https://docs.microsoft.com/en-us/dotnet/api/system.nullable-1)
-Use experimental mul_mat_q kernels
+### **YarnBetaFast**
```csharp
-public bool MulMatQ { get; set; }
+public Nullable YarnBetaFast { get; set; }
```
#### Property Value
-[Boolean](https://docs.microsoft.com/en-us/dotnet/api/system.boolean)
+[Nullable<Single>](https://docs.microsoft.com/en-us/dotnet/api/system.nullable-1)
-### **Encoding**
+### **YarnBetaSlow**
+
+```csharp
+public Nullable YarnBetaSlow { get; set; }
+```
-The encoding to use to convert text for the model
+#### Property Value
+
+[Nullable<Single>](https://docs.microsoft.com/en-us/dotnet/api/system.nullable-1)
+
+### **YarnOriginalContext**
```csharp
-public Encoding Encoding { get; set; }
+public Nullable YarnOriginalContext { get; set; }
```
#### Property Value
-[Encoding](https://docs.microsoft.com/en-us/dotnet/api/system.text.encoding)
+[Nullable<UInt32>](https://docs.microsoft.com/en-us/dotnet/api/system.nullable-1)
-## Constructors
+### **YarnScalingType**
-### **ModelParams(String)**
+```csharp
+public Nullable YarnScalingType { get; set; }
+```
+
+#### Property Value
+[Nullable<RopeScalingType>](https://docs.microsoft.com/en-us/dotnet/api/system.nullable-1)
+### **TypeK**
```csharp
-public ModelParams(string modelPath)
+public Nullable TypeK { get; set; }
```
-#### Parameters
+#### Property Value
-`modelPath` [String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
-The model path.
+[Nullable<GGMLType>](https://docs.microsoft.com/en-us/dotnet/api/system.nullable-1)
-### **ModelParams(String, Int32, Int32, Int32, Boolean, Boolean, Boolean, Boolean, String, String, Int32, Int32, Boolean, Boolean, Single, Single, Boolean, String)**
+### **TypeV**
-#### Caution
+```csharp
+public Nullable TypeV { get; set; }
+```
+
+#### Property Value
-Use object initializer to set all optional parameters
+[Nullable<GGMLType>](https://docs.microsoft.com/en-us/dotnet/api/system.nullable-1)
----
+### **NoKqvOffload**
+```csharp
+public bool NoKqvOffload { get; set; }
+```
+#### Property Value
+
+[Boolean](https://docs.microsoft.com/en-us/dotnet/api/system.boolean)
+
+### **DefragThreshold**
```csharp
-public ModelParams(string modelPath, int contextSize, int gpuLayerCount, int seed, bool useFp16Memory, bool useMemorymap, bool useMemoryLock, bool perplexity, string loraAdapter, string loraBase, int threads, int batchSize, bool convertEosToNewLine, bool embeddingMode, float ropeFrequencyBase, float ropeFrequencyScale, bool mulMatQ, string encoding)
+public float DefragThreshold { get; set; }
```
-#### Parameters
+#### Property Value
-`modelPath` [String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
-The model path.
+[Single](https://docs.microsoft.com/en-us/dotnet/api/system.single)
-`contextSize` [Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
-Model context size (n_ctx)
+### **DoPooling**
-`gpuLayerCount` [Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
-Number of layers to run in VRAM / GPU memory (n_gpu_layers)
+```csharp
+public bool DoPooling { get; set; }
+```
-`seed` [Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
-Seed for the random number generator (seed)
+#### Property Value
-`useFp16Memory` [Boolean](https://docs.microsoft.com/en-us/dotnet/api/system.boolean)
-Whether to use f16 instead of f32 for memory kv (memory_f16)
+[Boolean](https://docs.microsoft.com/en-us/dotnet/api/system.boolean)
-`useMemorymap` [Boolean](https://docs.microsoft.com/en-us/dotnet/api/system.boolean)
-Whether to use mmap for faster loads (use_mmap)
+### **VocabOnly**
-`useMemoryLock` [Boolean](https://docs.microsoft.com/en-us/dotnet/api/system.boolean)
-Whether to use mlock to keep model in memory (use_mlock)
+```csharp
+public bool VocabOnly { get; set; }
+```
-`perplexity` [Boolean](https://docs.microsoft.com/en-us/dotnet/api/system.boolean)
-Thether to compute perplexity over the prompt (perplexity)
+#### Property Value
-`loraAdapter` [String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
-Lora adapter path (lora_adapter)
+[Boolean](https://docs.microsoft.com/en-us/dotnet/api/system.boolean)
-`loraBase` [String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
-Base model path for the lora adapter (lora_base)
+### **Encoding**
-`threads` [Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
-Number of threads (-1 = autodetect) (n_threads)
+```csharp
+public Encoding Encoding { get; set; }
+```
-`batchSize` [Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
-Batch size for prompt processing (must be >=32 to use BLAS) (n_batch)
+#### Property Value
-`convertEosToNewLine` [Boolean](https://docs.microsoft.com/en-us/dotnet/api/system.boolean)
-Whether to convert eos to newline during the inference.
+[Encoding](https://docs.microsoft.com/en-us/dotnet/api/system.text.encoding)
-`embeddingMode` [Boolean](https://docs.microsoft.com/en-us/dotnet/api/system.boolean)
-Whether to use embedding mode. (embedding) Note that if this is set to true, The LLamaModel won't produce text response anymore.
+## Constructors
-`ropeFrequencyBase` [Single](https://docs.microsoft.com/en-us/dotnet/api/system.single)
-RoPE base frequency.
+### **ModelParams(String)**
-`ropeFrequencyScale` [Single](https://docs.microsoft.com/en-us/dotnet/api/system.single)
-RoPE frequency scaling factor
-`mulMatQ` [Boolean](https://docs.microsoft.com/en-us/dotnet/api/system.boolean)
-Use experimental mul_mat_q kernels
-`encoding` [String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
-The encoding to use to convert text for the model
+```csharp
+public ModelParams(string modelPath)
+```
+
+#### Parameters
+
+`modelPath` [String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
+The model path.
## Methods
diff --git a/docs/xmldocs/llama.exceptions.llamadecodeerror.md b/docs/xmldocs/llama.exceptions.llamadecodeerror.md
new file mode 100644
index 00000000..12601c23
--- /dev/null
+++ b/docs/xmldocs/llama.exceptions.llamadecodeerror.md
@@ -0,0 +1,118 @@
+# LLamaDecodeError
+
+Namespace: LLama.Exceptions
+
+`llama_decode` return a non-zero status code
+
+```csharp
+public class LLamaDecodeError : RuntimeError, System.Runtime.Serialization.ISerializable
+```
+
+Inheritance [Object](https://docs.microsoft.com/en-us/dotnet/api/system.object) β [Exception](https://docs.microsoft.com/en-us/dotnet/api/system.exception) β [RuntimeError](./llama.exceptions.runtimeerror.md) β [LLamaDecodeError](./llama.exceptions.llamadecodeerror.md)
+Implements [ISerializable](https://docs.microsoft.com/en-us/dotnet/api/system.runtime.serialization.iserializable)
+
+## Properties
+
+### **ReturnCode**
+
+The return status code
+
+```csharp
+public DecodeResult ReturnCode { get; }
+```
+
+#### Property Value
+
+[DecodeResult](./llama.native.decoderesult.md)
+
+### **TargetSite**
+
+```csharp
+public MethodBase TargetSite { get; }
+```
+
+#### Property Value
+
+[MethodBase](https://docs.microsoft.com/en-us/dotnet/api/system.reflection.methodbase)
+
+### **Message**
+
+```csharp
+public string Message { get; }
+```
+
+#### Property Value
+
+[String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
+
+### **Data**
+
+```csharp
+public IDictionary Data { get; }
+```
+
+#### Property Value
+
+[IDictionary](https://docs.microsoft.com/en-us/dotnet/api/system.collections.idictionary)
+
+### **InnerException**
+
+```csharp
+public Exception InnerException { get; }
+```
+
+#### Property Value
+
+[Exception](https://docs.microsoft.com/en-us/dotnet/api/system.exception)
+
+### **HelpLink**
+
+```csharp
+public string HelpLink { get; set; }
+```
+
+#### Property Value
+
+[String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
+
+### **Source**
+
+```csharp
+public string Source { get; set; }
+```
+
+#### Property Value
+
+[String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
+
+### **HResult**
+
+```csharp
+public int HResult { get; set; }
+```
+
+#### Property Value
+
+[Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
+
+### **StackTrace**
+
+```csharp
+public string StackTrace { get; }
+```
+
+#### Property Value
+
+[String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
+
+## Constructors
+
+### **LLamaDecodeError(DecodeResult)**
+
+```csharp
+public LLamaDecodeError(DecodeResult returnCode)
+```
+
+#### Parameters
+
+`returnCode` [DecodeResult](./llama.native.decoderesult.md)
diff --git a/docs/xmldocs/llama.exceptions.loadweightsfailedexception.md b/docs/xmldocs/llama.exceptions.loadweightsfailedexception.md
new file mode 100644
index 00000000..e3ea6a5c
--- /dev/null
+++ b/docs/xmldocs/llama.exceptions.loadweightsfailedexception.md
@@ -0,0 +1,118 @@
+# LoadWeightsFailedException
+
+Namespace: LLama.Exceptions
+
+Loading model weights failed
+
+```csharp
+public class LoadWeightsFailedException : RuntimeError, System.Runtime.Serialization.ISerializable
+```
+
+Inheritance [Object](https://docs.microsoft.com/en-us/dotnet/api/system.object) β [Exception](https://docs.microsoft.com/en-us/dotnet/api/system.exception) β [RuntimeError](./llama.exceptions.runtimeerror.md) β [LoadWeightsFailedException](./llama.exceptions.loadweightsfailedexception.md)
+Implements [ISerializable](https://docs.microsoft.com/en-us/dotnet/api/system.runtime.serialization.iserializable)
+
+## Properties
+
+### **ModelPath**
+
+The model path which failed to load
+
+```csharp
+public string ModelPath { get; }
+```
+
+#### Property Value
+
+[String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
+
+### **TargetSite**
+
+```csharp
+public MethodBase TargetSite { get; }
+```
+
+#### Property Value
+
+[MethodBase](https://docs.microsoft.com/en-us/dotnet/api/system.reflection.methodbase)
+
+### **Message**
+
+```csharp
+public string Message { get; }
+```
+
+#### Property Value
+
+[String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
+
+### **Data**
+
+```csharp
+public IDictionary Data { get; }
+```
+
+#### Property Value
+
+[IDictionary](https://docs.microsoft.com/en-us/dotnet/api/system.collections.idictionary)
+
+### **InnerException**
+
+```csharp
+public Exception InnerException { get; }
+```
+
+#### Property Value
+
+[Exception](https://docs.microsoft.com/en-us/dotnet/api/system.exception)
+
+### **HelpLink**
+
+```csharp
+public string HelpLink { get; set; }
+```
+
+#### Property Value
+
+[String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
+
+### **Source**
+
+```csharp
+public string Source { get; set; }
+```
+
+#### Property Value
+
+[String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
+
+### **HResult**
+
+```csharp
+public int HResult { get; set; }
+```
+
+#### Property Value
+
+[Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
+
+### **StackTrace**
+
+```csharp
+public string StackTrace { get; }
+```
+
+#### Property Value
+
+[String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
+
+## Constructors
+
+### **LoadWeightsFailedException(String)**
+
+```csharp
+public LoadWeightsFailedException(string modelPath)
+```
+
+#### Parameters
+
+`modelPath` [String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
diff --git a/docs/xmldocs/llama.exceptions.runtimeerror.md b/docs/xmldocs/llama.exceptions.runtimeerror.md
index 7116015f..3b2f4446 100644
--- a/docs/xmldocs/llama.exceptions.runtimeerror.md
+++ b/docs/xmldocs/llama.exceptions.runtimeerror.md
@@ -2,6 +2,8 @@
Namespace: LLama.Exceptions
+Base class for LLamaSharp runtime errors (i.e. errors produced by llama.cpp, converted into exceptions)
+
```csharp
public class RuntimeError : System.Exception, System.Runtime.Serialization.ISerializable
```
@@ -93,14 +95,10 @@ public string StackTrace { get; }
## Constructors
-### **RuntimeError()**
-
-```csharp
-public RuntimeError()
-```
-
### **RuntimeError(String)**
+Create a new RuntimeError
+
```csharp
public RuntimeError(string message)
```
diff --git a/docs/xmldocs/llama.extensions.icontextparamsextensions.md b/docs/xmldocs/llama.extensions.icontextparamsextensions.md
new file mode 100644
index 00000000..3eb8de49
--- /dev/null
+++ b/docs/xmldocs/llama.extensions.icontextparamsextensions.md
@@ -0,0 +1,33 @@
+# IContextParamsExtensions
+
+Namespace: LLama.Extensions
+
+Extention methods to the IContextParams interface
+
+```csharp
+public static class IContextParamsExtensions
+```
+
+Inheritance [Object](https://docs.microsoft.com/en-us/dotnet/api/system.object) β [IContextParamsExtensions](./llama.extensions.icontextparamsextensions.md)
+
+## Methods
+
+### **ToLlamaContextParams(IContextParams, LLamaContextParams&)**
+
+Convert the given `IModelParams` into a `LLamaContextParams`
+
+```csharp
+public static void ToLlamaContextParams(IContextParams params, LLamaContextParams& result)
+```
+
+#### Parameters
+
+`params` [IContextParams](./llama.abstractions.icontextparams.md)
+
+`result` [LLamaContextParams&](./llama.native.llamacontextparams&.md)
+
+#### Exceptions
+
+[FileNotFoundException](https://docs.microsoft.com/en-us/dotnet/api/system.io.filenotfoundexception)
+
+[ArgumentException](https://docs.microsoft.com/en-us/dotnet/api/system.argumentexception)
diff --git a/docs/xmldocs/llama.extensions.imodelparamsextensions.md b/docs/xmldocs/llama.extensions.imodelparamsextensions.md
index 460be8f8..923f0f02 100644
--- a/docs/xmldocs/llama.extensions.imodelparamsextensions.md
+++ b/docs/xmldocs/llama.extensions.imodelparamsextensions.md
@@ -12,23 +12,23 @@ Inheritance [Object](https://docs.microsoft.com/en-us/dotnet/api/system.object)
## Methods
-### **ToLlamaContextParams(IModelParams, LLamaContextParams&)**
+### **ToLlamaModelParams(IModelParams, LLamaModelParams&)**
-Convert the given `IModelParams` into a `LLamaContextParams`
+Convert the given `IModelParams` into a `LLamaModelParams`
```csharp
-public static MemoryHandle ToLlamaContextParams(IModelParams params, LLamaContextParams& result)
+public static IDisposable ToLlamaModelParams(IModelParams params, LLamaModelParams& result)
```
#### Parameters
`params` [IModelParams](./llama.abstractions.imodelparams.md)
-`result` [LLamaContextParams&](./llama.native.llamacontextparams&.md)
+`result` [LLamaModelParams&](./llama.native.llamamodelparams&.md)
#### Returns
-[MemoryHandle](https://docs.microsoft.com/en-us/dotnet/api/system.buffers.memoryhandle)
+[IDisposable](https://docs.microsoft.com/en-us/dotnet/api/system.idisposable)
#### Exceptions
diff --git a/docs/xmldocs/llama.extensions.keyvaluepairextensions.md b/docs/xmldocs/llama.extensions.keyvaluepairextensions.md
deleted file mode 100644
index c72e1c7e..00000000
--- a/docs/xmldocs/llama.extensions.keyvaluepairextensions.md
+++ /dev/null
@@ -1,40 +0,0 @@
-# KeyValuePairExtensions
-
-Namespace: LLama.Extensions
-
-Extensions to the KeyValuePair struct
-
-```csharp
-public static class KeyValuePairExtensions
-```
-
-Inheritance [Object](https://docs.microsoft.com/en-us/dotnet/api/system.object) β [KeyValuePairExtensions](./llama.extensions.keyvaluepairextensions.md)
-
-## Methods
-
-### **Deconstruct<TKey, TValue>(KeyValuePair<TKey, TValue>, TKey&, TValue&)**
-
-Deconstruct a KeyValuePair into it's constituent parts.
-
-```csharp
-public static void Deconstruct(KeyValuePair pair, TKey& first, TValue& second)
-```
-
-#### Type Parameters
-
-`TKey`
-Type of the Key
-
-`TValue`
-Type of the Value
-
-#### Parameters
-
-`pair` KeyValuePair<TKey, TValue>
-The KeyValuePair to deconstruct
-
-`first` TKey&
-First element, the Key
-
-`second` TValue&
-Second element, the Value
diff --git a/docs/xmldocs/llama.grammars.grammar.md b/docs/xmldocs/llama.grammars.grammar.md
index 3b794f45..da52c1bc 100644
--- a/docs/xmldocs/llama.grammars.grammar.md
+++ b/docs/xmldocs/llama.grammars.grammar.md
@@ -18,7 +18,7 @@ Inheritance [Object](https://docs.microsoft.com/en-us/dotnet/api/system.object)
Index of the initial rule to start from
```csharp
-public ulong StartRuleIndex { get; set; }
+public ulong StartRuleIndex { get; }
```
#### Property Value
diff --git a/docs/xmldocs/llama.grammars.grammarrule.md b/docs/xmldocs/llama.grammars.grammarrule.md
index 3cac47c8..9a6461eb 100644
--- a/docs/xmldocs/llama.grammars.grammarrule.md
+++ b/docs/xmldocs/llama.grammars.grammarrule.md
@@ -5,7 +5,7 @@ Namespace: LLama.Grammars
A single rule in a [Grammar](./llama.grammars.grammar.md)
```csharp
-public sealed class GrammarRule : System.IEquatable`1[[LLama.Grammars.GrammarRule, LLamaSharp, Version=0.5.0.0, Culture=neutral, PublicKeyToken=null]]
+public sealed class GrammarRule : System.IEquatable`1[[LLama.Grammars.GrammarRule, LLamaSharp, Version=0.0.0.0, Culture=neutral, PublicKeyToken=null]]
```
Inheritance [Object](https://docs.microsoft.com/en-us/dotnet/api/system.object) β [GrammarRule](./llama.grammars.grammarrule.md)
diff --git a/docs/xmldocs/llama.oldversion.ichatmodel.md b/docs/xmldocs/llama.ichatmodel.md
similarity index 61%
rename from docs/xmldocs/llama.oldversion.ichatmodel.md
rename to docs/xmldocs/llama.ichatmodel.md
index 4d9a6d44..9f51ba11 100644
--- a/docs/xmldocs/llama.oldversion.ichatmodel.md
+++ b/docs/xmldocs/llama.ichatmodel.md
@@ -1,12 +1,6 @@
# IChatModel
-Namespace: LLama.OldVersion
-
-#### Caution
-
-The entire LLama.OldVersion namespace will be removed
-
----
+Namespace: LLama
```csharp
public interface IChatModel
@@ -26,10 +20,10 @@ public abstract string Name { get; }
## Methods
-### **Chat(String, String, String)**
+### **Chat(String, String)**
```csharp
-IEnumerable Chat(string text, string prompt, string encoding)
+IEnumerable Chat(string text, string prompt)
```
#### Parameters
@@ -38,26 +32,20 @@ IEnumerable Chat(string text, string prompt, string encoding)
`prompt` [String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
-`encoding` [String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
-
#### Returns
[IEnumerable<String>](https://docs.microsoft.com/en-us/dotnet/api/system.collections.generic.ienumerable-1)
-### **InitChatPrompt(String, String)**
-
-Init a prompt for chat and automatically produce the next prompt during the chat.
+### **InitChatPrompt(String)**
```csharp
-void InitChatPrompt(string prompt, string encoding)
+void InitChatPrompt(string prompt)
```
#### Parameters
`prompt` [String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
-`encoding` [String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
-
### **InitChatAntiprompt(String[])**
```csharp
diff --git a/docs/xmldocs/llama.instructexecutor.md b/docs/xmldocs/llama.instructexecutor.md
deleted file mode 100644
index 95a018eb..00000000
--- a/docs/xmldocs/llama.instructexecutor.md
+++ /dev/null
@@ -1,142 +0,0 @@
-# InstructExecutor
-
-Namespace: LLama
-
-The LLama executor for instruct mode.
-
-```csharp
-public class InstructExecutor : StatefulExecutorBase, LLama.Abstractions.ILLamaExecutor
-```
-
-Inheritance [Object](https://docs.microsoft.com/en-us/dotnet/api/system.object) β [StatefulExecutorBase](./llama.statefulexecutorbase.md) β [InstructExecutor](./llama.instructexecutor.md)
-Implements [ILLamaExecutor](./llama.abstractions.illamaexecutor.md)
-
-## Properties
-
-### **Context**
-
-The context used by the executor.
-
-```csharp
-public LLamaContext Context { get; }
-```
-
-#### Property Value
-
-[LLamaContext](./llama.llamacontext.md)
-
-## Constructors
-
-### **InstructExecutor(LLamaContext, String, String)**
-
-
-
-```csharp
-public InstructExecutor(LLamaContext context, string instructionPrefix, string instructionSuffix)
-```
-
-#### Parameters
-
-`context` [LLamaContext](./llama.llamacontext.md)
-
-`instructionPrefix` [String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
-
-`instructionSuffix` [String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
-
-## Methods
-
-### **GetStateData()**
-
-```csharp
-public ExecutorBaseState GetStateData()
-```
-
-#### Returns
-
-[ExecutorBaseState](./llama.statefulexecutorbase.executorbasestate.md)
-
-### **LoadState(ExecutorBaseState)**
-
-```csharp
-public void LoadState(ExecutorBaseState data)
-```
-
-#### Parameters
-
-`data` [ExecutorBaseState](./llama.statefulexecutorbase.executorbasestate.md)
-
-### **SaveState(String)**
-
-```csharp
-public void SaveState(string filename)
-```
-
-#### Parameters
-
-`filename` [String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
-
-### **LoadState(String)**
-
-```csharp
-public void LoadState(string filename)
-```
-
-#### Parameters
-
-`filename` [String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
-
-### **GetLoopCondition(InferStateArgs)**
-
-```csharp
-protected bool GetLoopCondition(InferStateArgs args)
-```
-
-#### Parameters
-
-`args` [InferStateArgs](./llama.statefulexecutorbase.inferstateargs.md)
-
-#### Returns
-
-[Boolean](https://docs.microsoft.com/en-us/dotnet/api/system.boolean)
-
-### **PreprocessInputs(String, InferStateArgs)**
-
-```csharp
-protected void PreprocessInputs(string text, InferStateArgs args)
-```
-
-#### Parameters
-
-`text` [String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
-
-`args` [InferStateArgs](./llama.statefulexecutorbase.inferstateargs.md)
-
-### **PostProcess(IInferenceParams, InferStateArgs, IEnumerable`1&)**
-
-```csharp
-protected bool PostProcess(IInferenceParams inferenceParams, InferStateArgs args, IEnumerable`1& extraOutputs)
-```
-
-#### Parameters
-
-`inferenceParams` [IInferenceParams](./llama.abstractions.iinferenceparams.md)
-
-`args` [InferStateArgs](./llama.statefulexecutorbase.inferstateargs.md)
-
-`extraOutputs` [IEnumerable`1&](https://docs.microsoft.com/en-us/dotnet/api/system.collections.generic.ienumerable-1&)
-
-#### Returns
-
-[Boolean](https://docs.microsoft.com/en-us/dotnet/api/system.boolean)
-
-### **InferInternal(IInferenceParams, InferStateArgs)**
-
-```csharp
-protected void InferInternal(IInferenceParams inferenceParams, InferStateArgs args)
-```
-
-#### Parameters
-
-`inferenceParams` [IInferenceParams](./llama.abstractions.iinferenceparams.md)
-
-`args` [InferStateArgs](./llama.statefulexecutorbase.inferstateargs.md)
diff --git a/docs/xmldocs/llama.interactiveexecutor.md b/docs/xmldocs/llama.interactiveexecutor.md
deleted file mode 100644
index 38134c40..00000000
--- a/docs/xmldocs/llama.interactiveexecutor.md
+++ /dev/null
@@ -1,142 +0,0 @@
-# InteractiveExecutor
-
-Namespace: LLama
-
-The LLama executor for interactive mode.
-
-```csharp
-public class InteractiveExecutor : StatefulExecutorBase, LLama.Abstractions.ILLamaExecutor
-```
-
-Inheritance [Object](https://docs.microsoft.com/en-us/dotnet/api/system.object) β [StatefulExecutorBase](./llama.statefulexecutorbase.md) β [InteractiveExecutor](./llama.interactiveexecutor.md)
-Implements [ILLamaExecutor](./llama.abstractions.illamaexecutor.md)
-
-## Properties
-
-### **Context**
-
-The context used by the executor.
-
-```csharp
-public LLamaContext Context { get; }
-```
-
-#### Property Value
-
-[LLamaContext](./llama.llamacontext.md)
-
-## Constructors
-
-### **InteractiveExecutor(LLamaContext)**
-
-
-
-```csharp
-public InteractiveExecutor(LLamaContext context)
-```
-
-#### Parameters
-
-`context` [LLamaContext](./llama.llamacontext.md)
-
-## Methods
-
-### **GetStateData()**
-
-```csharp
-public ExecutorBaseState GetStateData()
-```
-
-#### Returns
-
-[ExecutorBaseState](./llama.statefulexecutorbase.executorbasestate.md)
-
-### **LoadState(ExecutorBaseState)**
-
-```csharp
-public void LoadState(ExecutorBaseState data)
-```
-
-#### Parameters
-
-`data` [ExecutorBaseState](./llama.statefulexecutorbase.executorbasestate.md)
-
-### **SaveState(String)**
-
-```csharp
-public void SaveState(string filename)
-```
-
-#### Parameters
-
-`filename` [String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
-
-### **LoadState(String)**
-
-```csharp
-public void LoadState(string filename)
-```
-
-#### Parameters
-
-`filename` [String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
-
-### **GetLoopCondition(InferStateArgs)**
-
-Define whether to continue the loop to generate responses.
-
-```csharp
-protected bool GetLoopCondition(InferStateArgs args)
-```
-
-#### Parameters
-
-`args` [InferStateArgs](./llama.statefulexecutorbase.inferstateargs.md)
-
-#### Returns
-
-[Boolean](https://docs.microsoft.com/en-us/dotnet/api/system.boolean)
-
-### **PreprocessInputs(String, InferStateArgs)**
-
-```csharp
-protected void PreprocessInputs(string text, InferStateArgs args)
-```
-
-#### Parameters
-
-`text` [String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
-
-`args` [InferStateArgs](./llama.statefulexecutorbase.inferstateargs.md)
-
-### **PostProcess(IInferenceParams, InferStateArgs, IEnumerable`1&)**
-
-Return whether to break the generation.
-
-```csharp
-protected bool PostProcess(IInferenceParams inferenceParams, InferStateArgs args, IEnumerable`1& extraOutputs)
-```
-
-#### Parameters
-
-`inferenceParams` [IInferenceParams](./llama.abstractions.iinferenceparams.md)
-
-`args` [InferStateArgs](./llama.statefulexecutorbase.inferstateargs.md)
-
-`extraOutputs` [IEnumerable`1&](https://docs.microsoft.com/en-us/dotnet/api/system.collections.generic.ienumerable-1&)
-
-#### Returns
-
-[Boolean](https://docs.microsoft.com/en-us/dotnet/api/system.boolean)
-
-### **InferInternal(IInferenceParams, InferStateArgs)**
-
-```csharp
-protected void InferInternal(IInferenceParams inferenceParams, InferStateArgs args)
-```
-
-#### Parameters
-
-`inferenceParams` [IInferenceParams](./llama.abstractions.iinferenceparams.md)
-
-`args` [InferStateArgs](./llama.statefulexecutorbase.inferstateargs.md)
diff --git a/docs/xmldocs/llama.llamacache.md b/docs/xmldocs/llama.llamacache.md
new file mode 100644
index 00000000..c789224a
--- /dev/null
+++ b/docs/xmldocs/llama.llamacache.md
@@ -0,0 +1,59 @@
+# LLamaCache
+
+Namespace: LLama
+
+```csharp
+public class LLamaCache
+```
+
+Inheritance [Object](https://docs.microsoft.com/en-us/dotnet/api/system.object) β [LLamaCache](./llama.llamacache.md)
+
+## Properties
+
+### **CacheSize**
+
+```csharp
+public int CacheSize { get; }
+```
+
+#### Property Value
+
+[Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
+
+### **Item**
+
+```csharp
+public LLamaState Item { get; set; }
+```
+
+#### Property Value
+
+[LLamaState](./llama.llamastate.md)
+
+## Constructors
+
+### **LLamaCache(Int32)**
+
+```csharp
+public LLamaCache(int capacity)
+```
+
+#### Parameters
+
+`capacity` [Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
+
+## Methods
+
+### **Contains(Int32[])**
+
+```csharp
+public bool Contains(Int32[] key)
+```
+
+#### Parameters
+
+`key` [Int32[]](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
+
+#### Returns
+
+[Boolean](https://docs.microsoft.com/en-us/dotnet/api/system.boolean)
diff --git a/docs/xmldocs/llama.llamacontext.md b/docs/xmldocs/llama.llamacontext.md
deleted file mode 100644
index 59494aee..00000000
--- a/docs/xmldocs/llama.llamacontext.md
+++ /dev/null
@@ -1,477 +0,0 @@
-# LLamaContext
-
-Namespace: LLama
-
-A llama_context, which holds all the context required to interact with a model
-
-```csharp
-public sealed class LLamaContext : System.IDisposable
-```
-
-Inheritance [Object](https://docs.microsoft.com/en-us/dotnet/api/system.object) β [LLamaContext](./llama.llamacontext.md)
-Implements [IDisposable](https://docs.microsoft.com/en-us/dotnet/api/system.idisposable)
-
-## Properties
-
-### **VocabCount**
-
-Total number of tokens in vocabulary of this model
-
-```csharp
-public int VocabCount { get; }
-```
-
-#### Property Value
-
-[Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
-
-### **ContextSize**
-
-Total number of tokens in the context
-
-```csharp
-public int ContextSize { get; }
-```
-
-#### Property Value
-
-[Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
-
-### **EmbeddingSize**
-
-Dimension of embedding vectors
-
-```csharp
-public int EmbeddingSize { get; }
-```
-
-#### Property Value
-
-[Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
-
-### **Params**
-
-The model params set for this model.
-
-```csharp
-public IModelParams Params { get; set; }
-```
-
-#### Property Value
-
-[IModelParams](./llama.abstractions.imodelparams.md)
-
-### **NativeHandle**
-
-The native handle, which is used to be passed to the native APIs
-
-```csharp
-public SafeLLamaContextHandle NativeHandle { get; }
-```
-
-#### Property Value
-
-[SafeLLamaContextHandle](./llama.native.safellamacontexthandle.md)
-
-**Remarks:**
-
-Be careful how you use this!
-
-### **Encoding**
-
-The encoding set for this model to deal with text input.
-
-```csharp
-public Encoding Encoding { get; }
-```
-
-#### Property Value
-
-[Encoding](https://docs.microsoft.com/en-us/dotnet/api/system.text.encoding)
-
-### **EmbeddingLength**
-
-The embedding length of the model, also known as `n_embed`
-
-```csharp
-public int EmbeddingLength { get; }
-```
-
-#### Property Value
-
-[Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
-
-## Constructors
-
-### **LLamaContext(IModelParams, ILLamaLogger)**
-
-#### Caution
-
-Use the LLamaWeights.CreateContext instead
-
----
-
-
-
-```csharp
-public LLamaContext(IModelParams params, ILLamaLogger logger)
-```
-
-#### Parameters
-
-`params` [IModelParams](./llama.abstractions.imodelparams.md)
-Model params.
-
-`logger` [ILLamaLogger](./llama.common.illamalogger.md)
-The logger.
-
-### **LLamaContext(LLamaWeights, IModelParams, ILLamaLogger)**
-
-Create a new LLamaContext for the given LLamaWeights
-
-```csharp
-public LLamaContext(LLamaWeights model, IModelParams params, ILLamaLogger logger)
-```
-
-#### Parameters
-
-`model` [LLamaWeights](./llama.llamaweights.md)
-
-`params` [IModelParams](./llama.abstractions.imodelparams.md)
-
-`logger` [ILLamaLogger](./llama.common.illamalogger.md)
-
-#### Exceptions
-
-[ObjectDisposedException](https://docs.microsoft.com/en-us/dotnet/api/system.objectdisposedexception)
-
-## Methods
-
-### **Clone()**
-
-Create a copy of the current state of this context
-
-```csharp
-public LLamaContext Clone()
-```
-
-#### Returns
-
-[LLamaContext](./llama.llamacontext.md)
-
-### **Tokenize(String, Boolean)**
-
-Tokenize a string.
-
-```csharp
-public Int32[] Tokenize(string text, bool addBos)
-```
-
-#### Parameters
-
-`text` [String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
-
-`addBos` [Boolean](https://docs.microsoft.com/en-us/dotnet/api/system.boolean)
-Whether to add a bos to the text.
-
-#### Returns
-
-[Int32[]](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
-
-### **DeTokenize(IEnumerable<Int32>)**
-
-Detokenize the tokens to text.
-
-```csharp
-public string DeTokenize(IEnumerable tokens)
-```
-
-#### Parameters
-
-`tokens` [IEnumerable<Int32>](https://docs.microsoft.com/en-us/dotnet/api/system.collections.generic.ienumerable-1)
-
-#### Returns
-
-[String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
-
-### **SaveState(String)**
-
-Save the state to specified path.
-
-```csharp
-public void SaveState(string filename)
-```
-
-#### Parameters
-
-`filename` [String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
-
-### **GetStateData()**
-
-#### Caution
-
-Use `GetState` instead, this supports larger states (over 2GB)
-
----
-
-Get the state data as a byte array.
-
-```csharp
-public Byte[] GetStateData()
-```
-
-#### Returns
-
-[Byte[]](https://docs.microsoft.com/en-us/dotnet/api/system.byte)
-
-### **GetState()**
-
-Get the state data as an opaque handle
-
-```csharp
-public State GetState()
-```
-
-#### Returns
-
-[State](./llama.llamacontext.state.md)
-
-### **LoadState(String)**
-
-Load the state from specified path.
-
-```csharp
-public void LoadState(string filename)
-```
-
-#### Parameters
-
-`filename` [String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
-
-#### Exceptions
-
-[RuntimeError](./llama.exceptions.runtimeerror.md)
-
-### **LoadState(Byte[])**
-
-Load the state from memory.
-
-```csharp
-public void LoadState(Byte[] stateData)
-```
-
-#### Parameters
-
-`stateData` [Byte[]](https://docs.microsoft.com/en-us/dotnet/api/system.byte)
-
-#### Exceptions
-
-[RuntimeError](./llama.exceptions.runtimeerror.md)
-
-### **LoadState(State)**
-
-Load the state from memory.
-
-```csharp
-public void LoadState(State state)
-```
-
-#### Parameters
-
-`state` [State](./llama.llamacontext.state.md)
-
-#### Exceptions
-
-[RuntimeError](./llama.exceptions.runtimeerror.md)
-
-### **Sample(LLamaTokenDataArray, Nullable`1&, Single, MirostatType, Single, Single, Int32, Single, Single, Single, SafeLLamaGrammarHandle)**
-
-Perform the sampling. Please don't use it unless you fully know what it does.
-
-```csharp
-public int Sample(LLamaTokenDataArray candidates, Nullable`1& mirostat_mu, float temperature, MirostatType mirostat, float mirostatTau, float mirostatEta, int topK, float topP, float tfsZ, float typicalP, SafeLLamaGrammarHandle grammar)
-```
-
-#### Parameters
-
-`candidates` [LLamaTokenDataArray](./llama.native.llamatokendataarray.md)
-
-`mirostat_mu` [Nullable`1&](https://docs.microsoft.com/en-us/dotnet/api/system.nullable-1&)
-
-`temperature` [Single](https://docs.microsoft.com/en-us/dotnet/api/system.single)
-
-`mirostat` [MirostatType](./llama.common.mirostattype.md)
-
-`mirostatTau` [Single](https://docs.microsoft.com/en-us/dotnet/api/system.single)
-
-`mirostatEta` [Single](https://docs.microsoft.com/en-us/dotnet/api/system.single)
-
-`topK` [Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
-
-`topP` [Single](https://docs.microsoft.com/en-us/dotnet/api/system.single)
-
-`tfsZ` [Single](https://docs.microsoft.com/en-us/dotnet/api/system.single)
-
-`typicalP` [Single](https://docs.microsoft.com/en-us/dotnet/api/system.single)
-
-`grammar` [SafeLLamaGrammarHandle](./llama.native.safellamagrammarhandle.md)
-
-#### Returns
-
-[Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
-
-### **ApplyPenalty(IEnumerable<Int32>, Dictionary<Int32, Single>, Int32, Single, Single, Single, Boolean)**
-
-Apply the penalty for the tokens. Please don't use it unless you fully know what it does.
-
-```csharp
-public LLamaTokenDataArray ApplyPenalty(IEnumerable lastTokens, Dictionary logitBias, int repeatLastTokensCount, float repeatPenalty, float alphaFrequency, float alphaPresence, bool penalizeNL)
-```
-
-#### Parameters
-
-`lastTokens` [IEnumerable<Int32>](https://docs.microsoft.com/en-us/dotnet/api/system.collections.generic.ienumerable-1)
-
-`logitBias` [Dictionary<Int32, Single>](https://docs.microsoft.com/en-us/dotnet/api/system.collections.generic.dictionary-2)
-
-`repeatLastTokensCount` [Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
-
-`repeatPenalty` [Single](https://docs.microsoft.com/en-us/dotnet/api/system.single)
-
-`alphaFrequency` [Single](https://docs.microsoft.com/en-us/dotnet/api/system.single)
-
-`alphaPresence` [Single](https://docs.microsoft.com/en-us/dotnet/api/system.single)
-
-`penalizeNL` [Boolean](https://docs.microsoft.com/en-us/dotnet/api/system.boolean)
-
-#### Returns
-
-[LLamaTokenDataArray](./llama.native.llamatokendataarray.md)
-
-### **Eval(Int32[], Int32)**
-
-
-
-```csharp
-public int Eval(Int32[] tokens, int pastTokensCount)
-```
-
-#### Parameters
-
-`tokens` [Int32[]](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
-
-`pastTokensCount` [Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
-
-#### Returns
-
-[Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
-The updated `pastTokensCount`.
-
-#### Exceptions
-
-[RuntimeError](./llama.exceptions.runtimeerror.md)
-
-### **Eval(List<Int32>, Int32)**
-
-
-
-```csharp
-public int Eval(List tokens, int pastTokensCount)
-```
-
-#### Parameters
-
-`tokens` [List<Int32>](https://docs.microsoft.com/en-us/dotnet/api/system.collections.generic.list-1)
-
-`pastTokensCount` [Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
-
-#### Returns
-
-[Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
-The updated `pastTokensCount`.
-
-#### Exceptions
-
-[RuntimeError](./llama.exceptions.runtimeerror.md)
-
-### **Eval(ReadOnlyMemory<Int32>, Int32)**
-
-
-
-```csharp
-public int Eval(ReadOnlyMemory tokens, int pastTokensCount)
-```
-
-#### Parameters
-
-`tokens` [ReadOnlyMemory<Int32>](https://docs.microsoft.com/en-us/dotnet/api/system.readonlymemory-1)
-
-`pastTokensCount` [Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
-
-#### Returns
-
-[Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
-The updated `pastTokensCount`.
-
-#### Exceptions
-
-[RuntimeError](./llama.exceptions.runtimeerror.md)
-
-### **Eval(ReadOnlySpan<Int32>, Int32)**
-
-
-
-```csharp
-public int Eval(ReadOnlySpan tokens, int pastTokensCount)
-```
-
-#### Parameters
-
-`tokens` [ReadOnlySpan<Int32>](https://docs.microsoft.com/en-us/dotnet/api/system.readonlyspan-1)
-
-`pastTokensCount` [Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
-
-#### Returns
-
-[Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
-The updated `pastTokensCount`.
-
-#### Exceptions
-
-[RuntimeError](./llama.exceptions.runtimeerror.md)
-
-### **GenerateResult(IEnumerable<Int32>)**
-
-```csharp
-internal IEnumerable GenerateResult(IEnumerable ids)
-```
-
-#### Parameters
-
-`ids` [IEnumerable<Int32>](https://docs.microsoft.com/en-us/dotnet/api/system.collections.generic.ienumerable-1)
-
-#### Returns
-
-[IEnumerable<String>](https://docs.microsoft.com/en-us/dotnet/api/system.collections.generic.ienumerable-1)
-
-### **TokenToString(Int32)**
-
-Convert a token into a string
-
-```csharp
-public string TokenToString(int token)
-```
-
-#### Parameters
-
-`token` [Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
-
-#### Returns
-
-[String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
-
-### **Dispose()**
-
-```csharp
-public void Dispose()
-```
diff --git a/docs/xmldocs/llama.llamaembedder.md b/docs/xmldocs/llama.llamaembedder.md
index 77057207..333c856e 100644
--- a/docs/xmldocs/llama.llamaembedder.md
+++ b/docs/xmldocs/llama.llamaembedder.md
@@ -5,136 +5,39 @@ Namespace: LLama
The embedder for LLama, which supports getting embeddings from text.
```csharp
-public sealed class LLamaEmbedder : System.IDisposable
+public class LLamaEmbedder
```
-Inheritance [Object](https://docs.microsoft.com/en-us/dotnet/api/system.object) β [LLamaEmbedder](./llama.llamaembedder.md)
-Implements [IDisposable](https://docs.microsoft.com/en-us/dotnet/api/system.idisposable)
-
-## Properties
-
-### **EmbeddingSize**
-
-Dimension of embedding vectors
-
-```csharp
-public int EmbeddingSize { get; }
-```
-
-#### Property Value
-
-[Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
+Inheritance [Object](https://docs.microsoft.com/en-us/dotnet/api/system.object) β [LLamaEmbedder](./llama.llamaembedder.md)
## Constructors
-### **LLamaEmbedder(IModelParams)**
-
-
+### **LLamaEmbedder(LLamaParams)**
```csharp
-public LLamaEmbedder(IModelParams params)
+public LLamaEmbedder(LLamaParams params)
```
#### Parameters
-`params` [IModelParams](./llama.abstractions.imodelparams.md)
-
-### **LLamaEmbedder(LLamaWeights, IModelParams)**
-
-```csharp
-public LLamaEmbedder(LLamaWeights weights, IModelParams params)
-```
-
-#### Parameters
-
-`weights` [LLamaWeights](./llama.llamaweights.md)
-
-`params` [IModelParams](./llama.abstractions.imodelparams.md)
+`params` [LLamaParams](./llama.llamaparams.md)
## Methods
-### **GetEmbeddings(String, Int32, Boolean, String)**
-
-#### Caution
-
-'threads' and 'encoding' parameters are no longer used
-
----
-
-Get the embeddings of the text.
-
-```csharp
-public Single[] GetEmbeddings(string text, int threads, bool addBos, string encoding)
-```
-
-#### Parameters
-
-`text` [String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
-
-`threads` [Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
-unused
-
-`addBos` [Boolean](https://docs.microsoft.com/en-us/dotnet/api/system.boolean)
-Add bos to the text.
-
-`encoding` [String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
-unused
-
-#### Returns
-
-[Single[]](https://docs.microsoft.com/en-us/dotnet/api/system.single)
-
-#### Exceptions
-
-[RuntimeError](./llama.exceptions.runtimeerror.md)
-
-### **GetEmbeddings(String)**
-
-Get the embeddings of the text.
+### **GetEmbeddings(String, Int32, Boolean)**
```csharp
-public Single[] GetEmbeddings(string text)
+public Single[] GetEmbeddings(string text, int n_thread, bool add_bos)
```
#### Parameters
`text` [String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
-#### Returns
-
-[Single[]](https://docs.microsoft.com/en-us/dotnet/api/system.single)
-
-#### Exceptions
-
-[RuntimeError](./llama.exceptions.runtimeerror.md)
+`n_thread` [Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
-### **GetEmbeddings(String, Boolean)**
-
-Get the embeddings of the text.
-
-```csharp
-public Single[] GetEmbeddings(string text, bool addBos)
-```
-
-#### Parameters
-
-`text` [String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
-
-`addBos` [Boolean](https://docs.microsoft.com/en-us/dotnet/api/system.boolean)
-Add bos to the text.
+`add_bos` [Boolean](https://docs.microsoft.com/en-us/dotnet/api/system.boolean)
#### Returns
[Single[]](https://docs.microsoft.com/en-us/dotnet/api/system.single)
-
-#### Exceptions
-
-[RuntimeError](./llama.exceptions.runtimeerror.md)
-
-### **Dispose()**
-
-
-
-```csharp
-public void Dispose()
-```
diff --git a/docs/xmldocs/llama.llamamodel.md b/docs/xmldocs/llama.llamamodel.md
new file mode 100644
index 00000000..4c927a24
--- /dev/null
+++ b/docs/xmldocs/llama.llamamodel.md
@@ -0,0 +1,226 @@
+# LLamaModel
+
+Namespace: LLama
+
+```csharp
+public class LLamaModel : IChatModel
+```
+
+Inheritance [Object](https://docs.microsoft.com/en-us/dotnet/api/system.object) β [LLamaModel](./llama.llamamodel.md)
+Implements [IChatModel](./llama.ichatmodel.md)
+
+## Properties
+
+### **Name**
+
+```csharp
+public string Name { get; set; }
+```
+
+#### Property Value
+
+[String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
+
+### **NativeHandle**
+
+```csharp
+public SafeLLamaContextHandle NativeHandle { get; }
+```
+
+#### Property Value
+
+[SafeLLamaContextHandle](./llama.native.safellamacontexthandle.md)
+
+## Constructors
+
+### **LLamaModel(String, String, Boolean, Boolean, Int32, Int32, Int32, Int32, Int32, Int32, Int32, Dictionary<Int32, Single>, Int32, Single, Single, Single, Single, Single, Int32, Single, Single, Int32, Single, Single, String, String, String, String, List<String>, String, String, Boolean, Boolean, Boolean, Boolean, Boolean, Boolean, Boolean, Boolean, Boolean, Boolean, Boolean, Boolean, Boolean)**
+
+```csharp
+public LLamaModel(string model_path, string model_name, bool echo_input, bool verbose, int seed, int n_threads, int n_predict, int n_parts, int n_ctx, int n_batch, int n_keep, Dictionary logit_bias, int top_k, float top_p, float tfs_z, float typical_p, float temp, float repeat_penalty, int repeat_last_n, float frequency_penalty, float presence_penalty, int mirostat, float mirostat_tau, float mirostat_eta, string prompt, string path_session, string input_prefix, string input_suffix, List antiprompt, string lora_adapter, string lora_base, bool memory_f16, bool random_prompt, bool use_color, bool interactive, bool embedding, bool interactive_first, bool instruct, bool penalize_nl, bool perplexity, bool use_mmap, bool use_mlock, bool mem_test, bool verbose_prompt)
+```
+
+#### Parameters
+
+`model_path` [String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
+
+`model_name` [String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
+
+`echo_input` [Boolean](https://docs.microsoft.com/en-us/dotnet/api/system.boolean)
+
+`verbose` [Boolean](https://docs.microsoft.com/en-us/dotnet/api/system.boolean)
+
+`seed` [Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
+
+`n_threads` [Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
+
+`n_predict` [Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
+
+`n_parts` [Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
+
+`n_ctx` [Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
+
+`n_batch` [Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
+
+`n_keep` [Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
+
+`logit_bias` [Dictionary<Int32, Single>](https://docs.microsoft.com/en-us/dotnet/api/system.collections.generic.dictionary-2)
+
+`top_k` [Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
+
+`top_p` [Single](https://docs.microsoft.com/en-us/dotnet/api/system.single)
+
+`tfs_z` [Single](https://docs.microsoft.com/en-us/dotnet/api/system.single)
+
+`typical_p` [Single](https://docs.microsoft.com/en-us/dotnet/api/system.single)
+
+`temp` [Single](https://docs.microsoft.com/en-us/dotnet/api/system.single)
+
+`repeat_penalty` [Single](https://docs.microsoft.com/en-us/dotnet/api/system.single)
+
+`repeat_last_n` [Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
+
+`frequency_penalty` [Single](https://docs.microsoft.com/en-us/dotnet/api/system.single)
+
+`presence_penalty` [Single](https://docs.microsoft.com/en-us/dotnet/api/system.single)
+
+`mirostat` [Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
+
+`mirostat_tau` [Single](https://docs.microsoft.com/en-us/dotnet/api/system.single)
+
+`mirostat_eta` [Single](https://docs.microsoft.com/en-us/dotnet/api/system.single)
+
+`prompt` [String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
+
+`path_session` [String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
+
+`input_prefix` [String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
+
+`input_suffix` [String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
+
+`antiprompt` [List<String>](https://docs.microsoft.com/en-us/dotnet/api/system.collections.generic.list-1)
+
+`lora_adapter` [String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
+
+`lora_base` [String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
+
+`memory_f16` [Boolean](https://docs.microsoft.com/en-us/dotnet/api/system.boolean)
+
+`random_prompt` [Boolean](https://docs.microsoft.com/en-us/dotnet/api/system.boolean)
+
+`use_color` [Boolean](https://docs.microsoft.com/en-us/dotnet/api/system.boolean)
+
+`interactive` [Boolean](https://docs.microsoft.com/en-us/dotnet/api/system.boolean)
+
+`embedding` [Boolean](https://docs.microsoft.com/en-us/dotnet/api/system.boolean)
+
+`interactive_first` [Boolean](https://docs.microsoft.com/en-us/dotnet/api/system.boolean)
+
+`instruct` [Boolean](https://docs.microsoft.com/en-us/dotnet/api/system.boolean)
+
+`penalize_nl` [Boolean](https://docs.microsoft.com/en-us/dotnet/api/system.boolean)
+
+`perplexity` [Boolean](https://docs.microsoft.com/en-us/dotnet/api/system.boolean)
+
+`use_mmap` [Boolean](https://docs.microsoft.com/en-us/dotnet/api/system.boolean)
+
+`use_mlock` [Boolean](https://docs.microsoft.com/en-us/dotnet/api/system.boolean)
+
+`mem_test` [Boolean](https://docs.microsoft.com/en-us/dotnet/api/system.boolean)
+
+`verbose_prompt` [Boolean](https://docs.microsoft.com/en-us/dotnet/api/system.boolean)
+
+### **LLamaModel(LLamaParams, String, Boolean, Boolean)**
+
+```csharp
+public LLamaModel(LLamaParams params, string name, bool echo_input, bool verbose)
+```
+
+#### Parameters
+
+`params` [LLamaParams](./llama.llamaparams.md)
+
+`name` [String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
+
+`echo_input` [Boolean](https://docs.microsoft.com/en-us/dotnet/api/system.boolean)
+
+`verbose` [Boolean](https://docs.microsoft.com/en-us/dotnet/api/system.boolean)
+
+## Methods
+
+### **WithPrompt(String)**
+
+```csharp
+public LLamaModel WithPrompt(string prompt)
+```
+
+#### Parameters
+
+`prompt` [String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
+
+#### Returns
+
+[LLamaModel](./llama.llamamodel.md)
+
+### **WithPromptFile(String)**
+
+```csharp
+public LLamaModel WithPromptFile(string promptFileName)
+```
+
+#### Parameters
+
+`promptFileName` [String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
+
+#### Returns
+
+[LLamaModel](./llama.llamamodel.md)
+
+### **InitChatPrompt(String)**
+
+```csharp
+public void InitChatPrompt(string prompt)
+```
+
+#### Parameters
+
+`prompt` [String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
+
+### **InitChatAntiprompt(String[])**
+
+```csharp
+public void InitChatAntiprompt(String[] antiprompt)
+```
+
+#### Parameters
+
+`antiprompt` [String[]](https://docs.microsoft.com/en-us/dotnet/api/system.string)
+
+### **Chat(String, String)**
+
+```csharp
+public IEnumerable Chat(string text, string prompt)
+```
+
+#### Parameters
+
+`text` [String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
+
+`prompt` [String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
+
+#### Returns
+
+[IEnumerable<String>](https://docs.microsoft.com/en-us/dotnet/api/system.collections.generic.ienumerable-1)
+
+### **Call(String)**
+
+```csharp
+public IEnumerable Call(string text)
+```
+
+#### Parameters
+
+`text` [String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
+
+#### Returns
+
+[IEnumerable<String>](https://docs.microsoft.com/en-us/dotnet/api/system.collections.generic.ienumerable-1)
diff --git a/docs/xmldocs/llama.llamamodelv1.md b/docs/xmldocs/llama.llamamodelv1.md
new file mode 100644
index 00000000..a4d02d00
--- /dev/null
+++ b/docs/xmldocs/llama.llamamodelv1.md
@@ -0,0 +1,369 @@
+# LLamaModelV1
+
+Namespace: LLama
+
+#### Caution
+
+This type is obsolete.
+
+---
+
+```csharp
+public class LLamaModelV1
+```
+
+Inheritance [Object](https://docs.microsoft.com/en-us/dotnet/api/system.object) β [LLamaModelV1](./llama.llamamodelv1.md)
+
+## Constructors
+
+### **LLamaModelV1(String, Int32, Int32, Int32, Boolean, Boolean, Boolean, Boolean, Boolean, Boolean, Int32, Int32, Int32, String, String, Boolean)**
+
+```csharp
+public LLamaModelV1(string model_path, int n_ctx, int n_parts, int seed, bool f16_kv, bool logits_all, bool vocab_only, bool use_mmap, bool use_mlock, bool embedding, int n_threads, int n_batch, int last_n_tokens_size, string lora_base, string lora_path, bool verbose)
+```
+
+#### Parameters
+
+`model_path` [String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
+
+`n_ctx` [Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
+
+`n_parts` [Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
+
+`seed` [Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
+
+`f16_kv` [Boolean](https://docs.microsoft.com/en-us/dotnet/api/system.boolean)
+
+`logits_all` [Boolean](https://docs.microsoft.com/en-us/dotnet/api/system.boolean)
+
+`vocab_only` [Boolean](https://docs.microsoft.com/en-us/dotnet/api/system.boolean)
+
+`use_mmap` [Boolean](https://docs.microsoft.com/en-us/dotnet/api/system.boolean)
+
+`use_mlock` [Boolean](https://docs.microsoft.com/en-us/dotnet/api/system.boolean)
+
+`embedding` [Boolean](https://docs.microsoft.com/en-us/dotnet/api/system.boolean)
+
+`n_threads` [Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
+
+`n_batch` [Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
+
+`last_n_tokens_size` [Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
+
+`lora_base` [String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
+
+`lora_path` [String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
+
+`verbose` [Boolean](https://docs.microsoft.com/en-us/dotnet/api/system.boolean)
+
+### **LLamaModelV1(LLamaModelV1)**
+
+```csharp
+public LLamaModelV1(LLamaModelV1 other)
+```
+
+#### Parameters
+
+`other` [LLamaModelV1](./llama.llamamodelv1.md)
+
+## Methods
+
+### **Tokenize(String)**
+
+```csharp
+public List Tokenize(string text)
+```
+
+#### Parameters
+
+`text` [String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
+
+#### Returns
+
+[List<Int32>](https://docs.microsoft.com/en-us/dotnet/api/system.collections.generic.list-1)
+
+### **DeTokenize(IEnumerable<Int32>)**
+
+```csharp
+public string DeTokenize(IEnumerable tokens)
+```
+
+#### Parameters
+
+`tokens` [IEnumerable<Int32>](https://docs.microsoft.com/en-us/dotnet/api/system.collections.generic.ienumerable-1)
+
+#### Returns
+
+[String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
+
+### **DeTokenize(Int32)**
+
+```csharp
+public string DeTokenize(int token)
+```
+
+#### Parameters
+
+`token` [Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
+
+#### Returns
+
+[String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
+
+### **SetCache(LLamaCache)**
+
+```csharp
+public void SetCache(LLamaCache cache)
+```
+
+#### Parameters
+
+`cache` [LLamaCache](./llama.llamacache.md)
+
+### **Reset()**
+
+```csharp
+public void Reset()
+```
+
+### **Eval(List<Int32>)**
+
+```csharp
+public void Eval(List tokens)
+```
+
+#### Parameters
+
+`tokens` [List<Int32>](https://docs.microsoft.com/en-us/dotnet/api/system.collections.generic.list-1)
+
+### **Sample(Int32, Single, Single, Single, Single, Single)**
+
+```csharp
+public int Sample(int top_k, float top_p, float temp, float repeat_penalty, float frequency_penalty, float presence_penalty)
+```
+
+#### Parameters
+
+`top_k` [Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
+
+`top_p` [Single](https://docs.microsoft.com/en-us/dotnet/api/system.single)
+
+`temp` [Single](https://docs.microsoft.com/en-us/dotnet/api/system.single)
+
+`repeat_penalty` [Single](https://docs.microsoft.com/en-us/dotnet/api/system.single)
+
+`frequency_penalty` [Single](https://docs.microsoft.com/en-us/dotnet/api/system.single)
+
+`presence_penalty` [Single](https://docs.microsoft.com/en-us/dotnet/api/system.single)
+
+#### Returns
+
+[Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
+
+### **Generate(IEnumerable<Int32>, Int32, Single, Single, Single, Single, Single, Boolean)**
+
+```csharp
+public IEnumerable Generate(IEnumerable tokens, int top_k, float top_p, float temp, float repeat_penalty, float frequency_penalty, float presence_penalty, bool reset)
+```
+
+#### Parameters
+
+`tokens` [IEnumerable<Int32>](https://docs.microsoft.com/en-us/dotnet/api/system.collections.generic.ienumerable-1)
+
+`top_k` [Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
+
+`top_p` [Single](https://docs.microsoft.com/en-us/dotnet/api/system.single)
+
+`temp` [Single](https://docs.microsoft.com/en-us/dotnet/api/system.single)
+
+`repeat_penalty` [Single](https://docs.microsoft.com/en-us/dotnet/api/system.single)
+
+`frequency_penalty` [Single](https://docs.microsoft.com/en-us/dotnet/api/system.single)
+
+`presence_penalty` [Single](https://docs.microsoft.com/en-us/dotnet/api/system.single)
+
+`reset` [Boolean](https://docs.microsoft.com/en-us/dotnet/api/system.boolean)
+
+#### Returns
+
+[IEnumerable<Int32>](https://docs.microsoft.com/en-us/dotnet/api/system.collections.generic.ienumerable-1)
+
+### **CreateEmbedding(String)**
+
+```csharp
+public Embedding CreateEmbedding(string input)
+```
+
+#### Parameters
+
+`input` [String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
+
+#### Returns
+
+[Embedding](./llama.types.embedding.md)
+
+### **Embed(String)**
+
+```csharp
+public Single[] Embed(string input)
+```
+
+#### Parameters
+
+`input` [String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
+
+#### Returns
+
+[Single[]](https://docs.microsoft.com/en-us/dotnet/api/system.single)
+
+### **CreateCompletion(String, String, Int32, Single, Single, Int32, Boolean, String[], Single, Single, Single, Int32)**
+
+```csharp
+public IEnumerable CreateCompletion(string prompt, string suffix, int max_tokens, float temperature, float top_p, int logprobs, bool echo, String[] stop, float frequency_penalty, float presence_penalty, float repeat_penalty, int top_k)
+```
+
+#### Parameters
+
+`prompt` [String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
+
+`suffix` [String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
+
+`max_tokens` [Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
+
+`temperature` [Single](https://docs.microsoft.com/en-us/dotnet/api/system.single)
+
+`top_p` [Single](https://docs.microsoft.com/en-us/dotnet/api/system.single)
+
+`logprobs` [Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
+
+`echo` [Boolean](https://docs.microsoft.com/en-us/dotnet/api/system.boolean)
+
+`stop` [String[]](https://docs.microsoft.com/en-us/dotnet/api/system.string)
+
+`frequency_penalty` [Single](https://docs.microsoft.com/en-us/dotnet/api/system.single)
+
+`presence_penalty` [Single](https://docs.microsoft.com/en-us/dotnet/api/system.single)
+
+`repeat_penalty` [Single](https://docs.microsoft.com/en-us/dotnet/api/system.single)
+
+`top_k` [Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
+
+#### Returns
+
+[IEnumerable<CompletionChunk>](https://docs.microsoft.com/en-us/dotnet/api/system.collections.generic.ienumerable-1)
+
+### **Call(String, String, Int32, Single, Single, Int32, Boolean, String[], Single, Single, Single, Int32)**
+
+```csharp
+public IEnumerable Call(string prompt, string suffix, int max_tokens, float temperature, float top_p, int logprobs, bool echo, String[] stop, float frequency_penalty, float presence_penalty, float repeat_penalty, int top_k)
+```
+
+#### Parameters
+
+`prompt` [String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
+
+`suffix` [String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
+
+`max_tokens` [Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
+
+`temperature` [Single](https://docs.microsoft.com/en-us/dotnet/api/system.single)
+
+`top_p` [Single](https://docs.microsoft.com/en-us/dotnet/api/system.single)
+
+`logprobs` [Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
+
+`echo` [Boolean](https://docs.microsoft.com/en-us/dotnet/api/system.boolean)
+
+`stop` [String[]](https://docs.microsoft.com/en-us/dotnet/api/system.string)
+
+`frequency_penalty` [Single](https://docs.microsoft.com/en-us/dotnet/api/system.single)
+
+`presence_penalty` [Single](https://docs.microsoft.com/en-us/dotnet/api/system.single)
+
+`repeat_penalty` [Single](https://docs.microsoft.com/en-us/dotnet/api/system.single)
+
+`top_k` [Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
+
+#### Returns
+
+[IEnumerable<CompletionChunk>](https://docs.microsoft.com/en-us/dotnet/api/system.collections.generic.ienumerable-1)
+
+### **CreateChatCompletion(IEnumerable<ChatCompletionMessage>, Single, Single, Int32, String[], Int32, Single, Single, Single)**
+
+```csharp
+public IEnumerable CreateChatCompletion(IEnumerable messages, float temperature, float top_p, int top_k, String[] stop, int max_tokens, float presence_penalty, float frequency_penalty, float repeat_penalty)
+```
+
+#### Parameters
+
+`messages` [IEnumerable<ChatCompletionMessage>](https://docs.microsoft.com/en-us/dotnet/api/system.collections.generic.ienumerable-1)
+
+`temperature` [Single](https://docs.microsoft.com/en-us/dotnet/api/system.single)
+
+`top_p` [Single](https://docs.microsoft.com/en-us/dotnet/api/system.single)
+
+`top_k` [Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
+
+`stop` [String[]](https://docs.microsoft.com/en-us/dotnet/api/system.string)
+
+`max_tokens` [Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
+
+`presence_penalty` [Single](https://docs.microsoft.com/en-us/dotnet/api/system.single)
+
+`frequency_penalty` [Single](https://docs.microsoft.com/en-us/dotnet/api/system.single)
+
+`repeat_penalty` [Single](https://docs.microsoft.com/en-us/dotnet/api/system.single)
+
+#### Returns
+
+[IEnumerable<ChatCompletionChunk>](https://docs.microsoft.com/en-us/dotnet/api/system.collections.generic.ienumerable-1)
+
+### **SaveState()**
+
+```csharp
+public LLamaState SaveState()
+```
+
+#### Returns
+
+[LLamaState](./llama.llamastate.md)
+
+### **LoadState(LLamaState)**
+
+```csharp
+public void LoadState(LLamaState state)
+```
+
+#### Parameters
+
+`state` [LLamaState](./llama.llamastate.md)
+
+### **LongestTokenPrefix(IEnumerable<Int32>, IEnumerable<Int32>)**
+
+```csharp
+internal static int LongestTokenPrefix(IEnumerable a, IEnumerable b)
+```
+
+#### Parameters
+
+`a` [IEnumerable<Int32>](https://docs.microsoft.com/en-us/dotnet/api/system.collections.generic.ienumerable-1)
+
+`b` [IEnumerable<Int32>](https://docs.microsoft.com/en-us/dotnet/api/system.collections.generic.ienumerable-1)
+
+#### Returns
+
+[Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
+
+### **<CreateChatCompletion>g__GetRole|31_0(ChatCompletionMessage)**
+
+```csharp
+internal static string g__GetRole|31_0(ChatCompletionMessage message)
+```
+
+#### Parameters
+
+`message` [ChatCompletionMessage](./llama.types.chatcompletionmessage.md)
+
+#### Returns
+
+[String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
diff --git a/docs/xmldocs/llama.oldversion.llamaparams.md b/docs/xmldocs/llama.llamaparams.md
similarity index 83%
rename from docs/xmldocs/llama.oldversion.llamaparams.md
rename to docs/xmldocs/llama.llamaparams.md
index 911fa2d8..cb74af2a 100644
--- a/docs/xmldocs/llama.oldversion.llamaparams.md
+++ b/docs/xmldocs/llama.llamaparams.md
@@ -1,18 +1,12 @@
# LLamaParams
-Namespace: LLama.OldVersion
-
-#### Caution
-
-The entire LLama.OldVersion namespace will be removed
-
----
+Namespace: LLama
```csharp
public struct LLamaParams
```
-Inheritance [Object](https://docs.microsoft.com/en-us/dotnet/api/system.object) β [ValueType](https://docs.microsoft.com/en-us/dotnet/api/system.valuetype) β [LLamaParams](./llama.oldversion.llamaparams.md)
+Inheritance [Object](https://docs.microsoft.com/en-us/dotnet/api/system.object) β [ValueType](https://docs.microsoft.com/en-us/dotnet/api/system.valuetype) β [LLamaParams](./llama.llamaparams.md)
## Fields
@@ -34,6 +28,12 @@ public int n_threads;
public int n_predict;
```
+### **n_parts**
+
+```csharp
+public int n_parts;
+```
+
### **n_ctx**
```csharp
@@ -52,12 +52,6 @@ public int n_batch;
public int n_keep;
```
-### **n_gpu_layers**
-
-```csharp
-public int n_gpu_layers;
-```
-
### **logit_bias**
```csharp
@@ -208,12 +202,6 @@ public bool use_color;
public bool interactive;
```
-### **prompt_cache_all**
-
-```csharp
-public bool prompt_cache_all;
-```
-
### **embedding**
```csharp
@@ -270,10 +258,10 @@ public bool verbose_prompt;
## Constructors
-### **LLamaParams(Int32, Int32, Int32, Int32, Int32, Int32, Int32, Dictionary<Int32, Single>, Int32, Single, Single, Single, Single, Single, Int32, Single, Single, Int32, Single, Single, String, String, String, String, String, List<String>, String, String, Boolean, Boolean, Boolean, Boolean, Boolean, Boolean, Boolean, Boolean, Boolean, Boolean, Boolean, Boolean, Boolean, Boolean)**
+### **LLamaParams(Int32, Int32, Int32, Int32, Int32, Int32, Int32, Dictionary<Int32, Single>, Int32, Single, Single, Single, Single, Single, Int32, Single, Single, Int32, Single, Single, String, String, String, String, String, List<String>, String, String, Boolean, Boolean, Boolean, Boolean, Boolean, Boolean, Boolean, Boolean, Boolean, Boolean, Boolean, Boolean, Boolean)**
```csharp
-LLamaParams(int seed, int n_threads, int n_predict, int n_ctx, int n_batch, int n_keep, int n_gpu_layers, Dictionary logit_bias, int top_k, float top_p, float tfs_z, float typical_p, float temp, float repeat_penalty, int repeat_last_n, float frequency_penalty, float presence_penalty, int mirostat, float mirostat_tau, float mirostat_eta, string model, string prompt, string path_session, string input_prefix, string input_suffix, List antiprompt, string lora_adapter, string lora_base, bool memory_f16, bool random_prompt, bool use_color, bool interactive, bool prompt_cache_all, bool embedding, bool interactive_first, bool instruct, bool penalize_nl, bool perplexity, bool use_mmap, bool use_mlock, bool mem_test, bool verbose_prompt)
+LLamaParams(int seed, int n_threads, int n_predict, int n_parts, int n_ctx, int n_batch, int n_keep, Dictionary logit_bias, int top_k, float top_p, float tfs_z, float typical_p, float temp, float repeat_penalty, int repeat_last_n, float frequency_penalty, float presence_penalty, int mirostat, float mirostat_tau, float mirostat_eta, string model, string prompt, string path_session, string input_prefix, string input_suffix, List antiprompt, string lora_adapter, string lora_base, bool memory_f16, bool random_prompt, bool use_color, bool interactive, bool embedding, bool interactive_first, bool instruct, bool penalize_nl, bool perplexity, bool use_mmap, bool use_mlock, bool mem_test, bool verbose_prompt)
```
#### Parameters
@@ -284,14 +272,14 @@ LLamaParams(int seed, int n_threads, int n_predict, int n_ctx, int n_batch, int
`n_predict` [Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
+`n_parts` [Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
+
`n_ctx` [Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
`n_batch` [Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
`n_keep` [Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
-`n_gpu_layers` [Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
-
`logit_bias` [Dictionary<Int32, Single>](https://docs.microsoft.com/en-us/dotnet/api/system.collections.generic.dictionary-2)
`top_k` [Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
@@ -342,8 +330,6 @@ LLamaParams(int seed, int n_threads, int n_predict, int n_ctx, int n_batch, int
`interactive` [Boolean](https://docs.microsoft.com/en-us/dotnet/api/system.boolean)
-`prompt_cache_all` [Boolean](https://docs.microsoft.com/en-us/dotnet/api/system.boolean)
-
`embedding` [Boolean](https://docs.microsoft.com/en-us/dotnet/api/system.boolean)
`interactive_first` [Boolean](https://docs.microsoft.com/en-us/dotnet/api/system.boolean)
diff --git a/docs/xmldocs/llama.llamastate.md b/docs/xmldocs/llama.llamastate.md
new file mode 100644
index 00000000..4db410f0
--- /dev/null
+++ b/docs/xmldocs/llama.llamastate.md
@@ -0,0 +1,160 @@
+# LLamaState
+
+Namespace: LLama
+
+```csharp
+public class LLamaState : System.IEquatable`1[[LLama.LLamaState, LLamaSharp, Version=0.2.0.0, Culture=neutral, PublicKeyToken=null]]
+```
+
+Inheritance [Object](https://docs.microsoft.com/en-us/dotnet/api/system.object) β [LLamaState](./llama.llamastate.md)
+Implements [IEquatable<LLamaState>](https://docs.microsoft.com/en-us/dotnet/api/system.iequatable-1)
+
+## Properties
+
+### **EvalTokens**
+
+```csharp
+public Queue EvalTokens { get; set; }
+```
+
+#### Property Value
+
+[Queue<Int32>](https://docs.microsoft.com/en-us/dotnet/api/system.collections.generic.queue-1)
+
+### **EvalLogits**
+
+```csharp
+public Queue EvalLogits { get; set; }
+```
+
+#### Property Value
+
+[Queue<Single[]>](https://docs.microsoft.com/en-us/dotnet/api/system.collections.generic.queue-1)
+
+### **State**
+
+```csharp
+public Byte[] State { get; set; }
+```
+
+#### Property Value
+
+[Byte[]](https://docs.microsoft.com/en-us/dotnet/api/system.byte)
+
+### **Size**
+
+```csharp
+public int Size { get; set; }
+```
+
+#### Property Value
+
+[Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
+
+## Constructors
+
+### **LLamaState(Queue<Int32>, Queue<Single[]>, Byte[], Int32)**
+
+```csharp
+public LLamaState(Queue EvalTokens, Queue EvalLogits, Byte[] State, int Size)
+```
+
+#### Parameters
+
+`EvalTokens` [Queue<Int32>](https://docs.microsoft.com/en-us/dotnet/api/system.collections.generic.queue-1)
+
+`EvalLogits` [Queue<Single[]>](https://docs.microsoft.com/en-us/dotnet/api/system.collections.generic.queue-1)
+
+`State` [Byte[]](https://docs.microsoft.com/en-us/dotnet/api/system.byte)
+
+`Size` [Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
+
+## Methods
+
+### **ToString()**
+
+```csharp
+public string ToString()
+```
+
+#### Returns
+
+[String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
+
+### **PrintMembers(StringBuilder)**
+
+```csharp
+protected bool PrintMembers(StringBuilder builder)
+```
+
+#### Parameters
+
+`builder` [StringBuilder](https://docs.microsoft.com/en-us/dotnet/api/system.text.stringbuilder)
+
+#### Returns
+
+[Boolean](https://docs.microsoft.com/en-us/dotnet/api/system.boolean)
+
+### **GetHashCode()**
+
+```csharp
+public int GetHashCode()
+```
+
+#### Returns
+
+[Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
+
+### **Equals(Object)**
+
+```csharp
+public bool Equals(object obj)
+```
+
+#### Parameters
+
+`obj` [Object](https://docs.microsoft.com/en-us/dotnet/api/system.object)
+
+#### Returns
+
+[Boolean](https://docs.microsoft.com/en-us/dotnet/api/system.boolean)
+
+### **Equals(LLamaState)**
+
+```csharp
+public bool Equals(LLamaState other)
+```
+
+#### Parameters
+
+`other` [LLamaState](./llama.llamastate.md)
+
+#### Returns
+
+[Boolean](https://docs.microsoft.com/en-us/dotnet/api/system.boolean)
+
+### **<Clone>$()**
+
+```csharp
+public LLamaState $()
+```
+
+#### Returns
+
+[LLamaState](./llama.llamastate.md)
+
+### **Deconstruct(Queue`1&, Queue`1&, Byte[]&, Int32&)**
+
+```csharp
+public void Deconstruct(Queue`1& EvalTokens, Queue`1& EvalLogits, Byte[]& State, Int32& Size)
+```
+
+#### Parameters
+
+`EvalTokens` [Queue`1&](https://docs.microsoft.com/en-us/dotnet/api/system.collections.generic.queue-1&)
+
+`EvalLogits` [Queue`1&](https://docs.microsoft.com/en-us/dotnet/api/system.collections.generic.queue-1&)
+
+`State` [Byte[]&](https://docs.microsoft.com/en-us/dotnet/api/system.byte&)
+
+`Size` [Int32&](https://docs.microsoft.com/en-us/dotnet/api/system.int32&)
diff --git a/docs/xmldocs/llama.llamaweights.md b/docs/xmldocs/llama.llamaweights.md
deleted file mode 100644
index 3b448c62..00000000
--- a/docs/xmldocs/llama.llamaweights.md
+++ /dev/null
@@ -1,118 +0,0 @@
-# LLamaWeights
-
-Namespace: LLama
-
-A set of model weights, loaded into memory.
-
-```csharp
-public sealed class LLamaWeights : System.IDisposable
-```
-
-Inheritance [Object](https://docs.microsoft.com/en-us/dotnet/api/system.object) β [LLamaWeights](./llama.llamaweights.md)
-Implements [IDisposable](https://docs.microsoft.com/en-us/dotnet/api/system.idisposable)
-
-## Properties
-
-### **NativeHandle**
-
-The native handle, which is used in the native APIs
-
-```csharp
-public SafeLlamaModelHandle NativeHandle { get; }
-```
-
-#### Property Value
-
-[SafeLlamaModelHandle](./llama.native.safellamamodelhandle.md)
-
-**Remarks:**
-
-Be careful how you use this!
-
-### **Encoding**
-
-Encoding to use to convert text into bytes for the model
-
-```csharp
-public Encoding Encoding { get; }
-```
-
-#### Property Value
-
-[Encoding](https://docs.microsoft.com/en-us/dotnet/api/system.text.encoding)
-
-### **VocabCount**
-
-Total number of tokens in vocabulary of this model
-
-```csharp
-public int VocabCount { get; }
-```
-
-#### Property Value
-
-[Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
-
-### **ContextSize**
-
-Total number of tokens in the context
-
-```csharp
-public int ContextSize { get; }
-```
-
-#### Property Value
-
-[Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
-
-### **EmbeddingSize**
-
-Dimension of embedding vectors
-
-```csharp
-public int EmbeddingSize { get; }
-```
-
-#### Property Value
-
-[Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
-
-## Methods
-
-### **LoadFromFile(IModelParams)**
-
-Load weights into memory
-
-```csharp
-public static LLamaWeights LoadFromFile(IModelParams params)
-```
-
-#### Parameters
-
-`params` [IModelParams](./llama.abstractions.imodelparams.md)
-
-#### Returns
-
-[LLamaWeights](./llama.llamaweights.md)
-
-### **Dispose()**
-
-```csharp
-public void Dispose()
-```
-
-### **CreateContext(IModelParams)**
-
-Create a llama_context using this model
-
-```csharp
-public LLamaContext CreateContext(IModelParams params)
-```
-
-#### Parameters
-
-`params` [IModelParams](./llama.abstractions.imodelparams.md)
-
-#### Returns
-
-[LLamaContext](./llama.llamacontext.md)
diff --git a/docs/xmldocs/llama.llavaweights.md b/docs/xmldocs/llama.llavaweights.md
new file mode 100644
index 00000000..c44fafea
--- /dev/null
+++ b/docs/xmldocs/llama.llavaweights.md
@@ -0,0 +1,119 @@
+# LLavaWeights
+
+Namespace: LLama
+
+A set of llava model weights (mmproj), loaded into memory.
+
+```csharp
+public sealed class LLavaWeights : System.IDisposable
+```
+
+Inheritance [Object](https://docs.microsoft.com/en-us/dotnet/api/system.object) β [LLavaWeights](./llama.llavaweights.md)
+Implements [IDisposable](https://docs.microsoft.com/en-us/dotnet/api/system.idisposable)
+
+## Properties
+
+### **NativeHandle**
+
+The native handle, which is used in the native APIs
+
+```csharp
+public SafeLlavaModelHandle NativeHandle { get; }
+```
+
+#### Property Value
+
+[SafeLlavaModelHandle](./llama.native.safellavamodelhandle.md)
+
+**Remarks:**
+
+Be careful how you use this!
+
+## Methods
+
+### **LoadFromFile(String)**
+
+Load weights into memory
+
+```csharp
+public static LLavaWeights LoadFromFile(string mmProject)
+```
+
+#### Parameters
+
+`mmProject` [String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
+path to the "mmproj" model file
+
+#### Returns
+
+[LLavaWeights](./llama.llavaweights.md)
+
+### **CreateImageEmbeddings(LLamaContext, Byte[])**
+
+Create the Image Embeddings from the bytes of an image.
+
+```csharp
+public SafeLlavaImageEmbedHandle CreateImageEmbeddings(LLamaContext ctxLlama, Byte[] image)
+```
+
+#### Parameters
+
+`ctxLlama` [LLamaContext](./llama.llamacontext.md)
+
+`image` [Byte[]](https://docs.microsoft.com/en-us/dotnet/api/system.byte)
+Image bytes. Supported formats:
+ JPGPNGBMPTGA
+
+#### Returns
+
+[SafeLlavaImageEmbedHandle](./llama.native.safellavaimageembedhandle.md)
+
+### **CreateImageEmbeddings(LLamaContext, String)**
+
+Create the Image Embeddings from the bytes of an image.
+
+```csharp
+public SafeLlavaImageEmbedHandle CreateImageEmbeddings(LLamaContext ctxLlama, string image)
+```
+
+#### Parameters
+
+`ctxLlama` [LLamaContext](./llama.llamacontext.md)
+
+`image` [String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
+Path to the image file. Supported formats:
+ JPGPNGBMPTGA
+
+#### Returns
+
+[SafeLlavaImageEmbedHandle](./llama.native.safellavaimageembedhandle.md)
+
+#### Exceptions
+
+[InvalidOperationException](https://docs.microsoft.com/en-us/dotnet/api/system.invalidoperationexception)
+
+### **EvalImageEmbed(LLamaContext, SafeLlavaImageEmbedHandle, Int32&)**
+
+Eval the image embeddings
+
+```csharp
+public bool EvalImageEmbed(LLamaContext ctxLlama, SafeLlavaImageEmbedHandle imageEmbed, Int32& n_past)
+```
+
+#### Parameters
+
+`ctxLlama` [LLamaContext](./llama.llamacontext.md)
+
+`imageEmbed` [SafeLlavaImageEmbedHandle](./llama.native.safellavaimageembedhandle.md)
+
+`n_past` [Int32&](https://docs.microsoft.com/en-us/dotnet/api/system.int32&)
+
+#### Returns
+
+[Boolean](https://docs.microsoft.com/en-us/dotnet/api/system.boolean)
+
+### **Dispose()**
+
+```csharp
+public void Dispose()
+```
diff --git a/docs/xmldocs/llama.native.decoderesult.md b/docs/xmldocs/llama.native.decoderesult.md
new file mode 100644
index 00000000..86ff26b8
--- /dev/null
+++ b/docs/xmldocs/llama.native.decoderesult.md
@@ -0,0 +1,20 @@
+# DecodeResult
+
+Namespace: LLama.Native
+
+Return codes from llama_decode
+
+```csharp
+public enum DecodeResult
+```
+
+Inheritance [Object](https://docs.microsoft.com/en-us/dotnet/api/system.object) β [ValueType](https://docs.microsoft.com/en-us/dotnet/api/system.valuetype) β [Enum](https://docs.microsoft.com/en-us/dotnet/api/system.enum) β [DecodeResult](./llama.native.decoderesult.md)
+Implements [IComparable](https://docs.microsoft.com/en-us/dotnet/api/system.icomparable), [IFormattable](https://docs.microsoft.com/en-us/dotnet/api/system.iformattable), [IConvertible](https://docs.microsoft.com/en-us/dotnet/api/system.iconvertible)
+
+## Fields
+
+| Name | Value | Description |
+| --- | --: | --- |
+| Error | -1 | An unspecified error |
+| Ok | 0 | Ok. |
+| NoKvSlot | 1 | Could not find a KV slot for the batch (try reducing the size of the batch or increase the context) |
diff --git a/docs/xmldocs/llama.native.ggmltype.md b/docs/xmldocs/llama.native.ggmltype.md
new file mode 100644
index 00000000..2fa955d0
--- /dev/null
+++ b/docs/xmldocs/llama.native.ggmltype.md
@@ -0,0 +1,35 @@
+# GGMLType
+
+Namespace: LLama.Native
+
+Possible GGML quantisation types
+
+```csharp
+public enum GGMLType
+```
+
+Inheritance [Object](https://docs.microsoft.com/en-us/dotnet/api/system.object) β [ValueType](https://docs.microsoft.com/en-us/dotnet/api/system.valuetype) β [Enum](https://docs.microsoft.com/en-us/dotnet/api/system.enum) β [GGMLType](./llama.native.ggmltype.md)
+Implements [IComparable](https://docs.microsoft.com/en-us/dotnet/api/system.icomparable), [IFormattable](https://docs.microsoft.com/en-us/dotnet/api/system.iformattable), [IConvertible](https://docs.microsoft.com/en-us/dotnet/api/system.iconvertible)
+
+## Fields
+
+| Name | Value | Description |
+| --- | --: | --- |
+| GGML_TYPE_F32 | 0 | Full 32 bit float |
+| GGML_TYPE_F16 | 1 | 16 bit float |
+| GGML_TYPE_Q4_0 | 2 | 4 bit float |
+| GGML_TYPE_Q4_1 | 3 | 4 bit float |
+| GGML_TYPE_Q5_0 | 6 | 5 bit float |
+| GGML_TYPE_Q5_1 | 7 | 5 bit float |
+| GGML_TYPE_Q8_0 | 8 | 8 bit float |
+| GGML_TYPE_Q8_1 | 9 | 8 bit float |
+| GGML_TYPE_Q2_K | 10 | "type-1" 2-bit quantization in super-blocks containing 16 blocks, each block having 16 weight. Block scales and mins are quantized with 4 bits. This ends up effectively using 2.5625 bits per weight (bpw) |
+| GGML_TYPE_Q3_K | 11 | "type-0" 3-bit quantization in super-blocks containing 16 blocks, each block having 16 weights. Scales are quantized with 6 bits. This end up using 3.4375 bpw. |
+| GGML_TYPE_Q4_K | 12 | "type-1" 4-bit quantization in super-blocks containing 8 blocks, each block having 32 weights. Scales and mins are quantized with 6 bits. This ends up using 4.5 bpw. |
+| GGML_TYPE_Q5_K | 13 | "type-1" 5-bit quantization. Same super-block structure as GGML_TYPE_Q4_K resulting in 5.5 bpw |
+| GGML_TYPE_Q6_K | 14 | "type-0" 6-bit quantization. Super-blocks with 16 blocks, each block having 16 weights. Scales are quantized with 8 bits. This ends up using 6.5625 bpw |
+| GGML_TYPE_Q8_K | 15 | "type-0" 8-bit quantization. Only used for quantizing intermediate results. The difference to the existing Q8_0 is that the block size is 256. All 2-6 bit dot products are implemented for this quantization type. |
+| GGML_TYPE_I8 | 16 | Integer, 8 bit |
+| GGML_TYPE_I16 | 17 | Integer, 16 bit |
+| GGML_TYPE_I32 | 18 | Integer, 32 bit |
+| GGML_TYPE_COUNT | 19 | The value of this entry is the count of the number of possible quant types. |
diff --git a/docs/xmldocs/llama.native.gpusplitmode.md b/docs/xmldocs/llama.native.gpusplitmode.md
new file mode 100644
index 00000000..756637be
--- /dev/null
+++ b/docs/xmldocs/llama.native.gpusplitmode.md
@@ -0,0 +1,24 @@
+# GPUSplitMode
+
+Namespace: LLama.Native
+
+
+
+```csharp
+public enum GPUSplitMode
+```
+
+Inheritance [Object](https://docs.microsoft.com/en-us/dotnet/api/system.object) β [ValueType](https://docs.microsoft.com/en-us/dotnet/api/system.valuetype) β [Enum](https://docs.microsoft.com/en-us/dotnet/api/system.enum) β [GPUSplitMode](./llama.native.gpusplitmode.md)
+Implements [IComparable](https://docs.microsoft.com/en-us/dotnet/api/system.icomparable), [IFormattable](https://docs.microsoft.com/en-us/dotnet/api/system.iformattable), [IConvertible](https://docs.microsoft.com/en-us/dotnet/api/system.iconvertible)
+
+**Remarks:**
+
+llama_split_mode
+
+## Fields
+
+| Name | Value | Description |
+| --- | --: | --- |
+| None | 0 | Single GPU |
+| Layer | 1 | Split layers and KV across GPUs |
+| Row | 2 | split rows across GPUs |
diff --git a/docs/xmldocs/llama.native.llamabatch.md b/docs/xmldocs/llama.native.llamabatch.md
new file mode 100644
index 00000000..72a97796
--- /dev/null
+++ b/docs/xmldocs/llama.native.llamabatch.md
@@ -0,0 +1,204 @@
+# LLamaBatch
+
+Namespace: LLama.Native
+
+A batch allows submitting multiple tokens to multiple sequences simultaneously
+
+```csharp
+public class LLamaBatch
+```
+
+Inheritance [Object](https://docs.microsoft.com/en-us/dotnet/api/system.object) β [LLamaBatch](./llama.native.llamabatch.md)
+
+## Properties
+
+### **TokenCount**
+
+The number of tokens in this batch
+
+```csharp
+public int TokenCount { get; private set; }
+```
+
+#### Property Value
+
+[Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
+
+### **SequenceCapacity**
+
+Maximum number of sequences a token can be assigned to (automatically grows if exceeded)
+
+```csharp
+public int SequenceCapacity { get; private set; }
+```
+
+#### Property Value
+
+[Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
+
+## Constructors
+
+### **LLamaBatch()**
+
+Create a new batch for submitting inputs to llama.cpp
+
+```csharp
+public LLamaBatch()
+```
+
+## Methods
+
+### **ToNativeBatch(LLamaNativeBatch&)**
+
+```csharp
+internal GroupDisposable ToNativeBatch(LLamaNativeBatch& batch)
+```
+
+#### Parameters
+
+`batch` [LLamaNativeBatch&](./llama.native.llamanativebatch&.md)
+
+#### Returns
+
+[GroupDisposable](./llama.native.groupdisposable.md)
+
+### **Add(LLamaToken, LLamaPos, ReadOnlySpan<LLamaSeqId>, Boolean)**
+
+Add a single token to the batch at the same position in several sequences
+
+```csharp
+public int Add(LLamaToken token, LLamaPos pos, ReadOnlySpan sequences, bool logits)
+```
+
+#### Parameters
+
+`token` [LLamaToken](./llama.native.llamatoken.md)
+The token to add
+
+`pos` [LLamaPos](./llama.native.llamapos.md)
+The position to add it att
+
+`sequences` [ReadOnlySpan<LLamaSeqId>](https://docs.microsoft.com/en-us/dotnet/api/system.readonlyspan-1)
+The set of sequences to add this token to
+
+`logits` [Boolean](https://docs.microsoft.com/en-us/dotnet/api/system.boolean)
+
+#### Returns
+
+[Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
+The index that the token was added at. Use this for GetLogitsIth
+
+**Remarks:**
+
+https://github.com/ggerganov/llama.cpp/blob/ad939626577cd25b462e8026cc543efb71528472/common/common.cpp#L829C2-L829C2
+
+### **Add(LLamaToken, LLamaPos, List<LLamaSeqId>, Boolean)**
+
+Add a single token to the batch at the same position in several sequences
+
+```csharp
+public int Add(LLamaToken token, LLamaPos pos, List sequences, bool logits)
+```
+
+#### Parameters
+
+`token` [LLamaToken](./llama.native.llamatoken.md)
+The token to add
+
+`pos` [LLamaPos](./llama.native.llamapos.md)
+The position to add it att
+
+`sequences` [List<LLamaSeqId>](https://docs.microsoft.com/en-us/dotnet/api/system.collections.generic.list-1)
+The set of sequences to add this token to
+
+`logits` [Boolean](https://docs.microsoft.com/en-us/dotnet/api/system.boolean)
+
+#### Returns
+
+[Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
+The index that the token was added at. Use this for GetLogitsIth
+
+**Remarks:**
+
+https://github.com/ggerganov/llama.cpp/blob/ad939626577cd25b462e8026cc543efb71528472/common/common.cpp#L829C2-L829C2
+
+### **Add(LLamaToken, LLamaPos, LLamaSeqId, Boolean)**
+
+Add a single token to the batch at a certain position for a single sequences
+
+```csharp
+public int Add(LLamaToken token, LLamaPos pos, LLamaSeqId sequence, bool logits)
+```
+
+#### Parameters
+
+`token` [LLamaToken](./llama.native.llamatoken.md)
+The token to add
+
+`pos` [LLamaPos](./llama.native.llamapos.md)
+The position to add it att
+
+`sequence` [LLamaSeqId](./llama.native.llamaseqid.md)
+The sequence to add this token to
+
+`logits` [Boolean](https://docs.microsoft.com/en-us/dotnet/api/system.boolean)
+
+#### Returns
+
+[Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
+The index that the token was added at. Use this for GetLogitsIth
+
+**Remarks:**
+
+https://github.com/ggerganov/llama.cpp/blob/ad939626577cd25b462e8026cc543efb71528472/common/common.cpp#L829C2-L829C2
+
+### **AddRange(ReadOnlySpan<LLamaToken>, LLamaPos, LLamaSeqId, Boolean)**
+
+Add a range of tokens to a single sequence, start at the given position.
+
+```csharp
+public int AddRange(ReadOnlySpan tokens, LLamaPos start, LLamaSeqId sequence, bool logitsLast)
+```
+
+#### Parameters
+
+`tokens` [ReadOnlySpan<LLamaToken>](https://docs.microsoft.com/en-us/dotnet/api/system.readonlyspan-1)
+The tokens to add
+
+`start` [LLamaPos](./llama.native.llamapos.md)
+The starting position to add tokens at
+
+`sequence` [LLamaSeqId](./llama.native.llamaseqid.md)
+The sequence to add this token to
+
+`logitsLast` [Boolean](https://docs.microsoft.com/en-us/dotnet/api/system.boolean)
+Whether the final token should generate logits
+
+#### Returns
+
+[Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
+The index that the final token was added at. Use this for GetLogitsIth
+
+### **Clear()**
+
+Set TokenCount to zero for this batch
+
+```csharp
+public void Clear()
+```
+
+### **GetLogitPositions(Span<ValueTuple<LLamaSeqId, Int32>>)**
+
+Get the positions where logits can be sampled from
+
+```csharp
+internal Span> GetLogitPositions(Span> dest)
+```
+
+#### Parameters
+
+`dest` [Span<ValueTuple<LLamaSeqId, Int32>>](https://docs.microsoft.com/en-us/dotnet/api/system.span-1)
+
+#### Returns
+
+[Span<ValueTuple<LLamaSeqId, Int32>>](https://docs.microsoft.com/en-us/dotnet/api/system.span-1)
diff --git a/docs/xmldocs/llama.native.llamabeamsstate.md b/docs/xmldocs/llama.native.llamabeamsstate.md
new file mode 100644
index 00000000..5bdf73d4
--- /dev/null
+++ b/docs/xmldocs/llama.native.llamabeamsstate.md
@@ -0,0 +1,45 @@
+# LLamaBeamsState
+
+Namespace: LLama.Native
+
+Passed to beam_search_callback function.
+ Whenever 0 < common_prefix_length, this number of tokens should be copied from any of the beams
+ (e.g. beams[0]) as they will be removed (shifted) from all beams in all subsequent callbacks.
+
+```csharp
+public struct LLamaBeamsState
+```
+
+Inheritance [Object](https://docs.microsoft.com/en-us/dotnet/api/system.object) β [ValueType](https://docs.microsoft.com/en-us/dotnet/api/system.valuetype) β [LLamaBeamsState](./llama.native.llamabeamsstate.md)
+
+## Fields
+
+### **CommonPrefixLength**
+
+Current max length of prefix tokens shared by all beams.
+
+```csharp
+public ulong CommonPrefixLength;
+```
+
+### **LastCall**
+
+True iff this is the last callback invocation.
+
+```csharp
+public bool LastCall;
+```
+
+## Properties
+
+### **Beams**
+
+The current state of each beam
+
+```csharp
+public Span Beams { get; }
+```
+
+#### Property Value
+
+[Span<LLamaBeamView>](https://docs.microsoft.com/en-us/dotnet/api/system.span-1)
diff --git a/docs/xmldocs/llama.native.llamabeamview.md b/docs/xmldocs/llama.native.llamabeamview.md
new file mode 100644
index 00000000..f23eb95c
--- /dev/null
+++ b/docs/xmldocs/llama.native.llamabeamview.md
@@ -0,0 +1,43 @@
+# LLamaBeamView
+
+Namespace: LLama.Native
+
+Information about a single beam in a beam search
+
+```csharp
+public struct LLamaBeamView
+```
+
+Inheritance [Object](https://docs.microsoft.com/en-us/dotnet/api/system.object) β [ValueType](https://docs.microsoft.com/en-us/dotnet/api/system.valuetype) β [LLamaBeamView](./llama.native.llamabeamview.md)
+
+## Fields
+
+### **CumulativeProbability**
+
+Cumulative beam probability (renormalized relative to all beams)
+
+```csharp
+public float CumulativeProbability;
+```
+
+### **EndOfBeam**
+
+Callback should set this to true when a beam is at end-of-beam.
+
+```csharp
+public bool EndOfBeam;
+```
+
+## Properties
+
+### **Tokens**
+
+Tokens in this beam
+
+```csharp
+public Span Tokens { get; }
+```
+
+#### Property Value
+
+[Span<LLamaToken>](https://docs.microsoft.com/en-us/dotnet/api/system.span-1)
diff --git a/docs/xmldocs/llama.native.llamachatmessage.md b/docs/xmldocs/llama.native.llamachatmessage.md
new file mode 100644
index 00000000..7b0c9035
--- /dev/null
+++ b/docs/xmldocs/llama.native.llamachatmessage.md
@@ -0,0 +1,29 @@
+# LLamaChatMessage
+
+Namespace: LLama.Native
+
+
+
+```csharp
+public struct LLamaChatMessage
+```
+
+Inheritance [Object](https://docs.microsoft.com/en-us/dotnet/api/system.object) β [ValueType](https://docs.microsoft.com/en-us/dotnet/api/system.valuetype) β [LLamaChatMessage](./llama.native.llamachatmessage.md)
+
+**Remarks:**
+
+llama_chat_message
+
+## Fields
+
+### **role**
+
+```csharp
+public Byte* role;
+```
+
+### **content**
+
+```csharp
+public Byte* content;
+```
diff --git a/docs/xmldocs/llama.native.llamacontextparams.md b/docs/xmldocs/llama.native.llamacontextparams.md
index 0b9ba61e..2bb397c6 100644
--- a/docs/xmldocs/llama.native.llamacontextparams.md
+++ b/docs/xmldocs/llama.native.llamacontextparams.md
@@ -17,15 +17,15 @@ Inheritance [Object](https://docs.microsoft.com/en-us/dotnet/api/system.object)
RNG seed, -1 for random
```csharp
-public int seed;
+public uint seed;
```
### **n_ctx**
-text context
+text context, 0 = from model
```csharp
-public int n_ctx;
+public uint n_ctx;
```
### **n_batch**
@@ -33,37 +33,36 @@ public int n_ctx;
prompt processing batch size
```csharp
-public int n_batch;
+public uint n_batch;
```
-### **n_gpu_layers**
+### **n_threads**
-number of layers to store in VRAM
+number of threads to use for generation
```csharp
-public int n_gpu_layers;
+public uint n_threads;
```
-### **main_gpu**
+### **n_threads_batch**
-the GPU that is used for scratch and small tensors
+number of threads to use for batch processing
```csharp
-public int main_gpu;
+public uint n_threads_batch;
```
-### **tensor_split**
+### **rope_scaling_type**
-how to split layers across multiple GPUs
+RoPE scaling type, from `enum llama_rope_scaling_type`
```csharp
-public IntPtr tensor_split;
+public RopeScalingType rope_scaling_type;
```
### **rope_freq_base**
-ref: https://github.com/ggerganov/llama.cpp/pull/2054
- RoPE base frequency
+RoPE base frequency, 0 = from model
```csharp
public float rope_freq_base;
@@ -71,121 +70,124 @@ public float rope_freq_base;
### **rope_freq_scale**
-ref: https://github.com/ggerganov/llama.cpp/pull/2054
- RoPE frequency scaling factor
+RoPE frequency scaling factor, 0 = from model
```csharp
public float rope_freq_scale;
```
-### **progress_callback**
+### **yarn_ext_factor**
-called with a progress value between 0 and 1, pass NULL to disable
+YaRN extrapolation mix factor, negative = from model
```csharp
-public IntPtr progress_callback;
+public float yarn_ext_factor;
```
-### **progress_callback_user_data**
+### **yarn_attn_factor**
-context pointer passed to the progress callback
+YaRN magnitude scaling factor
```csharp
-public IntPtr progress_callback_user_data;
+public float yarn_attn_factor;
```
-## Properties
-
-### **low_vram**
+### **yarn_beta_fast**
-if true, reduce VRAM usage at the cost of performance
+YaRN low correction dim
```csharp
-public bool low_vram { get; set; }
+public float yarn_beta_fast;
```
-#### Property Value
-
-[Boolean](https://docs.microsoft.com/en-us/dotnet/api/system.boolean)
-
-### **mul_mat_q**
+### **yarn_beta_slow**
-if true, use experimental mul_mat_q kernels
+YaRN high correction dim
```csharp
-public bool mul_mat_q { get; set; }
+public float yarn_beta_slow;
```
-#### Property Value
+### **yarn_orig_ctx**
-[Boolean](https://docs.microsoft.com/en-us/dotnet/api/system.boolean)
+YaRN original context size
-### **f16_kv**
+```csharp
+public uint yarn_orig_ctx;
+```
-use fp16 for KV cache
+### **defrag_threshold**
+
+defragment the KV cache if holes/size > defrag_threshold, Set to < 0 to disable (default)
```csharp
-public bool f16_kv { get; set; }
+public float defrag_threshold;
```
-#### Property Value
+### **cb_eval**
-[Boolean](https://docs.microsoft.com/en-us/dotnet/api/system.boolean)
+ggml_backend_sched_eval_callback
+
+```csharp
+public IntPtr cb_eval;
+```
-### **logits_all**
+### **cb_eval_user_data**
-the llama_eval() call computes all logits, not just the last one
+User data passed into cb_eval
```csharp
-public bool logits_all { get; set; }
+public IntPtr cb_eval_user_data;
```
-#### Property Value
+### **type_k**
-[Boolean](https://docs.microsoft.com/en-us/dotnet/api/system.boolean)
+data type for K cache
+
+```csharp
+public GGMLType type_k;
+```
-### **vocab_only**
+### **type_v**
-only load the vocabulary, no weights
+data type for V cache
```csharp
-public bool vocab_only { get; set; }
+public GGMLType type_v;
```
-#### Property Value
-
-[Boolean](https://docs.microsoft.com/en-us/dotnet/api/system.boolean)
+## Properties
-### **use_mmap**
+### **embedding**
-use mmap if possible
+embedding mode only
```csharp
-public bool use_mmap { get; set; }
+public bool embedding { get; set; }
```
#### Property Value
[Boolean](https://docs.microsoft.com/en-us/dotnet/api/system.boolean)
-### **use_mlock**
+### **offload_kqv**
-force system to keep model in RAM
+whether to offload the KQV ops (including the KV cache) to GPU
```csharp
-public bool use_mlock { get; set; }
+public bool offload_kqv { get; set; }
```
#### Property Value
[Boolean](https://docs.microsoft.com/en-us/dotnet/api/system.boolean)
-### **embedding**
+### **do_pooling**
-embedding mode only
+Whether to pool (sum) embedding results by sequence id (ignored if no pooling layer)
```csharp
-public bool embedding { get; set; }
+public bool do_pooling { get; set; }
```
#### Property Value
diff --git a/docs/xmldocs/llama.native.llamaftype.md b/docs/xmldocs/llama.native.llamaftype.md
index 7b98173d..6f982cf1 100644
--- a/docs/xmldocs/llama.native.llamaftype.md
+++ b/docs/xmldocs/llama.native.llamaftype.md
@@ -32,4 +32,16 @@ Implements [IComparable](https://docs.microsoft.com/en-us/dotnet/api/system.icom
| LLAMA_FTYPE_MOSTLY_Q5_K_S | 16 | K-Quant 5 bit (Small) |
| LLAMA_FTYPE_MOSTLY_Q5_K_M | 17 | K-Quant 5 bit (Medium) |
| LLAMA_FTYPE_MOSTLY_Q6_K | 18 | K-Quant 6 bit |
+| LLAMA_FTYPE_MOSTLY_IQ2_XXS | 19 | except 1d tensors |
+| LLAMA_FTYPE_MOSTLY_IQ2_XS | 20 | except 1d tensors |
+| LLAMA_FTYPE_MOSTLY_Q2_K_S | 21 | except 1d tensors |
+| LLAMA_FTYPE_MOSTLY_IQ3_K_XS | 22 | except 1d tensors |
+| LLAMA_FTYPE_MOSTLY_IQ3_XXS | 23 | except 1d tensors |
+| LLAMA_FTYPE_MOSTLY_IQ1_S | 24 | except 1d tensors |
+| LLAMA_FTYPE_MOSTLY_IQ4_NL | 25 | except 1d tensors |
+| LLAMA_FTYPE_MOSTLY_IQ3_S | 26 | except 1d tensors |
+| LLAMA_FTYPE_MOSTLY_IQ3_M | 27 | except 1d tensors |
+| LLAMA_FTYPE_MOSTLY_IQ2_S | 28 | except 1d tensors |
+| LLAMA_FTYPE_MOSTLY_IQ2_M | 29 | except 1d tensors |
+| LLAMA_FTYPE_MOSTLY_IQ4_XS | 30 | except 1d tensors |
| LLAMA_FTYPE_GUESSED | 1024 | File type was not specified |
diff --git a/docs/xmldocs/llama.native.llamagrammarelement.md b/docs/xmldocs/llama.native.llamagrammarelement.md
index c836c3cf..60bb882a 100644
--- a/docs/xmldocs/llama.native.llamagrammarelement.md
+++ b/docs/xmldocs/llama.native.llamagrammarelement.md
@@ -47,33 +47,25 @@ LLamaGrammarElement(LLamaGrammarElementType type, uint value)
## Methods
-### **Equals(LLamaGrammarElement)**
+### **IsCharElement()**
```csharp
-bool Equals(LLamaGrammarElement other)
+bool IsCharElement()
```
-#### Parameters
-
-`other` [LLamaGrammarElement](./llama.native.llamagrammarelement.md)
-
#### Returns
[Boolean](https://docs.microsoft.com/en-us/dotnet/api/system.boolean)
-### **Equals(Object)**
+### **ToString()**
```csharp
-bool Equals(object obj)
+string ToString()
```
-#### Parameters
-
-`obj` [Object](https://docs.microsoft.com/en-us/dotnet/api/system.object)
-
#### Returns
-[Boolean](https://docs.microsoft.com/en-us/dotnet/api/system.boolean)
+[String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
### **GetHashCode()**
@@ -85,12 +77,30 @@ int GetHashCode()
[Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
-### **IsCharElement()**
+### **Equals(Object)**
```csharp
-bool IsCharElement()
+bool Equals(object obj)
```
+#### Parameters
+
+`obj` [Object](https://docs.microsoft.com/en-us/dotnet/api/system.object)
+
+#### Returns
+
+[Boolean](https://docs.microsoft.com/en-us/dotnet/api/system.boolean)
+
+### **Equals(LLamaGrammarElement)**
+
+```csharp
+bool Equals(LLamaGrammarElement other)
+```
+
+#### Parameters
+
+`other` [LLamaGrammarElement](./llama.native.llamagrammarelement.md)
+
#### Returns
[Boolean](https://docs.microsoft.com/en-us/dotnet/api/system.boolean)
diff --git a/docs/xmldocs/llama.native.llamakvcacheview.md b/docs/xmldocs/llama.native.llamakvcacheview.md
new file mode 100644
index 00000000..d77f7216
--- /dev/null
+++ b/docs/xmldocs/llama.native.llamakvcacheview.md
@@ -0,0 +1,11 @@
+# LLamaKvCacheView
+
+Namespace: LLama.Native
+
+An updateable view of the KV cache (llama_kv_cache_view)
+
+```csharp
+public struct LLamaKvCacheView
+```
+
+Inheritance [Object](https://docs.microsoft.com/en-us/dotnet/api/system.object) β [ValueType](https://docs.microsoft.com/en-us/dotnet/api/system.valuetype) β [LLamaKvCacheView](./llama.native.llamakvcacheview.md)
diff --git a/docs/xmldocs/llama.native.llamakvcacheviewcell.md b/docs/xmldocs/llama.native.llamakvcacheviewcell.md
new file mode 100644
index 00000000..599de961
--- /dev/null
+++ b/docs/xmldocs/llama.native.llamakvcacheviewcell.md
@@ -0,0 +1,22 @@
+# LLamaKvCacheViewCell
+
+Namespace: LLama.Native
+
+Information associated with an individual cell in the KV cache view (llama_kv_cache_view_cell)
+
+```csharp
+public struct LLamaKvCacheViewCell
+```
+
+Inheritance [Object](https://docs.microsoft.com/en-us/dotnet/api/system.object) β [ValueType](https://docs.microsoft.com/en-us/dotnet/api/system.valuetype) β [LLamaKvCacheViewCell](./llama.native.llamakvcacheviewcell.md)
+
+## Fields
+
+### **pos**
+
+The position for this cell. Takes KV cache shifts into account.
+ May be negative if the cell is not populated.
+
+```csharp
+public LLamaPos pos;
+```
diff --git a/docs/xmldocs/llama.native.llamakvcacheviewsafehandle.md b/docs/xmldocs/llama.native.llamakvcacheviewsafehandle.md
new file mode 100644
index 00000000..55ce1188
--- /dev/null
+++ b/docs/xmldocs/llama.native.llamakvcacheviewsafehandle.md
@@ -0,0 +1,101 @@
+# LLamaKvCacheViewSafeHandle
+
+Namespace: LLama.Native
+
+A safe handle for a LLamaKvCacheView
+
+```csharp
+public class LLamaKvCacheViewSafeHandle : SafeLLamaHandleBase, System.IDisposable
+```
+
+Inheritance [Object](https://docs.microsoft.com/en-us/dotnet/api/system.object) β [CriticalFinalizerObject](https://docs.microsoft.com/en-us/dotnet/api/system.runtime.constrainedexecution.criticalfinalizerobject) β [SafeHandle](https://docs.microsoft.com/en-us/dotnet/api/system.runtime.interopservices.safehandle) β [SafeLLamaHandleBase](./llama.native.safellamahandlebase.md) β [LLamaKvCacheViewSafeHandle](./llama.native.llamakvcacheviewsafehandle.md)
+Implements [IDisposable](https://docs.microsoft.com/en-us/dotnet/api/system.idisposable)
+
+## Properties
+
+### **IsInvalid**
+
+```csharp
+public bool IsInvalid { get; }
+```
+
+#### Property Value
+
+[Boolean](https://docs.microsoft.com/en-us/dotnet/api/system.boolean)
+
+### **IsClosed**
+
+```csharp
+public bool IsClosed { get; }
+```
+
+#### Property Value
+
+[Boolean](https://docs.microsoft.com/en-us/dotnet/api/system.boolean)
+
+## Constructors
+
+### **LLamaKvCacheViewSafeHandle(SafeLLamaContextHandle, LLamaKvCacheView)**
+
+Initialize a LLamaKvCacheViewSafeHandle which will call `llama_kv_cache_view_free` when disposed
+
+```csharp
+public LLamaKvCacheViewSafeHandle(SafeLLamaContextHandle ctx, LLamaKvCacheView view)
+```
+
+#### Parameters
+
+`ctx` [SafeLLamaContextHandle](./llama.native.safellamacontexthandle.md)
+
+`view` [LLamaKvCacheView](./llama.native.llamakvcacheview.md)
+
+## Methods
+
+### **Allocate(SafeLLamaContextHandle, Int32)**
+
+Allocate a new KV cache view which can be used to inspect the KV cache
+
+```csharp
+public static LLamaKvCacheViewSafeHandle Allocate(SafeLLamaContextHandle ctx, int maxSequences)
+```
+
+#### Parameters
+
+`ctx` [SafeLLamaContextHandle](./llama.native.safellamacontexthandle.md)
+
+`maxSequences` [Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
+The maximum number of sequences visible in this view per cell
+
+#### Returns
+
+[LLamaKvCacheViewSafeHandle](./llama.native.llamakvcacheviewsafehandle.md)
+
+### **ReleaseHandle()**
+
+```csharp
+protected bool ReleaseHandle()
+```
+
+#### Returns
+
+[Boolean](https://docs.microsoft.com/en-us/dotnet/api/system.boolean)
+
+### **Update()**
+
+Update this view
+
+```csharp
+public void Update()
+```
+
+### **GetView()**
+
+Get the raw KV cache view
+
+```csharp
+public LLamaKvCacheView& GetView()
+```
+
+#### Returns
+
+[LLamaKvCacheView&](./llama.native.llamakvcacheview&.md)
diff --git a/docs/xmldocs/llama.native.llamaloglevel.md b/docs/xmldocs/llama.native.llamaloglevel.md
new file mode 100644
index 00000000..5c54507d
--- /dev/null
+++ b/docs/xmldocs/llama.native.llamaloglevel.md
@@ -0,0 +1,21 @@
+# LLamaLogLevel
+
+Namespace: LLama.Native
+
+Severity level of a log message
+
+```csharp
+public enum LLamaLogLevel
+```
+
+Inheritance [Object](https://docs.microsoft.com/en-us/dotnet/api/system.object) β [ValueType](https://docs.microsoft.com/en-us/dotnet/api/system.valuetype) β [Enum](https://docs.microsoft.com/en-us/dotnet/api/system.enum) β [LLamaLogLevel](./llama.native.llamaloglevel.md)
+Implements [IComparable](https://docs.microsoft.com/en-us/dotnet/api/system.icomparable), [IFormattable](https://docs.microsoft.com/en-us/dotnet/api/system.iformattable), [IConvertible](https://docs.microsoft.com/en-us/dotnet/api/system.iconvertible)
+
+## Fields
+
+| Name | Value | Description |
+| --- | --: | --- |
+| Error | 2 | Logs that highlight when the current flow of execution is stopped due to a failure. |
+| Warning | 3 | Logs that highlight an abnormal or unexpected event in the application flow, but do not otherwise cause the application execution to stop. |
+| Info | 4 | Logs that track the general flow of the application. |
+| Debug | 5 | Logs that are used for interactive investigation during development. |
diff --git a/docs/xmldocs/llama.native.llamamodelkvoverridetype.md b/docs/xmldocs/llama.native.llamamodelkvoverridetype.md
new file mode 100644
index 00000000..43bf1397
--- /dev/null
+++ b/docs/xmldocs/llama.native.llamamodelkvoverridetype.md
@@ -0,0 +1,24 @@
+# LLamaModelKvOverrideType
+
+Namespace: LLama.Native
+
+Specifies what type of value is being overridden by LLamaModelKvOverride
+
+```csharp
+public enum LLamaModelKvOverrideType
+```
+
+Inheritance [Object](https://docs.microsoft.com/en-us/dotnet/api/system.object) β [ValueType](https://docs.microsoft.com/en-us/dotnet/api/system.valuetype) β [Enum](https://docs.microsoft.com/en-us/dotnet/api/system.enum) β [LLamaModelKvOverrideType](./llama.native.llamamodelkvoverridetype.md)
+Implements [IComparable](https://docs.microsoft.com/en-us/dotnet/api/system.icomparable), [IFormattable](https://docs.microsoft.com/en-us/dotnet/api/system.iformattable), [IConvertible](https://docs.microsoft.com/en-us/dotnet/api/system.iconvertible)
+
+**Remarks:**
+
+llama_model_kv_override_type
+
+## Fields
+
+| Name | Value | Description |
+| --- | --: | --- |
+| Int | 0 | Overriding an int value |
+| Float | 1 | Overriding a float value |
+| Bool | 2 | Overriding a bool value |
diff --git a/docs/xmldocs/llama.native.llamamodelmetadataoverride.md b/docs/xmldocs/llama.native.llamamodelmetadataoverride.md
new file mode 100644
index 00000000..4b069e63
--- /dev/null
+++ b/docs/xmldocs/llama.native.llamamodelmetadataoverride.md
@@ -0,0 +1,53 @@
+# LLamaModelMetadataOverride
+
+Namespace: LLama.Native
+
+Override a key/value pair in the llama model metadata (llama_model_kv_override)
+
+```csharp
+public struct LLamaModelMetadataOverride
+```
+
+Inheritance [Object](https://docs.microsoft.com/en-us/dotnet/api/system.object) β [ValueType](https://docs.microsoft.com/en-us/dotnet/api/system.valuetype) β [LLamaModelMetadataOverride](./llama.native.llamamodelmetadataoverride.md)
+
+## Fields
+
+### **key**
+
+Key to override
+
+```csharp
+public e__FixedBuffer key;
+```
+
+### **Tag**
+
+Type of value
+
+```csharp
+public LLamaModelKvOverrideType Tag;
+```
+
+### **IntValue**
+
+Value, **must** only be used if Tag == LLAMA_KV_OVERRIDE_INT
+
+```csharp
+public long IntValue;
+```
+
+### **FloatValue**
+
+Value, **must** only be used if Tag == LLAMA_KV_OVERRIDE_FLOAT
+
+```csharp
+public double FloatValue;
+```
+
+### **BoolValue**
+
+Value, **must** only be used if Tag == LLAMA_KV_OVERRIDE_BOOL
+
+```csharp
+public long BoolValue;
+```
diff --git a/docs/xmldocs/llama.native.llamamodelparams.md b/docs/xmldocs/llama.native.llamamodelparams.md
new file mode 100644
index 00000000..ca9a2982
--- /dev/null
+++ b/docs/xmldocs/llama.native.llamamodelparams.md
@@ -0,0 +1,108 @@
+# LLamaModelParams
+
+Namespace: LLama.Native
+
+A C# representation of the llama.cpp `llama_model_params` struct
+
+```csharp
+public struct LLamaModelParams
+```
+
+Inheritance [Object](https://docs.microsoft.com/en-us/dotnet/api/system.object) β [ValueType](https://docs.microsoft.com/en-us/dotnet/api/system.valuetype) β [LLamaModelParams](./llama.native.llamamodelparams.md)
+
+## Fields
+
+### **n_gpu_layers**
+
+// number of layers to store in VRAM
+
+```csharp
+public int n_gpu_layers;
+```
+
+### **split_mode**
+
+how to split the model across multiple GPUs
+
+```csharp
+public GPUSplitMode split_mode;
+```
+
+### **main_gpu**
+
+the GPU that is used for scratch and small tensors
+
+```csharp
+public int main_gpu;
+```
+
+### **tensor_split**
+
+how to split layers across multiple GPUs (size: [NativeApi.llama_max_devices()](./llama.native.nativeapi.md#llama_max_devices))
+
+```csharp
+public Single* tensor_split;
+```
+
+### **progress_callback**
+
+called with a progress value between 0 and 1, pass NULL to disable. If the provided progress_callback
+ returns true, model loading continues. If it returns false, model loading is immediately aborted.
+
+```csharp
+public LlamaProgressCallback progress_callback;
+```
+
+### **progress_callback_user_data**
+
+context pointer passed to the progress callback
+
+```csharp
+public Void* progress_callback_user_data;
+```
+
+### **kv_overrides**
+
+override key-value pairs of the model meta data
+
+```csharp
+public LLamaModelMetadataOverride* kv_overrides;
+```
+
+## Properties
+
+### **vocab_only**
+
+only load the vocabulary, no weights
+
+```csharp
+public bool vocab_only { get; set; }
+```
+
+#### Property Value
+
+[Boolean](https://docs.microsoft.com/en-us/dotnet/api/system.boolean)
+
+### **use_mmap**
+
+use mmap if possible
+
+```csharp
+public bool use_mmap { get; set; }
+```
+
+#### Property Value
+
+[Boolean](https://docs.microsoft.com/en-us/dotnet/api/system.boolean)
+
+### **use_mlock**
+
+force system to keep model in RAM
+
+```csharp
+public bool use_mlock { get; set; }
+```
+
+#### Property Value
+
+[Boolean](https://docs.microsoft.com/en-us/dotnet/api/system.boolean)
diff --git a/docs/xmldocs/llama.native.llamamodelquantizeparams.md b/docs/xmldocs/llama.native.llamamodelquantizeparams.md
index 03d6f630..36d7e356 100644
--- a/docs/xmldocs/llama.native.llamamodelquantizeparams.md
+++ b/docs/xmldocs/llama.native.llamamodelquantizeparams.md
@@ -10,6 +10,10 @@ public struct LLamaModelQuantizeParams
Inheritance [Object](https://docs.microsoft.com/en-us/dotnet/api/system.object) β [ValueType](https://docs.microsoft.com/en-us/dotnet/api/system.valuetype) β [LLamaModelQuantizeParams](./llama.native.llamamodelquantizeparams.md)
+**Remarks:**
+
+llama_model_quantize_params
+
## Fields
### **nthread**
@@ -28,6 +32,14 @@ quantize to this llama_ftype
public LLamaFtype ftype;
```
+### **imatrix**
+
+pointer to importance matrix data
+
+```csharp
+public IntPtr imatrix;
+```
+
## Properties
### **allow_requantize**
@@ -53,3 +65,27 @@ public bool quantize_output_tensor { get; set; }
#### Property Value
[Boolean](https://docs.microsoft.com/en-us/dotnet/api/system.boolean)
+
+### **only_copy**
+
+only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
+
+```csharp
+public bool only_copy { get; set; }
+```
+
+#### Property Value
+
+[Boolean](https://docs.microsoft.com/en-us/dotnet/api/system.boolean)
+
+### **pure**
+
+disable k-quant mixtures and quantize all tensors to the same type
+
+```csharp
+public bool pure { get; set; }
+```
+
+#### Property Value
+
+[Boolean](https://docs.microsoft.com/en-us/dotnet/api/system.boolean)
diff --git a/docs/xmldocs/llama.native.llamanativebatch.md b/docs/xmldocs/llama.native.llamanativebatch.md
new file mode 100644
index 00000000..56a8ab9c
--- /dev/null
+++ b/docs/xmldocs/llama.native.llamanativebatch.md
@@ -0,0 +1,71 @@
+# LLamaNativeBatch
+
+Namespace: LLama.Native
+
+Input data for llama_decode
+ A llama_batch object can contain input about one or many sequences
+ The provided arrays (i.e. token, embd, pos, etc.) must have size of n_tokens
+
+```csharp
+public struct LLamaNativeBatch
+```
+
+Inheritance [Object](https://docs.microsoft.com/en-us/dotnet/api/system.object) β [ValueType](https://docs.microsoft.com/en-us/dotnet/api/system.valuetype) β [LLamaNativeBatch](./llama.native.llamanativebatch.md)
+
+## Fields
+
+### **n_tokens**
+
+The number of items pointed at by pos, seq_id and logits.
+
+```csharp
+public int n_tokens;
+```
+
+### **tokens**
+
+Either `n_tokens` of `llama_token`, or `NULL`, depending on how this batch was created
+
+```csharp
+public LLamaToken* tokens;
+```
+
+### **embd**
+
+Either `n_tokens * embd * sizeof(float)` or `NULL`, depending on how this batch was created
+
+```csharp
+public Single* embd;
+```
+
+### **pos**
+
+the positions of the respective token in the sequence
+
+```csharp
+public LLamaPos* pos;
+```
+
+### **n_seq_id**
+
+https://github.com/ggerganov/llama.cpp/blob/master/llama.h#L139 ???
+
+```csharp
+public Int32* n_seq_id;
+```
+
+### **seq_id**
+
+the sequence to which the respective token belongs
+
+```csharp
+public LLamaSeqId** seq_id;
+```
+
+### **logits**
+
+if zero, the logits for the respective token will not be output
+
+```csharp
+public Byte* logits;
+```
diff --git a/docs/xmldocs/llama.native.llamapoolingtype.md b/docs/xmldocs/llama.native.llamapoolingtype.md
new file mode 100644
index 00000000..6e26cd24
--- /dev/null
+++ b/docs/xmldocs/llama.native.llamapoolingtype.md
@@ -0,0 +1,21 @@
+# LLamaPoolingType
+
+Namespace: LLama.Native
+
+
+
+```csharp
+public enum LLamaPoolingType
+```
+
+Inheritance [Object](https://docs.microsoft.com/en-us/dotnet/api/system.object) β [ValueType](https://docs.microsoft.com/en-us/dotnet/api/system.valuetype) β [Enum](https://docs.microsoft.com/en-us/dotnet/api/system.enum) β [LLamaPoolingType](./llama.native.llamapoolingtype.md)
+Implements [IComparable](https://docs.microsoft.com/en-us/dotnet/api/system.icomparable), [IFormattable](https://docs.microsoft.com/en-us/dotnet/api/system.iformattable), [IConvertible](https://docs.microsoft.com/en-us/dotnet/api/system.iconvertible)
+
+**Remarks:**
+
+llama_pooling_type
+
+## Fields
+
+| Name | Value | Description |
+| --- | --: | --- |
diff --git a/docs/xmldocs/llama.native.llamapos.md b/docs/xmldocs/llama.native.llamapos.md
new file mode 100644
index 00000000..2aaccbba
--- /dev/null
+++ b/docs/xmldocs/llama.native.llamapos.md
@@ -0,0 +1,72 @@
+# LLamaPos
+
+Namespace: LLama.Native
+
+Indicates position in a sequence
+
+```csharp
+public struct LLamaPos
+```
+
+Inheritance [Object](https://docs.microsoft.com/en-us/dotnet/api/system.object) β [ValueType](https://docs.microsoft.com/en-us/dotnet/api/system.valuetype) β [LLamaPos](./llama.native.llamapos.md)
+Implements [IEquatable<LLamaPos>](https://docs.microsoft.com/en-us/dotnet/api/system.iequatable-1)
+
+## Fields
+
+### **Value**
+
+The raw value
+
+```csharp
+public int Value;
+```
+
+## Methods
+
+### **ToString()**
+
+```csharp
+string ToString()
+```
+
+#### Returns
+
+[String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
+
+### **GetHashCode()**
+
+```csharp
+int GetHashCode()
+```
+
+#### Returns
+
+[Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
+
+### **Equals(Object)**
+
+```csharp
+bool Equals(object obj)
+```
+
+#### Parameters
+
+`obj` [Object](https://docs.microsoft.com/en-us/dotnet/api/system.object)
+
+#### Returns
+
+[Boolean](https://docs.microsoft.com/en-us/dotnet/api/system.boolean)
+
+### **Equals(LLamaPos)**
+
+```csharp
+bool Equals(LLamaPos other)
+```
+
+#### Parameters
+
+`other` [LLamaPos](./llama.native.llamapos.md)
+
+#### Returns
+
+[Boolean](https://docs.microsoft.com/en-us/dotnet/api/system.boolean)
diff --git a/docs/xmldocs/llama.native.llamaropetype.md b/docs/xmldocs/llama.native.llamaropetype.md
new file mode 100644
index 00000000..d2b528a7
--- /dev/null
+++ b/docs/xmldocs/llama.native.llamaropetype.md
@@ -0,0 +1,15 @@
+# LLamaRopeType
+
+Namespace: LLama.Native
+
+```csharp
+public enum LLamaRopeType
+```
+
+Inheritance [Object](https://docs.microsoft.com/en-us/dotnet/api/system.object) β [ValueType](https://docs.microsoft.com/en-us/dotnet/api/system.valuetype) β [Enum](https://docs.microsoft.com/en-us/dotnet/api/system.enum) β [LLamaRopeType](./llama.native.llamaropetype.md)
+Implements [IComparable](https://docs.microsoft.com/en-us/dotnet/api/system.icomparable), [IFormattable](https://docs.microsoft.com/en-us/dotnet/api/system.iformattable), [IConvertible](https://docs.microsoft.com/en-us/dotnet/api/system.iconvertible)
+
+## Fields
+
+| Name | Value | Description |
+| --- | --: | --- |
diff --git a/docs/xmldocs/llama.native.llamaseqid.md b/docs/xmldocs/llama.native.llamaseqid.md
new file mode 100644
index 00000000..eb34f461
--- /dev/null
+++ b/docs/xmldocs/llama.native.llamaseqid.md
@@ -0,0 +1,80 @@
+# LLamaSeqId
+
+Namespace: LLama.Native
+
+ID for a sequence in a batch
+
+```csharp
+public struct LLamaSeqId
+```
+
+Inheritance [Object](https://docs.microsoft.com/en-us/dotnet/api/system.object) β [ValueType](https://docs.microsoft.com/en-us/dotnet/api/system.valuetype) β [LLamaSeqId](./llama.native.llamaseqid.md)
+Implements [IEquatable<LLamaSeqId>](https://docs.microsoft.com/en-us/dotnet/api/system.iequatable-1)
+
+## Fields
+
+### **Value**
+
+The raw value
+
+```csharp
+public int Value;
+```
+
+### **Zero**
+
+LLamaSeqId with value 0
+
+```csharp
+public static LLamaSeqId Zero;
+```
+
+## Methods
+
+### **ToString()**
+
+```csharp
+string ToString()
+```
+
+#### Returns
+
+[String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
+
+### **GetHashCode()**
+
+```csharp
+int GetHashCode()
+```
+
+#### Returns
+
+[Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
+
+### **Equals(Object)**
+
+```csharp
+bool Equals(object obj)
+```
+
+#### Parameters
+
+`obj` [Object](https://docs.microsoft.com/en-us/dotnet/api/system.object)
+
+#### Returns
+
+[Boolean](https://docs.microsoft.com/en-us/dotnet/api/system.boolean)
+
+### **Equals(LLamaSeqId)**
+
+```csharp
+bool Equals(LLamaSeqId other)
+```
+
+#### Parameters
+
+`other` [LLamaSeqId](./llama.native.llamaseqid.md)
+
+#### Returns
+
+[Boolean](https://docs.microsoft.com/en-us/dotnet/api/system.boolean)
diff --git a/docs/xmldocs/llama.native.llamatoken.md b/docs/xmldocs/llama.native.llamatoken.md
new file mode 100644
index 00000000..8282f92c
--- /dev/null
+++ b/docs/xmldocs/llama.native.llamatoken.md
@@ -0,0 +1,62 @@
+# LLamaToken
+
+Namespace: LLama.Native
+
+A single token
+
+```csharp
+public struct LLamaToken
+```
+
+Inheritance [Object](https://docs.microsoft.com/en-us/dotnet/api/system.object) β [ValueType](https://docs.microsoft.com/en-us/dotnet/api/system.valuetype) β [LLamaToken](./llama.native.llamatoken.md)
+Implements [IEquatable<LLamaToken>](https://docs.microsoft.com/en-us/dotnet/api/system.iequatable-1)
+
+## Methods
+
+### **ToString()**
+
+```csharp
+string ToString()
+```
+
+#### Returns
+
+[String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
+
+### **GetHashCode()**
+
+```csharp
+int GetHashCode()
+```
+
+#### Returns
+
+[Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
+
+### **Equals(Object)**
+
+```csharp
+bool Equals(object obj)
+```
+
+#### Parameters
+
+`obj` [Object](https://docs.microsoft.com/en-us/dotnet/api/system.object)
+
+#### Returns
+
+[Boolean](https://docs.microsoft.com/en-us/dotnet/api/system.boolean)
+
+### **Equals(LLamaToken)**
+
+```csharp
+bool Equals(LLamaToken other)
+```
+
+#### Parameters
+
+`other` [LLamaToken](./llama.native.llamatoken.md)
+
+#### Returns
+
+[Boolean](https://docs.microsoft.com/en-us/dotnet/api/system.boolean)
diff --git a/docs/xmldocs/llama.native.llamatokendata.md b/docs/xmldocs/llama.native.llamatokendata.md
index 8632f702..29151a25 100644
--- a/docs/xmldocs/llama.native.llamatokendata.md
+++ b/docs/xmldocs/llama.native.llamatokendata.md
@@ -2,6 +2,8 @@
Namespace: LLama.Native
+A single token along with probability of this token being selected
+
```csharp
public struct LLamaTokenData
```
@@ -15,7 +17,7 @@ Inheritance [Object](https://docs.microsoft.com/en-us/dotnet/api/system.object)
token id
```csharp
-public int id;
+public LLamaToken id;
```
### **logit**
@@ -36,15 +38,17 @@ public float p;
## Constructors
-### **LLamaTokenData(Int32, Single, Single)**
+### **LLamaTokenData(LLamaToken, Single, Single)**
+
+Create a new LLamaTokenData
```csharp
-LLamaTokenData(int id, float logit, float p)
+LLamaTokenData(LLamaToken id, float logit, float p)
```
#### Parameters
-`id` [Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
+`id` [LLamaToken](./llama.native.llamatoken.md)
`logit` [Single](https://docs.microsoft.com/en-us/dotnet/api/system.single)
diff --git a/docs/xmldocs/llama.native.llamatokendataarray.md b/docs/xmldocs/llama.native.llamatokendataarray.md
index b5ba8e5a..7ebcd206 100644
--- a/docs/xmldocs/llama.native.llamatokendataarray.md
+++ b/docs/xmldocs/llama.native.llamatokendataarray.md
@@ -61,3 +61,265 @@ LLamaTokenDataArray Create(ReadOnlySpan logits)
#### Returns
[LLamaTokenDataArray](./llama.native.llamatokendataarray.md)
+
+### **OverwriteLogits(ReadOnlySpan<ValueTuple<LLamaToken, Single>>)**
+
+Overwrite the logit values for all given tokens
+
+```csharp
+void OverwriteLogits(ReadOnlySpan> values)
+```
+
+#### Parameters
+
+`values` [ReadOnlySpan<ValueTuple<LLamaToken, Single>>](https://docs.microsoft.com/en-us/dotnet/api/system.readonlyspan-1)
+tuples of token and logit value to overwrite
+
+### **ApplyGrammar(SafeLLamaContextHandle, SafeLLamaGrammarHandle)**
+
+Apply grammar rules to candidate tokens
+
+```csharp
+void ApplyGrammar(SafeLLamaContextHandle ctx, SafeLLamaGrammarHandle grammar)
+```
+
+#### Parameters
+
+`ctx` [SafeLLamaContextHandle](./llama.native.safellamacontexthandle.md)
+
+`grammar` [SafeLLamaGrammarHandle](./llama.native.safellamagrammarhandle.md)
+
+### **TopK(SafeLLamaContextHandle, Int32, UInt64)**
+
+Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
+
+```csharp
+void TopK(SafeLLamaContextHandle context, int k, ulong minKeep)
+```
+
+#### Parameters
+
+`context` [SafeLLamaContextHandle](./llama.native.safellamacontexthandle.md)
+
+`k` [Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
+Number of tokens to keep
+
+`minKeep` [UInt64](https://docs.microsoft.com/en-us/dotnet/api/system.uint64)
+Minimum number to keep
+
+### **TopP(SafeLLamaContextHandle, Single, UInt64)**
+
+Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
+
+```csharp
+void TopP(SafeLLamaContextHandle context, float p, ulong minKeep)
+```
+
+#### Parameters
+
+`context` [SafeLLamaContextHandle](./llama.native.safellamacontexthandle.md)
+
+`p` [Single](https://docs.microsoft.com/en-us/dotnet/api/system.single)
+
+`minKeep` [UInt64](https://docs.microsoft.com/en-us/dotnet/api/system.uint64)
+
+### **MinP(SafeLLamaContextHandle, Single, UInt64)**
+
+Minimum P sampling as described in https://github.com/ggerganov/llama.cpp/pull/3841
+
+```csharp
+void MinP(SafeLLamaContextHandle context, float p, ulong minKeep)
+```
+
+#### Parameters
+
+`context` [SafeLLamaContextHandle](./llama.native.safellamacontexthandle.md)
+
+`p` [Single](https://docs.microsoft.com/en-us/dotnet/api/system.single)
+All tokens with probability greater than this will be kept
+
+`minKeep` [UInt64](https://docs.microsoft.com/en-us/dotnet/api/system.uint64)
+
+### **TailFree(SafeLLamaContextHandle, Single, UInt64)**
+
+Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/.
+
+```csharp
+void TailFree(SafeLLamaContextHandle context, float z, ulong min_keep)
+```
+
+#### Parameters
+
+`context` [SafeLLamaContextHandle](./llama.native.safellamacontexthandle.md)
+
+`z` [Single](https://docs.microsoft.com/en-us/dotnet/api/system.single)
+
+`min_keep` [UInt64](https://docs.microsoft.com/en-us/dotnet/api/system.uint64)
+
+### **LocallyTypical(SafeLLamaContextHandle, Single, UInt64)**
+
+Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666.
+
+```csharp
+void LocallyTypical(SafeLLamaContextHandle context, float p, ulong min_keep)
+```
+
+#### Parameters
+
+`context` [SafeLLamaContextHandle](./llama.native.safellamacontexthandle.md)
+
+`p` [Single](https://docs.microsoft.com/en-us/dotnet/api/system.single)
+
+`min_keep` [UInt64](https://docs.microsoft.com/en-us/dotnet/api/system.uint64)
+
+### **RepetitionPenalty(SafeLLamaContextHandle, ReadOnlySpan<LLamaToken>, Single, Single, Single)**
+
+Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix.
+ Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details.
+
+```csharp
+void RepetitionPenalty(SafeLLamaContextHandle context, ReadOnlySpan last_tokens, float penalty_repeat, float penalty_freq, float penalty_present)
+```
+
+#### Parameters
+
+`context` [SafeLLamaContextHandle](./llama.native.safellamacontexthandle.md)
+
+`last_tokens` [ReadOnlySpan<LLamaToken>](https://docs.microsoft.com/en-us/dotnet/api/system.readonlyspan-1)
+
+`penalty_repeat` [Single](https://docs.microsoft.com/en-us/dotnet/api/system.single)
+
+`penalty_freq` [Single](https://docs.microsoft.com/en-us/dotnet/api/system.single)
+
+`penalty_present` [Single](https://docs.microsoft.com/en-us/dotnet/api/system.single)
+
+### **Guidance(SafeLLamaContextHandle, ReadOnlySpan<Single>, Single)**
+
+Apply classifier-free guidance to the logits as described in academic paper "Stay on topic with Classifier-Free Guidance" https://arxiv.org/abs/2306.17806
+
+```csharp
+void Guidance(SafeLLamaContextHandle context, ReadOnlySpan guidanceLogits, float guidance)
+```
+
+#### Parameters
+
+`context` [SafeLLamaContextHandle](./llama.native.safellamacontexthandle.md)
+
+`guidanceLogits` [ReadOnlySpan<Single>](https://docs.microsoft.com/en-us/dotnet/api/system.readonlyspan-1)
+Logits extracted from a separate context from the same model.
+ Other than a negative prompt at the beginning, it should have all generated and user input tokens copied from the main context.
+
+`guidance` [Single](https://docs.microsoft.com/en-us/dotnet/api/system.single)
+Guidance strength. 0 means no guidance, higher values applies stronger guidance
+
+### **Temperature(SafeLLamaContextHandle, Single)**
+
+Sample with temperature.
+ As temperature increases, the prediction becomes more diverse but also vulnerable to hallucinations -- generating tokens that are sensible but not factual
+
+```csharp
+void Temperature(SafeLLamaContextHandle context, float temp)
+```
+
+#### Parameters
+
+`context` [SafeLLamaContextHandle](./llama.native.safellamacontexthandle.md)
+
+`temp` [Single](https://docs.microsoft.com/en-us/dotnet/api/system.single)
+
+### **Softmax(SafeLLamaContextHandle)**
+
+Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
+
+```csharp
+void Softmax(SafeLLamaContextHandle context)
+```
+
+#### Parameters
+
+`context` [SafeLLamaContextHandle](./llama.native.safellamacontexthandle.md)
+
+### **SampleToken(SafeLLamaContextHandle)**
+
+Randomly selects a token from the candidates based on their probabilities.
+
+```csharp
+LLamaToken SampleToken(SafeLLamaContextHandle context)
+```
+
+#### Parameters
+
+`context` [SafeLLamaContextHandle](./llama.native.safellamacontexthandle.md)
+
+#### Returns
+
+[LLamaToken](./llama.native.llamatoken.md)
+
+### **SampleTokenGreedy(SafeLLamaContextHandle)**
+
+Selects the token with the highest probability.
+
+```csharp
+LLamaToken SampleTokenGreedy(SafeLLamaContextHandle context)
+```
+
+#### Parameters
+
+`context` [SafeLLamaContextHandle](./llama.native.safellamacontexthandle.md)
+
+#### Returns
+
+[LLamaToken](./llama.native.llamatoken.md)
+
+### **SampleTokenMirostat(SafeLLamaContextHandle, Single, Single, Int32, Single&)**
+
+Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
+
+```csharp
+LLamaToken SampleTokenMirostat(SafeLLamaContextHandle context, float tau, float eta, int m, Single& mu)
+```
+
+#### Parameters
+
+`context` [SafeLLamaContextHandle](./llama.native.safellamacontexthandle.md)
+
+`tau` [Single](https://docs.microsoft.com/en-us/dotnet/api/system.single)
+The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
+
+`eta` [Single](https://docs.microsoft.com/en-us/dotnet/api/system.single)
+The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
+
+`m` [Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
+The number of tokens considered in the estimation of `s_hat`. This is an arbitrary value that is used to calculate `s_hat`, which in turn helps to calculate the value of `k`. In the paper, they use `m = 100`, but you can experiment with different values to see how it affects the performance of the algorithm.
+
+`mu` [Single&](https://docs.microsoft.com/en-us/dotnet/api/system.single&)
+Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
+
+#### Returns
+
+[LLamaToken](./llama.native.llamatoken.md)
+
+### **SampleTokenMirostat2(SafeLLamaContextHandle, Single, Single, Single&)**
+
+Mirostat 2.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
+
+```csharp
+LLamaToken SampleTokenMirostat2(SafeLLamaContextHandle context, float tau, float eta, Single& mu)
+```
+
+#### Parameters
+
+`context` [SafeLLamaContextHandle](./llama.native.safellamacontexthandle.md)
+
+`tau` [Single](https://docs.microsoft.com/en-us/dotnet/api/system.single)
+The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
+
+`eta` [Single](https://docs.microsoft.com/en-us/dotnet/api/system.single)
+The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
+
+`mu` [Single&](https://docs.microsoft.com/en-us/dotnet/api/system.single&)
+Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
+
+#### Returns
+
+[LLamaToken](./llama.native.llamatoken.md)
diff --git a/docs/xmldocs/llama.native.llamatokentype.md b/docs/xmldocs/llama.native.llamatokentype.md
new file mode 100644
index 00000000..6edad50f
--- /dev/null
+++ b/docs/xmldocs/llama.native.llamatokentype.md
@@ -0,0 +1,25 @@
+# LLamaTokenType
+
+Namespace: LLama.Native
+
+Token Types
+
+```csharp
+public enum LLamaTokenType
+```
+
+Inheritance [Object](https://docs.microsoft.com/en-us/dotnet/api/system.object) β [ValueType](https://docs.microsoft.com/en-us/dotnet/api/system.valuetype) β [Enum](https://docs.microsoft.com/en-us/dotnet/api/system.enum) β [LLamaTokenType](./llama.native.llamatokentype.md)
+Implements [IComparable](https://docs.microsoft.com/en-us/dotnet/api/system.icomparable), [IFormattable](https://docs.microsoft.com/en-us/dotnet/api/system.iformattable), [IConvertible](https://docs.microsoft.com/en-us/dotnet/api/system.iconvertible)
+
+**Remarks:**
+
+C# equivalent of llama_token_get_type
+
+## Fields
+
+| Name | Value | Description |
+| --- | --: | --- |
+| LLAMA_TOKEN_TYPE_UNDEFINED | 0 | No specific type has been set for this token |
+| LLAMA_TOKEN_TYPE_NORMAL | 1 | This is a "normal" token |
+| LLAMA_TOKEN_TYPE_UNKNOWN | 2 | An "unknown" character/text token e.g. <unk> |
+| LLAMA_TOKEN_TYPE_CONTROL | 3 | A special control token e.g. </s> |
diff --git a/docs/xmldocs/llama.native.llamavocabtype.md b/docs/xmldocs/llama.native.llamavocabtype.md
new file mode 100644
index 00000000..d24b3d53
--- /dev/null
+++ b/docs/xmldocs/llama.native.llamavocabtype.md
@@ -0,0 +1,21 @@
+# LLamaVocabType
+
+Namespace: LLama.Native
+
+
+
+```csharp
+public enum LLamaVocabType
+```
+
+Inheritance [Object](https://docs.microsoft.com/en-us/dotnet/api/system.object) β [ValueType](https://docs.microsoft.com/en-us/dotnet/api/system.valuetype) β [Enum](https://docs.microsoft.com/en-us/dotnet/api/system.enum) β [LLamaVocabType](./llama.native.llamavocabtype.md)
+Implements [IComparable](https://docs.microsoft.com/en-us/dotnet/api/system.icomparable), [IFormattable](https://docs.microsoft.com/en-us/dotnet/api/system.iformattable), [IConvertible](https://docs.microsoft.com/en-us/dotnet/api/system.iconvertible)
+
+**Remarks:**
+
+llama_vocab_type
+
+## Fields
+
+| Name | Value | Description |
+| --- | --: | --- |
diff --git a/docs/xmldocs/llama.native.llavaimageembed.md b/docs/xmldocs/llama.native.llavaimageembed.md
new file mode 100644
index 00000000..be6346cb
--- /dev/null
+++ b/docs/xmldocs/llama.native.llavaimageembed.md
@@ -0,0 +1,25 @@
+# LLavaImageEmbed
+
+Namespace: LLama.Native
+
+LLaVa Image embeddings
+
+```csharp
+public struct LLavaImageEmbed
+```
+
+Inheritance [Object](https://docs.microsoft.com/en-us/dotnet/api/system.object) β [ValueType](https://docs.microsoft.com/en-us/dotnet/api/system.valuetype) β [LLavaImageEmbed](./llama.native.llavaimageembed.md)
+
+## Fields
+
+### **embed**
+
+```csharp
+public Single* embed;
+```
+
+### **n_image_pos**
+
+```csharp
+public int n_image_pos;
+```
diff --git a/docs/xmldocs/llama.native.nativeapi.md b/docs/xmldocs/llama.native.nativeapi.md
index 764a9ff8..bafd6775 100644
--- a/docs/xmldocs/llama.native.nativeapi.md
+++ b/docs/xmldocs/llama.native.nativeapi.md
@@ -5,19 +5,11 @@ Namespace: LLama.Native
Direct translation of the llama.cpp API
```csharp
-public class NativeApi
+public static class NativeApi
```
Inheritance [Object](https://docs.microsoft.com/en-us/dotnet/api/system.object) β [NativeApi](./llama.native.nativeapi.md)
-## Constructors
-
-### **NativeApi()**
-
-```csharp
-public NativeApi()
-```
-
## Methods
### **llama_sample_token_mirostat(SafeLLamaContextHandle, LLamaTokenDataArrayNative&, Single, Single, Int32, Single&)**
@@ -25,7 +17,7 @@ public NativeApi()
Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
```csharp
-public static int llama_sample_token_mirostat(SafeLLamaContextHandle ctx, LLamaTokenDataArrayNative& candidates, float tau, float eta, int m, Single& mu)
+public static LLamaToken llama_sample_token_mirostat(SafeLLamaContextHandle ctx, LLamaTokenDataArrayNative& candidates, float tau, float eta, int m, Single& mu)
```
#### Parameters
@@ -49,14 +41,14 @@ Maximum cross-entropy. This value is initialized to be twice the target cross-en
#### Returns
-[Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
+[LLamaToken](./llama.native.llamatoken.md)
### **llama_sample_token_mirostat_v2(SafeLLamaContextHandle, LLamaTokenDataArrayNative&, Single, Single, Single&)**
Mirostat 2.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
```csharp
-public static int llama_sample_token_mirostat_v2(SafeLLamaContextHandle ctx, LLamaTokenDataArrayNative& candidates, float tau, float eta, Single& mu)
+public static LLamaToken llama_sample_token_mirostat_v2(SafeLLamaContextHandle ctx, LLamaTokenDataArrayNative& candidates, float tau, float eta, Single& mu)
```
#### Parameters
@@ -77,14 +69,14 @@ Maximum cross-entropy. This value is initialized to be twice the target cross-en
#### Returns
-[Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
+[LLamaToken](./llama.native.llamatoken.md)
### **llama_sample_token_greedy(SafeLLamaContextHandle, LLamaTokenDataArrayNative&)**
Selects the token with the highest probability.
```csharp
-public static int llama_sample_token_greedy(SafeLLamaContextHandle ctx, LLamaTokenDataArrayNative& candidates)
+public static LLamaToken llama_sample_token_greedy(SafeLLamaContextHandle ctx, LLamaTokenDataArrayNative& candidates)
```
#### Parameters
@@ -96,14 +88,14 @@ Pointer to LLamaTokenDataArray
#### Returns
-[Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
+[LLamaToken](./llama.native.llamatoken.md)
### **llama_sample_token(SafeLLamaContextHandle, LLamaTokenDataArrayNative&)**
Randomly selects a token from the candidates based on their probabilities.
```csharp
-public static int llama_sample_token(SafeLLamaContextHandle ctx, LLamaTokenDataArrayNative& candidates)
+public static LLamaToken llama_sample_token(SafeLLamaContextHandle ctx, LLamaTokenDataArrayNative& candidates)
```
#### Parameters
@@ -115,117 +107,108 @@ Pointer to LLamaTokenDataArray
#### Returns
-[Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
-
-### **llama_token_to_str(SafeLLamaContextHandle, Int32)**
+[LLamaToken](./llama.native.llamatoken.md)
-Token Id -> String. Uses the vocabulary in the provided context
+### **<llama_get_embeddings>g__llama_get_embeddings_native|30_0(SafeLLamaContextHandle)**
```csharp
-public static IntPtr llama_token_to_str(SafeLLamaContextHandle ctx, int token)
+internal static Single* g__llama_get_embeddings_native|30_0(SafeLLamaContextHandle ctx)
```
#### Parameters
`ctx` [SafeLLamaContextHandle](./llama.native.safellamacontexthandle.md)
-`token` [Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
-
#### Returns
-[IntPtr](https://docs.microsoft.com/en-us/dotnet/api/system.intptr)
-Pointer to a string.
-
-### **llama_token_bos(SafeLLamaContextHandle)**
+[Single*](https://docs.microsoft.com/en-us/dotnet/api/system.single*)
-Get the "Beginning of sentence" token
+### **<llama_token_to_piece>g__llama_token_to_piece_native|44_0(SafeLlamaModelHandle, LLamaToken, Byte*, Int32)**
```csharp
-public static int llama_token_bos(SafeLLamaContextHandle ctx)
+internal static int g__llama_token_to_piece_native|44_0(SafeLlamaModelHandle model, LLamaToken llamaToken, Byte* buffer, int length)
```
#### Parameters
-`ctx` [SafeLLamaContextHandle](./llama.native.safellamacontexthandle.md)
+`model` [SafeLlamaModelHandle](./llama.native.safellamamodelhandle.md)
+
+`llamaToken` [LLamaToken](./llama.native.llamatoken.md)
+
+`buffer` [Byte*](https://docs.microsoft.com/en-us/dotnet/api/system.byte*)
+
+`length` [Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
#### Returns
[Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
-### **llama_token_eos(SafeLLamaContextHandle)**
-
-Get the "End of sentence" token
+### **<TryLoadLibraries>g__TryLoad|84_0(String)**
```csharp
-public static int llama_token_eos(SafeLLamaContextHandle ctx)
+internal static IntPtr g__TryLoad|84_0(string path)
```
#### Parameters
-`ctx` [SafeLLamaContextHandle](./llama.native.safellamacontexthandle.md)
+`path` [String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
#### Returns
-[Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
-
-### **llama_token_nl(SafeLLamaContextHandle)**
+[IntPtr](https://docs.microsoft.com/en-us/dotnet/api/system.intptr)
-Get the "new line" token
+### **<TryLoadLibraries>g__TryFindPath|84_1(String, <>c__DisplayClass84_0&)**
```csharp
-public static int llama_token_nl(SafeLLamaContextHandle ctx)
+internal static string g__TryFindPath|84_1(string filename, <>c__DisplayClass84_0& )
```
#### Parameters
-`ctx` [SafeLLamaContextHandle](./llama.native.safellamacontexthandle.md)
+`filename` [String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
+
+`` [<>c__DisplayClass84_0&](./llama.native.nativeapi.<>c__displayclass84_0&.md)
#### Returns
-[Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
+[String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
-### **llama_print_timings(SafeLLamaContextHandle)**
+### **llama_set_n_threads(SafeLLamaContextHandle, UInt32, UInt32)**
-Print out timing information for this context
+Set the number of threads used for decoding
```csharp
-public static void llama_print_timings(SafeLLamaContextHandle ctx)
+public static void llama_set_n_threads(SafeLLamaContextHandle ctx, uint n_threads, uint n_threads_batch)
```
#### Parameters
`ctx` [SafeLLamaContextHandle](./llama.native.safellamacontexthandle.md)
-### **llama_reset_timings(SafeLLamaContextHandle)**
+`n_threads` [UInt32](https://docs.microsoft.com/en-us/dotnet/api/system.uint32)
+n_threads is the number of threads used for generation (single token)
-Reset all collected timing information for this context
+`n_threads_batch` [UInt32](https://docs.microsoft.com/en-us/dotnet/api/system.uint32)
+n_threads_batch is the number of threads used for prompt and batch processing (multiple tokens)
+
+### **llama_vocab_type(SafeLlamaModelHandle)**
```csharp
-public static void llama_reset_timings(SafeLLamaContextHandle ctx)
+public static LLamaVocabType llama_vocab_type(SafeLlamaModelHandle model)
```
#### Parameters
-`ctx` [SafeLLamaContextHandle](./llama.native.safellamacontexthandle.md)
-
-### **llama_print_system_info()**
-
-Print system information
-
-```csharp
-public static IntPtr llama_print_system_info()
-```
+`model` [SafeLlamaModelHandle](./llama.native.safellamamodelhandle.md)
#### Returns
-[IntPtr](https://docs.microsoft.com/en-us/dotnet/api/system.intptr)
-
-### **llama_model_n_vocab(SafeLlamaModelHandle)**
+[LLamaVocabType](./llama.native.llamavocabtype.md)
-Get the number of tokens in the model vocabulary
+### **llama_rope_type(SafeLlamaModelHandle)**
```csharp
-public static int llama_model_n_vocab(SafeLlamaModelHandle model)
+public static LLamaRopeType llama_rope_type(SafeLlamaModelHandle model)
```
#### Parameters
@@ -234,219 +217,227 @@ public static int llama_model_n_vocab(SafeLlamaModelHandle model)
#### Returns
-[Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
+[LLamaRopeType](./llama.native.llamaropetype.md)
-### **llama_model_n_ctx(SafeLlamaModelHandle)**
+### **llama_grammar_init(LLamaGrammarElement**, UInt64, UInt64)**
-Get the size of the context window for the model
+Create a new grammar from the given set of grammar rules
```csharp
-public static int llama_model_n_ctx(SafeLlamaModelHandle model)
+public static IntPtr llama_grammar_init(LLamaGrammarElement** rules, ulong n_rules, ulong start_rule_index)
```
#### Parameters
-`model` [SafeLlamaModelHandle](./llama.native.safellamamodelhandle.md)
+`rules` [LLamaGrammarElement**](./llama.native.llamagrammarelement**.md)
+
+`n_rules` [UInt64](https://docs.microsoft.com/en-us/dotnet/api/system.uint64)
+
+`start_rule_index` [UInt64](https://docs.microsoft.com/en-us/dotnet/api/system.uint64)
#### Returns
-[Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
+[IntPtr](https://docs.microsoft.com/en-us/dotnet/api/system.intptr)
-### **llama_model_n_embd(SafeLlamaModelHandle)**
+### **llama_grammar_free(IntPtr)**
-Get the dimension of embedding vectors from this model
+Free all memory from the given SafeLLamaGrammarHandle
```csharp
-public static int llama_model_n_embd(SafeLlamaModelHandle model)
+public static void llama_grammar_free(IntPtr grammar)
```
#### Parameters
-`model` [SafeLlamaModelHandle](./llama.native.safellamamodelhandle.md)
-
-#### Returns
-
-[Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
+`grammar` [IntPtr](https://docs.microsoft.com/en-us/dotnet/api/system.intptr)
-### **llama_token_to_piece_with_model(SafeLlamaModelHandle, Int32, Byte*, Int32)**
+### **llama_grammar_copy(SafeLLamaGrammarHandle)**
-Convert a single token into text
+Create a copy of an existing grammar instance
```csharp
-public static int llama_token_to_piece_with_model(SafeLlamaModelHandle model, int llamaToken, Byte* buffer, int length)
+public static IntPtr llama_grammar_copy(SafeLLamaGrammarHandle grammar)
```
#### Parameters
-`model` [SafeLlamaModelHandle](./llama.native.safellamamodelhandle.md)
-
-`llamaToken` [Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
-
-`buffer` [Byte*](https://docs.microsoft.com/en-us/dotnet/api/system.byte*)
-buffer to write string into
-
-`length` [Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
-size of the buffer
+`grammar` [SafeLLamaGrammarHandle](./llama.native.safellamagrammarhandle.md)
#### Returns
-[Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
-The length writte, or if the buffer is too small a negative that indicates the length required
+[IntPtr](https://docs.microsoft.com/en-us/dotnet/api/system.intptr)
-### **llama_tokenize_with_model(SafeLlamaModelHandle, Byte*, Int32*, Int32, Boolean)**
+### **llama_sample_grammar(SafeLLamaContextHandle, LLamaTokenDataArrayNative&, SafeLLamaGrammarHandle)**
-Convert text into tokens
+Apply constraints from grammar
```csharp
-public static int llama_tokenize_with_model(SafeLlamaModelHandle model, Byte* text, Int32* tokens, int n_max_tokens, bool add_bos)
+public static void llama_sample_grammar(SafeLLamaContextHandle ctx, LLamaTokenDataArrayNative& candidates, SafeLLamaGrammarHandle grammar)
```
#### Parameters
-`model` [SafeLlamaModelHandle](./llama.native.safellamamodelhandle.md)
-
-`text` [Byte*](https://docs.microsoft.com/en-us/dotnet/api/system.byte*)
-
-`tokens` [Int32*](https://docs.microsoft.com/en-us/dotnet/api/system.int32*)
-
-`n_max_tokens` [Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
-
-`add_bos` [Boolean](https://docs.microsoft.com/en-us/dotnet/api/system.boolean)
+`ctx` [SafeLLamaContextHandle](./llama.native.safellamacontexthandle.md)
-#### Returns
+`candidates` [LLamaTokenDataArrayNative&](./llama.native.llamatokendataarraynative&.md)
-[Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
-Returns the number of tokens on success, no more than n_max_tokens.
- Returns a negative number on failure - the number of tokens that would have been returned
+`grammar` [SafeLLamaGrammarHandle](./llama.native.safellamagrammarhandle.md)
-### **llama_log_set(LLamaLogCallback)**
+### **llama_grammar_accept_token(SafeLLamaContextHandle, SafeLLamaGrammarHandle, LLamaToken)**
-Register a callback to receive llama log messages
+Accepts the sampled token into the grammar
```csharp
-public static void llama_log_set(LLamaLogCallback logCallback)
+public static void llama_grammar_accept_token(SafeLLamaContextHandle ctx, SafeLLamaGrammarHandle grammar, LLamaToken token)
```
#### Parameters
-`logCallback` [LLamaLogCallback](./llama.native.llamalogcallback.md)
+`ctx` [SafeLLamaContextHandle](./llama.native.safellamacontexthandle.md)
-### **llama_grammar_init(LLamaGrammarElement**, UInt64, UInt64)**
+`grammar` [SafeLLamaGrammarHandle](./llama.native.safellamagrammarhandle.md)
-Create a new grammar from the given set of grammar rules
+`token` [LLamaToken](./llama.native.llamatoken.md)
+
+### **llava_validate_embed_size(SafeLLamaContextHandle, SafeLlavaModelHandle)**
+
+Sanity check for clip <-> llava embed size match
```csharp
-public static IntPtr llama_grammar_init(LLamaGrammarElement** rules, ulong n_rules, ulong start_rule_index)
+public static bool llava_validate_embed_size(SafeLLamaContextHandle ctxLlama, SafeLlavaModelHandle ctxClip)
```
#### Parameters
-`rules` [LLamaGrammarElement**](./llama.native.llamagrammarelement**.md)
-
-`n_rules` [UInt64](https://docs.microsoft.com/en-us/dotnet/api/system.uint64)
+`ctxLlama` [SafeLLamaContextHandle](./llama.native.safellamacontexthandle.md)
+LLama Context
-`start_rule_index` [UInt64](https://docs.microsoft.com/en-us/dotnet/api/system.uint64)
+`ctxClip` [SafeLlavaModelHandle](./llama.native.safellavamodelhandle.md)
+Llava Model
#### Returns
-[IntPtr](https://docs.microsoft.com/en-us/dotnet/api/system.intptr)
+[Boolean](https://docs.microsoft.com/en-us/dotnet/api/system.boolean)
+True if validate successfully
-### **llama_grammar_free(IntPtr)**
+### **llava_image_embed_make_with_bytes(SafeLlavaModelHandle, Int32, Byte[], Int32)**
-Free all memory from the given SafeLLamaGrammarHandle
+Build an image embed from image file bytes
```csharp
-public static void llama_grammar_free(IntPtr grammar)
+public static SafeLlavaImageEmbedHandle llava_image_embed_make_with_bytes(SafeLlavaModelHandle ctx_clip, int n_threads, Byte[] image_bytes, int image_bytes_length)
```
#### Parameters
-`grammar` [IntPtr](https://docs.microsoft.com/en-us/dotnet/api/system.intptr)
+`ctx_clip` [SafeLlavaModelHandle](./llama.native.safellavamodelhandle.md)
+SafeHandle to the Clip Model
-### **llama_sample_grammar(SafeLLamaContextHandle, LLamaTokenDataArrayNative&, SafeLLamaGrammarHandle)**
+`n_threads` [Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
+Number of threads
-Apply constraints from grammar
+`image_bytes` [Byte[]](https://docs.microsoft.com/en-us/dotnet/api/system.byte)
+Binary image in jpeg format
+
+`image_bytes_length` [Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
+Bytes lenght of the image
+
+#### Returns
+
+[SafeLlavaImageEmbedHandle](./llama.native.safellavaimageembedhandle.md)
+SafeHandle to the Embeddings
+
+### **llava_image_embed_make_with_filename(SafeLlavaModelHandle, Int32, String)**
+
+Build an image embed from a path to an image filename
```csharp
-public static void llama_sample_grammar(SafeLLamaContextHandle ctx, LLamaTokenDataArrayNative& candidates, SafeLLamaGrammarHandle grammar)
+public static SafeLlavaImageEmbedHandle llava_image_embed_make_with_filename(SafeLlavaModelHandle ctx_clip, int n_threads, string image_path)
```
#### Parameters
-`ctx` [SafeLLamaContextHandle](./llama.native.safellamacontexthandle.md)
+`ctx_clip` [SafeLlavaModelHandle](./llama.native.safellavamodelhandle.md)
+SafeHandle to the Clip Model
-`candidates` [LLamaTokenDataArrayNative&](./llama.native.llamatokendataarraynative&.md)
+`n_threads` [Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
+Number of threads
-`grammar` [SafeLLamaGrammarHandle](./llama.native.safellamagrammarhandle.md)
+`image_path` [String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
+Image filename (jpeg) to generate embeddings
-### **llama_grammar_accept_token(SafeLLamaContextHandle, SafeLLamaGrammarHandle, Int32)**
+#### Returns
-Accepts the sampled token into the grammar
+[SafeLlavaImageEmbedHandle](./llama.native.safellavaimageembedhandle.md)
+SafeHandel to the embeddings
+
+### **llava_image_embed_free(IntPtr)**
+
+Free an embedding made with llava_image_embed_make_*
```csharp
-public static void llama_grammar_accept_token(SafeLLamaContextHandle ctx, SafeLLamaGrammarHandle grammar, int token)
+public static void llava_image_embed_free(IntPtr embed)
```
#### Parameters
-`ctx` [SafeLLamaContextHandle](./llama.native.safellamacontexthandle.md)
-
-`grammar` [SafeLLamaGrammarHandle](./llama.native.safellamagrammarhandle.md)
-
-`token` [Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
+`embed` [IntPtr](https://docs.microsoft.com/en-us/dotnet/api/system.intptr)
+Embeddings to release
-### **llama_model_quantize(String, String, LLamaModelQuantizeParams*)**
+### **llava_eval_image_embed(SafeLLamaContextHandle, SafeLlavaImageEmbedHandle, Int32, Int32&)**
-Returns 0 on success
+Write the image represented by embed into the llama context with batch size n_batch, starting at context
+ pos n_past. on completion, n_past points to the next position in the context after the image embed.
```csharp
-public static int llama_model_quantize(string fname_inp, string fname_out, LLamaModelQuantizeParams* param)
+public static bool llava_eval_image_embed(SafeLLamaContextHandle ctx_llama, SafeLlavaImageEmbedHandle embed, int n_batch, Int32& n_past)
```
#### Parameters
-`fname_inp` [String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
-
-`fname_out` [String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
+`ctx_llama` [SafeLLamaContextHandle](./llama.native.safellamacontexthandle.md)
+Llama Context
-`param` [LLamaModelQuantizeParams*](./llama.native.llamamodelquantizeparams*.md)
+`embed` [SafeLlavaImageEmbedHandle](./llama.native.safellavaimageembedhandle.md)
+Embedding handle
-#### Returns
+`n_batch` [Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
-[Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
-Returns 0 on success
+`n_past` [Int32&](https://docs.microsoft.com/en-us/dotnet/api/system.int32&)
-**Remarks:**
+#### Returns
-not great API - very likely to change
+[Boolean](https://docs.microsoft.com/en-us/dotnet/api/system.boolean)
+True on success
-### **llama_sample_classifier_free_guidance(SafeLLamaContextHandle, LLamaTokenDataArrayNative, SafeLLamaContextHandle, Single)**
+### **llama_model_quantize(String, String, LLamaModelQuantizeParams*)**
-Apply classifier-free guidance to the logits as described in academic paper "Stay on topic with Classifier-Free Guidance" https://arxiv.org/abs/2306.17806
+Returns 0 on success
```csharp
-public static void llama_sample_classifier_free_guidance(SafeLLamaContextHandle ctx, LLamaTokenDataArrayNative candidates, SafeLLamaContextHandle guidanceCtx, float scale)
+public static uint llama_model_quantize(string fname_inp, string fname_out, LLamaModelQuantizeParams* param)
```
#### Parameters
-`ctx` [SafeLLamaContextHandle](./llama.native.safellamacontexthandle.md)
+`fname_inp` [String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
+
+`fname_out` [String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
-`candidates` [LLamaTokenDataArrayNative](./llama.native.llamatokendataarraynative.md)
-A vector of `llama_token_data` containing the candidate tokens, the logits must be directly extracted from the original generation context without being sorted.
+`param` [LLamaModelQuantizeParams*](./llama.native.llamamodelquantizeparams*.md)
-`guidanceCtx` [SafeLLamaContextHandle](./llama.native.safellamacontexthandle.md)
-A separate context from the same model. Other than a negative prompt at the beginning, it should have all generated and user input tokens copied from the main context.
+#### Returns
-`scale` [Single](https://docs.microsoft.com/en-us/dotnet/api/system.single)
-Guidance strength. 1.0f means no guidance. Higher values mean stronger guidance.
+[UInt32](https://docs.microsoft.com/en-us/dotnet/api/system.uint32)
+Returns 0 on success
-### **llama_sample_repetition_penalty(SafeLLamaContextHandle, LLamaTokenDataArrayNative&, Int32*, UInt64, Single)**
+### **llama_sample_repetition_penalties(SafeLLamaContextHandle, LLamaTokenDataArrayNative&, LLamaToken*, UInt64, Single, Single, Single)**
Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix.
+ Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details.
```csharp
-public static void llama_sample_repetition_penalty(SafeLLamaContextHandle ctx, LLamaTokenDataArrayNative& candidates, Int32* last_tokens, ulong last_tokens_size, float penalty)
+public static void llama_sample_repetition_penalties(SafeLLamaContextHandle ctx, LLamaTokenDataArrayNative& candidates, LLamaToken* last_tokens, ulong last_tokens_size, float penalty_repeat, float penalty_freq, float penalty_present)
```
#### Parameters
@@ -456,52 +447,59 @@ public static void llama_sample_repetition_penalty(SafeLLamaContextHandle ctx, L
`candidates` [LLamaTokenDataArrayNative&](./llama.native.llamatokendataarraynative&.md)
Pointer to LLamaTokenDataArray
-`last_tokens` [Int32*](https://docs.microsoft.com/en-us/dotnet/api/system.int32*)
+`last_tokens` [LLamaToken*](./llama.native.llamatoken*.md)
`last_tokens_size` [UInt64](https://docs.microsoft.com/en-us/dotnet/api/system.uint64)
-`penalty` [Single](https://docs.microsoft.com/en-us/dotnet/api/system.single)
+`penalty_repeat` [Single](https://docs.microsoft.com/en-us/dotnet/api/system.single)
+Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix.
-### **llama_sample_frequency_and_presence_penalties(SafeLLamaContextHandle, LLamaTokenDataArrayNative&, Int32*, UInt64, Single, Single)**
+`penalty_freq` [Single](https://docs.microsoft.com/en-us/dotnet/api/system.single)
+Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details.
+`penalty_present` [Single](https://docs.microsoft.com/en-us/dotnet/api/system.single)
Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details.
+### **llama_sample_apply_guidance(SafeLLamaContextHandle, Span<Single>, ReadOnlySpan<Single>, Single)**
+
+Apply classifier-free guidance to the logits as described in academic paper "Stay on topic with Classifier-Free Guidance" https://arxiv.org/abs/2306.17806
+
```csharp
-public static void llama_sample_frequency_and_presence_penalties(SafeLLamaContextHandle ctx, LLamaTokenDataArrayNative& candidates, Int32* last_tokens, ulong last_tokens_size, float alpha_frequency, float alpha_presence)
+public static void llama_sample_apply_guidance(SafeLLamaContextHandle ctx, Span logits, ReadOnlySpan logits_guidance, float scale)
```
#### Parameters
`ctx` [SafeLLamaContextHandle](./llama.native.safellamacontexthandle.md)
-`candidates` [LLamaTokenDataArrayNative&](./llama.native.llamatokendataarraynative&.md)
-Pointer to LLamaTokenDataArray
-
-`last_tokens` [Int32*](https://docs.microsoft.com/en-us/dotnet/api/system.int32*)
-
-`last_tokens_size` [UInt64](https://docs.microsoft.com/en-us/dotnet/api/system.uint64)
+`logits` [Span<Single>](https://docs.microsoft.com/en-us/dotnet/api/system.span-1)
+Logits extracted from the original generation context.
-`alpha_frequency` [Single](https://docs.microsoft.com/en-us/dotnet/api/system.single)
+`logits_guidance` [ReadOnlySpan<Single>](https://docs.microsoft.com/en-us/dotnet/api/system.readonlyspan-1)
+Logits extracted from a separate context from the same model.
+ Other than a negative prompt at the beginning, it should have all generated and user input tokens copied from the main context.
-`alpha_presence` [Single](https://docs.microsoft.com/en-us/dotnet/api/system.single)
+`scale` [Single](https://docs.microsoft.com/en-us/dotnet/api/system.single)
+Guidance strength. 1.0f means no guidance. Higher values mean stronger guidance.
-### **llama_sample_classifier_free_guidance(SafeLLamaContextHandle, LLamaTokenDataArrayNative&, SafeLLamaContextHandle, Single)**
+### **llama_sample_apply_guidance(SafeLLamaContextHandle, Single*, Single*, Single)**
Apply classifier-free guidance to the logits as described in academic paper "Stay on topic with Classifier-Free Guidance" https://arxiv.org/abs/2306.17806
```csharp
-public static void llama_sample_classifier_free_guidance(SafeLLamaContextHandle ctx, LLamaTokenDataArrayNative& candidates, SafeLLamaContextHandle guidance_ctx, float scale)
+public static void llama_sample_apply_guidance(SafeLLamaContextHandle ctx, Single* logits, Single* logits_guidance, float scale)
```
#### Parameters
`ctx` [SafeLLamaContextHandle](./llama.native.safellamacontexthandle.md)
-`candidates` [LLamaTokenDataArrayNative&](./llama.native.llamatokendataarraynative&.md)
-A vector of `llama_token_data` containing the candidate tokens, the logits must be directly extracted from the original generation context without being sorted.
+`logits` [Single*](https://docs.microsoft.com/en-us/dotnet/api/system.single*)
+Logits extracted from the original generation context.
-`guidance_ctx` [SafeLLamaContextHandle](./llama.native.safellamacontexthandle.md)
-A separate context from the same model. Other than a negative prompt at the beginning, it should have all generated and user input tokens copied from the main context.
+`logits_guidance` [Single*](https://docs.microsoft.com/en-us/dotnet/api/system.single*)
+Logits extracted from a separate context from the same model.
+ Other than a negative prompt at the beginning, it should have all generated and user input tokens copied from the main context.
`scale` [Single](https://docs.microsoft.com/en-us/dotnet/api/system.single)
Guidance strength. 1.0f means no guidance. Higher values mean stronger guidance.
@@ -559,6 +557,25 @@ Pointer to LLamaTokenDataArray
`min_keep` [UInt64](https://docs.microsoft.com/en-us/dotnet/api/system.uint64)
+### **llama_sample_min_p(SafeLLamaContextHandle, LLamaTokenDataArrayNative&, Single, UInt64)**
+
+Minimum P sampling as described in https://github.com/ggerganov/llama.cpp/pull/3841
+
+```csharp
+public static void llama_sample_min_p(SafeLLamaContextHandle ctx, LLamaTokenDataArrayNative& candidates, float p, ulong min_keep)
+```
+
+#### Parameters
+
+`ctx` [SafeLLamaContextHandle](./llama.native.safellamacontexthandle.md)
+
+`candidates` [LLamaTokenDataArrayNative&](./llama.native.llamatokendataarraynative&.md)
+Pointer to LLamaTokenDataArray
+
+`p` [Single](https://docs.microsoft.com/en-us/dotnet/api/system.single)
+
+`min_keep` [UInt64](https://docs.microsoft.com/en-us/dotnet/api/system.uint64)
+
### **llama_sample_tail_free(SafeLLamaContextHandle, LLamaTokenDataArrayNative&, Single, UInt64)**
Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/.
@@ -597,12 +614,12 @@ Pointer to LLamaTokenDataArray
`min_keep` [UInt64](https://docs.microsoft.com/en-us/dotnet/api/system.uint64)
-### **llama_sample_temperature(SafeLLamaContextHandle, LLamaTokenDataArrayNative&, Single)**
+### **llama_sample_typical(SafeLLamaContextHandle, LLamaTokenDataArrayNative&, Single, Single, Single)**
-Modify logits by temperature
+Dynamic temperature implementation described in the paper https://arxiv.org/abs/2309.02772.
```csharp
-public static void llama_sample_temperature(SafeLLamaContextHandle ctx, LLamaTokenDataArrayNative& candidates, float temp)
+public static void llama_sample_typical(SafeLLamaContextHandle ctx, LLamaTokenDataArrayNative& candidates, float min_temp, float max_temp, float exponent_val)
```
#### Parameters
@@ -610,495 +627,922 @@ public static void llama_sample_temperature(SafeLLamaContextHandle ctx, LLamaTok
`ctx` [SafeLLamaContextHandle](./llama.native.safellamacontexthandle.md)
`candidates` [LLamaTokenDataArrayNative&](./llama.native.llamatokendataarraynative&.md)
+Pointer to LLamaTokenDataArray
-`temp` [Single](https://docs.microsoft.com/en-us/dotnet/api/system.single)
+`min_temp` [Single](https://docs.microsoft.com/en-us/dotnet/api/system.single)
-### **llama_empty_call()**
+`max_temp` [Single](https://docs.microsoft.com/en-us/dotnet/api/system.single)
-A method that does nothing. This is a native method, calling it will force the llama native dependencies to be loaded.
+`exponent_val` [Single](https://docs.microsoft.com/en-us/dotnet/api/system.single)
+
+### **llama_sample_temp(SafeLLamaContextHandle, LLamaTokenDataArrayNative&, Single)**
+
+Modify logits by temperature
```csharp
-public static bool llama_empty_call()
+public static void llama_sample_temp(SafeLLamaContextHandle ctx, LLamaTokenDataArrayNative& candidates, float temp)
```
-#### Returns
+#### Parameters
-[Boolean](https://docs.microsoft.com/en-us/dotnet/api/system.boolean)
+`ctx` [SafeLLamaContextHandle](./llama.native.safellamacontexthandle.md)
-### **llama_context_default_params()**
+`candidates` [LLamaTokenDataArrayNative&](./llama.native.llamatokendataarraynative&.md)
-Create a LLamaContextParams with default values
+`temp` [Single](https://docs.microsoft.com/en-us/dotnet/api/system.single)
+
+### **llama_get_embeddings(SafeLLamaContextHandle)**
+
+Get the embeddings for the input
```csharp
-public static LLamaContextParams llama_context_default_params()
+public static Span llama_get_embeddings(SafeLLamaContextHandle ctx)
```
+#### Parameters
+
+`ctx` [SafeLLamaContextHandle](./llama.native.safellamacontexthandle.md)
+
#### Returns
-[LLamaContextParams](./llama.native.llamacontextparams.md)
+[Span<Single>](https://docs.microsoft.com/en-us/dotnet/api/system.span-1)
-### **llama_model_quantize_default_params()**
+### **llama_chat_apply_template(SafeLlamaModelHandle, Char*, LLamaChatMessage*, IntPtr, Boolean, Char*, Int32)**
-Create a LLamaModelQuantizeParams with default values
+Apply chat template. Inspired by hf apply_chat_template() on python.
+ Both "model" and "custom_template" are optional, but at least one is required. "custom_template" has higher precedence than "model"
+ NOTE: This function does not use a jinja parser. It only support a pre-defined list of template. See more: https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template
```csharp
-public static LLamaModelQuantizeParams llama_model_quantize_default_params()
+public static int llama_chat_apply_template(SafeLlamaModelHandle model, Char* tmpl, LLamaChatMessage* chat, IntPtr n_msg, bool add_ass, Char* buf, int length)
```
-#### Returns
-
-[LLamaModelQuantizeParams](./llama.native.llamamodelquantizeparams.md)
-
-### **llama_mmap_supported()**
+#### Parameters
-Check if memory mapping is supported
+`model` [SafeLlamaModelHandle](./llama.native.safellamamodelhandle.md)
-```csharp
-public static bool llama_mmap_supported()
-```
+`tmpl` [Char*](https://docs.microsoft.com/en-us/dotnet/api/system.char*)
+A Jinja template to use for this chat. If this is nullptr, the modelβs default chat template will be used instead.
-#### Returns
+`chat` [LLamaChatMessage*](./llama.native.llamachatmessage*.md)
+Pointer to a list of multiple llama_chat_message
-[Boolean](https://docs.microsoft.com/en-us/dotnet/api/system.boolean)
+`n_msg` [IntPtr](https://docs.microsoft.com/en-us/dotnet/api/system.intptr)
+Number of llama_chat_message in this chat
-### **llama_mlock_supported()**
+`add_ass` [Boolean](https://docs.microsoft.com/en-us/dotnet/api/system.boolean)
+Whether to end the prompt with the token(s) that indicate the start of an assistant message.
-Check if memory lockingis supported
+`buf` [Char*](https://docs.microsoft.com/en-us/dotnet/api/system.char*)
+A buffer to hold the output formatted prompt. The recommended alloc size is 2 * (total number of characters of all messages)
-```csharp
-public static bool llama_mlock_supported()
-```
+`length` [Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
+The size of the allocated buffer
#### Returns
-[Boolean](https://docs.microsoft.com/en-us/dotnet/api/system.boolean)
+[Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
+The total number of bytes of the formatted prompt. If is it larger than the size of buffer, you may need to re-alloc it and then re-apply the template.
-### **llama_eval_export(SafeLLamaContextHandle, String)**
+### **llama_token_bos(SafeLlamaModelHandle)**
-Export a static computation graph for context of 511 and batch size of 1
- NOTE: since this functionality is mostly for debugging and demonstration purposes, we hardcode these
- parameters here to keep things simple
- IMPORTANT: do not use for anything else other than debugging and testing!
+Get the "Beginning of sentence" token
```csharp
-public static int llama_eval_export(SafeLLamaContextHandle ctx, string fname)
+public static LLamaToken llama_token_bos(SafeLlamaModelHandle model)
```
#### Parameters
-`ctx` [SafeLLamaContextHandle](./llama.native.safellamacontexthandle.md)
-
-`fname` [String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
+`model` [SafeLlamaModelHandle](./llama.native.safellamamodelhandle.md)
#### Returns
-[Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
+[LLamaToken](./llama.native.llamatoken.md)
-### **llama_load_model_from_file(String, LLamaContextParams)**
+### **llama_token_eos(SafeLlamaModelHandle)**
-Various functions for loading a ggml llama model.
- Allocate (almost) all memory needed for the model.
- Return NULL on failure
+Get the "End of sentence" token
```csharp
-public static IntPtr llama_load_model_from_file(string path_model, LLamaContextParams params)
+public static LLamaToken llama_token_eos(SafeLlamaModelHandle model)
```
#### Parameters
-`path_model` [String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
-
-`params` [LLamaContextParams](./llama.native.llamacontextparams.md)
+`model` [SafeLlamaModelHandle](./llama.native.safellamamodelhandle.md)
#### Returns
-[IntPtr](https://docs.microsoft.com/en-us/dotnet/api/system.intptr)
+[LLamaToken](./llama.native.llamatoken.md)
-### **llama_new_context_with_model(SafeLlamaModelHandle, LLamaContextParams)**
+### **llama_token_nl(SafeLlamaModelHandle)**
-Create a new llama_context with the given model.
- Return value should always be wrapped in SafeLLamaContextHandle!
+Get the "new line" token
```csharp
-public static IntPtr llama_new_context_with_model(SafeLlamaModelHandle model, LLamaContextParams params)
+public static LLamaToken llama_token_nl(SafeLlamaModelHandle model)
```
#### Parameters
`model` [SafeLlamaModelHandle](./llama.native.safellamamodelhandle.md)
-`params` [LLamaContextParams](./llama.native.llamacontextparams.md)
-
#### Returns
-[IntPtr](https://docs.microsoft.com/en-us/dotnet/api/system.intptr)
+[LLamaToken](./llama.native.llamatoken.md)
-### **llama_backend_init(Boolean)**
+### **llama_add_bos_token(SafeLlamaModelHandle)**
-not great API - very likely to change.
- Initialize the llama + ggml backend
- Call once at the start of the program
+Returns -1 if unknown, 1 for true or 0 for false.
```csharp
-public static void llama_backend_init(bool numa)
+public static int llama_add_bos_token(SafeLlamaModelHandle model)
```
#### Parameters
-`numa` [Boolean](https://docs.microsoft.com/en-us/dotnet/api/system.boolean)
+`model` [SafeLlamaModelHandle](./llama.native.safellamamodelhandle.md)
-### **llama_free(IntPtr)**
+#### Returns
+
+[Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
+
+### **llama_add_eos_token(SafeLlamaModelHandle)**
-Frees all allocated memory in the given llama_context
+Returns -1 if unknown, 1 for true or 0 for false.
```csharp
-public static void llama_free(IntPtr ctx)
+public static int llama_add_eos_token(SafeLlamaModelHandle model)
```
#### Parameters
-`ctx` [IntPtr](https://docs.microsoft.com/en-us/dotnet/api/system.intptr)
+`model` [SafeLlamaModelHandle](./llama.native.safellamamodelhandle.md)
+
+#### Returns
+
+[Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
-### **llama_free_model(IntPtr)**
+### **llama_token_prefix(SafeLlamaModelHandle)**
-Frees all allocated memory associated with a model
+codellama infill tokens, Beginning of infill prefix
```csharp
-public static void llama_free_model(IntPtr model)
+public static int llama_token_prefix(SafeLlamaModelHandle model)
```
#### Parameters
-`model` [IntPtr](https://docs.microsoft.com/en-us/dotnet/api/system.intptr)
+`model` [SafeLlamaModelHandle](./llama.native.safellamamodelhandle.md)
+
+#### Returns
+
+[Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
-### **llama_model_apply_lora_from_file(SafeLlamaModelHandle, String, String, Int32)**
+### **llama_token_middle(SafeLlamaModelHandle)**
-Apply a LoRA adapter to a loaded model
- path_base_model is the path to a higher quality model to use as a base for
- the layers modified by the adapter. Can be NULL to use the current loaded model.
- The model needs to be reloaded before applying a new adapter, otherwise the adapter
- will be applied on top of the previous one
+codellama infill tokens, Beginning of infill middle
```csharp
-public static int llama_model_apply_lora_from_file(SafeLlamaModelHandle model_ptr, string path_lora, string path_base_model, int n_threads)
+public static int llama_token_middle(SafeLlamaModelHandle model)
```
#### Parameters
-`model_ptr` [SafeLlamaModelHandle](./llama.native.safellamamodelhandle.md)
+`model` [SafeLlamaModelHandle](./llama.native.safellamamodelhandle.md)
-`path_lora` [String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
+#### Returns
-`path_base_model` [String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
+[Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
-`n_threads` [Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
+### **llama_token_suffix(SafeLlamaModelHandle)**
+
+codellama infill tokens, Beginning of infill suffix
+
+```csharp
+public static int llama_token_suffix(SafeLlamaModelHandle model)
+```
+
+#### Parameters
+
+`model` [SafeLlamaModelHandle](./llama.native.safellamamodelhandle.md)
#### Returns
[Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
-Returns 0 on success
-### **llama_get_kv_cache_token_count(SafeLLamaContextHandle)**
+### **llama_token_eot(SafeLlamaModelHandle)**
-Returns the number of tokens in the KV cache
+codellama infill tokens, End of infill middle
```csharp
-public static int llama_get_kv_cache_token_count(SafeLLamaContextHandle ctx)
+public static int llama_token_eot(SafeLlamaModelHandle model)
+```
+
+#### Parameters
+
+`model` [SafeLlamaModelHandle](./llama.native.safellamamodelhandle.md)
+
+#### Returns
+
+[Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
+
+### **llama_print_timings(SafeLLamaContextHandle)**
+
+Print out timing information for this context
+
+```csharp
+public static void llama_print_timings(SafeLLamaContextHandle ctx)
+```
+
+#### Parameters
+
+`ctx` [SafeLLamaContextHandle](./llama.native.safellamacontexthandle.md)
+
+### **llama_reset_timings(SafeLLamaContextHandle)**
+
+Reset all collected timing information for this context
+
+```csharp
+public static void llama_reset_timings(SafeLLamaContextHandle ctx)
```
#### Parameters
`ctx` [SafeLLamaContextHandle](./llama.native.safellamacontexthandle.md)
+### **llama_print_system_info()**
+
+Print system information
+
+```csharp
+public static IntPtr llama_print_system_info()
+```
+
+#### Returns
+
+[IntPtr](https://docs.microsoft.com/en-us/dotnet/api/system.intptr)
+
+### **llama_token_to_piece(SafeLlamaModelHandle, LLamaToken, Span<Byte>)**
+
+Convert a single token into text
+
+```csharp
+public static int llama_token_to_piece(SafeLlamaModelHandle model, LLamaToken llamaToken, Span buffer)
+```
+
+#### Parameters
+
+`model` [SafeLlamaModelHandle](./llama.native.safellamamodelhandle.md)
+
+`llamaToken` [LLamaToken](./llama.native.llamatoken.md)
+
+`buffer` [Span<Byte>](https://docs.microsoft.com/en-us/dotnet/api/system.span-1)
+buffer to write string into
+
#### Returns
[Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
+The length written, or if the buffer is too small a negative that indicates the length required
-### **llama_set_rng_seed(SafeLLamaContextHandle, Int32)**
+### **llama_tokenize(SafeLlamaModelHandle, Byte*, Int32, LLamaToken*, Int32, Boolean, Boolean)**
-Sets the current rng seed.
+Convert text into tokens
+
+```csharp
+public static int llama_tokenize(SafeLlamaModelHandle model, Byte* text, int text_len, LLamaToken* tokens, int n_max_tokens, bool add_bos, bool special)
+```
+
+#### Parameters
+
+`model` [SafeLlamaModelHandle](./llama.native.safellamamodelhandle.md)
+
+`text` [Byte*](https://docs.microsoft.com/en-us/dotnet/api/system.byte*)
+
+`text_len` [Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
+
+`tokens` [LLamaToken*](./llama.native.llamatoken*.md)
+
+`n_max_tokens` [Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
+
+`add_bos` [Boolean](https://docs.microsoft.com/en-us/dotnet/api/system.boolean)
+
+`special` [Boolean](https://docs.microsoft.com/en-us/dotnet/api/system.boolean)
+Allow tokenizing special and/or control tokens which otherwise are not exposed and treated as plaintext. Does not insert a leading space.
+
+#### Returns
+
+[Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
+Returns the number of tokens on success, no more than n_max_tokens.
+ Returns a negative number on failure - the number of tokens that would have been returned
+
+### **llama_log_set(LLamaLogCallback)**
+
+Register a callback to receive llama log messages
+
+```csharp
+public static void llama_log_set(LLamaLogCallback logCallback)
+```
+
+#### Parameters
+
+`logCallback` [LLamaLogCallback](./llama.native.llamalogcallback.md)
+
+### **llama_kv_cache_clear(SafeLLamaContextHandle)**
+
+Clear the KV cache
```csharp
-public static void llama_set_rng_seed(SafeLLamaContextHandle ctx, int seed)
+public static void llama_kv_cache_clear(SafeLLamaContextHandle ctx)
```
#### Parameters
`ctx` [SafeLLamaContextHandle](./llama.native.safellamacontexthandle.md)
-`seed` [Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
+### **llama_kv_cache_seq_rm(SafeLLamaContextHandle, LLamaSeqId, LLamaPos, LLamaPos)**
-### **llama_get_state_size(SafeLLamaContextHandle)**
+Removes all tokens that belong to the specified sequence and have positions in [p0, p1)
-Returns the maximum size in bytes of the state (rng, logits, embedding
- and kv_cache) - will often be smaller after compacting tokens
+```csharp
+public static void llama_kv_cache_seq_rm(SafeLLamaContextHandle ctx, LLamaSeqId seq, LLamaPos p0, LLamaPos p1)
+```
+
+#### Parameters
+
+`ctx` [SafeLLamaContextHandle](./llama.native.safellamacontexthandle.md)
+
+`seq` [LLamaSeqId](./llama.native.llamaseqid.md)
+
+`p0` [LLamaPos](./llama.native.llamapos.md)
+
+`p1` [LLamaPos](./llama.native.llamapos.md)
+
+### **llama_kv_cache_seq_cp(SafeLLamaContextHandle, LLamaSeqId, LLamaSeqId, LLamaPos, LLamaPos)**
+
+Copy all tokens that belong to the specified sequence to another sequence
+ Note that this does not allocate extra KV cache memory - it simply assigns the tokens to the new sequence
```csharp
-public static ulong llama_get_state_size(SafeLLamaContextHandle ctx)
+public static void llama_kv_cache_seq_cp(SafeLLamaContextHandle ctx, LLamaSeqId src, LLamaSeqId dest, LLamaPos p0, LLamaPos p1)
```
#### Parameters
`ctx` [SafeLLamaContextHandle](./llama.native.safellamacontexthandle.md)
-#### Returns
+`src` [LLamaSeqId](./llama.native.llamaseqid.md)
-[UInt64](https://docs.microsoft.com/en-us/dotnet/api/system.uint64)
+`dest` [LLamaSeqId](./llama.native.llamaseqid.md)
-### **llama_copy_state_data(SafeLLamaContextHandle, Byte*)**
+`p0` [LLamaPos](./llama.native.llamapos.md)
-Copies the state to the specified destination address.
- Destination needs to have allocated enough memory.
+`p1` [LLamaPos](./llama.native.llamapos.md)
+
+### **llama_kv_cache_seq_keep(SafeLLamaContextHandle, LLamaSeqId)**
+
+Removes all tokens that do not belong to the specified sequence
```csharp
-public static ulong llama_copy_state_data(SafeLLamaContextHandle ctx, Byte* dest)
+public static void llama_kv_cache_seq_keep(SafeLLamaContextHandle ctx, LLamaSeqId seq)
```
#### Parameters
`ctx` [SafeLLamaContextHandle](./llama.native.safellamacontexthandle.md)
-`dest` [Byte*](https://docs.microsoft.com/en-us/dotnet/api/system.byte*)
+`seq` [LLamaSeqId](./llama.native.llamaseqid.md)
-#### Returns
+### **llama_kv_cache_seq_add(SafeLLamaContextHandle, LLamaSeqId, LLamaPos, LLamaPos, Int32)**
-[UInt64](https://docs.microsoft.com/en-us/dotnet/api/system.uint64)
-the number of bytes copied
+Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1)
+ If the KV cache is RoPEd, the KV data is updated accordingly:
+ - lazily on next llama_decode()
+ - explicitly with llama_kv_cache_update()
-### **llama_copy_state_data(SafeLLamaContextHandle, Byte[])**
+```csharp
+public static void llama_kv_cache_seq_add(SafeLLamaContextHandle ctx, LLamaSeqId seq, LLamaPos p0, LLamaPos p1, int delta)
+```
-Copies the state to the specified destination address.
- Destination needs to have allocated enough memory (see llama_get_state_size)
+#### Parameters
+
+`ctx` [SafeLLamaContextHandle](./llama.native.safellamacontexthandle.md)
+
+`seq` [LLamaSeqId](./llama.native.llamaseqid.md)
+
+`p0` [LLamaPos](./llama.native.llamapos.md)
+
+`p1` [LLamaPos](./llama.native.llamapos.md)
+
+`delta` [Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
+
+### **llama_kv_cache_seq_div(SafeLLamaContextHandle, LLamaSeqId, LLamaPos, LLamaPos, Int32)**
+
+Integer division of the positions by factor of `d > 1`
+ If the KV cache is RoPEd, the KV data is updated accordingly:
+ - lazily on next llama_decode()
+ - explicitly with llama_kv_cache_update()
+
+ p0 < 0 : [0, p1]
+
+ p1 < 0 : [p0, inf)
```csharp
-public static ulong llama_copy_state_data(SafeLLamaContextHandle ctx, Byte[] dest)
+public static void llama_kv_cache_seq_div(SafeLLamaContextHandle ctx, LLamaSeqId seq, LLamaPos p0, LLamaPos p1, int d)
```
#### Parameters
`ctx` [SafeLLamaContextHandle](./llama.native.safellamacontexthandle.md)
-`dest` [Byte[]](https://docs.microsoft.com/en-us/dotnet/api/system.byte)
+`seq` [LLamaSeqId](./llama.native.llamaseqid.md)
-#### Returns
+`p0` [LLamaPos](./llama.native.llamapos.md)
-[UInt64](https://docs.microsoft.com/en-us/dotnet/api/system.uint64)
-the number of bytes copied
+`p1` [LLamaPos](./llama.native.llamapos.md)
-### **llama_set_state_data(SafeLLamaContextHandle, Byte*)**
+`d` [Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
-Set the state reading from the specified address
+### **llama_kv_cache_seq_pos_max(SafeLLamaContextHandle, LLamaSeqId)**
+
+Returns the largest position present in the KV cache for the specified sequence
```csharp
-public static ulong llama_set_state_data(SafeLLamaContextHandle ctx, Byte* src)
+public static LLamaPos llama_kv_cache_seq_pos_max(SafeLLamaContextHandle ctx, LLamaSeqId seq)
```
#### Parameters
`ctx` [SafeLLamaContextHandle](./llama.native.safellamacontexthandle.md)
-`src` [Byte*](https://docs.microsoft.com/en-us/dotnet/api/system.byte*)
+`seq` [LLamaSeqId](./llama.native.llamaseqid.md)
#### Returns
-[UInt64](https://docs.microsoft.com/en-us/dotnet/api/system.uint64)
-the number of bytes read
+[LLamaPos](./llama.native.llamapos.md)
-### **llama_set_state_data(SafeLLamaContextHandle, Byte[])**
+### **llama_kv_cache_defrag(SafeLLamaContextHandle)**
-Set the state reading from the specified address
+Defragment the KV cache. This will be applied:
+ - lazily on next llama_decode()
+ - explicitly with llama_kv_cache_update()
```csharp
-public static ulong llama_set_state_data(SafeLLamaContextHandle ctx, Byte[] src)
+public static LLamaPos llama_kv_cache_defrag(SafeLLamaContextHandle ctx)
```
#### Parameters
`ctx` [SafeLLamaContextHandle](./llama.native.safellamacontexthandle.md)
-`src` [Byte[]](https://docs.microsoft.com/en-us/dotnet/api/system.byte)
-
#### Returns
-[UInt64](https://docs.microsoft.com/en-us/dotnet/api/system.uint64)
-the number of bytes read
+[LLamaPos](./llama.native.llamapos.md)
-### **llama_load_session_file(SafeLLamaContextHandle, String, Int32[], UInt64, UInt64*)**
+### **llama_kv_cache_update(SafeLLamaContextHandle)**
-Load session file
+Apply the KV cache updates (such as K-shifts, defragmentation, etc.)
```csharp
-public static bool llama_load_session_file(SafeLLamaContextHandle ctx, string path_session, Int32[] tokens_out, ulong n_token_capacity, UInt64* n_token_count_out)
+public static void llama_kv_cache_update(SafeLLamaContextHandle ctx)
```
#### Parameters
`ctx` [SafeLLamaContextHandle](./llama.native.safellamacontexthandle.md)
-`path_session` [String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
+### **llama_batch_init(Int32, Int32, Int32)**
-`tokens_out` [Int32[]](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
+Allocates a batch of tokens on the heap
+ Each token can be assigned up to n_seq_max sequence ids
+ The batch has to be freed with llama_batch_free()
+ If embd != 0, llama_batch.embd will be allocated with size of n_tokens * embd * sizeof(float)
+ Otherwise, llama_batch.token will be allocated to store n_tokens llama_token
+ The rest of the llama_batch members are allocated with size n_tokens
+ All members are left uninitialized
-`n_token_capacity` [UInt64](https://docs.microsoft.com/en-us/dotnet/api/system.uint64)
+```csharp
+public static LLamaNativeBatch llama_batch_init(int n_tokens, int embd, int n_seq_max)
+```
+
+#### Parameters
+
+`n_tokens` [Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
-`n_token_count_out` [UInt64*](https://docs.microsoft.com/en-us/dotnet/api/system.uint64*)
+`embd` [Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
+
+`n_seq_max` [Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
+Each token can be assigned up to n_seq_max sequence ids
#### Returns
-[Boolean](https://docs.microsoft.com/en-us/dotnet/api/system.boolean)
+[LLamaNativeBatch](./llama.native.llamanativebatch.md)
+
+### **llama_batch_free(LLamaNativeBatch)**
+
+Frees a batch of tokens allocated with llama_batch_init()
+
+```csharp
+public static void llama_batch_free(LLamaNativeBatch batch)
+```
+
+#### Parameters
+
+`batch` [LLamaNativeBatch](./llama.native.llamanativebatch.md)
+
+### **llama_decode(SafeLLamaContextHandle, LLamaNativeBatch)**
-### **llama_save_session_file(SafeLLamaContextHandle, String, Int32[], UInt64)**
-Save session file
```csharp
-public static bool llama_save_session_file(SafeLLamaContextHandle ctx, string path_session, Int32[] tokens, ulong n_token_count)
+public static int llama_decode(SafeLLamaContextHandle ctx, LLamaNativeBatch batch)
```
#### Parameters
`ctx` [SafeLLamaContextHandle](./llama.native.safellamacontexthandle.md)
-`path_session` [String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
+`batch` [LLamaNativeBatch](./llama.native.llamanativebatch.md)
+
+#### Returns
+
+[Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
+Positive return values does not mean a fatal error, but rather a warning:
+ - 0: success
+ - 1: could not find a KV slot for the batch (try reducing the size of the batch or increase the context)
+ - < 0: error
-`tokens` [Int32[]](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
+### **llama_kv_cache_view_init(SafeLLamaContextHandle, Int32)**
-`n_token_count` [UInt64](https://docs.microsoft.com/en-us/dotnet/api/system.uint64)
+Create an empty KV cache view. (use only for debugging purposes)
+
+```csharp
+public static LLamaKvCacheView llama_kv_cache_view_init(SafeLLamaContextHandle ctx, int n_max_seq)
+```
+
+#### Parameters
+
+`ctx` [SafeLLamaContextHandle](./llama.native.safellamacontexthandle.md)
+
+`n_max_seq` [Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
#### Returns
-[Boolean](https://docs.microsoft.com/en-us/dotnet/api/system.boolean)
+[LLamaKvCacheView](./llama.native.llamakvcacheview.md)
+
+### **llama_kv_cache_view_free(LLamaKvCacheView&)**
-### **llama_eval(SafeLLamaContextHandle, Int32[], Int32, Int32, Int32)**
+Free a KV cache view. (use only for debugging purposes)
-Run the llama inference to obtain the logits and probabilities for the next token.
- tokens + n_tokens is the provided batch of new tokens to process
- n_past is the number of tokens to use from previous eval calls
+```csharp
+public static void llama_kv_cache_view_free(LLamaKvCacheView& view)
+```
+
+#### Parameters
+
+`view` [LLamaKvCacheView&](./llama.native.llamakvcacheview&.md)
+
+### **llama_kv_cache_view_update(SafeLLamaContextHandle, LLamaKvCacheView&)**
+
+Update the KV cache view structure with the current state of the KV cache. (use only for debugging purposes)
```csharp
-public static int llama_eval(SafeLLamaContextHandle ctx, Int32[] tokens, int n_tokens, int n_past, int n_threads)
+public static void llama_kv_cache_view_update(SafeLLamaContextHandle ctx, LLamaKvCacheView& view)
```
#### Parameters
`ctx` [SafeLLamaContextHandle](./llama.native.safellamacontexthandle.md)
-`tokens` [Int32[]](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
+`view` [LLamaKvCacheView&](./llama.native.llamakvcacheview&.md)
-`n_tokens` [Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
+### **llama_get_kv_cache_token_count(SafeLLamaContextHandle)**
-`n_past` [Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
+Returns the number of tokens in the KV cache (slow, use only for debug)
+ If a KV cell has multiple sequences assigned to it, it will be counted multiple times
-`n_threads` [Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
+```csharp
+public static int llama_get_kv_cache_token_count(SafeLLamaContextHandle ctx)
+```
+
+#### Parameters
+
+`ctx` [SafeLLamaContextHandle](./llama.native.safellamacontexthandle.md)
#### Returns
[Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
-Returns 0 on success
-### **llama_eval_with_pointer(SafeLLamaContextHandle, Int32*, Int32, Int32, Int32)**
+### **llama_get_kv_cache_used_cells(SafeLLamaContextHandle)**
-Run the llama inference to obtain the logits and probabilities for the next token.
- tokens + n_tokens is the provided batch of new tokens to process
- n_past is the number of tokens to use from previous eval calls
+Returns the number of used KV cells (i.e. have at least one sequence assigned to them)
```csharp
-public static int llama_eval_with_pointer(SafeLLamaContextHandle ctx, Int32* tokens, int n_tokens, int n_past, int n_threads)
+public static int llama_get_kv_cache_used_cells(SafeLLamaContextHandle ctx)
```
#### Parameters
`ctx` [SafeLLamaContextHandle](./llama.native.safellamacontexthandle.md)
-`tokens` [Int32*](https://docs.microsoft.com/en-us/dotnet/api/system.int32*)
+#### Returns
-`n_tokens` [Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
+[Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
+
+### **llama_beam_search(SafeLLamaContextHandle, LLamaBeamSearchCallback, IntPtr, UInt64, Int32, Int32, Int32)**
+
+Deterministically returns entire sentence constructed by a beam search.
+
+```csharp
+public static void llama_beam_search(SafeLLamaContextHandle ctx, LLamaBeamSearchCallback callback, IntPtr callback_data, ulong n_beams, int n_past, int n_predict, int n_threads)
+```
+
+#### Parameters
+
+`ctx` [SafeLLamaContextHandle](./llama.native.safellamacontexthandle.md)
+Pointer to the llama_context.
+
+`callback` [LLamaBeamSearchCallback](./llama.native.nativeapi.llamabeamsearchcallback.md)
+Invoked for each iteration of the beam_search loop, passing in beams_state.
+
+`callback_data` [IntPtr](https://docs.microsoft.com/en-us/dotnet/api/system.intptr)
+A pointer that is simply passed back to callback.
+
+`n_beams` [UInt64](https://docs.microsoft.com/en-us/dotnet/api/system.uint64)
+Number of beams to use.
`n_past` [Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
+Number of tokens already evaluated.
+
+`n_predict` [Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
+Maximum number of tokens to predict. EOS may occur earlier.
`n_threads` [Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
+Number of threads.
+
+### **llama_empty_call()**
+
+A method that does nothing. This is a native method, calling it will force the llama native dependencies to be loaded.
+
+```csharp
+public static void llama_empty_call()
+```
+
+### **llama_max_devices()**
+
+Get the maximum number of devices supported by llama.cpp
+
+```csharp
+public static long llama_max_devices()
+```
#### Returns
-[Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
-Returns 0 on success
+[Int64](https://docs.microsoft.com/en-us/dotnet/api/system.int64)
+
+### **llama_model_default_params()**
+
+Create a LLamaModelParams with default values
+
+```csharp
+public static LLamaModelParams llama_model_default_params()
+```
+
+#### Returns
+
+[LLamaModelParams](./llama.native.llamamodelparams.md)
+
+### **llama_context_default_params()**
+
+Create a LLamaContextParams with default values
+
+```csharp
+public static LLamaContextParams llama_context_default_params()
+```
+
+#### Returns
+
+[LLamaContextParams](./llama.native.llamacontextparams.md)
+
+### **llama_model_quantize_default_params()**
+
+Create a LLamaModelQuantizeParams with default values
+
+```csharp
+public static LLamaModelQuantizeParams llama_model_quantize_default_params()
+```
+
+#### Returns
+
+[LLamaModelQuantizeParams](./llama.native.llamamodelquantizeparams.md)
+
+### **llama_supports_mmap()**
+
+Check if memory mapping is supported
+
+```csharp
+public static bool llama_supports_mmap()
+```
+
+#### Returns
+
+[Boolean](https://docs.microsoft.com/en-us/dotnet/api/system.boolean)
+
+### **llama_supports_mlock()**
+
+Check if memory locking is supported
+
+```csharp
+public static bool llama_supports_mlock()
+```
+
+#### Returns
-### **llama_tokenize(SafeLLamaContextHandle, String, Encoding, Int32[], Int32, Boolean)**
+[Boolean](https://docs.microsoft.com/en-us/dotnet/api/system.boolean)
+
+### **llama_supports_gpu_offload()**
+
+Check if GPU offload is supported
+
+```csharp
+public static bool llama_supports_gpu_offload()
+```
+
+#### Returns
+
+[Boolean](https://docs.microsoft.com/en-us/dotnet/api/system.boolean)
-Convert the provided text into tokens.
+### **llama_set_rng_seed(SafeLLamaContextHandle, UInt32)**
+
+Sets the current rng seed.
```csharp
-public static int llama_tokenize(SafeLLamaContextHandle ctx, string text, Encoding encoding, Int32[] tokens, int n_max_tokens, bool add_bos)
+public static void llama_set_rng_seed(SafeLLamaContextHandle ctx, uint seed)
```
#### Parameters
`ctx` [SafeLLamaContextHandle](./llama.native.safellamacontexthandle.md)
-`text` [String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
+`seed` [UInt32](https://docs.microsoft.com/en-us/dotnet/api/system.uint32)
+
+### **llama_get_state_size(SafeLLamaContextHandle)**
-`encoding` [Encoding](https://docs.microsoft.com/en-us/dotnet/api/system.text.encoding)
+Returns the maximum size in bytes of the state (rng, logits, embedding
+ and kv_cache) - will often be smaller after compacting tokens
-`tokens` [Int32[]](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
+```csharp
+public static ulong llama_get_state_size(SafeLLamaContextHandle ctx)
+```
-`n_max_tokens` [Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
+#### Parameters
-`add_bos` [Boolean](https://docs.microsoft.com/en-us/dotnet/api/system.boolean)
+`ctx` [SafeLLamaContextHandle](./llama.native.safellamacontexthandle.md)
#### Returns
-[Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
-Returns the number of tokens on success, no more than n_max_tokens.
- Returns a negative number on failure - the number of tokens that would have been returned
+[UInt64](https://docs.microsoft.com/en-us/dotnet/api/system.uint64)
-### **llama_tokenize_native(SafeLLamaContextHandle, Byte*, Int32*, Int32, Boolean)**
+### **llama_copy_state_data(SafeLLamaContextHandle, Byte*)**
-Convert the provided text into tokens.
+Copies the state to the specified destination address.
+ Destination needs to have allocated enough memory.
```csharp
-public static int llama_tokenize_native(SafeLLamaContextHandle ctx, Byte* text, Int32* tokens, int n_max_tokens, bool add_bos)
+public static ulong llama_copy_state_data(SafeLLamaContextHandle ctx, Byte* dest)
```
#### Parameters
`ctx` [SafeLLamaContextHandle](./llama.native.safellamacontexthandle.md)
-`text` [Byte*](https://docs.microsoft.com/en-us/dotnet/api/system.byte*)
+`dest` [Byte*](https://docs.microsoft.com/en-us/dotnet/api/system.byte*)
-`tokens` [Int32*](https://docs.microsoft.com/en-us/dotnet/api/system.int32*)
+#### Returns
-`n_max_tokens` [Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
+[UInt64](https://docs.microsoft.com/en-us/dotnet/api/system.uint64)
+the number of bytes copied
-`add_bos` [Boolean](https://docs.microsoft.com/en-us/dotnet/api/system.boolean)
+### **llama_set_state_data(SafeLLamaContextHandle, Byte*)**
+
+Set the state reading from the specified address
+
+```csharp
+public static ulong llama_set_state_data(SafeLLamaContextHandle ctx, Byte* src)
+```
+
+#### Parameters
+
+`ctx` [SafeLLamaContextHandle](./llama.native.safellamacontexthandle.md)
+
+`src` [Byte*](https://docs.microsoft.com/en-us/dotnet/api/system.byte*)
#### Returns
-[Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
-Returns the number of tokens on success, no more than n_max_tokens.
- Returns a negative number on failure - the number of tokens that would have been returned
+[UInt64](https://docs.microsoft.com/en-us/dotnet/api/system.uint64)
+the number of bytes read
-### **llama_n_vocab(SafeLLamaContextHandle)**
+### **llama_load_session_file(SafeLLamaContextHandle, String, LLamaToken[], UInt64, UInt64&)**
-Get the number of tokens in the model vocabulary for this context
+Load session file
```csharp
-public static int llama_n_vocab(SafeLLamaContextHandle ctx)
+public static bool llama_load_session_file(SafeLLamaContextHandle ctx, string path_session, LLamaToken[] tokens_out, ulong n_token_capacity, UInt64& n_token_count_out)
```
#### Parameters
`ctx` [SafeLLamaContextHandle](./llama.native.safellamacontexthandle.md)
+`path_session` [String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
+
+`tokens_out` [LLamaToken[]](./llama.native.llamatoken.md)
+
+`n_token_capacity` [UInt64](https://docs.microsoft.com/en-us/dotnet/api/system.uint64)
+
+`n_token_count_out` [UInt64&](https://docs.microsoft.com/en-us/dotnet/api/system.uint64&)
+
#### Returns
-[Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
+[Boolean](https://docs.microsoft.com/en-us/dotnet/api/system.boolean)
+
+### **llama_save_session_file(SafeLLamaContextHandle, String, LLamaToken[], UInt64)**
+
+Save session file
+
+```csharp
+public static bool llama_save_session_file(SafeLLamaContextHandle ctx, string path_session, LLamaToken[] tokens, ulong n_token_count)
+```
+
+#### Parameters
+
+`ctx` [SafeLLamaContextHandle](./llama.native.safellamacontexthandle.md)
+
+`path_session` [String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
+
+`tokens` [LLamaToken[]](./llama.native.llamatoken.md)
+
+`n_token_count` [UInt64](https://docs.microsoft.com/en-us/dotnet/api/system.uint64)
+
+#### Returns
+
+[Boolean](https://docs.microsoft.com/en-us/dotnet/api/system.boolean)
+
+### **llama_token_get_text(SafeLlamaModelHandle, LLamaToken)**
+
+```csharp
+public static Byte* llama_token_get_text(SafeLlamaModelHandle model, LLamaToken token)
+```
+
+#### Parameters
+
+`model` [SafeLlamaModelHandle](./llama.native.safellamamodelhandle.md)
+
+`token` [LLamaToken](./llama.native.llamatoken.md)
+
+#### Returns
+
+[Byte*](https://docs.microsoft.com/en-us/dotnet/api/system.byte*)
+
+### **llama_token_get_score(SafeLlamaModelHandle, LLamaToken)**
+
+```csharp
+public static float llama_token_get_score(SafeLlamaModelHandle model, LLamaToken token)
+```
+
+#### Parameters
+
+`model` [SafeLlamaModelHandle](./llama.native.safellamamodelhandle.md)
+
+`token` [LLamaToken](./llama.native.llamatoken.md)
+
+#### Returns
+
+[Single](https://docs.microsoft.com/en-us/dotnet/api/system.single)
+
+### **llama_token_get_type(SafeLlamaModelHandle, LLamaToken)**
+
+```csharp
+public static LLamaTokenType llama_token_get_type(SafeLlamaModelHandle model, LLamaToken token)
+```
+
+#### Parameters
+
+`model` [SafeLlamaModelHandle](./llama.native.safellamamodelhandle.md)
+
+`token` [LLamaToken](./llama.native.llamatoken.md)
+
+#### Returns
+
+[LLamaTokenType](./llama.native.llamatokentype.md)
### **llama_n_ctx(SafeLLamaContextHandle)**
Get the size of the context window for the model for this context
```csharp
-public static int llama_n_ctx(SafeLLamaContextHandle ctx)
+public static uint llama_n_ctx(SafeLLamaContextHandle ctx)
```
#### Parameters
@@ -1107,14 +1551,14 @@ public static int llama_n_ctx(SafeLLamaContextHandle ctx)
#### Returns
-[Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
+[UInt32](https://docs.microsoft.com/en-us/dotnet/api/system.uint32)
-### **llama_n_embd(SafeLLamaContextHandle)**
+### **llama_n_batch(SafeLLamaContextHandle)**
-Get the dimension of embedding vectors from the model for this context
+Get the batch size for this context
```csharp
-public static int llama_n_embd(SafeLLamaContextHandle ctx)
+public static uint llama_n_batch(SafeLLamaContextHandle ctx)
```
#### Parameters
@@ -1123,11 +1567,11 @@ public static int llama_n_embd(SafeLLamaContextHandle ctx)
#### Returns
-[Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
+[UInt32](https://docs.microsoft.com/en-us/dotnet/api/system.uint32)
### **llama_get_logits(SafeLLamaContextHandle)**
-Token logits obtained from the last call to llama_eval()
+Token logits obtained from the last call to llama_decode
The logits for the last token are stored in the last row
Can be mutated in order to change the probabilities of the next token.
Rows: n_tokens
@@ -1145,19 +1589,38 @@ public static Single* llama_get_logits(SafeLLamaContextHandle ctx)
[Single*](https://docs.microsoft.com/en-us/dotnet/api/system.single*)
-### **llama_get_embeddings(SafeLLamaContextHandle)**
+### **llama_get_logits_ith(SafeLLamaContextHandle, Int32)**
-Get the embeddings for the input
- shape: [n_embd] (1-dimensional)
+Logits for the ith token. Equivalent to: llama_get_logits(ctx) + i*n_vocab
```csharp
-public static Single* llama_get_embeddings(SafeLLamaContextHandle ctx)
+public static Single* llama_get_logits_ith(SafeLLamaContextHandle ctx, int i)
```
#### Parameters
`ctx` [SafeLLamaContextHandle](./llama.native.safellamacontexthandle.md)
+`i` [Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
+
+#### Returns
+
+[Single*](https://docs.microsoft.com/en-us/dotnet/api/system.single*)
+
+### **llama_get_embeddings_ith(SafeLLamaContextHandle, Int32)**
+
+Get the embeddings for the ith sequence. Equivalent to: llama_get_embeddings(ctx) + i*n_embd
+
+```csharp
+public static Single* llama_get_embeddings_ith(SafeLLamaContextHandle ctx, int i)
+```
+
+#### Parameters
+
+`ctx` [SafeLLamaContextHandle](./llama.native.safellamacontexthandle.md)
+
+`i` [Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
+
#### Returns
[Single*](https://docs.microsoft.com/en-us/dotnet/api/system.single*)
diff --git a/docs/xmldocs/llama.native.nativelibraryconfig.md b/docs/xmldocs/llama.native.nativelibraryconfig.md
new file mode 100644
index 00000000..7cd99de5
--- /dev/null
+++ b/docs/xmldocs/llama.native.nativelibraryconfig.md
@@ -0,0 +1,258 @@
+# NativeLibraryConfig
+
+Namespace: LLama.Native
+
+Allows configuration of the native llama.cpp libraries to load and use.
+ All configuration must be done before using **any** other LLamaSharp methods!
+
+```csharp
+public sealed class NativeLibraryConfig
+```
+
+Inheritance [Object](https://docs.microsoft.com/en-us/dotnet/api/system.object) β [NativeLibraryConfig](./llama.native.nativelibraryconfig.md)
+
+## Properties
+
+### **Instance**
+
+Get the config instance
+
+```csharp
+public static NativeLibraryConfig Instance { get; }
+```
+
+#### Property Value
+
+[NativeLibraryConfig](./llama.native.nativelibraryconfig.md)
+
+### **LibraryHasLoaded**
+
+Check if the native library has already been loaded. Configuration cannot be modified if this is true.
+
+```csharp
+public static bool LibraryHasLoaded { get; internal set; }
+```
+
+#### Property Value
+
+[Boolean](https://docs.microsoft.com/en-us/dotnet/api/system.boolean)
+
+## Methods
+
+### **WithLibrary(String, String)**
+
+Load a specified native library as backend for LLamaSharp.
+ When this method is called, all the other configurations will be ignored.
+
+```csharp
+public NativeLibraryConfig WithLibrary(string llamaPath, string llavaPath)
+```
+
+#### Parameters
+
+`llamaPath` [String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
+The full path to the llama library to load.
+
+`llavaPath` [String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
+The full path to the llava library to load.
+
+#### Returns
+
+[NativeLibraryConfig](./llama.native.nativelibraryconfig.md)
+
+#### Exceptions
+
+[InvalidOperationException](https://docs.microsoft.com/en-us/dotnet/api/system.invalidoperationexception)
+Thrown if `LibraryHasLoaded` is true.
+
+### **WithCuda(Boolean)**
+
+Configure whether to use cuda backend if possible.
+
+```csharp
+public NativeLibraryConfig WithCuda(bool enable)
+```
+
+#### Parameters
+
+`enable` [Boolean](https://docs.microsoft.com/en-us/dotnet/api/system.boolean)
+
+#### Returns
+
+[NativeLibraryConfig](./llama.native.nativelibraryconfig.md)
+
+#### Exceptions
+
+[InvalidOperationException](https://docs.microsoft.com/en-us/dotnet/api/system.invalidoperationexception)
+Thrown if `LibraryHasLoaded` is true.
+
+### **WithAvx(AvxLevel)**
+
+Configure the prefferred avx support level of the backend.
+
+```csharp
+public NativeLibraryConfig WithAvx(AvxLevel level)
+```
+
+#### Parameters
+
+`level` [AvxLevel](./llama.native.nativelibraryconfig.avxlevel.md)
+
+#### Returns
+
+[NativeLibraryConfig](./llama.native.nativelibraryconfig.md)
+
+#### Exceptions
+
+[InvalidOperationException](https://docs.microsoft.com/en-us/dotnet/api/system.invalidoperationexception)
+Thrown if `LibraryHasLoaded` is true.
+
+### **WithAutoFallback(Boolean)**
+
+Configure whether to allow fallback when there's no match for preferred settings.
+
+```csharp
+public NativeLibraryConfig WithAutoFallback(bool enable)
+```
+
+#### Parameters
+
+`enable` [Boolean](https://docs.microsoft.com/en-us/dotnet/api/system.boolean)
+
+#### Returns
+
+[NativeLibraryConfig](./llama.native.nativelibraryconfig.md)
+
+#### Exceptions
+
+[InvalidOperationException](https://docs.microsoft.com/en-us/dotnet/api/system.invalidoperationexception)
+Thrown if `LibraryHasLoaded` is true.
+
+### **SkipCheck(Boolean)**
+
+Whether to skip the check when you don't allow fallback. This option
+ may be useful under some complex conditions. For example, you're sure
+ you have your cublas configured but LLamaSharp take it as invalid by mistake.
+
+```csharp
+public NativeLibraryConfig SkipCheck(bool enable)
+```
+
+#### Parameters
+
+`enable` [Boolean](https://docs.microsoft.com/en-us/dotnet/api/system.boolean)
+
+#### Returns
+
+[NativeLibraryConfig](./llama.native.nativelibraryconfig.md)
+
+#### Exceptions
+
+[InvalidOperationException](https://docs.microsoft.com/en-us/dotnet/api/system.invalidoperationexception)
+Thrown if `LibraryHasLoaded` is true.
+
+### **WithLogs(Boolean)**
+
+Whether to output the logs to console when loading the native library with your configuration.
+
+```csharp
+public NativeLibraryConfig WithLogs(bool enable)
+```
+
+#### Parameters
+
+`enable` [Boolean](https://docs.microsoft.com/en-us/dotnet/api/system.boolean)
+
+#### Returns
+
+[NativeLibraryConfig](./llama.native.nativelibraryconfig.md)
+
+#### Exceptions
+
+[InvalidOperationException](https://docs.microsoft.com/en-us/dotnet/api/system.invalidoperationexception)
+Thrown if `LibraryHasLoaded` is true.
+
+### **WithLogs(LLamaLogLevel)**
+
+Enable console logging with the specified log logLevel.
+
+```csharp
+public NativeLibraryConfig WithLogs(LLamaLogLevel logLevel)
+```
+
+#### Parameters
+
+`logLevel` [LLamaLogLevel](./llama.native.llamaloglevel.md)
+
+#### Returns
+
+[NativeLibraryConfig](./llama.native.nativelibraryconfig.md)
+
+#### Exceptions
+
+[InvalidOperationException](https://docs.microsoft.com/en-us/dotnet/api/system.invalidoperationexception)
+Thrown if `LibraryHasLoaded` is true.
+
+### **WithSearchDirectories(IEnumerable<String>)**
+
+Add self-defined search directories. Note that the file stucture of the added
+ directories must be the same as the default directory. Besides, the directory
+ won't be used recursively.
+
+```csharp
+public NativeLibraryConfig WithSearchDirectories(IEnumerable directories)
+```
+
+#### Parameters
+
+`directories` [IEnumerable<String>](https://docs.microsoft.com/en-us/dotnet/api/system.collections.generic.ienumerable-1)
+
+#### Returns
+
+[NativeLibraryConfig](./llama.native.nativelibraryconfig.md)
+
+### **WithSearchDirectory(String)**
+
+Add self-defined search directories. Note that the file stucture of the added
+ directories must be the same as the default directory. Besides, the directory
+ won't be used recursively.
+
+```csharp
+public NativeLibraryConfig WithSearchDirectory(string directory)
+```
+
+#### Parameters
+
+`directory` [String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
+
+#### Returns
+
+[NativeLibraryConfig](./llama.native.nativelibraryconfig.md)
+
+### **CheckAndGatherDescription(LibraryName)**
+
+```csharp
+internal static Description CheckAndGatherDescription(LibraryName library)
+```
+
+#### Parameters
+
+`library` [LibraryName](./llama.native.libraryname.md)
+
+#### Returns
+
+[Description](./llama.native.nativelibraryconfig.description.md)
+
+### **AvxLevelToString(AvxLevel)**
+
+```csharp
+internal static string AvxLevelToString(AvxLevel level)
+```
+
+#### Parameters
+
+`level` [AvxLevel](./llama.native.nativelibraryconfig.avxlevel.md)
+
+#### Returns
+
+[String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
diff --git a/docs/xmldocs/llama.native.ropescalingtype.md b/docs/xmldocs/llama.native.ropescalingtype.md
new file mode 100644
index 00000000..928627b4
--- /dev/null
+++ b/docs/xmldocs/llama.native.ropescalingtype.md
@@ -0,0 +1,25 @@
+# RopeScalingType
+
+Namespace: LLama.Native
+
+RoPE scaling type.
+
+```csharp
+public enum RopeScalingType
+```
+
+Inheritance [Object](https://docs.microsoft.com/en-us/dotnet/api/system.object) β [ValueType](https://docs.microsoft.com/en-us/dotnet/api/system.valuetype) β [Enum](https://docs.microsoft.com/en-us/dotnet/api/system.enum) β [RopeScalingType](./llama.native.ropescalingtype.md)
+Implements [IComparable](https://docs.microsoft.com/en-us/dotnet/api/system.icomparable), [IFormattable](https://docs.microsoft.com/en-us/dotnet/api/system.iformattable), [IConvertible](https://docs.microsoft.com/en-us/dotnet/api/system.iconvertible)
+
+**Remarks:**
+
+C# equivalent of llama_rope_scaling_type
+
+## Fields
+
+| Name | Value | Description |
+| --- | --: | --- |
+| Unspecified | -1 | No particular scaling type has been specified |
+| None | 0 | Do not apply any RoPE scaling |
+| Linear | 1 | Positional linear interpolation, as described by kaikendev: https://kaiokendev.github.io/til#extending-context-to-8k |
+| Yarn | 2 | YaRN scaling: https://arxiv.org/pdf/2309.00071.pdf |
diff --git a/docs/xmldocs/llama.native.safellamacontexthandle.md b/docs/xmldocs/llama.native.safellamacontexthandle.md
index 0fe73571..af07938e 100644
--- a/docs/xmldocs/llama.native.safellamacontexthandle.md
+++ b/docs/xmldocs/llama.native.safellamacontexthandle.md
@@ -30,12 +30,12 @@ public int VocabCount { get; }
Total number of tokens in the context
```csharp
-public int ContextSize { get; }
+public uint ContextSize { get; }
```
#### Property Value
-[Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
+[UInt32](https://docs.microsoft.com/en-us/dotnet/api/system.uint32)
### **EmbeddingSize**
@@ -49,6 +49,18 @@ public int EmbeddingSize { get; }
[Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
+### **BatchSize**
+
+Get the maximum batch size for this context
+
+```csharp
+public uint BatchSize { get; }
+```
+
+#### Property Value
+
+[UInt32](https://docs.microsoft.com/en-us/dotnet/api/system.uint32)
+
### **ModelHandle**
Get the model which this context is using
@@ -83,22 +95,12 @@ public bool IsClosed { get; }
## Constructors
-### **SafeLLamaContextHandle(IntPtr, SafeLlamaModelHandle)**
-
-Create a new SafeLLamaContextHandle
+### **SafeLLamaContextHandle()**
```csharp
-public SafeLLamaContextHandle(IntPtr handle, SafeLlamaModelHandle model)
+public SafeLLamaContextHandle()
```
-#### Parameters
-
-`handle` [IntPtr](https://docs.microsoft.com/en-us/dotnet/api/system.intptr)
-pointer to an allocated llama_context
-
-`model` [SafeLlamaModelHandle](./llama.native.safellamamodelhandle.md)
-the model which this context was created from
-
## Methods
### **ReleaseHandle()**
@@ -133,28 +135,44 @@ public static SafeLLamaContextHandle Create(SafeLlamaModelHandle model, LLamaCon
[RuntimeError](./llama.exceptions.runtimeerror.md)
-### **Clone(LLamaContextParams)**
+### **GetLogits()**
-Create a new llama context with a clone of the current llama context state
+Token logits obtained from the last call to llama_decode
+ The logits for the last token are stored in the last row
+ Can be mutated in order to change the probabilities of the next token.
+ Rows: n_tokens
+ Cols: n_vocab
```csharp
-public SafeLLamaContextHandle Clone(LLamaContextParams lparams)
+public Span GetLogits()
+```
+
+#### Returns
+
+[Span<Single>](https://docs.microsoft.com/en-us/dotnet/api/system.span-1)
+
+### **GetLogitsIth(Int32)**
+
+Logits for the ith token. Equivalent to: llama_get_logits(ctx) + i*n_vocab
+
+```csharp
+public Span GetLogitsIth(int i)
```
#### Parameters
-`lparams` [LLamaContextParams](./llama.native.llamacontextparams.md)
+`i` [Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
#### Returns
-[SafeLLamaContextHandle](./llama.native.safellamacontexthandle.md)
+[Span<Single>](https://docs.microsoft.com/en-us/dotnet/api/system.span-1)
-### **Tokenize(String, Boolean, Encoding)**
+### **Tokenize(String, Boolean, Boolean, Encoding)**
Convert the given text into tokens
```csharp
-public Int32[] Tokenize(string text, bool add_bos, Encoding encoding)
+public LLamaToken[] Tokenize(string text, bool add_bos, bool special, Encoding encoding)
```
#### Parameters
@@ -165,113 +183,83 @@ The text to tokenize
`add_bos` [Boolean](https://docs.microsoft.com/en-us/dotnet/api/system.boolean)
Whether the "BOS" token should be added
+`special` [Boolean](https://docs.microsoft.com/en-us/dotnet/api/system.boolean)
+Allow tokenizing special and/or control tokens which otherwise are not exposed and treated as plaintext.
+
`encoding` [Encoding](https://docs.microsoft.com/en-us/dotnet/api/system.text.encoding)
Encoding to use for the text
#### Returns
-[Int32[]](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
+[LLamaToken[]](./llama.native.llamatoken.md)
#### Exceptions
[RuntimeError](./llama.exceptions.runtimeerror.md)
-### **GetLogits()**
-
-Token logits obtained from the last call to llama_eval()
- The logits for the last token are stored in the last row
- Can be mutated in order to change the probabilities of the next token.
- Rows: n_tokens
- Cols: n_vocab
+### **TokenToSpan(LLamaToken, Span<Byte>)**
-```csharp
-public Span GetLogits()
-```
-
-#### Returns
-
-[Span<Single>](https://docs.microsoft.com/en-us/dotnet/api/system.span-1)
-
-### **TokenToString(Int32, Encoding)**
-
-Convert a token into a string
+Convert a single llama token into bytes
```csharp
-public string TokenToString(int token, Encoding encoding)
+public uint TokenToSpan(LLamaToken token, Span dest)
```
#### Parameters
-`token` [Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
-Token to decode into a string
+`token` [LLamaToken](./llama.native.llamatoken.md)
+Token to decode
-`encoding` [Encoding](https://docs.microsoft.com/en-us/dotnet/api/system.text.encoding)
+`dest` [Span<Byte>](https://docs.microsoft.com/en-us/dotnet/api/system.span-1)
+A span to attempt to write into. If this is too small nothing will be written
#### Returns
-[String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
-
-### **TokenToString(Int32, Encoding, StringBuilder)**
-
-Append a single llama token to a string builder
-
-```csharp
-public void TokenToString(int token, Encoding encoding, StringBuilder dest)
-```
-
-#### Parameters
-
-`token` [Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
-Token to decode
-
-`encoding` [Encoding](https://docs.microsoft.com/en-us/dotnet/api/system.text.encoding)
+[UInt32](https://docs.microsoft.com/en-us/dotnet/api/system.uint32)
+The size of this token. **nothing will be written** if this is larger than `dest`
-`dest` [StringBuilder](https://docs.microsoft.com/en-us/dotnet/api/system.text.stringbuilder)
-string builder to append the result to
+### **Decode(LLamaBatch)**
-### **TokenToSpan(Int32, Span<Byte>)**
-Convert a single llama token into bytes
```csharp
-public int TokenToSpan(int token, Span dest)
+public DecodeResult Decode(LLamaBatch batch)
```
#### Parameters
-`token` [Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
-Token to decode
-
-`dest` [Span<Byte>](https://docs.microsoft.com/en-us/dotnet/api/system.span-1)
-A span to attempt to write into. If this is too small nothing will be written
+`batch` [LLamaBatch](./llama.native.llamabatch.md)
#### Returns
-[Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
-The size of this token. **nothing will be written** if this is larger than `dest`
+[DecodeResult](./llama.native.decoderesult.md)
+Positive return values does not mean a fatal error, but rather a warning:
+ - 0: success
+ - 1: could not find a KV slot for the batch (try reducing the size of the batch or increase the context)
+ - < 0: error
-### **Eval(ReadOnlySpan<Int32>, Int32, Int32)**
+### **Decode(List<LLamaToken>, LLamaSeqId, LLamaBatch, Int32&)**
-Run the llama inference to obtain the logits and probabilities for the next token.
+Decode a set of tokens in batch-size chunks.
```csharp
-public bool Eval(ReadOnlySpan tokens, int n_past, int n_threads)
+internal ValueTuple Decode(List tokens, LLamaSeqId id, LLamaBatch batch, Int32& n_past)
```
#### Parameters
-`tokens` [ReadOnlySpan<Int32>](https://docs.microsoft.com/en-us/dotnet/api/system.readonlyspan-1)
-The provided batch of new tokens to process
+`tokens` [List<LLamaToken>](https://docs.microsoft.com/en-us/dotnet/api/system.collections.generic.list-1)
-`n_past` [Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
-the number of tokens to use from previous eval calls
+`id` [LLamaSeqId](./llama.native.llamaseqid.md)
-`n_threads` [Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
+`batch` [LLamaBatch](./llama.native.llamabatch.md)
+
+`n_past` [Int32&](https://docs.microsoft.com/en-us/dotnet/api/system.int32&)
#### Returns
-[Boolean](https://docs.microsoft.com/en-us/dotnet/api/system.boolean)
-Returns true on success
+[ValueTuple<DecodeResult, Int32>](https://docs.microsoft.com/en-us/dotnet/api/system.valuetuple-2)
+A tuple, containing the decode result and the number of tokens that have not been decoded yet.
### **GetStateSize()**
@@ -372,3 +360,169 @@ The pointer to read the state from
[UInt64](https://docs.microsoft.com/en-us/dotnet/api/system.uint64)
Number of bytes read from the src pointer
+
+### **SetSeed(UInt32)**
+
+Set the RNG seed
+
+```csharp
+public void SetSeed(uint seed)
+```
+
+#### Parameters
+
+`seed` [UInt32](https://docs.microsoft.com/en-us/dotnet/api/system.uint32)
+
+### **SetThreads(UInt32, UInt32)**
+
+Set the number of threads used for decoding
+
+```csharp
+public void SetThreads(uint threads, uint threadsBatch)
+```
+
+#### Parameters
+
+`threads` [UInt32](https://docs.microsoft.com/en-us/dotnet/api/system.uint32)
+n_threads is the number of threads used for generation (single token)
+
+`threadsBatch` [UInt32](https://docs.microsoft.com/en-us/dotnet/api/system.uint32)
+n_threads_batch is the number of threads used for prompt and batch processing (multiple tokens)
+
+### **KvCacheGetDebugView(Int32)**
+
+Get a new KV cache view that can be used to debug the KV cache
+
+```csharp
+public LLamaKvCacheViewSafeHandle KvCacheGetDebugView(int maxSequences)
+```
+
+#### Parameters
+
+`maxSequences` [Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
+
+#### Returns
+
+[LLamaKvCacheViewSafeHandle](./llama.native.llamakvcacheviewsafehandle.md)
+
+### **KvCacheCountCells()**
+
+Count the number of used cells in the KV cache (i.e. have at least one sequence assigned to them)
+
+```csharp
+public int KvCacheCountCells()
+```
+
+#### Returns
+
+[Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
+
+### **KvCacheCountTokens()**
+
+Returns the number of tokens in the KV cache (slow, use only for debug)
+ If a KV cell has multiple sequences assigned to it, it will be counted multiple times
+
+```csharp
+public int KvCacheCountTokens()
+```
+
+#### Returns
+
+[Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
+
+### **KvCacheClear()**
+
+Clear the KV cache
+
+```csharp
+public void KvCacheClear()
+```
+
+### **KvCacheRemove(LLamaSeqId, LLamaPos, LLamaPos)**
+
+Removes all tokens that belong to the specified sequence and have positions in [p0, p1)
+
+```csharp
+public void KvCacheRemove(LLamaSeqId seq, LLamaPos p0, LLamaPos p1)
+```
+
+#### Parameters
+
+`seq` [LLamaSeqId](./llama.native.llamaseqid.md)
+
+`p0` [LLamaPos](./llama.native.llamapos.md)
+
+`p1` [LLamaPos](./llama.native.llamapos.md)
+
+### **KvCacheSequenceCopy(LLamaSeqId, LLamaSeqId, LLamaPos, LLamaPos)**
+
+Copy all tokens that belong to the specified sequence to another sequence. Note that
+ this does not allocate extra KV cache memory - it simply assigns the tokens to the
+ new sequence
+
+```csharp
+public void KvCacheSequenceCopy(LLamaSeqId src, LLamaSeqId dest, LLamaPos p0, LLamaPos p1)
+```
+
+#### Parameters
+
+`src` [LLamaSeqId](./llama.native.llamaseqid.md)
+
+`dest` [LLamaSeqId](./llama.native.llamaseqid.md)
+
+`p0` [LLamaPos](./llama.native.llamapos.md)
+
+`p1` [LLamaPos](./llama.native.llamapos.md)
+
+### **KvCacheSequenceKeep(LLamaSeqId)**
+
+Removes all tokens that do not belong to the specified sequence
+
+```csharp
+public void KvCacheSequenceKeep(LLamaSeqId seq)
+```
+
+#### Parameters
+
+`seq` [LLamaSeqId](./llama.native.llamaseqid.md)
+
+### **KvCacheSequenceAdd(LLamaSeqId, LLamaPos, LLamaPos, Int32)**
+
+Adds relative position "delta" to all tokens that belong to the specified sequence
+ and have positions in [p0, p1. If the KV cache is RoPEd, the KV data is updated
+ accordingly
+
+```csharp
+public void KvCacheSequenceAdd(LLamaSeqId seq, LLamaPos p0, LLamaPos p1, int delta)
+```
+
+#### Parameters
+
+`seq` [LLamaSeqId](./llama.native.llamaseqid.md)
+
+`p0` [LLamaPos](./llama.native.llamapos.md)
+
+`p1` [LLamaPos](./llama.native.llamapos.md)
+
+`delta` [Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
+
+### **KvCacheSequenceDivide(LLamaSeqId, LLamaPos, LLamaPos, Int32)**
+
+Integer division of the positions by factor of `d > 1`.
+ If the KV cache is RoPEd, the KV data is updated accordingly.
+ p0 < 0 : [0, p1]
+ p1 < 0 : [p0, inf)
+
+```csharp
+public void KvCacheSequenceDivide(LLamaSeqId seq, LLamaPos p0, LLamaPos p1, int divisor)
+```
+
+#### Parameters
+
+`seq` [LLamaSeqId](./llama.native.llamaseqid.md)
+
+`p0` [LLamaPos](./llama.native.llamapos.md)
+
+`p1` [LLamaPos](./llama.native.llamapos.md)
+
+`divisor` [Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
diff --git a/docs/xmldocs/llama.native.safellamagrammarhandle.md b/docs/xmldocs/llama.native.safellamagrammarhandle.md
index 653f0a36..0a08687d 100644
--- a/docs/xmldocs/llama.native.safellamagrammarhandle.md
+++ b/docs/xmldocs/llama.native.safellamagrammarhandle.md
@@ -95,3 +95,29 @@ index of the start rule of the grammar
#### Exceptions
[RuntimeError](./llama.exceptions.runtimeerror.md)
+
+### **Clone()**
+
+Create a copy of this grammar instance
+
+```csharp
+public SafeLLamaGrammarHandle Clone()
+```
+
+#### Returns
+
+[SafeLLamaGrammarHandle](./llama.native.safellamagrammarhandle.md)
+
+### **AcceptToken(SafeLLamaContextHandle, LLamaToken)**
+
+Accepts the sampled token into the grammar
+
+```csharp
+public void AcceptToken(SafeLLamaContextHandle ctx, LLamaToken token)
+```
+
+#### Parameters
+
+`ctx` [SafeLLamaContextHandle](./llama.native.safellamacontexthandle.md)
+
+`token` [LLamaToken](./llama.native.llamatoken.md)
diff --git a/docs/xmldocs/llama.native.safellamamodelhandle.md b/docs/xmldocs/llama.native.safellamamodelhandle.md
index 831ab0c4..e6dd6e64 100644
--- a/docs/xmldocs/llama.native.safellamamodelhandle.md
+++ b/docs/xmldocs/llama.native.safellamamodelhandle.md
@@ -37,6 +37,18 @@ public int ContextSize { get; }
[Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
+### **RopeFrequency**
+
+Get the rope frequency this model was trained with
+
+```csharp
+public float RopeFrequency { get; }
+```
+
+#### Property Value
+
+[Single](https://docs.microsoft.com/en-us/dotnet/api/system.single)
+
### **EmbeddingSize**
Dimension of embedding vectors
@@ -49,6 +61,54 @@ public int EmbeddingSize { get; }
[Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
+### **SizeInBytes**
+
+Get the size of this model in bytes
+
+```csharp
+public ulong SizeInBytes { get; }
+```
+
+#### Property Value
+
+[UInt64](https://docs.microsoft.com/en-us/dotnet/api/system.uint64)
+
+### **ParameterCount**
+
+Get the number of parameters in this model
+
+```csharp
+public ulong ParameterCount { get; }
+```
+
+#### Property Value
+
+[UInt64](https://docs.microsoft.com/en-us/dotnet/api/system.uint64)
+
+### **Description**
+
+Get a description of this model
+
+```csharp
+public string Description { get; }
+```
+
+#### Property Value
+
+[String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
+
+### **MetadataCount**
+
+Get the number of metadata key/value pairs
+
+```csharp
+public int MetadataCount { get; }
+```
+
+#### Property Value
+
+[Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
+
### **IsInvalid**
```csharp
@@ -69,6 +129,14 @@ public bool IsClosed { get; }
[Boolean](https://docs.microsoft.com/en-us/dotnet/api/system.boolean)
+## Constructors
+
+### **SafeLlamaModelHandle()**
+
+```csharp
+public SafeLlamaModelHandle()
+```
+
## Methods
### **ReleaseHandle()**
@@ -81,19 +149,19 @@ protected bool ReleaseHandle()
[Boolean](https://docs.microsoft.com/en-us/dotnet/api/system.boolean)
-### **LoadFromFile(String, LLamaContextParams)**
+### **LoadFromFile(String, LLamaModelParams)**
Load a model from the given file path into memory
```csharp
-public static SafeLlamaModelHandle LoadFromFile(string modelPath, LLamaContextParams lparams)
+public static SafeLlamaModelHandle LoadFromFile(string modelPath, LLamaModelParams lparams)
```
#### Parameters
`modelPath` [String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
-`lparams` [LLamaContextParams](./llama.native.llamacontextparams.md)
+`lparams` [LLamaModelParams](./llama.native.llamamodelparams.md)
#### Returns
@@ -103,39 +171,93 @@ public static SafeLlamaModelHandle LoadFromFile(string modelPath, LLamaContextPa
[RuntimeError](./llama.exceptions.runtimeerror.md)
-### **ApplyLoraFromFile(String, String, Int32)**
+### **llama_model_apply_lora_from_file(SafeLlamaModelHandle, String, Single, String, Int32)**
+
+Apply a LoRA adapter to a loaded model
+ path_base_model is the path to a higher quality model to use as a base for
+ the layers modified by the adapter. Can be NULL to use the current loaded model.
+ The model needs to be reloaded before applying a new adapter, otherwise the adapter
+ will be applied on top of the previous one
+
+```csharp
+public static int llama_model_apply_lora_from_file(SafeLlamaModelHandle model_ptr, string path_lora, float scale, string path_base_model, int n_threads)
+```
+
+#### Parameters
+
+`model_ptr` [SafeLlamaModelHandle](./llama.native.safellamamodelhandle.md)
+
+`path_lora` [String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
+
+`scale` [Single](https://docs.microsoft.com/en-us/dotnet/api/system.single)
+
+`path_base_model` [String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
+
+`n_threads` [Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
+
+#### Returns
+
+[Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
+Returns 0 on success
+
+### **llama_model_meta_val_str(SafeLlamaModelHandle, Byte*, Byte*, Int64)**
+
+Get metadata value as a string by key name
+
+```csharp
+public static int llama_model_meta_val_str(SafeLlamaModelHandle model, Byte* key, Byte* buf, long buf_size)
+```
+
+#### Parameters
+
+`model` [SafeLlamaModelHandle](./llama.native.safellamamodelhandle.md)
+
+`key` [Byte*](https://docs.microsoft.com/en-us/dotnet/api/system.byte*)
+
+`buf` [Byte*](https://docs.microsoft.com/en-us/dotnet/api/system.byte*)
+
+`buf_size` [Int64](https://docs.microsoft.com/en-us/dotnet/api/system.int64)
+
+#### Returns
+
+[Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
+The length of the string on success, or -1 on failure
+
+### **ApplyLoraFromFile(String, Single, String, Nullable<Int32>)**
Apply a LoRA adapter to a loaded model
```csharp
-public void ApplyLoraFromFile(string lora, string modelBase, int threads)
+public void ApplyLoraFromFile(string lora, float scale, string modelBase, Nullable threads)
```
#### Parameters
`lora` [String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
+`scale` [Single](https://docs.microsoft.com/en-us/dotnet/api/system.single)
+
`modelBase` [String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
A path to a higher quality model to use as a base for the layers modified by the
adapter. Can be NULL to use the current loaded model.
-`threads` [Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
+`threads` [Nullable<Int32>](https://docs.microsoft.com/en-us/dotnet/api/system.nullable-1)
#### Exceptions
[RuntimeError](./llama.exceptions.runtimeerror.md)
-### **TokenToSpan(Int32, Span<Byte>)**
+### **TokenToSpan(LLamaToken, Span<Byte>)**
Convert a single llama token into bytes
```csharp
-public int TokenToSpan(int llama_token, Span dest)
+public uint TokenToSpan(LLamaToken token, Span dest)
```
#### Parameters
-`llama_token` [Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
+`token` [LLamaToken](./llama.native.llamatoken.md)
Token to decode
`dest` [Span<Byte>](https://docs.microsoft.com/en-us/dotnet/api/system.span-1)
@@ -143,78 +265,159 @@ A span to attempt to write into. If this is too small nothing will be written
#### Returns
-[Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
+[UInt32](https://docs.microsoft.com/en-us/dotnet/api/system.uint32)
The size of this token. **nothing will be written** if this is larger than `dest`
-### **TokenToString(Int32, Encoding)**
+### **TokensToSpan(IReadOnlyList<LLamaToken>, Span<Char>, Encoding)**
-Convert a single llama token into a string
+#### Caution
+
+Use a StreamingTokenDecoder instead
+
+---
+
+Convert a sequence of tokens into characters.
```csharp
-public string TokenToString(int llama_token, Encoding encoding)
+internal Span TokensToSpan(IReadOnlyList tokens, Span dest, Encoding encoding)
```
#### Parameters
-`llama_token` [Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
+`tokens` [IReadOnlyList<LLamaToken>](https://docs.microsoft.com/en-us/dotnet/api/system.collections.generic.ireadonlylist-1)
+
+`dest` [Span<Char>](https://docs.microsoft.com/en-us/dotnet/api/system.span-1)
`encoding` [Encoding](https://docs.microsoft.com/en-us/dotnet/api/system.text.encoding)
-Encoding to use to decode the bytes into a string
#### Returns
-[String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
+[Span<Char>](https://docs.microsoft.com/en-us/dotnet/api/system.span-1)
+The section of the span which has valid data in it.
+ If there was insufficient space in the output span this will be
+ filled with as many characters as possible, starting from the _last_ token.
-### **TokenToString(Int32, Encoding, StringBuilder)**
+### **Tokenize(String, Boolean, Boolean, Encoding)**
-Append a single llama token to a string builder
+Convert a string of text into tokens
```csharp
-public void TokenToString(int llama_token, Encoding encoding, StringBuilder dest)
+public LLamaToken[] Tokenize(string text, bool add_bos, bool special, Encoding encoding)
```
#### Parameters
-`llama_token` [Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
-Token to decode
+`text` [String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
+
+`add_bos` [Boolean](https://docs.microsoft.com/en-us/dotnet/api/system.boolean)
+
+`special` [Boolean](https://docs.microsoft.com/en-us/dotnet/api/system.boolean)
+Allow tokenizing special and/or control tokens which otherwise are not exposed and treated as plaintext.
`encoding` [Encoding](https://docs.microsoft.com/en-us/dotnet/api/system.text.encoding)
-`dest` [StringBuilder](https://docs.microsoft.com/en-us/dotnet/api/system.text.stringbuilder)
-string builder to append the result to
+#### Returns
-### **Tokenize(String, Boolean, Encoding)**
+[LLamaToken[]](./llama.native.llamatoken.md)
-Convert a string of text into tokens
+### **CreateContext(LLamaContextParams)**
+
+Create a new context for this model
```csharp
-public Int32[] Tokenize(string text, bool add_bos, Encoding encoding)
+public SafeLLamaContextHandle CreateContext(LLamaContextParams params)
```
#### Parameters
-`text` [String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
+`params` [LLamaContextParams](./llama.native.llamacontextparams.md)
-`add_bos` [Boolean](https://docs.microsoft.com/en-us/dotnet/api/system.boolean)
+#### Returns
-`encoding` [Encoding](https://docs.microsoft.com/en-us/dotnet/api/system.text.encoding)
+[SafeLLamaContextHandle](./llama.native.safellamacontexthandle.md)
+
+### **MetadataKeyByIndex(Int32)**
+
+Get the metadata key for the given index
+
+```csharp
+public Nullable> MetadataKeyByIndex(int index)
+```
+
+#### Parameters
+
+`index` [Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
+The index to get
#### Returns
-[Int32[]](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
+[Nullable<Memory<Byte>>](https://docs.microsoft.com/en-us/dotnet/api/system.nullable-1)
+The key, null if there is no such key or if the buffer was too small
-### **CreateContext(LLamaContextParams)**
+### **MetadataValueByIndex(Int32)**
-Create a new context for this model
+Get the metadata value for the given index
```csharp
-public SafeLLamaContextHandle CreateContext(LLamaContextParams params)
+public Nullable> MetadataValueByIndex(int index)
```
#### Parameters
-`params` [LLamaContextParams](./llama.native.llamacontextparams.md)
+`index` [Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
+The index to get
#### Returns
-[SafeLLamaContextHandle](./llama.native.safellamacontexthandle.md)
+[Nullable<Memory<Byte>>](https://docs.microsoft.com/en-us/dotnet/api/system.nullable-1)
+The value, null if there is no such value or if the buffer was too small
+
+### **ReadMetadata()**
+
+```csharp
+internal IReadOnlyDictionary ReadMetadata()
+```
+
+#### Returns
+
+[IReadOnlyDictionary<String, String>](https://docs.microsoft.com/en-us/dotnet/api/system.collections.generic.ireadonlydictionary-2)
+
+### **<llama_model_meta_key_by_index>g__llama_model_meta_key_by_index_native|23_0(SafeLlamaModelHandle, Int32, Byte*, Int64)**
+
+```csharp
+internal static int g__llama_model_meta_key_by_index_native|23_0(SafeLlamaModelHandle model, int index, Byte* buf, long buf_size)
+```
+
+#### Parameters
+
+`model` [SafeLlamaModelHandle](./llama.native.safellamamodelhandle.md)
+
+`index` [Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
+
+`buf` [Byte*](https://docs.microsoft.com/en-us/dotnet/api/system.byte*)
+
+`buf_size` [Int64](https://docs.microsoft.com/en-us/dotnet/api/system.int64)
+
+#### Returns
+
+[Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
+
+### **<llama_model_meta_val_str_by_index>g__llama_model_meta_val_str_by_index_native|24_0(SafeLlamaModelHandle, Int32, Byte*, Int64)**
+
+```csharp
+internal static int g__llama_model_meta_val_str_by_index_native|24_0(SafeLlamaModelHandle model, int index, Byte* buf, long buf_size)
+```
+
+#### Parameters
+
+`model` [SafeLlamaModelHandle](./llama.native.safellamamodelhandle.md)
+
+`index` [Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
+
+`buf` [Byte*](https://docs.microsoft.com/en-us/dotnet/api/system.byte*)
+
+`buf_size` [Int64](https://docs.microsoft.com/en-us/dotnet/api/system.int64)
+
+#### Returns
+
+[Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
diff --git a/docs/xmldocs/llama.native.safellavaimageembedhandle.md b/docs/xmldocs/llama.native.safellavaimageembedhandle.md
new file mode 100644
index 00000000..741c5acf
--- /dev/null
+++ b/docs/xmldocs/llama.native.safellavaimageembedhandle.md
@@ -0,0 +1,94 @@
+# SafeLlavaImageEmbedHandle
+
+Namespace: LLama.Native
+
+A Reference to a llava Image Embed handle
+
+```csharp
+public sealed class SafeLlavaImageEmbedHandle : SafeLLamaHandleBase, System.IDisposable
+```
+
+Inheritance [Object](https://docs.microsoft.com/en-us/dotnet/api/system.object) β [CriticalFinalizerObject](https://docs.microsoft.com/en-us/dotnet/api/system.runtime.constrainedexecution.criticalfinalizerobject) β [SafeHandle](https://docs.microsoft.com/en-us/dotnet/api/system.runtime.interopservices.safehandle) β [SafeLLamaHandleBase](./llama.native.safellamahandlebase.md) β [SafeLlavaImageEmbedHandle](./llama.native.safellavaimageembedhandle.md)
+Implements [IDisposable](https://docs.microsoft.com/en-us/dotnet/api/system.idisposable)
+
+## Properties
+
+### **IsInvalid**
+
+```csharp
+public bool IsInvalid { get; }
+```
+
+#### Property Value
+
+[Boolean](https://docs.microsoft.com/en-us/dotnet/api/system.boolean)
+
+### **IsClosed**
+
+```csharp
+public bool IsClosed { get; }
+```
+
+#### Property Value
+
+[Boolean](https://docs.microsoft.com/en-us/dotnet/api/system.boolean)
+
+## Methods
+
+### **CreateFromFileName(SafeLlavaModelHandle, LLamaContext, String)**
+
+Create an image embed from an image file
+
+```csharp
+public static SafeLlavaImageEmbedHandle CreateFromFileName(SafeLlavaModelHandle ctxLlava, LLamaContext ctxLlama, string image)
+```
+
+#### Parameters
+
+`ctxLlava` [SafeLlavaModelHandle](./llama.native.safellavamodelhandle.md)
+
+`ctxLlama` [LLamaContext](./llama.llamacontext.md)
+
+`image` [String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
+Path to the image file. Supported formats:
+ JPGPNGBMPTGA
+
+#### Returns
+
+[SafeLlavaImageEmbedHandle](./llama.native.safellavaimageembedhandle.md)
+
+#### Exceptions
+
+[InvalidOperationException](https://docs.microsoft.com/en-us/dotnet/api/system.invalidoperationexception)
+
+### **CreateFromMemory(SafeLlavaModelHandle, LLamaContext, Byte[])**
+
+Create an image embed from the bytes of an image.
+
+```csharp
+public static SafeLlavaImageEmbedHandle CreateFromMemory(SafeLlavaModelHandle ctxLlava, LLamaContext ctxLlama, Byte[] image)
+```
+
+#### Parameters
+
+`ctxLlava` [SafeLlavaModelHandle](./llama.native.safellavamodelhandle.md)
+
+`ctxLlama` [LLamaContext](./llama.llamacontext.md)
+
+`image` [Byte[]](https://docs.microsoft.com/en-us/dotnet/api/system.byte)
+Image bytes. Supported formats:
+ JPGPNGBMPTGA
+
+#### Returns
+
+[SafeLlavaImageEmbedHandle](./llama.native.safellavaimageembedhandle.md)
+
+### **ReleaseHandle()**
+
+```csharp
+protected bool ReleaseHandle()
+```
+
+#### Returns
+
+[Boolean](https://docs.microsoft.com/en-us/dotnet/api/system.boolean)
diff --git a/docs/xmldocs/llama.native.safellavamodelhandle.md b/docs/xmldocs/llama.native.safellavamodelhandle.md
new file mode 100644
index 00000000..4f8179f3
--- /dev/null
+++ b/docs/xmldocs/llama.native.safellavamodelhandle.md
@@ -0,0 +1,138 @@
+# SafeLlavaModelHandle
+
+Namespace: LLama.Native
+
+A reference to a set of llava model weights.
+
+```csharp
+public sealed class SafeLlavaModelHandle : SafeLLamaHandleBase, System.IDisposable
+```
+
+Inheritance [Object](https://docs.microsoft.com/en-us/dotnet/api/system.object) β [CriticalFinalizerObject](https://docs.microsoft.com/en-us/dotnet/api/system.runtime.constrainedexecution.criticalfinalizerobject) β [SafeHandle](https://docs.microsoft.com/en-us/dotnet/api/system.runtime.interopservices.safehandle) β [SafeLLamaHandleBase](./llama.native.safellamahandlebase.md) β [SafeLlavaModelHandle](./llama.native.safellavamodelhandle.md)
+Implements [IDisposable](https://docs.microsoft.com/en-us/dotnet/api/system.idisposable)
+
+## Properties
+
+### **IsInvalid**
+
+```csharp
+public bool IsInvalid { get; }
+```
+
+#### Property Value
+
+[Boolean](https://docs.microsoft.com/en-us/dotnet/api/system.boolean)
+
+### **IsClosed**
+
+```csharp
+public bool IsClosed { get; }
+```
+
+#### Property Value
+
+[Boolean](https://docs.microsoft.com/en-us/dotnet/api/system.boolean)
+
+## Methods
+
+### **ReleaseHandle()**
+
+```csharp
+protected bool ReleaseHandle()
+```
+
+#### Returns
+
+[Boolean](https://docs.microsoft.com/en-us/dotnet/api/system.boolean)
+
+### **LoadFromFile(String, Int32)**
+
+Load a model from the given file path into memory
+
+```csharp
+public static SafeLlavaModelHandle LoadFromFile(string modelPath, int verbosity)
+```
+
+#### Parameters
+
+`modelPath` [String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
+MMP File (Multi-Modal Projections)
+
+`verbosity` [Int32](https://docs.microsoft.com/en-us/dotnet/api/system.int32)
+Verbosity level
+
+#### Returns
+
+[SafeLlavaModelHandle](./llama.native.safellavamodelhandle.md)
+SafeHandle of the Clip Model
+
+#### Exceptions
+
+[InvalidOperationException](https://docs.microsoft.com/en-us/dotnet/api/system.invalidoperationexception)
+
+[RuntimeError](./llama.exceptions.runtimeerror.md)
+
+### **CreateImageEmbeddings(LLamaContext, String)**
+
+Create the Image Embeddings.
+
+```csharp
+public SafeLlavaImageEmbedHandle CreateImageEmbeddings(LLamaContext ctxLlama, string image)
+```
+
+#### Parameters
+
+`ctxLlama` [LLamaContext](./llama.llamacontext.md)
+LLama Context
+
+`image` [String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
+Image filename (it supports jpeg format only)
+
+#### Returns
+
+[SafeLlavaImageEmbedHandle](./llama.native.safellavaimageembedhandle.md)
+return the SafeHandle of these embeddings
+
+### **CreateImageEmbeddings(LLamaContext, Byte[])**
+
+Create the Image Embeddings.
+
+```csharp
+public SafeLlavaImageEmbedHandle CreateImageEmbeddings(LLamaContext ctxLlama, Byte[] image)
+```
+
+#### Parameters
+
+`ctxLlama` [LLamaContext](./llama.llamacontext.md)
+LLama Context
+
+`image` [Byte[]](https://docs.microsoft.com/en-us/dotnet/api/system.byte)
+Image in binary format (it supports jpeg format only)
+
+#### Returns
+
+[SafeLlavaImageEmbedHandle](./llama.native.safellavaimageembedhandle.md)
+return the SafeHandle of these embeddings
+
+### **EvalImageEmbed(LLamaContext, SafeLlavaImageEmbedHandle, Int32&)**
+
+Evaluates the image embeddings.
+
+```csharp
+public bool EvalImageEmbed(LLamaContext ctxLlama, SafeLlavaImageEmbedHandle imageEmbed, Int32& n_past)
+```
+
+#### Parameters
+
+`ctxLlama` [LLamaContext](./llama.llamacontext.md)
+Llama Context
+
+`imageEmbed` [SafeLlavaImageEmbedHandle](./llama.native.safellavaimageembedhandle.md)
+The current embeddings to evaluate
+
+`n_past` [Int32&](https://docs.microsoft.com/en-us/dotnet/api/system.int32&)
+
+#### Returns
+
+[Boolean](https://docs.microsoft.com/en-us/dotnet/api/system.boolean)
+True on success
diff --git a/docs/xmldocs/llama.native.samplingapi.md b/docs/xmldocs/llama.native.samplingapi.md
deleted file mode 100644
index db074c67..00000000
--- a/docs/xmldocs/llama.native.samplingapi.md
+++ /dev/null
@@ -1,338 +0,0 @@
-# SamplingApi
-
-Namespace: LLama.Native
-
-Direct translation of the llama.cpp sampling API
-
-```csharp
-public class SamplingApi
-```
-
-Inheritance [Object](https://docs.microsoft.com/en-us/dotnet/api/system.object) β [SamplingApi](./llama.native.samplingapi.md)
-
-## Constructors
-
-### **SamplingApi()**
-
-```csharp
-public SamplingApi()
-```
-
-## Methods
-
-### **llama_sample_grammar(SafeLLamaContextHandle, LLamaTokenDataArray, SafeLLamaGrammarHandle)**
-
-Apply grammar rules to candidate tokens
-
-```csharp
-public static void llama_sample_grammar(SafeLLamaContextHandle ctx, LLamaTokenDataArray candidates, SafeLLamaGrammarHandle grammar)
-```
-
-#### Parameters
-
-`ctx` [SafeLLamaContextHandle](./llama.native.safellamacontexthandle.md)
-
-`candidates` [LLamaTokenDataArray](./llama.native.llamatokendataarray.md)
-
-`grammar` [SafeLLamaGrammarHandle](./llama.native.safellamagrammarhandle.md)
-
-### **llama_sample_repetition_penalty(SafeLLamaContextHandle, LLamaTokenDataArray, Memory<Int32>, UInt64, Single)**
-
-#### Caution
-
-last_tokens_size parameter is no longer needed
-
----
-
-Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix.
-
-```csharp
-public static void llama_sample_repetition_penalty(SafeLLamaContextHandle ctx, LLamaTokenDataArray candidates, Memory last_tokens, ulong last_tokens_size, float penalty)
-```
-
-#### Parameters
-
-`ctx` [SafeLLamaContextHandle](./llama.native.safellamacontexthandle.md)
-
-`candidates` [LLamaTokenDataArray](./llama.native.llamatokendataarray.md)
-Pointer to LLamaTokenDataArray
-
-`last_tokens` [Memory<Int32>](https://docs.microsoft.com/en-us/dotnet/api/system.memory-1)
-
-`last_tokens_size` [UInt64](https://docs.microsoft.com/en-us/dotnet/api/system.uint64)
-
-`penalty` [Single](https://docs.microsoft.com/en-us/dotnet/api/system.single)
-
-### **llama_sample_repetition_penalty(SafeLLamaContextHandle, LLamaTokenDataArray, Memory<Int32>, Single)**
-
-Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix.
-
-```csharp
-public static void llama_sample_repetition_penalty(SafeLLamaContextHandle ctx, LLamaTokenDataArray candidates, Memory last_tokens, float penalty)
-```
-
-#### Parameters
-
-`ctx` [SafeLLamaContextHandle](./llama.native.safellamacontexthandle.md)