| @@ -1,148 +1,48 @@ | |||
| # Bacthed executor - multi-output to one input | |||
| ```cs | |||
| using LLama.Batched; | |||
| using LLama.Common; | |||
| using LLama.Native; | |||
| using LLama.Sampling; | |||
| using Spectre.Console; | |||
| namespace LLama.Examples.Examples; | |||
| /// <summary> | |||
| /// This demonstrates generating multiple replies to the same prompt, with a shared cache | |||
| /// </summary> | |||
| public class BatchedExecutorFork | |||
| { | |||
| private const int n_split = 16; | |||
| private const int n_len = 72; | |||
| public static async Task Run() | |||
| { | |||
| string modelPath = UserSettings.GetModelPath(); | |||
| var parameters = new ModelParams(modelPath); | |||
| using var model = LLamaWeights.LoadFromFile(parameters); | |||
| var prompt = AnsiConsole.Ask("Prompt (or ENTER for default):", "Not many people know that"); | |||
| // Create an executor that can evaluate a batch of conversations together | |||
| using var executor = new BatchedExecutor(model, parameters); | |||
| // Print some info | |||
| var name = executor.Model.Metadata.GetValueOrDefault("general.name", "unknown model name"); | |||
| Console.WriteLine($"Created executor with model: {name}"); | |||
| // Evaluate the initial prompt to create one conversation | |||
| using var start = executor.Create(); | |||
| start.Prompt(prompt); | |||
| await executor.Infer(); | |||
| // Create the root node of the tree | |||
| var root = new Node(start); | |||
| await AnsiConsole | |||
| .Progress() | |||
| .StartAsync(async progress => | |||
| { | |||
| var reporter = progress.AddTask("Running Inference (1)", maxValue: n_len); | |||
| // Run inference loop | |||
| for (var i = 0; i < n_len; i++) | |||
| { | |||
| if (i != 0) | |||
| await executor.Infer(); | |||
| // Occasionally fork all the active conversations | |||
| if (i != 0 && i % n_split == 0) | |||
| root.Split(); | |||
| // Sample all active conversations | |||
| root.Sample(); | |||
| // Update progress bar | |||
| reporter.Increment(1); | |||
| reporter.Description($"Running Inference ({root.ActiveConversationCount})"); | |||
| } | |||
| // Display results | |||
| var display = new Tree(prompt); | |||
| root.Display(display); | |||
| AnsiConsole.Write(display); | |||
| }); | |||
| } | |||
| private class Node | |||
| { | |||
| private readonly StreamingTokenDecoder _decoder; | |||
| private readonly DefaultSamplingPipeline _sampler; | |||
| private Conversation? _conversation; | |||
| private Node? _left; | |||
| private Node? _right; | |||
| public int ActiveConversationCount => _conversation != null ? 1 : _left!.ActiveConversationCount + _right!.ActiveConversationCount; | |||
| public Node(Conversation conversation) | |||
| { | |||
| _sampler = new DefaultSamplingPipeline(); | |||
| _conversation = conversation; | |||
| _decoder = new StreamingTokenDecoder(conversation.Executor.Context); | |||
| } | |||
| public void Sample() | |||
| { | |||
| if (_conversation == null) | |||
| { | |||
| _left?.Sample(); | |||
| _right?.Sample(); | |||
| return; | |||
| } | |||
| if (_conversation.RequiresInference) | |||
| return; | |||
| // Sample one token | |||
| var ctx = _conversation.Executor.Context.NativeHandle; | |||
| var token = _sampler.Sample(ctx, _conversation.Sample(), Array.Empty<LLamaToken>()); | |||
| _sampler.Accept(ctx, token); | |||
| _decoder.Add(token); | |||
| // Prompt the conversation with this token, to continue generating from there | |||
| _conversation.Prompt(token); | |||
| } | |||
| public void Split() | |||
| { | |||
| if (_conversation != null) | |||
| { | |||
| _left = new Node(_conversation.Fork()); | |||
| _right = new Node(_conversation.Fork()); | |||
| _conversation.Dispose(); | |||
| _conversation = null; | |||
| } | |||
| else | |||
| { | |||
| _left?.Split(); | |||
| _right?.Split(); | |||
| } | |||
| } | |||
| public void Display<T>(T tree, int depth = 0) | |||
| where T : IHasTreeNodes | |||
| { | |||
| var colors = new[] { "red", "green", "blue", "yellow", "white" }; | |||
| var color = colors[depth % colors.Length]; | |||
| var message = Markup.Escape(_decoder.Read().ReplaceLineEndings("")); | |||
| var n = tree.AddNode($"[{color}]{message}[/]"); | |||
| _left?.Display(n, depth + 1); | |||
| _right?.Display(n, depth + 1); | |||
| } | |||
| } | |||
| } | |||
| ``` | |||
| # BatchedExecutor Fork - Generate Multiple Completions With Shared Memory | |||
| This example demonstrates using the `BatchedExecutor` to split one sequence into multiple sequences. See the source code [here](https://github.com/SciSharp/LLamaSharp/blob/master/LLama.Examples/Examples/BatchedExecutorFork.cs). | |||
| Sequences share memory up to the point they were split, meaning no extra memory is consumed by creating a fork. Inference runs for all sequences simultaneously, this means that running two sequences does _not_ take twice as much time as running one. | |||
| An example output, starting with the prompt `Not many people know that`: | |||
| ``` | |||
| Not many people know that | |||
| └── , in the 17th century, a military band led by Captain Charles | |||
| ├── Bossler of Baden, Germany, composed and played a music suite titled | |||
| │ ├── the "Civil Psalm," in order to rally German Protestants during | |||
| │ │ ├── the Thirty Years' War. This tune became popular among German soldiers, | |||
| │ │ │ ├── and its popularity continued long after the war | |||
| │ │ │ └── and, eventually, reached France. The | |||
| │ │ └── the Thirty Years' War.This music, with its clear call | |||
| │ │ ├── to arms and strong Christian themes, helped | |||
| │ │ └── to arms and unwavering belief | |||
| │ └── "Baden's First National Symphony," with lyrics by a young Wol | |||
| │ ├── fgang Amadeus Mozart. The story of the composition's creation | |||
| │ │ ├── has long been forgotten. But the B | |||
| │ │ └── was popularized by a novelty book | |||
| │ └── fgang Amadeus Mozart. It's said that this music brought | |||
| │ ├── peace to Europe, at least for a | |||
| │ └── the troops together during difficult times. It | |||
| └── Newdick played a mournful dirge to accompany the procession of | |||
| ├── the head of King Charles I. It is the scene that opens my latest book | |||
| │ ├── , "Death and Taxes." The book follows a British army captain named | |||
| │ │ ├── Marcus as he seeks revenge for his wife | |||
| │ │ └── William Darnay who becomes involved in | |||
| │ └── , A King, A Pawn and a Prince. The murder of the king | |||
| │ ├── and the civil war that followed are the | |||
| │ └── is a watershed moment in the political | |||
| └── the coffin of William Shakespeare, as it was carried to its final resting place | |||
| ├── . That is the least that can be said for a man who is often regarded | |||
| │ ├── as the greatest writer in the English language | |||
| │ └── as the greatest writer the English language has | |||
| └── at Stratford-upon-Avon. Shakespeare, of course | |||
| ├── , was a famous English poet and play | |||
| └── , was one of the greatest playwright | |||
| ``` | |||
| Forked sequences can be used for many possible things. For example | |||
| - Evaluating the system prompt once and forking for each independent conversation. | |||
| - Saving a "checkpoint" in a conversation to return to later. | |||
| - Beam Search. | |||
| - Splitting a conversation, generating completions from several different "agents", and taking the best response. | |||
| @@ -1,130 +1,7 @@ | |||
| # Batched executor - basic guidance | |||
| # BatchedExecutor Guidance - Classifier Free Guidance / Negative Prompting | |||
| ```cs | |||
| using LLama.Batched; | |||
| using LLama.Common; | |||
| using LLama.Native; | |||
| using LLama.Sampling; | |||
| using Spectre.Console; | |||
| This example demonstrates using `Classifier Free Guidance` (a.k.a. negative prompting) with a custom sampling pipeline. Negative prompting is a way of steering the model output away from certain topics. See the source code [here](https://github.com/SciSharp/LLamaSharp/blob/master/LLama.Examples/Examples/BatchedExecutorGuidance.cs). | |||
| namespace LLama.Examples.Examples; | |||
| Two conversations are created. The `guided` conversation starts with the prompt that should be completed as shown as the output, for example `"my favourite colour is"`. The `guidance` conversation contains the negative prompt at the start, for example `"I hate the colour red. My favourite colour is"`. Note that this is a _negative_ prompt, so therefore this guidance will make the model answer as if it _likes_ the colour red. | |||
| /// <summary> | |||
| /// This demonstrates using a batch to generate two sequences and then using one | |||
| /// sequence as the negative guidance ("classifier free guidance") for the other. | |||
| /// </summary> | |||
| public class BatchedExecutorGuidance | |||
| { | |||
| private const int n_len = 32; | |||
| public static async Task Run() | |||
| { | |||
| string modelPath = UserSettings.GetModelPath(); | |||
| var parameters = new ModelParams(modelPath); | |||
| using var model = LLamaWeights.LoadFromFile(parameters); | |||
| var positivePrompt = AnsiConsole.Ask("Positive Prompt (or ENTER for default):", "My favourite colour is").Trim(); | |||
| var negativePrompt = AnsiConsole.Ask("Negative Prompt (or ENTER for default):", "I hate the colour red. My favourite colour is").Trim(); | |||
| var weight = AnsiConsole.Ask("Guidance Weight (or ENTER for default):", 2.0f); | |||
| // Create an executor that can evaluate a batch of conversations together | |||
| using var executor = new BatchedExecutor(model, parameters); | |||
| // Print some info | |||
| var name = executor.Model.Metadata.GetValueOrDefault("general.name", "unknown model name"); | |||
| Console.WriteLine($"Created executor with model: {name}"); | |||
| // Load the two prompts into two conversations | |||
| using var guided = executor.Create(); | |||
| guided.Prompt(positivePrompt); | |||
| using var guidance = executor.Create(); | |||
| guidance.Prompt(negativePrompt); | |||
| // Run inference to evaluate prompts | |||
| await AnsiConsole | |||
| .Status() | |||
| .Spinner(Spinner.Known.Line) | |||
| .StartAsync("Evaluating Prompts...", _ => executor.Infer()); | |||
| // Fork the "guided" conversation. We'll run this one without guidance for comparison | |||
| using var unguided = guided.Fork(); | |||
| // Run inference loop | |||
| var unguidedSampler = new GuidedSampler(null, weight); | |||
| var unguidedDecoder = new StreamingTokenDecoder(executor.Context); | |||
| var guidedSampler = new GuidedSampler(guidance, weight); | |||
| var guidedDecoder = new StreamingTokenDecoder(executor.Context); | |||
| await AnsiConsole | |||
| .Progress() | |||
| .StartAsync(async progress => | |||
| { | |||
| var reporter = progress.AddTask("Running Inference", maxValue: n_len); | |||
| for (var i = 0; i < n_len; i++) | |||
| { | |||
| if (i != 0) | |||
| await executor.Infer(); | |||
| // Sample from the "unguided" conversation. This is just a conversation using the same prompt, without any | |||
| // guidance. This serves as a comparison to show the effect of guidance. | |||
| var u = unguidedSampler.Sample(executor.Context.NativeHandle, unguided.Sample(), Array.Empty<LLamaToken>()); | |||
| unguidedDecoder.Add(u); | |||
| unguided.Prompt(u); | |||
| // Sample from the "guided" conversation. This sampler will internally use the "guidance" conversation | |||
| // to steer the conversation. See how this is done in GuidedSampler.ProcessLogits (bottom of this file). | |||
| var g = guidedSampler.Sample(executor.Context.NativeHandle, guided.Sample(), Array.Empty<LLamaToken>()); | |||
| guidedDecoder.Add(g); | |||
| // Use this token to advance both guided _and_ guidance. Keeping them in sync (except for the initial prompt). | |||
| guided.Prompt(g); | |||
| guidance.Prompt(g); | |||
| // Early exit if we reach the natural end of the guided sentence | |||
| if (g == model.EndOfSentenceToken) | |||
| break; | |||
| // Update progress bar | |||
| reporter.Increment(1); | |||
| } | |||
| }); | |||
| AnsiConsole.MarkupLine($"[green]Unguided:[/][white]{unguidedDecoder.Read().ReplaceLineEndings(" ")}[/]"); | |||
| AnsiConsole.MarkupLine($"[green]Guided:[/][white]{guidedDecoder.Read().ReplaceLineEndings(" ")}[/]"); | |||
| } | |||
| private class GuidedSampler(Conversation? guidance, float weight) | |||
| : BaseSamplingPipeline | |||
| { | |||
| public override void Accept(SafeLLamaContextHandle ctx, LLamaToken token) | |||
| { | |||
| } | |||
| public override ISamplingPipeline Clone() | |||
| { | |||
| throw new NotSupportedException(); | |||
| } | |||
| protected override void ProcessLogits(SafeLLamaContextHandle ctx, Span<float> logits, ReadOnlySpan<LLamaToken> lastTokens) | |||
| { | |||
| if (guidance == null) | |||
| return; | |||
| // Get the logits generated by the guidance sequences | |||
| var guidanceLogits = guidance.Sample(); | |||
| // Use those logits to guide this sequence | |||
| NativeApi.llama_sample_apply_guidance(ctx, logits, guidanceLogits, weight); | |||
| } | |||
| protected override LLamaToken ProcessTokenDataArray(SafeLLamaContextHandle ctx, LLamaTokenDataArray candidates, ReadOnlySpan<LLamaToken> lastTokens) | |||
| { | |||
| candidates.Temperature(ctx, 0.8f); | |||
| candidates.TopK(ctx, 25); | |||
| return candidates.SampleToken(ctx); | |||
| } | |||
| } | |||
| } | |||
| ``` | |||
| A custom sampler samples the `guidance` conversation and uses that output to influence the output of the `guided` conversation. Once a token is selected _both_ conversations are continued with this token. | |||
| @@ -1,121 +1,5 @@ | |||
| # Batched executor - rewinding to an earlier state | |||
| # BatchedExecutor - Rewind | |||
| ```cs | |||
| using LLama.Batched; | |||
| using LLama.Common; | |||
| using LLama.Native; | |||
| using LLama.Sampling; | |||
| using Spectre.Console; | |||
| This example demonstrates using the `BatchedExecutor` to split one sequence into multiple sequences. See the source code [here](https://github.com/SciSharp/LLamaSharp/blob/master/LLama.Examples/Examples/BatchedExecutorRewind.cs). | |||
| namespace LLama.Examples.Examples; | |||
| /// <summary> | |||
| /// This demonstrates generating tokens and then rewinding to an earlier state | |||
| /// </summary> | |||
| public class BatchedExecutorRewind | |||
| { | |||
| private const int n_generate = 24; | |||
| private const int n_rewind = 12; | |||
| private const int n_repeats = 6; | |||
| public static async Task Run() | |||
| { | |||
| string modelPath = UserSettings.GetModelPath(); | |||
| var parameters = new ModelParams(modelPath); | |||
| using var model = LLamaWeights.LoadFromFile(parameters); | |||
| var prompt = AnsiConsole.Ask("Prompt (or ENTER for default):", "Not many people know that"); | |||
| // Create an executor that can evaluate a batch of conversations together | |||
| using var executor = new BatchedExecutor(model, parameters); | |||
| // Print some info | |||
| var name = executor.Model.Metadata.GetValueOrDefault("general.name", "unknown model name"); | |||
| Console.WriteLine($"Created executor with model: {name}"); | |||
| // Evaluate the initial prompt to create one conversation | |||
| using var conversation = executor.Create(); | |||
| conversation.Prompt(prompt); | |||
| // Create the start node wrapping the conversation | |||
| var node = new Node(executor.Context); | |||
| // Print the prompt | |||
| Console.ForegroundColor = ConsoleColor.Green; | |||
| Console.WriteLine(prompt); | |||
| for (var i = 0; i < n_repeats; i++) | |||
| { | |||
| for (var j = 0; j < n_generate; j++) | |||
| { | |||
| // Run inference | |||
| await executor.Infer(); | |||
| // Sample a token | |||
| var token = node.Sample(conversation); | |||
| // Continue conversation with this token | |||
| if (j != n_generate - 1) | |||
| conversation.Prompt(token); | |||
| } | |||
| // Write out what we generated | |||
| node.Write(n_rewind, i + 1); | |||
| // Rewind back a few tokens | |||
| conversation.Rewind(n_rewind + 1); | |||
| // Prompt with a token | |||
| conversation.Prompt(node.GetToken(n_generate - n_rewind - 1)); | |||
| // Create a new node around the rewound conversation | |||
| node = new Node(executor.Context); | |||
| } | |||
| Console.WriteLine("Press any key to exit demo"); | |||
| Console.ReadKey(true); | |||
| } | |||
| private class Node | |||
| { | |||
| private readonly LLamaContext _context; | |||
| private readonly List<LLamaToken> _tokens = new List<LLamaToken>(); | |||
| private readonly DefaultSamplingPipeline Sampler; | |||
| public Node(LLamaContext context) | |||
| { | |||
| _context = context; | |||
| Sampler = new DefaultSamplingPipeline(); | |||
| } | |||
| public LLamaToken Sample(Conversation conversation) | |||
| { | |||
| var token = Sampler.Sample(_context.NativeHandle, conversation.Sample(), Array.Empty<LLamaToken>()); | |||
| _tokens.Add(token); | |||
| return token; | |||
| } | |||
| public void Write(int n_rewind, int depth) | |||
| { | |||
| var decoder = new StreamingTokenDecoder(_context); | |||
| for (var i = 0; i < _tokens.Count - n_rewind; i++) | |||
| decoder.Add(_tokens[i]); | |||
| AnsiConsole.MarkupLine($"[green]{new string(' ', depth * 3) + decoder.Read().ReplaceLineEndings(" ")}[/]"); | |||
| for (var i = _tokens.Count - n_rewind; i < _tokens.Count; i++) | |||
| decoder.Add(_tokens[i]); | |||
| AnsiConsole.MarkupLine($"[maroon]{decoder.Read().ReplaceLineEndings(" ")}[/]"); | |||
| } | |||
| public LLamaToken GetToken(int index) | |||
| { | |||
| return _tokens[index]; | |||
| } | |||
| } | |||
| } | |||
| ``` | |||
| A single conversation is prompted and then continued for 24 tokens, after that it is re-wound by 12 tokens and continued from there. Rewinding simply sets the conversation back to an earlier state and requires no extra computation. | |||