| @@ -1,148 +1,48 @@ | |||||
| # Bacthed executor - multi-output to one input | |||||
| ```cs | |||||
| using LLama.Batched; | |||||
| using LLama.Common; | |||||
| using LLama.Native; | |||||
| using LLama.Sampling; | |||||
| using Spectre.Console; | |||||
| namespace LLama.Examples.Examples; | |||||
| /// <summary> | |||||
| /// This demonstrates generating multiple replies to the same prompt, with a shared cache | |||||
| /// </summary> | |||||
| public class BatchedExecutorFork | |||||
| { | |||||
| private const int n_split = 16; | |||||
| private const int n_len = 72; | |||||
| public static async Task Run() | |||||
| { | |||||
| string modelPath = UserSettings.GetModelPath(); | |||||
| var parameters = new ModelParams(modelPath); | |||||
| using var model = LLamaWeights.LoadFromFile(parameters); | |||||
| var prompt = AnsiConsole.Ask("Prompt (or ENTER for default):", "Not many people know that"); | |||||
| // Create an executor that can evaluate a batch of conversations together | |||||
| using var executor = new BatchedExecutor(model, parameters); | |||||
| // Print some info | |||||
| var name = executor.Model.Metadata.GetValueOrDefault("general.name", "unknown model name"); | |||||
| Console.WriteLine($"Created executor with model: {name}"); | |||||
| // Evaluate the initial prompt to create one conversation | |||||
| using var start = executor.Create(); | |||||
| start.Prompt(prompt); | |||||
| await executor.Infer(); | |||||
| // Create the root node of the tree | |||||
| var root = new Node(start); | |||||
| await AnsiConsole | |||||
| .Progress() | |||||
| .StartAsync(async progress => | |||||
| { | |||||
| var reporter = progress.AddTask("Running Inference (1)", maxValue: n_len); | |||||
| // Run inference loop | |||||
| for (var i = 0; i < n_len; i++) | |||||
| { | |||||
| if (i != 0) | |||||
| await executor.Infer(); | |||||
| // Occasionally fork all the active conversations | |||||
| if (i != 0 && i % n_split == 0) | |||||
| root.Split(); | |||||
| // Sample all active conversations | |||||
| root.Sample(); | |||||
| // Update progress bar | |||||
| reporter.Increment(1); | |||||
| reporter.Description($"Running Inference ({root.ActiveConversationCount})"); | |||||
| } | |||||
| // Display results | |||||
| var display = new Tree(prompt); | |||||
| root.Display(display); | |||||
| AnsiConsole.Write(display); | |||||
| }); | |||||
| } | |||||
| private class Node | |||||
| { | |||||
| private readonly StreamingTokenDecoder _decoder; | |||||
| private readonly DefaultSamplingPipeline _sampler; | |||||
| private Conversation? _conversation; | |||||
| private Node? _left; | |||||
| private Node? _right; | |||||
| public int ActiveConversationCount => _conversation != null ? 1 : _left!.ActiveConversationCount + _right!.ActiveConversationCount; | |||||
| public Node(Conversation conversation) | |||||
| { | |||||
| _sampler = new DefaultSamplingPipeline(); | |||||
| _conversation = conversation; | |||||
| _decoder = new StreamingTokenDecoder(conversation.Executor.Context); | |||||
| } | |||||
| public void Sample() | |||||
| { | |||||
| if (_conversation == null) | |||||
| { | |||||
| _left?.Sample(); | |||||
| _right?.Sample(); | |||||
| return; | |||||
| } | |||||
| if (_conversation.RequiresInference) | |||||
| return; | |||||
| // Sample one token | |||||
| var ctx = _conversation.Executor.Context.NativeHandle; | |||||
| var token = _sampler.Sample(ctx, _conversation.Sample(), Array.Empty<LLamaToken>()); | |||||
| _sampler.Accept(ctx, token); | |||||
| _decoder.Add(token); | |||||
| // Prompt the conversation with this token, to continue generating from there | |||||
| _conversation.Prompt(token); | |||||
| } | |||||
| public void Split() | |||||
| { | |||||
| if (_conversation != null) | |||||
| { | |||||
| _left = new Node(_conversation.Fork()); | |||||
| _right = new Node(_conversation.Fork()); | |||||
| _conversation.Dispose(); | |||||
| _conversation = null; | |||||
| } | |||||
| else | |||||
| { | |||||
| _left?.Split(); | |||||
| _right?.Split(); | |||||
| } | |||||
| } | |||||
| public void Display<T>(T tree, int depth = 0) | |||||
| where T : IHasTreeNodes | |||||
| { | |||||
| var colors = new[] { "red", "green", "blue", "yellow", "white" }; | |||||
| var color = colors[depth % colors.Length]; | |||||
| var message = Markup.Escape(_decoder.Read().ReplaceLineEndings("")); | |||||
| var n = tree.AddNode($"[{color}]{message}[/]"); | |||||
| _left?.Display(n, depth + 1); | |||||
| _right?.Display(n, depth + 1); | |||||
| } | |||||
| } | |||||
| } | |||||
| ``` | |||||
| # BatchedExecutor Fork - Generate Multiple Completions With Shared Memory | |||||
| This example demonstrates using the `BatchedExecutor` to split one sequence into multiple sequences. See the source code [here](https://github.com/SciSharp/LLamaSharp/blob/master/LLama.Examples/Examples/BatchedExecutorFork.cs). | |||||
| Sequences share memory up to the point they were split, meaning no extra memory is consumed by creating a fork. Inference runs for all sequences simultaneously, this means that running two sequences does _not_ take twice as much time as running one. | |||||
| An example output, starting with the prompt `Not many people know that`: | |||||
| ``` | |||||
| Not many people know that | |||||
| └── , in the 17th century, a military band led by Captain Charles | |||||
| ├── Bossler of Baden, Germany, composed and played a music suite titled | |||||
| │ ├── the "Civil Psalm," in order to rally German Protestants during | |||||
| │ │ ├── the Thirty Years' War. This tune became popular among German soldiers, | |||||
| │ │ │ ├── and its popularity continued long after the war | |||||
| │ │ │ └── and, eventually, reached France. The | |||||
| │ │ └── the Thirty Years' War.This music, with its clear call | |||||
| │ │ ├── to arms and strong Christian themes, helped | |||||
| │ │ └── to arms and unwavering belief | |||||
| │ └── "Baden's First National Symphony," with lyrics by a young Wol | |||||
| │ ├── fgang Amadeus Mozart. The story of the composition's creation | |||||
| │ │ ├── has long been forgotten. But the B | |||||
| │ │ └── was popularized by a novelty book | |||||
| │ └── fgang Amadeus Mozart. It's said that this music brought | |||||
| │ ├── peace to Europe, at least for a | |||||
| │ └── the troops together during difficult times. It | |||||
| └── Newdick played a mournful dirge to accompany the procession of | |||||
| ├── the head of King Charles I. It is the scene that opens my latest book | |||||
| │ ├── , "Death and Taxes." The book follows a British army captain named | |||||
| │ │ ├── Marcus as he seeks revenge for his wife | |||||
| │ │ └── William Darnay who becomes involved in | |||||
| │ └── , A King, A Pawn and a Prince. The murder of the king | |||||
| │ ├── and the civil war that followed are the | |||||
| │ └── is a watershed moment in the political | |||||
| └── the coffin of William Shakespeare, as it was carried to its final resting place | |||||
| ├── . That is the least that can be said for a man who is often regarded | |||||
| │ ├── as the greatest writer in the English language | |||||
| │ └── as the greatest writer the English language has | |||||
| └── at Stratford-upon-Avon. Shakespeare, of course | |||||
| ├── , was a famous English poet and play | |||||
| └── , was one of the greatest playwright | |||||
| ``` | |||||
| Forked sequences can be used for many possible things. For example | |||||
| - Evaluating the system prompt once and forking for each independent conversation. | |||||
| - Saving a "checkpoint" in a conversation to return to later. | |||||
| - Beam Search. | |||||
| - Splitting a conversation, generating completions from several different "agents", and taking the best response. | |||||
| @@ -1,130 +1,7 @@ | |||||
| # Batched executor - basic guidance | |||||
| # BatchedExecutor Guidance - Classifier Free Guidance / Negative Prompting | |||||
| ```cs | |||||
| using LLama.Batched; | |||||
| using LLama.Common; | |||||
| using LLama.Native; | |||||
| using LLama.Sampling; | |||||
| using Spectre.Console; | |||||
| This example demonstrates using `Classifier Free Guidance` (a.k.a. negative prompting) with a custom sampling pipeline. Negative prompting is a way of steering the model output away from certain topics. See the source code [here](https://github.com/SciSharp/LLamaSharp/blob/master/LLama.Examples/Examples/BatchedExecutorGuidance.cs). | |||||
| namespace LLama.Examples.Examples; | |||||
| Two conversations are created. The `guided` conversation starts with the prompt that should be completed as shown as the output, for example `"my favourite colour is"`. The `guidance` conversation contains the negative prompt at the start, for example `"I hate the colour red. My favourite colour is"`. Note that this is a _negative_ prompt, so therefore this guidance will make the model answer as if it _likes_ the colour red. | |||||
| /// <summary> | |||||
| /// This demonstrates using a batch to generate two sequences and then using one | |||||
| /// sequence as the negative guidance ("classifier free guidance") for the other. | |||||
| /// </summary> | |||||
| public class BatchedExecutorGuidance | |||||
| { | |||||
| private const int n_len = 32; | |||||
| public static async Task Run() | |||||
| { | |||||
| string modelPath = UserSettings.GetModelPath(); | |||||
| var parameters = new ModelParams(modelPath); | |||||
| using var model = LLamaWeights.LoadFromFile(parameters); | |||||
| var positivePrompt = AnsiConsole.Ask("Positive Prompt (or ENTER for default):", "My favourite colour is").Trim(); | |||||
| var negativePrompt = AnsiConsole.Ask("Negative Prompt (or ENTER for default):", "I hate the colour red. My favourite colour is").Trim(); | |||||
| var weight = AnsiConsole.Ask("Guidance Weight (or ENTER for default):", 2.0f); | |||||
| // Create an executor that can evaluate a batch of conversations together | |||||
| using var executor = new BatchedExecutor(model, parameters); | |||||
| // Print some info | |||||
| var name = executor.Model.Metadata.GetValueOrDefault("general.name", "unknown model name"); | |||||
| Console.WriteLine($"Created executor with model: {name}"); | |||||
| // Load the two prompts into two conversations | |||||
| using var guided = executor.Create(); | |||||
| guided.Prompt(positivePrompt); | |||||
| using var guidance = executor.Create(); | |||||
| guidance.Prompt(negativePrompt); | |||||
| // Run inference to evaluate prompts | |||||
| await AnsiConsole | |||||
| .Status() | |||||
| .Spinner(Spinner.Known.Line) | |||||
| .StartAsync("Evaluating Prompts...", _ => executor.Infer()); | |||||
| // Fork the "guided" conversation. We'll run this one without guidance for comparison | |||||
| using var unguided = guided.Fork(); | |||||
| // Run inference loop | |||||
| var unguidedSampler = new GuidedSampler(null, weight); | |||||
| var unguidedDecoder = new StreamingTokenDecoder(executor.Context); | |||||
| var guidedSampler = new GuidedSampler(guidance, weight); | |||||
| var guidedDecoder = new StreamingTokenDecoder(executor.Context); | |||||
| await AnsiConsole | |||||
| .Progress() | |||||
| .StartAsync(async progress => | |||||
| { | |||||
| var reporter = progress.AddTask("Running Inference", maxValue: n_len); | |||||
| for (var i = 0; i < n_len; i++) | |||||
| { | |||||
| if (i != 0) | |||||
| await executor.Infer(); | |||||
| // Sample from the "unguided" conversation. This is just a conversation using the same prompt, without any | |||||
| // guidance. This serves as a comparison to show the effect of guidance. | |||||
| var u = unguidedSampler.Sample(executor.Context.NativeHandle, unguided.Sample(), Array.Empty<LLamaToken>()); | |||||
| unguidedDecoder.Add(u); | |||||
| unguided.Prompt(u); | |||||
| // Sample from the "guided" conversation. This sampler will internally use the "guidance" conversation | |||||
| // to steer the conversation. See how this is done in GuidedSampler.ProcessLogits (bottom of this file). | |||||
| var g = guidedSampler.Sample(executor.Context.NativeHandle, guided.Sample(), Array.Empty<LLamaToken>()); | |||||
| guidedDecoder.Add(g); | |||||
| // Use this token to advance both guided _and_ guidance. Keeping them in sync (except for the initial prompt). | |||||
| guided.Prompt(g); | |||||
| guidance.Prompt(g); | |||||
| // Early exit if we reach the natural end of the guided sentence | |||||
| if (g == model.EndOfSentenceToken) | |||||
| break; | |||||
| // Update progress bar | |||||
| reporter.Increment(1); | |||||
| } | |||||
| }); | |||||
| AnsiConsole.MarkupLine($"[green]Unguided:[/][white]{unguidedDecoder.Read().ReplaceLineEndings(" ")}[/]"); | |||||
| AnsiConsole.MarkupLine($"[green]Guided:[/][white]{guidedDecoder.Read().ReplaceLineEndings(" ")}[/]"); | |||||
| } | |||||
| private class GuidedSampler(Conversation? guidance, float weight) | |||||
| : BaseSamplingPipeline | |||||
| { | |||||
| public override void Accept(SafeLLamaContextHandle ctx, LLamaToken token) | |||||
| { | |||||
| } | |||||
| public override ISamplingPipeline Clone() | |||||
| { | |||||
| throw new NotSupportedException(); | |||||
| } | |||||
| protected override void ProcessLogits(SafeLLamaContextHandle ctx, Span<float> logits, ReadOnlySpan<LLamaToken> lastTokens) | |||||
| { | |||||
| if (guidance == null) | |||||
| return; | |||||
| // Get the logits generated by the guidance sequences | |||||
| var guidanceLogits = guidance.Sample(); | |||||
| // Use those logits to guide this sequence | |||||
| NativeApi.llama_sample_apply_guidance(ctx, logits, guidanceLogits, weight); | |||||
| } | |||||
| protected override LLamaToken ProcessTokenDataArray(SafeLLamaContextHandle ctx, LLamaTokenDataArray candidates, ReadOnlySpan<LLamaToken> lastTokens) | |||||
| { | |||||
| candidates.Temperature(ctx, 0.8f); | |||||
| candidates.TopK(ctx, 25); | |||||
| return candidates.SampleToken(ctx); | |||||
| } | |||||
| } | |||||
| } | |||||
| ``` | |||||
| A custom sampler samples the `guidance` conversation and uses that output to influence the output of the `guided` conversation. Once a token is selected _both_ conversations are continued with this token. | |||||
| @@ -1,121 +1,5 @@ | |||||
| # Batched executor - rewinding to an earlier state | |||||
| # BatchedExecutor - Rewind | |||||
| ```cs | |||||
| using LLama.Batched; | |||||
| using LLama.Common; | |||||
| using LLama.Native; | |||||
| using LLama.Sampling; | |||||
| using Spectre.Console; | |||||
| This example demonstrates using the `BatchedExecutor` to split one sequence into multiple sequences. See the source code [here](https://github.com/SciSharp/LLamaSharp/blob/master/LLama.Examples/Examples/BatchedExecutorRewind.cs). | |||||
| namespace LLama.Examples.Examples; | |||||
| /// <summary> | |||||
| /// This demonstrates generating tokens and then rewinding to an earlier state | |||||
| /// </summary> | |||||
| public class BatchedExecutorRewind | |||||
| { | |||||
| private const int n_generate = 24; | |||||
| private const int n_rewind = 12; | |||||
| private const int n_repeats = 6; | |||||
| public static async Task Run() | |||||
| { | |||||
| string modelPath = UserSettings.GetModelPath(); | |||||
| var parameters = new ModelParams(modelPath); | |||||
| using var model = LLamaWeights.LoadFromFile(parameters); | |||||
| var prompt = AnsiConsole.Ask("Prompt (or ENTER for default):", "Not many people know that"); | |||||
| // Create an executor that can evaluate a batch of conversations together | |||||
| using var executor = new BatchedExecutor(model, parameters); | |||||
| // Print some info | |||||
| var name = executor.Model.Metadata.GetValueOrDefault("general.name", "unknown model name"); | |||||
| Console.WriteLine($"Created executor with model: {name}"); | |||||
| // Evaluate the initial prompt to create one conversation | |||||
| using var conversation = executor.Create(); | |||||
| conversation.Prompt(prompt); | |||||
| // Create the start node wrapping the conversation | |||||
| var node = new Node(executor.Context); | |||||
| // Print the prompt | |||||
| Console.ForegroundColor = ConsoleColor.Green; | |||||
| Console.WriteLine(prompt); | |||||
| for (var i = 0; i < n_repeats; i++) | |||||
| { | |||||
| for (var j = 0; j < n_generate; j++) | |||||
| { | |||||
| // Run inference | |||||
| await executor.Infer(); | |||||
| // Sample a token | |||||
| var token = node.Sample(conversation); | |||||
| // Continue conversation with this token | |||||
| if (j != n_generate - 1) | |||||
| conversation.Prompt(token); | |||||
| } | |||||
| // Write out what we generated | |||||
| node.Write(n_rewind, i + 1); | |||||
| // Rewind back a few tokens | |||||
| conversation.Rewind(n_rewind + 1); | |||||
| // Prompt with a token | |||||
| conversation.Prompt(node.GetToken(n_generate - n_rewind - 1)); | |||||
| // Create a new node around the rewound conversation | |||||
| node = new Node(executor.Context); | |||||
| } | |||||
| Console.WriteLine("Press any key to exit demo"); | |||||
| Console.ReadKey(true); | |||||
| } | |||||
| private class Node | |||||
| { | |||||
| private readonly LLamaContext _context; | |||||
| private readonly List<LLamaToken> _tokens = new List<LLamaToken>(); | |||||
| private readonly DefaultSamplingPipeline Sampler; | |||||
| public Node(LLamaContext context) | |||||
| { | |||||
| _context = context; | |||||
| Sampler = new DefaultSamplingPipeline(); | |||||
| } | |||||
| public LLamaToken Sample(Conversation conversation) | |||||
| { | |||||
| var token = Sampler.Sample(_context.NativeHandle, conversation.Sample(), Array.Empty<LLamaToken>()); | |||||
| _tokens.Add(token); | |||||
| return token; | |||||
| } | |||||
| public void Write(int n_rewind, int depth) | |||||
| { | |||||
| var decoder = new StreamingTokenDecoder(_context); | |||||
| for (var i = 0; i < _tokens.Count - n_rewind; i++) | |||||
| decoder.Add(_tokens[i]); | |||||
| AnsiConsole.MarkupLine($"[green]{new string(' ', depth * 3) + decoder.Read().ReplaceLineEndings(" ")}[/]"); | |||||
| for (var i = _tokens.Count - n_rewind; i < _tokens.Count; i++) | |||||
| decoder.Add(_tokens[i]); | |||||
| AnsiConsole.MarkupLine($"[maroon]{decoder.Read().ReplaceLineEndings(" ")}[/]"); | |||||
| } | |||||
| public LLamaToken GetToken(int index) | |||||
| { | |||||
| return _tokens[index]; | |||||
| } | |||||
| } | |||||
| } | |||||
| ``` | |||||
| A single conversation is prompted and then continued for 24 tokens, after that it is re-wound by 12 tokens and continued from there. Rewinding simply sets the conversation back to an earlier state and requires no extra computation. | |||||