Rewritten some examples docs, explaining what these examples show instead of just showing the source code. (#728)

1 year ago · 3d76ef7b6a
--- a/docs/Examples/BatchedExecutorFork.md
+++ b/docs/Examples/BatchedExecutorFork.md
@@ -1,148 +1,48 @@
 # Bacthed executor - multi-output to one input
 ```cs
 using LLama.Batched;
 using LLama.Common;
 using LLama.Native;
 using LLama.Sampling;
 using Spectre.Console;
 namespace LLama.Examples.Examples;
 /// <summary>
 /// This demonstrates generating multiple replies to the same prompt, with a shared cache
 /// </summary>
 public class BatchedExecutorFork
 {
    private const int n_split = 16;
    private const int n_len = 72;
    public static async Task Run()
    {
        string modelPath = UserSettings.GetModelPath();
        var parameters = new ModelParams(modelPath);
        using var model = LLamaWeights.LoadFromFile(parameters);
        var prompt = AnsiConsole.Ask("Prompt (or ENTER for default):", "Not many people know that");
        // Create an executor that can evaluate a batch of conversations together
        using var executor = new BatchedExecutor(model, parameters);
        // Print some info
        var name = executor.Model.Metadata.GetValueOrDefault("general.name", "unknown model name");
        Console.WriteLine($"Created executor with model: {name}");
        // Evaluate the initial prompt to create one conversation
        using var start = executor.Create();
        start.Prompt(prompt);
        await executor.Infer();
        // Create the root node of the tree
        var root = new Node(start);
        await AnsiConsole
            .Progress()
            .StartAsync(async progress =>
            {
                var reporter = progress.AddTask("Running Inference (1)", maxValue: n_len);
                // Run inference loop
                for (var i = 0; i < n_len; i++)
                {
                    if (i != 0)
                        await executor.Infer();
                    // Occasionally fork all the active conversations
                    if (i != 0 && i % n_split == 0)
                        root.Split();
                    // Sample all active conversations
                    root.Sample();
                    // Update progress bar
                    reporter.Increment(1);
                    reporter.Description($"Running Inference ({root.ActiveConversationCount})");
                }
                // Display results
                var display = new Tree(prompt);
                root.Display(display);
                AnsiConsole.Write(display);
            });
    }
    private class Node
    {
        private readonly StreamingTokenDecoder _decoder;
        private readonly DefaultSamplingPipeline _sampler;
        private Conversation? _conversation;
        private Node? _left;
        private Node? _right;
        public int ActiveConversationCount => _conversation != null ? 1 : _left!.ActiveConversationCount + _right!.ActiveConversationCount;
        public Node(Conversation conversation)
        {
            _sampler = new DefaultSamplingPipeline();
            _conversation = conversation;
            _decoder = new StreamingTokenDecoder(conversation.Executor.Context);
        }
        public void Sample()
        {
            if (_conversation == null)
            {
                _left?.Sample();
                _right?.Sample();
                return;
            }
            if (_conversation.RequiresInference)
                return;
            // Sample one token
            var ctx = _conversation.Executor.Context.NativeHandle;
            var token = _sampler.Sample(ctx, _conversation.Sample(), Array.Empty<LLamaToken>());
            _sampler.Accept(ctx, token);
            _decoder.Add(token);
            // Prompt the conversation with this token, to continue generating from there
            _conversation.Prompt(token);
        }
        public void Split()
        {
            if (_conversation != null)
            {
                _left = new Node(_conversation.Fork());
                _right = new Node(_conversation.Fork());
                _conversation.Dispose();
                _conversation = null;
            }
            else
            {
                _left?.Split();
                _right?.Split();
            }
        }
        public void Display<T>(T tree, int depth = 0)
            where T : IHasTreeNodes
        {
            var colors = new[] { "red", "green", "blue", "yellow", "white" };
            var color = colors[depth % colors.Length];
            var message = Markup.Escape(_decoder.Read().ReplaceLineEndings(""));
            var n = tree.AddNode($"[{color}]{message}[/]");
            _left?.Display(n, depth + 1);
            _right?.Display(n, depth + 1);
        }
    }
 }
 ```
 # BatchedExecutor Fork - Generate Multiple Completions With Shared Memory
 This example demonstrates using the `BatchedExecutor` to split one sequence into multiple sequences. See the source code [here](https://github.com/SciSharp/LLamaSharp/blob/master/LLama.Examples/Examples/BatchedExecutorFork.cs).
 Sequences share memory up to the point they were split, meaning no extra memory is consumed by creating a fork. Inference runs for all sequences simultaneously, this means that running two sequences does _not_ take twice as much time as running one.
 An example output, starting with the prompt `Not many people know that`:
 ```
 Not many people know that
 └── , in the 17th century, a military band led by Captain Charles
    ├──  Bossler of Baden, Germany, composed and played a music suite titled
    │   ├──  the "Civil Psalm," in order to rally German Protestants during
    │   │   ├──  the Thirty Years' War.  This tune became popular among German soldiers,
    │   │   │   ├──  and its popularity continued long after the war
    │   │   │   └──  and, eventually, reached France. The
    │   │   └──  the Thirty Years' War.This music, with its clear call
    │   │       ├──  to arms and strong Christian themes, helped
    │   │       └──  to arms and unwavering belief
    │   └──  "Baden's First National Symphony," with lyrics by a young Wol
    │       ├── fgang Amadeus Mozart. The story of the composition's creation
    │       │   ├──  has long been forgotten. But the B
    │       │   └──  was popularized by a novelty book
    │       └── fgang Amadeus Mozart. It's said that this music brought
    │           ├──  peace to Europe, at least for a
    │           └──  the troops together during difficult times. It
    └──  Newdick played a mournful dirge to accompany the procession of
        ├──  the head of King Charles I. It is the scene that opens my latest book
        │   ├── , "Death and Taxes." The book follows a British army captain named
        │   │   ├──  Marcus as he seeks revenge for his wife
        │   │   └──  William Darnay who becomes involved in
        │   └── , A King, A Pawn and a Prince. The murder of the king
        │       ├──  and the civil war that followed are the
        │       └──  is a watershed moment in the political
        └──  the coffin of William Shakespeare, as it was carried to its final resting place
            ├── . That is the least that can be said for a man who is often regarded
            │   ├──  as the greatest writer in the English language
            │   └──  as the greatest writer the English language has
            └──  at Stratford-upon-Avon.  Shakespeare, of course
                ├── , was a famous English poet and play
                └── , was one of the greatest playwright
 ```
 Forked sequences can be used for many possible things. For example
 - Evaluating the system prompt once and forking for each independent conversation.
 - Saving a "checkpoint" in a conversation to return to later.
 - Beam Search.
 - Splitting a conversation, generating completions from several different "agents", and taking the best response.
--- a/docs/Examples/BatchedExecutorGuidance.md
+++ b/docs/Examples/BatchedExecutorGuidance.md
@@ -1,130 +1,7 @@
 # Batched executor - basic guidance
 # BatchedExecutor Guidance - Classifier Free Guidance / Negative Prompting
 ```cs
 using LLama.Batched;
 using LLama.Common;
 using LLama.Native;
 using LLama.Sampling;
 using Spectre.Console;
 This example demonstrates using `Classifier Free Guidance` (a.k.a. negative prompting) with a custom sampling pipeline. Negative prompting is a way of steering the model output away from certain topics. See the source code [here](https://github.com/SciSharp/LLamaSharp/blob/master/LLama.Examples/Examples/BatchedExecutorGuidance.cs).
 namespace LLama.Examples.Examples;
 Two conversations are created. The `guided` conversation starts with the prompt that should be completed as shown as the output, for example `"my favourite colour is"`. The `guidance` conversation contains the negative prompt at the start, for example `"I hate the colour red. My favourite colour is"`. Note that this is a _negative_ prompt, so therefore this guidance will make the model answer as if it _likes_ the colour red.
 /// <summary>
 /// This demonstrates using a batch to generate two sequences and then using one
 /// sequence as the negative guidance ("classifier free guidance") for the other.
 /// </summary>
 public class BatchedExecutorGuidance
 {
    private const int n_len = 32;
    public static async Task Run()
    {
        string modelPath = UserSettings.GetModelPath();
        var parameters = new ModelParams(modelPath);
        using var model = LLamaWeights.LoadFromFile(parameters);
        var positivePrompt = AnsiConsole.Ask("Positive Prompt (or ENTER for default):", "My favourite colour is").Trim();
        var negativePrompt = AnsiConsole.Ask("Negative Prompt (or ENTER for default):", "I hate the colour red. My favourite colour is").Trim();
        var weight = AnsiConsole.Ask("Guidance Weight (or ENTER for default):", 2.0f);
        // Create an executor that can evaluate a batch of conversations together
        using var executor = new BatchedExecutor(model, parameters);
        // Print some info
        var name = executor.Model.Metadata.GetValueOrDefault("general.name", "unknown model name");
        Console.WriteLine($"Created executor with model: {name}");
        // Load the two prompts into two conversations
        using var guided = executor.Create();
        guided.Prompt(positivePrompt);
        using var guidance = executor.Create();
        guidance.Prompt(negativePrompt);
        // Run inference to evaluate prompts
        await AnsiConsole
             .Status()
             .Spinner(Spinner.Known.Line)
             .StartAsync("Evaluating Prompts...", _ => executor.Infer());
        // Fork the "guided" conversation. We'll run this one without guidance for comparison
        using var unguided = guided.Fork();
        // Run inference loop
        var unguidedSampler = new GuidedSampler(null, weight);
        var unguidedDecoder = new StreamingTokenDecoder(executor.Context);
        var guidedSampler = new GuidedSampler(guidance, weight);
        var guidedDecoder = new StreamingTokenDecoder(executor.Context);
        await AnsiConsole
           .Progress()
           .StartAsync(async progress =>
            {
                var reporter = progress.AddTask("Running Inference", maxValue: n_len);
                for (var i = 0; i < n_len; i++)
                {
                    if (i != 0)
                        await executor.Infer();
                    // Sample from the "unguided" conversation. This is just a conversation using the same prompt, without any
                    // guidance. This serves as a comparison to show the effect of guidance.
                    var u = unguidedSampler.Sample(executor.Context.NativeHandle, unguided.Sample(), Array.Empty<LLamaToken>());
                    unguidedDecoder.Add(u);
                    unguided.Prompt(u);
                    // Sample from the "guided" conversation. This sampler will internally use the "guidance" conversation
                    // to steer the conversation. See how this is done in GuidedSampler.ProcessLogits (bottom of this file).
                    var g = guidedSampler.Sample(executor.Context.NativeHandle, guided.Sample(), Array.Empty<LLamaToken>());
                    guidedDecoder.Add(g);
                    // Use this token to advance both guided _and_ guidance. Keeping them in sync (except for the initial prompt).
                    guided.Prompt(g);
                    guidance.Prompt(g);
                    // Early exit if we reach the natural end of the guided sentence
                    if (g == model.EndOfSentenceToken)
                        break;
                    // Update progress bar
                    reporter.Increment(1);
                }
            });
        AnsiConsole.MarkupLine($"[green]Unguided:[/][white]{unguidedDecoder.Read().ReplaceLineEndings(" ")}[/]");
        AnsiConsole.MarkupLine($"[green]Guided:[/][white]{guidedDecoder.Read().ReplaceLineEndings(" ")}[/]");
    }
    private class GuidedSampler(Conversation? guidance, float weight)
        : BaseSamplingPipeline
    {
        public override void Accept(SafeLLamaContextHandle ctx, LLamaToken token)
        {
        }
        public override ISamplingPipeline Clone()
        {
            throw new NotSupportedException();
        }
        protected override void ProcessLogits(SafeLLamaContextHandle ctx, Span<float> logits, ReadOnlySpan<LLamaToken> lastTokens)
        {
            if (guidance == null)
                return;
            // Get the logits generated by the guidance sequences
            var guidanceLogits = guidance.Sample();
            // Use those logits to guide this sequence
            NativeApi.llama_sample_apply_guidance(ctx, logits, guidanceLogits, weight);
        }
        protected override LLamaToken ProcessTokenDataArray(SafeLLamaContextHandle ctx, LLamaTokenDataArray candidates, ReadOnlySpan<LLamaToken> lastTokens)
        {
            candidates.Temperature(ctx, 0.8f);
            candidates.TopK(ctx, 25);
            return candidates.SampleToken(ctx);
        }
    }
 }
 ```
 A custom sampler samples the `guidance` conversation and uses that output to influence the output of the `guided` conversation. Once a token is selected _both_ conversations are continued with this token.
--- a/docs/Examples/BatchedExecutorRewind.md
+++ b/docs/Examples/BatchedExecutorRewind.md
@@ -1,121 +1,5 @@
 # Batched executor - rewinding to an earlier state
 # BatchedExecutor - Rewind
 ```cs
 using LLama.Batched;
 using LLama.Common;
 using LLama.Native;
 using LLama.Sampling;
 using Spectre.Console;
 This example demonstrates using the `BatchedExecutor` to split one sequence into multiple sequences. See the source code [here](https://github.com/SciSharp/LLamaSharp/blob/master/LLama.Examples/Examples/BatchedExecutorRewind.cs).
 namespace LLama.Examples.Examples;
 /// <summary>
 /// This demonstrates generating tokens and then rewinding to an earlier state
 /// </summary>
 public class BatchedExecutorRewind
 {
    private const int n_generate = 24;
    private const int n_rewind = 12;
    private const int n_repeats = 6;
    public static async Task Run()
    {
        string modelPath = UserSettings.GetModelPath();
        var parameters = new ModelParams(modelPath);
        using var model = LLamaWeights.LoadFromFile(parameters);
        var prompt = AnsiConsole.Ask("Prompt (or ENTER for default):", "Not many people know that");
        // Create an executor that can evaluate a batch of conversations together
        using var executor = new BatchedExecutor(model, parameters);
        // Print some info
        var name = executor.Model.Metadata.GetValueOrDefault("general.name", "unknown model name");
        Console.WriteLine($"Created executor with model: {name}");
        // Evaluate the initial prompt to create one conversation
        using var conversation = executor.Create();
        conversation.Prompt(prompt);
        // Create the start node wrapping the conversation
        var node = new Node(executor.Context);
        // Print the prompt
        Console.ForegroundColor = ConsoleColor.Green;
        Console.WriteLine(prompt);
        for (var i = 0; i < n_repeats; i++)
        {
            for (var j = 0; j < n_generate; j++)
            {
                // Run inference
                await executor.Infer();
                // Sample a token
                var token = node.Sample(conversation);
                // Continue conversation with this token
                if (j != n_generate - 1)
                    conversation.Prompt(token);
            }
            // Write out what we generated
            node.Write(n_rewind, i + 1);
            // Rewind back a few tokens
            conversation.Rewind(n_rewind + 1);
            // Prompt with a token
            conversation.Prompt(node.GetToken(n_generate - n_rewind - 1));
            // Create a new node around the rewound conversation
            node = new Node(executor.Context);
        }
        Console.WriteLine("Press any key to exit demo");
        Console.ReadKey(true);
    }
    private class Node
    {
        private readonly LLamaContext _context;
        private readonly List<LLamaToken> _tokens = new List<LLamaToken>();
        private readonly DefaultSamplingPipeline Sampler;
        public Node(LLamaContext context)
        {
            _context = context;
            Sampler = new DefaultSamplingPipeline();
        }
        public LLamaToken Sample(Conversation conversation)
        {
            var token = Sampler.Sample(_context.NativeHandle, conversation.Sample(), Array.Empty<LLamaToken>());
            _tokens.Add(token);
            return token;
        }
        public void Write(int n_rewind, int depth)
        {
            var decoder = new StreamingTokenDecoder(_context);
            for (var i = 0; i < _tokens.Count - n_rewind; i++)
                decoder.Add(_tokens[i]);
            AnsiConsole.MarkupLine($"[green]{new string(' ', depth * 3) + decoder.Read().ReplaceLineEndings(" ")}[/]");
            for (var i = _tokens.Count - n_rewind; i < _tokens.Count; i++)
                decoder.Add(_tokens[i]);
            AnsiConsole.MarkupLine($"[maroon]{decoder.Read().ReplaceLineEndings(" ")}[/]");
        }
        public LLamaToken GetToken(int index)
        {
            return _tokens[index];
        }
    }
 }
 ```
 A single conversation is prompted and then continued for 24 tokens, after that it is re-wound by 12 tokens and continued from there. Rewinding simply sets the conversation back to an earlier state and requires no extra computation.