Browse Source

Rewritten some examples docs, explaining what these examples show instead of just showing the source code. (#728)

master
Martin Evans GitHub 1 year ago
parent
commit
3d76ef7b6a
No known key found for this signature in database GPG Key ID: B5690EEEBB952194
3 changed files with 55 additions and 394 deletions
  1. +48
    -148
      docs/Examples/BatchedExecutorFork.md
  2. +4
    -127
      docs/Examples/BatchedExecutorGuidance.md
  3. +3
    -119
      docs/Examples/BatchedExecutorRewind.md

+ 48
- 148
docs/Examples/BatchedExecutorFork.md View File

@@ -1,148 +1,48 @@
# Bacthed executor - multi-output to one input

```cs
using LLama.Batched;
using LLama.Common;
using LLama.Native;
using LLama.Sampling;
using Spectre.Console;

namespace LLama.Examples.Examples;

/// <summary>
/// This demonstrates generating multiple replies to the same prompt, with a shared cache
/// </summary>
public class BatchedExecutorFork
{
private const int n_split = 16;
private const int n_len = 72;

public static async Task Run()
{
string modelPath = UserSettings.GetModelPath();

var parameters = new ModelParams(modelPath);
using var model = LLamaWeights.LoadFromFile(parameters);

var prompt = AnsiConsole.Ask("Prompt (or ENTER for default):", "Not many people know that");

// Create an executor that can evaluate a batch of conversations together
using var executor = new BatchedExecutor(model, parameters);

// Print some info
var name = executor.Model.Metadata.GetValueOrDefault("general.name", "unknown model name");
Console.WriteLine($"Created executor with model: {name}");

// Evaluate the initial prompt to create one conversation
using var start = executor.Create();
start.Prompt(prompt);
await executor.Infer();

// Create the root node of the tree
var root = new Node(start);

await AnsiConsole
.Progress()
.StartAsync(async progress =>
{
var reporter = progress.AddTask("Running Inference (1)", maxValue: n_len);

// Run inference loop
for (var i = 0; i < n_len; i++)
{
if (i != 0)
await executor.Infer();

// Occasionally fork all the active conversations
if (i != 0 && i % n_split == 0)
root.Split();

// Sample all active conversations
root.Sample();

// Update progress bar
reporter.Increment(1);
reporter.Description($"Running Inference ({root.ActiveConversationCount})");
}

// Display results
var display = new Tree(prompt);
root.Display(display);
AnsiConsole.Write(display);
});
}

private class Node
{
private readonly StreamingTokenDecoder _decoder;

private readonly DefaultSamplingPipeline _sampler;
private Conversation? _conversation;

private Node? _left;
private Node? _right;

public int ActiveConversationCount => _conversation != null ? 1 : _left!.ActiveConversationCount + _right!.ActiveConversationCount;

public Node(Conversation conversation)
{
_sampler = new DefaultSamplingPipeline();
_conversation = conversation;
_decoder = new StreamingTokenDecoder(conversation.Executor.Context);
}

public void Sample()
{
if (_conversation == null)
{
_left?.Sample();
_right?.Sample();
return;
}

if (_conversation.RequiresInference)
return;

// Sample one token
var ctx = _conversation.Executor.Context.NativeHandle;
var token = _sampler.Sample(ctx, _conversation.Sample(), Array.Empty<LLamaToken>());
_sampler.Accept(ctx, token);
_decoder.Add(token);

// Prompt the conversation with this token, to continue generating from there
_conversation.Prompt(token);
}

public void Split()
{
if (_conversation != null)
{
_left = new Node(_conversation.Fork());
_right = new Node(_conversation.Fork());

_conversation.Dispose();
_conversation = null;
}
else
{
_left?.Split();
_right?.Split();
}
}

public void Display<T>(T tree, int depth = 0)
where T : IHasTreeNodes
{
var colors = new[] { "red", "green", "blue", "yellow", "white" };
var color = colors[depth % colors.Length];

var message = Markup.Escape(_decoder.Read().ReplaceLineEndings(""));

var n = tree.AddNode($"[{color}]{message}[/]");

_left?.Display(n, depth + 1);
_right?.Display(n, depth + 1);
}
}
}
```
# BatchedExecutor Fork - Generate Multiple Completions With Shared Memory

This example demonstrates using the `BatchedExecutor` to split one sequence into multiple sequences. See the source code [here](https://github.com/SciSharp/LLamaSharp/blob/master/LLama.Examples/Examples/BatchedExecutorFork.cs).

Sequences share memory up to the point they were split, meaning no extra memory is consumed by creating a fork. Inference runs for all sequences simultaneously, this means that running two sequences does _not_ take twice as much time as running one.

An example output, starting with the prompt `Not many people know that`:

```
Not many people know that
└── , in the 17th century, a military band led by Captain Charles
├── Bossler of Baden, Germany, composed and played a music suite titled
│ ├── the "Civil Psalm," in order to rally German Protestants during
│ │ ├── the Thirty Years' War. This tune became popular among German soldiers,
│ │ │ ├── and its popularity continued long after the war
│ │ │ └── and, eventually, reached France. The
│ │ └── the Thirty Years' War.This music, with its clear call
│ │ ├── to arms and strong Christian themes, helped
│ │ └── to arms and unwavering belief
│ └── "Baden's First National Symphony," with lyrics by a young Wol
│ ├── fgang Amadeus Mozart. The story of the composition's creation
│ │ ├── has long been forgotten. But the B
│ │ └── was popularized by a novelty book
│ └── fgang Amadeus Mozart. It's said that this music brought
│ ├── peace to Europe, at least for a
│ └── the troops together during difficult times. It
└── Newdick played a mournful dirge to accompany the procession of
├── the head of King Charles I. It is the scene that opens my latest book
│ ├── , "Death and Taxes." The book follows a British army captain named
│ │ ├── Marcus as he seeks revenge for his wife
│ │ └── William Darnay who becomes involved in
│ └── , A King, A Pawn and a Prince. The murder of the king
│ ├── and the civil war that followed are the
│ └── is a watershed moment in the political
└── the coffin of William Shakespeare, as it was carried to its final resting place
├── . That is the least that can be said for a man who is often regarded
│ ├── as the greatest writer in the English language
│ └── as the greatest writer the English language has
└── at Stratford-upon-Avon. Shakespeare, of course
├── , was a famous English poet and play
└── , was one of the greatest playwright
```

Forked sequences can be used for many possible things. For example
- Evaluating the system prompt once and forking for each independent conversation.
- Saving a "checkpoint" in a conversation to return to later.
- Beam Search.
- Splitting a conversation, generating completions from several different "agents", and taking the best response.

+ 4
- 127
docs/Examples/BatchedExecutorGuidance.md View File

@@ -1,130 +1,7 @@
# Batched executor - basic guidance
# BatchedExecutor Guidance - Classifier Free Guidance / Negative Prompting


```cs
using LLama.Batched;
using LLama.Common;
using LLama.Native;
using LLama.Sampling;
using Spectre.Console;
This example demonstrates using `Classifier Free Guidance` (a.k.a. negative prompting) with a custom sampling pipeline. Negative prompting is a way of steering the model output away from certain topics. See the source code [here](https://github.com/SciSharp/LLamaSharp/blob/master/LLama.Examples/Examples/BatchedExecutorGuidance.cs).


namespace LLama.Examples.Examples;
Two conversations are created. The `guided` conversation starts with the prompt that should be completed as shown as the output, for example `"my favourite colour is"`. The `guidance` conversation contains the negative prompt at the start, for example `"I hate the colour red. My favourite colour is"`. Note that this is a _negative_ prompt, so therefore this guidance will make the model answer as if it _likes_ the colour red.


/// <summary>
/// This demonstrates using a batch to generate two sequences and then using one
/// sequence as the negative guidance ("classifier free guidance") for the other.
/// </summary>
public class BatchedExecutorGuidance
{
private const int n_len = 32;

public static async Task Run()
{
string modelPath = UserSettings.GetModelPath();

var parameters = new ModelParams(modelPath);
using var model = LLamaWeights.LoadFromFile(parameters);

var positivePrompt = AnsiConsole.Ask("Positive Prompt (or ENTER for default):", "My favourite colour is").Trim();
var negativePrompt = AnsiConsole.Ask("Negative Prompt (or ENTER for default):", "I hate the colour red. My favourite colour is").Trim();
var weight = AnsiConsole.Ask("Guidance Weight (or ENTER for default):", 2.0f);

// Create an executor that can evaluate a batch of conversations together
using var executor = new BatchedExecutor(model, parameters);

// Print some info
var name = executor.Model.Metadata.GetValueOrDefault("general.name", "unknown model name");
Console.WriteLine($"Created executor with model: {name}");

// Load the two prompts into two conversations
using var guided = executor.Create();
guided.Prompt(positivePrompt);
using var guidance = executor.Create();
guidance.Prompt(negativePrompt);

// Run inference to evaluate prompts
await AnsiConsole
.Status()
.Spinner(Spinner.Known.Line)
.StartAsync("Evaluating Prompts...", _ => executor.Infer());

// Fork the "guided" conversation. We'll run this one without guidance for comparison
using var unguided = guided.Fork();

// Run inference loop
var unguidedSampler = new GuidedSampler(null, weight);
var unguidedDecoder = new StreamingTokenDecoder(executor.Context);
var guidedSampler = new GuidedSampler(guidance, weight);
var guidedDecoder = new StreamingTokenDecoder(executor.Context);
await AnsiConsole
.Progress()
.StartAsync(async progress =>
{
var reporter = progress.AddTask("Running Inference", maxValue: n_len);

for (var i = 0; i < n_len; i++)
{
if (i != 0)
await executor.Infer();

// Sample from the "unguided" conversation. This is just a conversation using the same prompt, without any
// guidance. This serves as a comparison to show the effect of guidance.
var u = unguidedSampler.Sample(executor.Context.NativeHandle, unguided.Sample(), Array.Empty<LLamaToken>());
unguidedDecoder.Add(u);
unguided.Prompt(u);

// Sample from the "guided" conversation. This sampler will internally use the "guidance" conversation
// to steer the conversation. See how this is done in GuidedSampler.ProcessLogits (bottom of this file).
var g = guidedSampler.Sample(executor.Context.NativeHandle, guided.Sample(), Array.Empty<LLamaToken>());
guidedDecoder.Add(g);

// Use this token to advance both guided _and_ guidance. Keeping them in sync (except for the initial prompt).
guided.Prompt(g);
guidance.Prompt(g);

// Early exit if we reach the natural end of the guided sentence
if (g == model.EndOfSentenceToken)
break;

// Update progress bar
reporter.Increment(1);
}
});

AnsiConsole.MarkupLine($"[green]Unguided:[/][white]{unguidedDecoder.Read().ReplaceLineEndings(" ")}[/]");
AnsiConsole.MarkupLine($"[green]Guided:[/][white]{guidedDecoder.Read().ReplaceLineEndings(" ")}[/]");
}

private class GuidedSampler(Conversation? guidance, float weight)
: BaseSamplingPipeline
{
public override void Accept(SafeLLamaContextHandle ctx, LLamaToken token)
{
}

public override ISamplingPipeline Clone()
{
throw new NotSupportedException();
}

protected override void ProcessLogits(SafeLLamaContextHandle ctx, Span<float> logits, ReadOnlySpan<LLamaToken> lastTokens)
{
if (guidance == null)
return;

// Get the logits generated by the guidance sequences
var guidanceLogits = guidance.Sample();

// Use those logits to guide this sequence
NativeApi.llama_sample_apply_guidance(ctx, logits, guidanceLogits, weight);
}

protected override LLamaToken ProcessTokenDataArray(SafeLLamaContextHandle ctx, LLamaTokenDataArray candidates, ReadOnlySpan<LLamaToken> lastTokens)
{
candidates.Temperature(ctx, 0.8f);
candidates.TopK(ctx, 25);

return candidates.SampleToken(ctx);
}
}
}
```
A custom sampler samples the `guidance` conversation and uses that output to influence the output of the `guided` conversation. Once a token is selected _both_ conversations are continued with this token.

+ 3
- 119
docs/Examples/BatchedExecutorRewind.md View File

@@ -1,121 +1,5 @@
# Batched executor - rewinding to an earlier state
# BatchedExecutor - Rewind


```cs
using LLama.Batched;
using LLama.Common;
using LLama.Native;
using LLama.Sampling;
using Spectre.Console;
This example demonstrates using the `BatchedExecutor` to split one sequence into multiple sequences. See the source code [here](https://github.com/SciSharp/LLamaSharp/blob/master/LLama.Examples/Examples/BatchedExecutorRewind.cs).


namespace LLama.Examples.Examples;

/// <summary>
/// This demonstrates generating tokens and then rewinding to an earlier state
/// </summary>
public class BatchedExecutorRewind
{
private const int n_generate = 24;
private const int n_rewind = 12;
private const int n_repeats = 6;

public static async Task Run()
{
string modelPath = UserSettings.GetModelPath();

var parameters = new ModelParams(modelPath);
using var model = LLamaWeights.LoadFromFile(parameters);

var prompt = AnsiConsole.Ask("Prompt (or ENTER for default):", "Not many people know that");

// Create an executor that can evaluate a batch of conversations together
using var executor = new BatchedExecutor(model, parameters);

// Print some info
var name = executor.Model.Metadata.GetValueOrDefault("general.name", "unknown model name");
Console.WriteLine($"Created executor with model: {name}");

// Evaluate the initial prompt to create one conversation
using var conversation = executor.Create();
conversation.Prompt(prompt);
// Create the start node wrapping the conversation
var node = new Node(executor.Context);

// Print the prompt
Console.ForegroundColor = ConsoleColor.Green;
Console.WriteLine(prompt);

for (var i = 0; i < n_repeats; i++)
{
for (var j = 0; j < n_generate; j++)
{
// Run inference
await executor.Infer();

// Sample a token
var token = node.Sample(conversation);

// Continue conversation with this token
if (j != n_generate - 1)
conversation.Prompt(token);
}

// Write out what we generated
node.Write(n_rewind, i + 1);

// Rewind back a few tokens
conversation.Rewind(n_rewind + 1);

// Prompt with a token
conversation.Prompt(node.GetToken(n_generate - n_rewind - 1));

// Create a new node around the rewound conversation
node = new Node(executor.Context);
}

Console.WriteLine("Press any key to exit demo");
Console.ReadKey(true);
}

private class Node
{
private readonly LLamaContext _context;

private readonly List<LLamaToken> _tokens = new List<LLamaToken>();
private readonly DefaultSamplingPipeline Sampler;

public Node(LLamaContext context)
{
_context = context;
Sampler = new DefaultSamplingPipeline();
}

public LLamaToken Sample(Conversation conversation)
{
var token = Sampler.Sample(_context.NativeHandle, conversation.Sample(), Array.Empty<LLamaToken>());
_tokens.Add(token);
return token;
}

public void Write(int n_rewind, int depth)
{
var decoder = new StreamingTokenDecoder(_context);

for (var i = 0; i < _tokens.Count - n_rewind; i++)
decoder.Add(_tokens[i]);

AnsiConsole.MarkupLine($"[green]{new string(' ', depth * 3) + decoder.Read().ReplaceLineEndings(" ")}[/]");

for (var i = _tokens.Count - n_rewind; i < _tokens.Count; i++)
decoder.Add(_tokens[i]);

AnsiConsole.MarkupLine($"[maroon]{decoder.Read().ReplaceLineEndings(" ")}[/]");
}

public LLamaToken GetToken(int index)
{
return _tokens[index];
}
}
}
```
A single conversation is prompted and then continued for 24 tokens, after that it is re-wound by 12 tokens and continued from there. Rewinding simply sets the conversation back to an earlier state and requires no extra computation.

Loading…
Cancel
Save