using System; using System.Threading; using System.Threading.Tasks; using LLama.Abstractions; using LLama.Native; namespace LLama.Batched; ///

/// A batched executor that can infer multiple separate "conversations" simultaneously. ///

public sealed class BatchedExecutor : IDisposable { private int _nextSequenceId; internal LLamaBatch Batch { get; } ///

/// Epoch is incremented every time Infer is called. Conversations can use this to keep track of /// whether they're waiting for inference, or can be sampled. ///

internal ulong Epoch { get; private set; } ///

/// The this executor is using ///

public LLamaContext Context { get; } ///

/// The this executor is using ///

public LLamaWeights Model { get; } ///

/// Get the number of tokens in the batch, waiting for to be called ///

public int BatchedTokenCount => Batch.TokenCount; ///

/// Check if this executor has been disposed. ///

public bool IsDisposed { get; private set; } ///

/// Create a new batched executor ///

/// The model to use /// Parameters to create a new context public BatchedExecutor(LLamaWeights model, IContextParams contextParams) { Model = model; Batch = new LLamaBatch(); Context = model.CreateContext(contextParams); Epoch = 1; } ///

/// Finalizer for BatchedExecutor ///

~BatchedExecutor() { Dispose(); } ///

/// Start a new with the given prompt ///

/// /// public Conversation Prompt(string prompt) { if (IsDisposed) throw new ObjectDisposedException(nameof(BatchedExecutor)); var conversation = new Conversation(this, GetNextSequenceId(), 0); conversation.Prompt(prompt); return conversation; } ///

/// Run inference for all conversations in the batch which have pending tokens. /// /// If the result is `NoKvSlot` then there is not enough memory for inference, try disposing some conversation /// threads and running inference again. ///

public async Task Infer(CancellationToken cancellation = default) { if (IsDisposed) throw new ObjectDisposedException(nameof(BatchedExecutor)); var status = await Context.DecodeAsync(Batch, cancellation); // Only clear the batch if the result was ok. leaving all this state in place means that "Infer" can // be called again after a warning (e.g. NoKvSlot). if (status == DecodeResult.Ok) { Epoch++; Batch.Clear(); } return status; } /// public void Dispose() { if (IsDisposed) return; IsDisposed = true; GC.SuppressFinalize(this); Context.Dispose(); } internal LLamaSeqId GetNextSequenceId() { return checked((LLamaSeqId)_nextSequenceId++); } }