Browse Source

Add service for managing Models and Model Contexts

tags/v0.6.0
sa_ddam213 2 years ago
parent
commit
c9108f8311
7 changed files with 582 additions and 94 deletions
  1. +55
    -0
      LLama.Web/Async/AsyncLock.cs
  2. +1
    -0
      LLama.Web/Common/LLamaOptions.cs
  3. +30
    -0
      LLama.Web/Common/ModelLoadType.cs
  4. +112
    -94
      LLama.Web/Common/ModelOptions.cs
  5. +106
    -0
      LLama.Web/LLamaModel.cs
  6. +76
    -0
      LLama.Web/Services/IModelService.cs
  7. +202
    -0
      LLama.Web/Services/ModelService.cs

+ 55
- 0
LLama.Web/Async/AsyncLock.cs View File

@@ -0,0 +1,55 @@
namespace LLama.Web.Async
{
/// <summary>
/// Create an Async locking using statment
/// </summary>
public sealed class AsyncLock
{
private readonly SemaphoreSlim _semaphore;
private readonly Task<IDisposable> _releaser;


/// <summary>
/// Initializes a new instance of the <see cref="AsyncLock"/> class.
/// </summary>
public AsyncLock()
{
_semaphore = new SemaphoreSlim(1, 1);
_releaser = Task.FromResult((IDisposable)new Releaser(this));
}


/// <summary>
/// Locks the using statement asynchronously.
/// </summary>
/// <returns></returns>
public Task<IDisposable> LockAsync()
{
var wait = _semaphore.WaitAsync();
if (wait.IsCompleted)
return _releaser;

return wait.ContinueWith((_, state) => (IDisposable)state, _releaser.Result, CancellationToken.None, TaskContinuationOptions.ExecuteSynchronously, TaskScheduler.Default);
}


/// <summary>
/// IDisposable wrapper class to release the lock on dispose
/// </summary>
/// <seealso cref="IDisposable" />
private sealed class Releaser : IDisposable
{
private readonly AsyncLock _lockToRelease;

internal Releaser(AsyncLock lockToRelease)
{
_lockToRelease = lockToRelease;
}

public void Dispose()
{
_lockToRelease._semaphore.Release();
}
}
}
}

+ 1
- 0
LLama.Web/Common/LLamaOptions.cs View File

@@ -2,6 +2,7 @@
{ {
public class LLamaOptions public class LLamaOptions
{ {
public ModelLoadType ModelLoadType { get; set; }
public List<ModelOptions> Models { get; set; } public List<ModelOptions> Models { get; set; }
public List<PromptOptions> Prompts { get; set; } = new List<PromptOptions>(); public List<PromptOptions> Prompts { get; set; } = new List<PromptOptions>();
public List<ParameterOptions> Parameters { get; set; } = new List<ParameterOptions>(); public List<ParameterOptions> Parameters { get; set; } = new List<ParameterOptions>();


+ 30
- 0
LLama.Web/Common/ModelLoadType.cs View File

@@ -0,0 +1,30 @@
namespace LLama.Web.Common
{
/// <summary>
/// The type of model load caching to use
/// </summary>
public enum ModelLoadType
{

/// <summary>
/// Only one model will be loaded into memory at a time, any other models will be unloaded before the new one is loaded
/// </summary>
Single = 0,

/// <summary>
/// Multiple models will be loaded into memory, ensure you use the ModelConfigs to split the hardware resources
/// </summary>
Multiple = 1,

/// <summary>
/// The first model in the appsettings.json list will be preloaded into memory at app startup
/// </summary>
PreloadSingle = 2,


/// <summary>
/// All models in the appsettings.json list will be preloaded into memory at app startup, ensure you use the ModelConfigs to split the hardware resources
/// </summary>
PreloadMultiple = 3,
}
}

+ 112
- 94
LLama.Web/Common/ModelOptions.cs View File

@@ -3,105 +3,123 @@ using LLama.Abstractions;


namespace LLama.Web.Common namespace LLama.Web.Common
{ {
public class ModelOptions
: IModelParams
public class ModelOptions : IModelParams
{ {
/// <summary>
/// Model friendly name
/// </summary>
public string Name { get; set; } public string Name { get; set; }

/// <summary>
/// Max context insta=nces allowed per model
/// </summary>
public int MaxInstances { get; set; } public int MaxInstances { get; set; }


/// <summary>
/// Model context size (n_ctx)
/// </summary>
public int ContextSize { get; set; } = 512;

/// <summary>
/// the GPU that is used for scratch and small tensors
/// </summary>
public int MainGpu { get; set; } = 0;

/// <summary>
/// if true, reduce VRAM usage at the cost of performance
/// </summary>
public bool LowVram { get; set; } = false;

/// <summary>
/// Number of layers to run in VRAM / GPU memory (n_gpu_layers)
/// </summary>
public int GpuLayerCount { get; set; } = 20;

/// <summary>
/// Seed for the random number generator (seed)
/// </summary>
public int Seed { get; set; } = 1686349486;

/// <summary>
/// Use f16 instead of f32 for memory kv (memory_f16)
/// </summary>
public bool UseFp16Memory { get; set; } = true;

/// <summary>
/// Use mmap for faster loads (use_mmap)
/// </summary>
public bool UseMemorymap { get; set; } = true;

/// <summary>
/// Use mlock to keep model in memory (use_mlock)
/// </summary>
public bool UseMemoryLock { get; set; } = false;

/// <summary>
/// Compute perplexity over the prompt (perplexity)
/// </summary>
public bool Perplexity { get; set; } = false;

/// <summary>
/// Model path (model)
/// </summary>
public string ModelPath { get; set; }

/// <summary>
/// model alias
/// </summary>
public string ModelAlias { get; set; } = "unknown";


/// <summary>
/// Model context size (n_ctx)
/// </summary>
public int ContextSize { get; set; } = 512;
/// <summary>
/// the GPU that is used for scratch and small tensors
/// </summary>
public int MainGpu { get; set; } = 0;
/// <summary>
/// if true, reduce VRAM usage at the cost of performance
/// </summary>
public bool LowVram { get; set; } = false;
/// <summary>
/// Number of layers to run in VRAM / GPU memory (n_gpu_layers)
/// </summary>
public int GpuLayerCount { get; set; } = 20;
/// <summary>
/// Seed for the random number generator (seed)
/// </summary>
public int Seed { get; set; } = 1686349486;
/// <summary>
/// Use f16 instead of f32 for memory kv (memory_f16)
/// </summary>
public bool UseFp16Memory { get; set; } = true;
/// <summary>
/// Use mmap for faster loads (use_mmap)
/// </summary>
public bool UseMemorymap { get; set; } = true;
/// <summary>
/// Use mlock to keep model in memory (use_mlock)
/// </summary>
public bool UseMemoryLock { get; set; } = false;
/// <summary>
/// Compute perplexity over the prompt (perplexity)
/// </summary>
public bool Perplexity { get; set; } = false;
/// <summary>
/// Model path (model)
/// </summary>
public string ModelPath { get; set; }
/// <summary>
/// model alias
/// </summary>
public string ModelAlias { get; set; } = "unknown";
/// <summary>
/// lora adapter path (lora_adapter)
/// </summary>
public string LoraAdapter { get; set; } = string.Empty;
/// <summary>
/// base model path for the lora adapter (lora_base)
/// </summary>
public string LoraBase { get; set; } = string.Empty;
/// <summary>
/// Number of threads (-1 = autodetect) (n_threads)
/// </summary>
public int Threads { get; set; } = Math.Max(Environment.ProcessorCount / 2, 1);
/// <summary>
/// batch size for prompt processing (must be >=32 to use BLAS) (n_batch)
/// </summary>
public int BatchSize { get; set; } = 512;

/// <summary>
/// Whether to convert eos to newline during the inference.
/// </summary>
public bool ConvertEosToNewLine { get; set; } = false;

/// <summary>
/// Whether to use embedding mode. (embedding) Note that if this is set to true,
/// The LLamaModel won't produce text response anymore.
/// </summary>
public bool EmbeddingMode { get; set; } = false;

/// <summary>
/// how split tensors should be distributed across GPUs
/// </summary>
public float[] TensorSplits { get; set; }

/// <summary>
/// RoPE base frequency
/// </summary>
public float RopeFrequencyBase { get; set; } = 10000.0f;

/// <summary>
/// RoPE frequency scaling factor
/// </summary>
public float RopeFrequencyScale { get; set; } = 1.0f;

/// <summary>
/// Use experimental mul_mat_q kernels
/// </summary>
public bool MulMatQ { get; set; }
/// <summary>
/// lora adapter path (lora_adapter)
/// </summary>
public string LoraAdapter { get; set; } = string.Empty;

/// <summary>
/// base model path for the lora adapter (lora_base)
/// </summary>
public string LoraBase { get; set; } = string.Empty;

/// <summary>
/// Number of threads (-1 = autodetect) (n_threads)
/// </summary>
public int Threads { get; set; } = Math.Max(Environment.ProcessorCount / 2, 1);

/// <summary>
/// batch size for prompt processing (must be >=32 to use BLAS) (n_batch)
/// </summary>
public int BatchSize { get; set; } = 512;

/// <summary>
/// Whether to convert eos to newline during the inference.
/// </summary>
public bool ConvertEosToNewLine { get; set; } = false;

/// <summary>
/// Whether to use embedding mode. (embedding) Note that if this is set to true,
/// The LLamaModel won't produce text response anymore.
/// </summary>
public bool EmbeddingMode { get; set; } = false;

/// <summary>
/// how split tensors should be distributed across GPUs
/// </summary>
public float[] TensorSplits { get; set; }

/// <summary>
/// RoPE base frequency
/// </summary>
public float RopeFrequencyBase { get; set; } = 10000.0f;

/// <summary>
/// RoPE frequency scaling factor
/// </summary>
public float RopeFrequencyScale { get; set; } = 1.0f;

/// <summary>
/// Use experimental mul_mat_q kernels
/// </summary>
public bool MulMatQ { get; set; }


/// <summary> /// <summary>
/// The encoding to use for models /// The encoding to use for models


+ 106
- 0
LLama.Web/LLamaModel.cs View File

@@ -0,0 +1,106 @@
using LLama.Abstractions;
using LLama.Web.Common;
using System.Collections.Concurrent;

namespace LLama.Web
{
/// <summary>
/// Wrapper class for LLamaSharp LLamaWeights
/// </summary>
/// <seealso cref="System.IDisposable" />
public class LLamaModel : IDisposable
{
private readonly ModelOptions _config;
private readonly LLamaWeights _weights;
private readonly ConcurrentDictionary<string, LLamaContext> _contexts;

/// <summary>
/// Initializes a new instance of the <see cref="LLamaModel"/> class.
/// </summary>
/// <param name="modelParams">The model parameters.</param>
public LLamaModel(ModelOptions modelParams)
{
_config = modelParams;
_weights = LLamaWeights.LoadFromFile(modelParams);
_contexts = new ConcurrentDictionary<string, LLamaContext>();
}

/// <summary>
/// Gets the model configuration.
/// </summary>
public IModelParams ModelParams => _config;

/// <summary>
/// Gets the LLamaWeights
/// </summary>
public LLamaWeights LLamaWeights => _weights;


/// <summary>
/// Gets the context count.
/// </summary>
public int ContextCount => _contexts.Count;


/// <summary>
/// Creates a new context session on this model
/// </summary>
/// <param name="contextName">The unique context identifier</param>
/// <returns>LLamaModelContext for this LLamaModel</returns>
/// <exception cref="Exception">Context exists</exception>
public Task<LLamaContext> CreateContext(string contextName)
{
if (_contexts.TryGetValue(contextName, out var context))
throw new Exception($"Context with id {contextName} already exists.");

if (_config.MaxInstances > -1 && ContextCount >= _config.MaxInstances)
throw new Exception($"Maximum model instances reached");

context = _weights.CreateContext(_config);
if (_contexts.TryAdd(contextName, context))
return Task.FromResult(context);

return Task.FromResult<LLamaContext>(null);
}

/// <summary>
/// Get a contexts belonging to this model
/// </summary>
/// <param name="contextName">The unique context identifier</param>
/// <returns>LLamaModelContext for this LLamaModel with the specified contextName</returns>
public Task<LLamaContext> GetContext(string contextName)
{
if (_contexts.TryGetValue(contextName, out var context))
return Task.FromResult(context);

return Task.FromResult<LLamaContext>(null);
}

/// <summary>
/// Remove a context from this model
/// </summary>
/// <param name="contextName">The unique context identifier</param>
/// <returns>true if removed, otherwise false</returns>
public Task<bool> RemoveContext(string contextName)
{
if (!_contexts.TryRemove(contextName, out var context))
return Task.FromResult(false);

context?.Dispose();
return Task.FromResult(true);
}


/// <summary>
/// Performs application-defined tasks associated with freeing, releasing, or resetting unmanaged resources.
/// </summary>
public void Dispose()
{
foreach (var context in _contexts.Values)
{
context?.Dispose();
}
_weights.Dispose();
}
}
}

+ 76
- 0
LLama.Web/Services/IModelService.cs View File

@@ -0,0 +1,76 @@
using LLama.Web.Common;

namespace LLama.Web.Services
{
/// <summary>
/// Service for managing language Models
/// </summary>
public interface IModelService
{
/// <summary>
/// Gets the model with the specified name.
/// </summary>
/// <param name="modelName">Name of the model.</param>
Task<LLamaModel> GetModel(string modelName);


/// <summary>
/// Loads a model from a ModelConfig object.
/// </summary>
/// <param name="modelOptions">The model configuration.</param>
Task<LLamaModel> LoadModel(ModelOptions modelOptions);


/// <summary>
/// Loads all models found in appsettings.json
/// </summary>
Task LoadModels();


/// <summary>
/// Unloads the model with the specified name.
/// </summary>
/// <param name="modelName">Name of the model.</param>
Task UnloadModel(string modelName);


/// <summary>
/// Unloads all models.
/// </summary>
Task UnloadModels();


/// <summary>
/// Gets a context with the specified identifier
/// </summary>
/// <param name="modelName">Name of the model.</param>
/// <param name="contextName">The context identifier.</param>
Task<LLamaContext> GetContext(string modelName, string contextName);


/// <summary>
/// Removes the context.
/// </summary>
/// <param name="modelName">Name of the model.</param>
/// <param name="contextName">The context identifier.</param>
Task<bool> RemoveContext(string modelName, string contextName);


/// <summary>
/// Creates a context.
/// </summary>
/// <param name="modelName">Name of the model.</param>
/// <param name="contextName">The context identifier.</param>
Task<LLamaContext> CreateContext(string modelName, string contextName);


/// <summary>
/// Gets the or create model and context.
/// This will load a model from disk if not already loaded, and also create the context
/// </summary>
/// <param name="modelName">Name of the model.</param>
/// <param name="contextName">The context identifier.</param>
/// <returns>Both loaded Model and Context</returns>
Task<(LLamaModel, LLamaContext)> GetOrCreateModelAndContext(string modelName, string contextName);
}
}

+ 202
- 0
LLama.Web/Services/ModelService.cs View File

@@ -0,0 +1,202 @@
using LLama.Web.Async;
using LLama.Web.Common;
using System.Collections.Concurrent;

namespace LLama.Web.Services
{

/// <summary>
/// Sercive for handling Models,Weights & Contexts
/// </summary>
public class ModelService : IModelService
{
private readonly AsyncLock _modelLock;
private readonly AsyncLock _contextLock;
private readonly LLamaOptions _configuration;
private readonly ConcurrentDictionary<string, LLamaModel> _modelInstances;


/// <summary>
/// Initializes a new instance of the <see cref="ModelService"/> class.
/// </summary>
/// <param name="logger">The logger.</param>
/// <param name="options">The options.</param>
public ModelService(LLamaOptions configuration)
{
_modelLock = new AsyncLock();
_contextLock = new AsyncLock();
_configuration = configuration;
_modelInstances = new ConcurrentDictionary<string, LLamaModel>();
}


/// <summary>
/// Loads a model with the provided configuration.
/// </summary>
/// <param name="modelOptions">The model configuration.</param>
/// <returns></returns>
public async Task<LLamaModel> LoadModel(ModelOptions modelOptions)
{
if (_modelInstances.TryGetValue(modelOptions.Name, out var existingModel))
return existingModel;

using (await _modelLock.LockAsync())
{
if (_modelInstances.TryGetValue(modelOptions.Name, out var model))
return model;

// If in single mode unload any other models
if (_configuration.ModelLoadType == ModelLoadType.Single
|| _configuration.ModelLoadType == ModelLoadType.PreloadSingle)
await UnloadModels();


model = new LLamaModel(modelOptions);
_modelInstances.TryAdd(modelOptions.Name, model);
return model;
}
}


/// <summary>
/// Loads the models.
/// </summary>
public async Task LoadModels()
{
if (_configuration.ModelLoadType == ModelLoadType.Single
|| _configuration.ModelLoadType == ModelLoadType.Multiple)
return;

foreach (var modelConfig in _configuration.Models)
{
await LoadModel(modelConfig);

//Only preload first model if in SinglePreload mode
if (_configuration.ModelLoadType == ModelLoadType.PreloadSingle)
break;
}
}


/// <summary>
/// Unloads the model.
/// </summary>
/// <param name="modelName">Name of the model.</param>
/// <returns></returns>
public Task UnloadModel(string modelName)
{
if (_modelInstances.TryRemove(modelName, out var model))
{
model?.Dispose();
return Task.FromResult(true);
}
return Task.FromResult(false);
}



/// <summary>
/// Unloads all models.
/// </summary>
public async Task UnloadModels()
{
foreach (var modelName in _modelInstances.Keys)
{
await UnloadModel(modelName);
}
}


/// <summary>
/// Gets a model ny name.
/// </summary>
/// <param name="modelName">Name of the model.</param>
/// <returns></returns>
public Task<LLamaModel> GetModel(string modelName)
{
_modelInstances.TryGetValue(modelName, out var model);
return Task.FromResult(model);
}


/// <summary>
/// Gets a context from the specified model.
/// </summary>
/// <param name="modelName">Name of the model.</param>
/// <param name="contextName">The contextName.</param>
/// <returns></returns>
/// <exception cref="System.Exception">Model not found</exception>
public async Task<LLamaContext> GetContext(string modelName, string contextName)
{
if (!_modelInstances.TryGetValue(modelName, out var model))
throw new Exception("Model not found");

return await model.GetContext(contextName);
}


/// <summary>
/// Creates a context on the specified model.
/// </summary>
/// <param name="modelName">Name of the model.</param>
/// <param name="contextName">The contextName.</param>
/// <returns></returns>
/// <exception cref="System.Exception">Model not found</exception>
public async Task<LLamaContext> CreateContext(string modelName, string contextName)
{
if (!_modelInstances.TryGetValue(modelName, out var model))
throw new Exception("Model not found");

using (await _contextLock.LockAsync())
{
return await model.CreateContext(contextName);
}
}


/// <summary>
/// Removes a context from the specified model.
/// </summary>
/// <param name="modelName">Name of the model.</param>
/// <param name="contextName">The contextName.</param>
/// <returns></returns>
/// <exception cref="System.Exception">Model not found</exception>
public async Task<bool> RemoveContext(string modelName, string contextName)
{
if (!_modelInstances.TryGetValue(modelName, out var model))
throw new Exception("Model not found");

using (await _contextLock.LockAsync())
{
return await model.RemoveContext(contextName);
}
}


/// <summary>
/// Loads, Gets,Creates a Model and a Context
/// </summary>
/// <param name="modelName">Name of the model.</param>
/// <param name="contextName">The contextName.</param>
/// <returns></returns>
/// <exception cref="System.Exception">Model option '{modelName}' not found</exception>
public async Task<(LLamaModel, LLamaContext)> GetOrCreateModelAndContext(string modelName, string contextName)
{
if (_modelInstances.TryGetValue(modelName, out var model))
return (model, await model.GetContext(contextName) ?? await model.CreateContext(contextName));


// Get model configuration
var modelConfig = _configuration.Models.FirstOrDefault(x => x.Name == modelName);
if (modelConfig is null)
throw new Exception($"Model option '{modelName}' not found");

// Load Model
model = await LoadModel(modelConfig);

// Get or Create Context
return (model, await model.GetContext(contextName) ?? await model.CreateContext(contextName));
}

}
}

Loading…
Cancel
Save