Add service for managing Models and Model Contexts

2 years ago · c9108f8311
--- a/LLama.Web/Async/AsyncLock.cs
+++ b/LLama.Web/Async/AsyncLock.cs
@@ -0,0 +1,55 @@
 namespace LLama.Web.Async
 {
    /// <summary>
    /// Create an Async locking using statment
    /// </summary>
    public sealed class AsyncLock
    {
        private readonly SemaphoreSlim _semaphore;
        private readonly Task<IDisposable> _releaser;
        /// <summary>
        /// Initializes a new instance of the <see cref="AsyncLock"/> class.
        /// </summary>
        public AsyncLock()
        {
            _semaphore = new SemaphoreSlim(1, 1);
            _releaser = Task.FromResult((IDisposable)new Releaser(this));
        }
        /// <summary>
        /// Locks the using statement asynchronously.
        /// </summary>
        /// <returns></returns>
        public Task<IDisposable> LockAsync()
        {
            var wait = _semaphore.WaitAsync();
            if (wait.IsCompleted)
                return _releaser;
            return wait.ContinueWith((_, state) => (IDisposable)state, _releaser.Result, CancellationToken.None, TaskContinuationOptions.ExecuteSynchronously, TaskScheduler.Default);
        }
        /// <summary>
        /// IDisposable wrapper class to release the lock on dispose
        /// </summary>
        /// <seealso cref="IDisposable" />
        private sealed class Releaser : IDisposable
        {
            private readonly AsyncLock _lockToRelease;
            internal Releaser(AsyncLock lockToRelease)
            {
                _lockToRelease = lockToRelease;
            }
            public void Dispose()
            {
                _lockToRelease._semaphore.Release();
            }
        }
    }
 }
--- a/LLama.Web/Common/LLamaOptions.cs
+++ b/LLama.Web/Common/LLamaOptions.cs
@@ -2,6 +2,7 @@
 {
    public class LLamaOptions
    {
        public ModelLoadType ModelLoadType { get; set; }
        public List<ModelOptions> Models { get; set; }
        public List<PromptOptions> Prompts { get; set; } = new List<PromptOptions>();
        public List<ParameterOptions> Parameters { get; set; } = new List<ParameterOptions>();
--- a/LLama.Web/Common/ModelLoadType.cs
+++ b/LLama.Web/Common/ModelLoadType.cs
@@ -0,0 +1,30 @@
 namespace LLama.Web.Common
 {
    /// <summary>
    /// The type of model load caching to use
    /// </summary>
    public enum ModelLoadType
    {
        /// <summary>
        /// Only one model will be loaded into memory at a time, any other models will be unloaded before the new one is loaded
        /// </summary>
        Single = 0,
        /// <summary>
        /// Multiple models will be loaded into memory, ensure you use the ModelConfigs to split the hardware resources
        /// </summary>
        Multiple = 1,
        /// <summary>
        /// The first model in the appsettings.json list will be preloaded into memory at app startup
        /// </summary>
        PreloadSingle = 2,
        /// <summary>
        /// All models in the appsettings.json list will be preloaded into memory at app startup, ensure you use the ModelConfigs to split the hardware resources
        /// </summary>
        PreloadMultiple = 3,
    }
 }
--- a/LLama.Web/Common/ModelOptions.cs
+++ b/LLama.Web/Common/ModelOptions.cs
@@ -3,105 +3,123 @@ using LLama.Abstractions;
 namespace LLama.Web.Common
 {
    public class ModelOptions
        : IModelParams
    public class ModelOptions : IModelParams
    {
        /// <summary>
        /// Model friendly name
        /// </summary>
        public string Name { get; set; }
        /// <summary>
        /// Max context insta=nces allowed per model
        /// </summary>
        public int MaxInstances { get; set; }
        /// <summary>
        /// Model context size (n_ctx)
        /// </summary>
        public int ContextSize { get; set; } = 512;
        /// <summary>
        /// the GPU that is used for scratch and small tensors
        /// </summary>
        public int MainGpu { get; set; } = 0;
        /// <summary>
        /// if true, reduce VRAM usage at the cost of performance
        /// </summary>
        public bool LowVram { get; set; } = false;
        /// <summary>
        /// Number of layers to run in VRAM / GPU memory (n_gpu_layers)
        /// </summary>
        public int GpuLayerCount { get; set; } = 20;
        /// <summary>
        /// Seed for the random number generator (seed)
        /// </summary>
        public int Seed { get; set; } = 1686349486;
        /// <summary>
        /// Use f16 instead of f32 for memory kv (memory_f16)
        /// </summary>
        public bool UseFp16Memory { get; set; } = true;
        /// <summary>
        /// Use mmap for faster loads (use_mmap)
        /// </summary>
        public bool UseMemorymap { get; set; } = true;
        /// <summary>
        /// Use mlock to keep model in memory (use_mlock)
        /// </summary>
        public bool UseMemoryLock { get; set; } = false;
        /// <summary>
        /// Compute perplexity over the prompt (perplexity)
        /// </summary>
        public bool Perplexity { get; set; } = false;
        /// <summary>
        /// Model path (model)
        /// </summary>
        public string ModelPath { get; set; }
        /// <summary>
        /// model alias
        /// </summary>
        public string ModelAlias { get; set; } = "unknown";
 		/// <summary>
 		/// Model context size (n_ctx)
 		/// </summary>
 		public int ContextSize { get; set; } = 512;
 		/// <summary>
 		/// the GPU that is used for scratch and small tensors
 		/// </summary>
 		public int MainGpu { get; set; } = 0;
 		/// <summary>
 		/// if true, reduce VRAM usage at the cost of performance
 		/// </summary>
 		public bool LowVram { get; set; } = false;
 		/// <summary>
 		/// Number of layers to run in VRAM / GPU memory (n_gpu_layers)
 		/// </summary>
 		public int GpuLayerCount { get; set; } = 20;
 		/// <summary>
 		/// Seed for the random number generator (seed)
 		/// </summary>
 		public int Seed { get; set; } = 1686349486;
 		/// <summary>
 		/// Use f16 instead of f32 for memory kv (memory_f16)
 		/// </summary>
 		public bool UseFp16Memory { get; set; } = true;
 		/// <summary>
 		/// Use mmap for faster loads (use_mmap)
 		/// </summary>
 		public bool UseMemorymap { get; set; } = true;
 		/// <summary>
 		/// Use mlock to keep model in memory (use_mlock)
 		/// </summary>
 		public bool UseMemoryLock { get; set; } = false;
 		/// <summary>
 		/// Compute perplexity over the prompt (perplexity)
 		/// </summary>
 		public bool Perplexity { get; set; } = false;
 		/// <summary>
 		/// Model path (model)
 		/// </summary>
 		public string ModelPath { get; set; }
 		/// <summary>
 		/// model alias
 		/// </summary>
 		public string ModelAlias { get; set; } = "unknown";
 		/// <summary>
 		/// lora adapter path (lora_adapter)
 		/// </summary>
 		public string LoraAdapter { get; set; } = string.Empty;
 		/// <summary>
 		/// base model path for the lora adapter (lora_base)
 		/// </summary>
 		public string LoraBase { get; set; } = string.Empty;
 		/// <summary>
 		/// Number of threads (-1 = autodetect) (n_threads)
 		/// </summary>
 		public int Threads { get; set; } = Math.Max(Environment.ProcessorCount / 2, 1);
 		/// <summary>
 		/// batch size for prompt processing (must be >=32 to use BLAS) (n_batch)
 		/// </summary>
 		public int BatchSize { get; set; } = 512;
 		/// <summary>
 		/// Whether to convert eos to newline during the inference.
 		/// </summary>
 		public bool ConvertEosToNewLine { get; set; } = false;
 		/// <summary>
 		/// Whether to use embedding mode. (embedding) Note that if this is set to true, 
 		/// The LLamaModel won't produce text response anymore.
 		/// </summary>
 		public bool EmbeddingMode { get; set; } = false;
 		/// <summary>
 		/// how split tensors should be distributed across GPUs
 		/// </summary>
 		public float[] TensorSplits { get; set; }
 		/// <summary>
 		/// RoPE base frequency
 		/// </summary>
 		public float RopeFrequencyBase { get; set; } = 10000.0f;
 		/// <summary>
 		/// RoPE frequency scaling factor
 		/// </summary>
 		public float RopeFrequencyScale { get; set; } = 1.0f;
 		/// <summary>
 		/// Use experimental mul_mat_q kernels
 		/// </summary>
 		public bool MulMatQ { get; set; }
        /// <summary>
        /// lora adapter path (lora_adapter)
        /// </summary>
        public string LoraAdapter { get; set; } = string.Empty;
        /// <summary>
        /// base model path for the lora adapter (lora_base)
        /// </summary>
        public string LoraBase { get; set; } = string.Empty;
        /// <summary>
        /// Number of threads (-1 = autodetect) (n_threads)
        /// </summary>
        public int Threads { get; set; } = Math.Max(Environment.ProcessorCount / 2, 1);
        /// <summary>
        /// batch size for prompt processing (must be >=32 to use BLAS) (n_batch)
        /// </summary>
        public int BatchSize { get; set; } = 512;
        /// <summary>
        /// Whether to convert eos to newline during the inference.
        /// </summary>
        public bool ConvertEosToNewLine { get; set; } = false;
        /// <summary>
        /// Whether to use embedding mode. (embedding) Note that if this is set to true, 
        /// The LLamaModel won't produce text response anymore.
        /// </summary>
        public bool EmbeddingMode { get; set; } = false;
        /// <summary>
        /// how split tensors should be distributed across GPUs
        /// </summary>
        public float[] TensorSplits { get; set; }
        /// <summary>
        /// RoPE base frequency
        /// </summary>
        public float RopeFrequencyBase { get; set; } = 10000.0f;
        /// <summary>
        /// RoPE frequency scaling factor
        /// </summary>
        public float RopeFrequencyScale { get; set; } = 1.0f;
        /// <summary>
        /// Use experimental mul_mat_q kernels
        /// </summary>
        public bool MulMatQ { get; set; }
        /// <summary>
        /// The encoding to use for models
--- a/LLama.Web/LLamaModel.cs
+++ b/LLama.Web/LLamaModel.cs
@@ -0,0 +1,106 @@
 using LLama.Abstractions;
 using LLama.Web.Common;
 using System.Collections.Concurrent;
 namespace LLama.Web
 {
    /// <summary>
    /// Wrapper class for LLamaSharp LLamaWeights
    /// </summary>
    /// <seealso cref="System.IDisposable" />
    public class LLamaModel : IDisposable
    {
        private readonly ModelOptions _config;
        private readonly LLamaWeights _weights;
        private readonly ConcurrentDictionary<string, LLamaContext> _contexts;
        /// <summary>
        /// Initializes a new instance of the <see cref="LLamaModel"/> class.
        /// </summary>
        /// <param name="modelParams">The model parameters.</param>
        public LLamaModel(ModelOptions modelParams)
        {
            _config = modelParams;
            _weights = LLamaWeights.LoadFromFile(modelParams);
            _contexts = new ConcurrentDictionary<string, LLamaContext>();
        }
        /// <summary>
        /// Gets the model configuration.
        /// </summary>
        public IModelParams ModelParams => _config;
        /// <summary>
        /// Gets the LLamaWeights
        /// </summary>
        public LLamaWeights LLamaWeights => _weights;
        /// <summary>
        /// Gets the context count.
        /// </summary>
        public int ContextCount => _contexts.Count;
        /// <summary>
        /// Creates a new context session on this model
        /// </summary>
        /// <param name="contextName">The unique context identifier</param>
        /// <returns>LLamaModelContext for this LLamaModel</returns>
        /// <exception cref="Exception">Context exists</exception>
        public Task<LLamaContext> CreateContext(string contextName)
        {
            if (_contexts.TryGetValue(contextName, out var context))
                throw new Exception($"Context with id {contextName} already exists.");
            if (_config.MaxInstances > -1 && ContextCount >= _config.MaxInstances)
                throw new Exception($"Maximum model instances reached");
            context = _weights.CreateContext(_config);
            if (_contexts.TryAdd(contextName, context))
                return Task.FromResult(context);
            return Task.FromResult<LLamaContext>(null);
        }
        /// <summary>
        /// Get a contexts belonging to this model
        /// </summary>
        /// <param name="contextName">The unique context identifier</param>
        /// <returns>LLamaModelContext for this LLamaModel with the specified contextName</returns>
        public Task<LLamaContext> GetContext(string contextName)
        {
            if (_contexts.TryGetValue(contextName, out var context))
                return Task.FromResult(context);
            return Task.FromResult<LLamaContext>(null);
        }
        /// <summary>
        /// Remove a context from this model
        /// </summary>
        /// <param name="contextName">The unique context identifier</param>
        /// <returns>true if removed, otherwise false</returns>
        public Task<bool> RemoveContext(string contextName)
        {
            if (!_contexts.TryRemove(contextName, out var context))
                return Task.FromResult(false);
            context?.Dispose();
            return Task.FromResult(true);
        }
        /// <summary>
        /// Performs application-defined tasks associated with freeing, releasing, or resetting unmanaged resources.
        /// </summary>
        public void Dispose()
        {
            foreach (var context in _contexts.Values)
            {
                context?.Dispose();
            }
            _weights.Dispose();
        }
    }
 }
--- a/LLama.Web/Services/IModelService.cs
+++ b/LLama.Web/Services/IModelService.cs
@@ -0,0 +1,76 @@
 using LLama.Web.Common;
 namespace LLama.Web.Services
 {
    /// <summary>
    /// Service for managing language Models
    /// </summary>
    public interface IModelService
    {
        /// <summary>
        /// Gets the model with the specified name.
        /// </summary>
        /// <param name="modelName">Name of the model.</param>
        Task<LLamaModel> GetModel(string modelName);
        /// <summary>
        /// Loads a model from a ModelConfig object.
        /// </summary>
        /// <param name="modelOptions">The model configuration.</param>
        Task<LLamaModel> LoadModel(ModelOptions modelOptions);
        /// <summary>
        /// Loads all models found in appsettings.json
        /// </summary>
        Task LoadModels();
        /// <summary>
        /// Unloads the model with the specified name.
        /// </summary>
        /// <param name="modelName">Name of the model.</param>
        Task UnloadModel(string modelName);
        /// <summary>
        /// Unloads all models.
        /// </summary>
        Task UnloadModels();
        /// <summary>
        /// Gets a context with the specified identifier
        /// </summary>
        /// <param name="modelName">Name of the model.</param>
        /// <param name="contextName">The context identifier.</param>
        Task<LLamaContext> GetContext(string modelName, string contextName);
        /// <summary>
        /// Removes the context.
        /// </summary>
        /// <param name="modelName">Name of the model.</param>
        /// <param name="contextName">The context identifier.</param>
        Task<bool> RemoveContext(string modelName, string contextName);
        /// <summary>
        /// Creates a context.
        /// </summary>
        /// <param name="modelName">Name of the model.</param>
        /// <param name="contextName">The context identifier.</param>
        Task<LLamaContext> CreateContext(string modelName, string contextName);
        /// <summary>
        /// Gets the or create model and context.
        /// This will load a model from disk if not already loaded, and also create the context
        /// </summary>
        /// <param name="modelName">Name of the model.</param>
        /// <param name="contextName">The context identifier.</param>
        /// <returns>Both loaded Model and Context</returns>
        Task<(LLamaModel, LLamaContext)> GetOrCreateModelAndContext(string modelName, string contextName);
    }
 }
--- a/LLama.Web/Services/ModelService.cs
+++ b/LLama.Web/Services/ModelService.cs
@@ -0,0 +1,202 @@
 using LLama.Web.Async;
 using LLama.Web.Common;
 using System.Collections.Concurrent;
 namespace LLama.Web.Services
 {
    /// <summary>
    /// Sercive for handling Models,Weights & Contexts
    /// </summary>
    public class ModelService : IModelService
    {
        private readonly AsyncLock _modelLock;
        private readonly AsyncLock _contextLock;
        private readonly LLamaOptions _configuration;
        private readonly ConcurrentDictionary<string, LLamaModel> _modelInstances;
        /// <summary>
        /// Initializes a new instance of the <see cref="ModelService"/> class.
        /// </summary>
        /// <param name="logger">The logger.</param>
        /// <param name="options">The options.</param>
        public ModelService(LLamaOptions configuration)
        {
            _modelLock = new AsyncLock();
            _contextLock = new AsyncLock();
            _configuration = configuration;
            _modelInstances = new ConcurrentDictionary<string, LLamaModel>();
        }
        /// <summary>
        /// Loads a model with the provided configuration.
        /// </summary>
        /// <param name="modelOptions">The model configuration.</param>
        /// <returns></returns>
        public async Task<LLamaModel> LoadModel(ModelOptions modelOptions)
        {
            if (_modelInstances.TryGetValue(modelOptions.Name, out var existingModel))
                return existingModel;
            using (await _modelLock.LockAsync())
            {
                if (_modelInstances.TryGetValue(modelOptions.Name, out var model))
                    return model;
                // If in single mode unload any other models
                if (_configuration.ModelLoadType == ModelLoadType.Single
                 || _configuration.ModelLoadType == ModelLoadType.PreloadSingle)
                    await UnloadModels();
                model = new LLamaModel(modelOptions);
                _modelInstances.TryAdd(modelOptions.Name, model);
                return model;
            }
        }
        /// <summary>
        /// Loads the models.
        /// </summary>
        public async Task LoadModels()
        {
            if (_configuration.ModelLoadType == ModelLoadType.Single
             || _configuration.ModelLoadType == ModelLoadType.Multiple)
                return;
            foreach (var modelConfig in _configuration.Models)
            {
                await LoadModel(modelConfig);
                //Only preload first model if in SinglePreload mode
                if (_configuration.ModelLoadType == ModelLoadType.PreloadSingle)
                    break;
            }
        }
        /// <summary>
        /// Unloads the model.
        /// </summary>
        /// <param name="modelName">Name of the model.</param>
        /// <returns></returns>
        public Task UnloadModel(string modelName)
        {
            if (_modelInstances.TryRemove(modelName, out var model))
            {
                model?.Dispose();
                return Task.FromResult(true);
            }
            return Task.FromResult(false);
        }
        /// <summary>
        /// Unloads all models.
        /// </summary>
        public async Task UnloadModels()
        {
            foreach (var modelName in _modelInstances.Keys)
            {
                await UnloadModel(modelName);
            }
        }
        /// <summary>
        /// Gets a model ny name.
        /// </summary>
        /// <param name="modelName">Name of the model.</param>
        /// <returns></returns>
        public Task<LLamaModel> GetModel(string modelName)
        {
            _modelInstances.TryGetValue(modelName, out var model);
            return Task.FromResult(model);
        }
        /// <summary>
        /// Gets a context from the specified model.
        /// </summary>
        /// <param name="modelName">Name of the model.</param>
        /// <param name="contextName">The contextName.</param>
        /// <returns></returns>
        /// <exception cref="System.Exception">Model not found</exception>
        public async Task<LLamaContext> GetContext(string modelName, string contextName)
        {
            if (!_modelInstances.TryGetValue(modelName, out var model))
                throw new Exception("Model not found");
            return await model.GetContext(contextName);
        }
        /// <summary>
        /// Creates a context on the specified model.
        /// </summary>
        /// <param name="modelName">Name of the model.</param>
        /// <param name="contextName">The contextName.</param>
        /// <returns></returns>
        /// <exception cref="System.Exception">Model not found</exception>
        public async Task<LLamaContext> CreateContext(string modelName, string contextName)
        {
            if (!_modelInstances.TryGetValue(modelName, out var model))
                throw new Exception("Model not found");
            using (await _contextLock.LockAsync())
            {
                return await model.CreateContext(contextName);
            }
        }
        /// <summary>
        /// Removes a context from the specified model.
        /// </summary>
        /// <param name="modelName">Name of the model.</param>
        /// <param name="contextName">The contextName.</param>
        /// <returns></returns>
        /// <exception cref="System.Exception">Model not found</exception>
        public async Task<bool> RemoveContext(string modelName, string contextName)
        {
            if (!_modelInstances.TryGetValue(modelName, out var model))
                throw new Exception("Model not found");
            using (await _contextLock.LockAsync())
            {
                return await model.RemoveContext(contextName);
            }
        }
        /// <summary>
        /// Loads, Gets,Creates a Model and a Context
        /// </summary>
        /// <param name="modelName">Name of the model.</param>
        /// <param name="contextName">The contextName.</param>
        /// <returns></returns>
        /// <exception cref="System.Exception">Model option '{modelName}' not found</exception>
        public async Task<(LLamaModel, LLamaContext)> GetOrCreateModelAndContext(string modelName, string contextName)
        {
            if (_modelInstances.TryGetValue(modelName, out var model))
                return (model, await model.GetContext(contextName) ?? await model.CreateContext(contextName));
            // Get model configuration
            var modelConfig = _configuration.Models.FirstOrDefault(x => x.Name == modelName);
            if (modelConfig is null)
                throw new Exception($"Model option '{modelName}' not found");
            // Load Model
            model = await LoadModel(modelConfig);
            // Get or Create Context
            return (model, await model.GetContext(contextName) ?? await model.CreateContext(contextName));
        }
    }
 }