Added a safe `TensorSplitsCollection` to the params which prevents incorrectly setting the `tensor_splits` collection

2 years ago · 6a4cd506bd
--- a/LLama.Web/Common/ModelOptions.cs
+++ b/LLama.Web/Common/ModelOptions.cs
@@ -106,7 +106,7 @@ namespace LLama.Web.Common
        /// <summary>
        /// how split tensors should be distributed across GPUs
        /// </summary>
        public float[] TensorSplits { get; set; }
        public TensorSplitsCollection TensorSplits { get; set; } = new();

        /// <summary>
        /// RoPE base frequency
--- a/LLama/Abstractions/IModelParams.cs
+++ b/LLama/Abstractions/IModelParams.cs
@@ -1,6 +1,8 @@
 using System;
 using System.Buffers;
 using System.Collections.Generic;
 using System.Linq;
 using LLama.Native;

 namespace LLama.Abstractions
 {
@@ -37,7 +39,7 @@ namespace LLama.Abstractions
        /// <summary>
        /// how split tensors should be distributed across GPUs
        /// </summary>
        float[]? TensorSplits { get; set; }
        TensorSplitsCollection TensorSplits { get; set; }

        /// <summary>
        /// Load vocab only (no weights)
@@ -98,4 +100,42 @@ namespace LLama.Abstractions
            }
        }
    }

    /// <summary>
    /// A fixed size array to set the tensor splits across multiple GPUs
    /// </summary>
    public sealed class TensorSplitsCollection
    {
        private readonly float[] _array = new float[NativeApi.llama_max_devices()];

        /// <summary>
        /// The size of this array
        /// </summary>
        public int Length => _array.Length;

        /// <summary>
        /// Get or set the proportion of work to do on the given device.
        /// </summary>
        /// <remarks>"[ 3, 2 ]" will assign 60% of the data to GPU 0 and 40% to GPU 1.</remarks>
        /// <param name="index"></param>
        /// <returns></returns>
        public float this[int index]
        {
            get => _array[index];
            set => _array[index] = value;
        }

        /// <summary>
        /// Set all values to zero
        /// </summary>
        public void Clear()
        {
            Array.Clear(_array, 0, _array.Length);
        }

        internal MemoryHandle Pin()
        {
            return _array.AsMemory().Pin();
        }
    }
 }
--- a/LLama/Common/ModelParams.cs
+++ b/LLama/Common/ModelParams.cs
@@ -82,9 +82,10 @@ namespace LLama.Common
        public bool EmbeddingMode { get; set; }

        /// <summary>
        /// how split tensors should be distributed across GPUs
        /// how split tensors should be distributed across GPUs.
        /// </summary>
        public float[]? TensorSplits { get; set; }
        /// <remarks>"[ 3, 2 ]" will assign 60% of the data to GPU 0 and 40% to GPU 1.</remarks>
        public TensorSplitsCollection TensorSplits { get; set; }

 		/// <summary>
 		/// RoPE base frequency
--- a/LLama/Extensions/IModelParamsExtensions.cs
+++ b/LLama/Extensions/IModelParamsExtensions.cs
@@ -1,7 +1,6 @@
 using System.IO;
 using System;
 using System.Buffers;
 using System.Diagnostics;
 using LLama.Abstractions;
 using LLama.Native;

@@ -22,25 +21,6 @@ namespace LLama.Extensions
        /// <exception cref="ArgumentException"></exception>
        public static MemoryHandle ToLlamaModelParams(this IModelParams @params, out LLamaModelParams result)
        {
            var maxDevices = NativeApi.llama_max_devices();
            var splits = @params.TensorSplits;
            if (splits != null)
            {
                Debug.Assert(@params.TensorSplits != null);

                // If the splits array is too large just throw
                if (splits.Length > maxDevices)
                    throw new ArgumentException($"TensorSplits size must be <= NativeApi.llama_max_devices() ({maxDevices})");

                // If the splits array is too small pad it up to the necessary size
                if (splits.Length < maxDevices)
                {
                    splits = new float[maxDevices];
                    for (var i = 0; i < @params.TensorSplits.Length; i++)
                        splits[i] = @params.TensorSplits[i];
                }
            }

            result = NativeApi.llama_model_default_params();

            result.main_gpu = @params.MainGpu;
@@ -49,7 +29,7 @@ namespace LLama.Extensions
            result.use_mmap = @params.UseMemorymap;
            result.vocab_only = @params.VocabOnly;

            var pin = splits.AsMemory().Pin();
            var pin = @params.TensorSplits.Pin();
            unsafe
            {
                result.tensor_split = (float*)pin.Pointer;