| @@ -106,7 +106,7 @@ namespace LLama.Web.Common | |||
| /// <summary> | |||
| /// how split tensors should be distributed across GPUs | |||
| /// </summary> | |||
| public float[] TensorSplits { get; set; } | |||
| public TensorSplitsCollection TensorSplits { get; set; } = new(); | |||
| /// <summary> | |||
| /// RoPE base frequency | |||
| @@ -1,6 +1,8 @@ | |||
| using System; | |||
| using System.Buffers; | |||
| using System.Collections.Generic; | |||
| using System.Linq; | |||
| using LLama.Native; | |||
| namespace LLama.Abstractions | |||
| { | |||
| @@ -37,7 +39,7 @@ namespace LLama.Abstractions | |||
| /// <summary> | |||
| /// how split tensors should be distributed across GPUs | |||
| /// </summary> | |||
| float[]? TensorSplits { get; set; } | |||
| TensorSplitsCollection TensorSplits { get; set; } | |||
| /// <summary> | |||
| /// Load vocab only (no weights) | |||
| @@ -98,4 +100,42 @@ namespace LLama.Abstractions | |||
| } | |||
| } | |||
| } | |||
| /// <summary> | |||
| /// A fixed size array to set the tensor splits across multiple GPUs | |||
| /// </summary> | |||
| public sealed class TensorSplitsCollection | |||
| { | |||
| private readonly float[] _array = new float[NativeApi.llama_max_devices()]; | |||
| /// <summary> | |||
| /// The size of this array | |||
| /// </summary> | |||
| public int Length => _array.Length; | |||
| /// <summary> | |||
| /// Get or set the proportion of work to do on the given device. | |||
| /// </summary> | |||
| /// <remarks>"[ 3, 2 ]" will assign 60% of the data to GPU 0 and 40% to GPU 1.</remarks> | |||
| /// <param name="index"></param> | |||
| /// <returns></returns> | |||
| public float this[int index] | |||
| { | |||
| get => _array[index]; | |||
| set => _array[index] = value; | |||
| } | |||
| /// <summary> | |||
| /// Set all values to zero | |||
| /// </summary> | |||
| public void Clear() | |||
| { | |||
| Array.Clear(_array, 0, _array.Length); | |||
| } | |||
| internal MemoryHandle Pin() | |||
| { | |||
| return _array.AsMemory().Pin(); | |||
| } | |||
| } | |||
| } | |||
| @@ -82,9 +82,10 @@ namespace LLama.Common | |||
| public bool EmbeddingMode { get; set; } | |||
| /// <summary> | |||
| /// how split tensors should be distributed across GPUs | |||
| /// how split tensors should be distributed across GPUs. | |||
| /// </summary> | |||
| public float[]? TensorSplits { get; set; } | |||
| /// <remarks>"[ 3, 2 ]" will assign 60% of the data to GPU 0 and 40% to GPU 1.</remarks> | |||
| public TensorSplitsCollection TensorSplits { get; set; } | |||
| /// <summary> | |||
| /// RoPE base frequency | |||
| @@ -1,7 +1,6 @@ | |||
| using System.IO; | |||
| using System; | |||
| using System.Buffers; | |||
| using System.Diagnostics; | |||
| using LLama.Abstractions; | |||
| using LLama.Native; | |||
| @@ -22,25 +21,6 @@ namespace LLama.Extensions | |||
| /// <exception cref="ArgumentException"></exception> | |||
| public static MemoryHandle ToLlamaModelParams(this IModelParams @params, out LLamaModelParams result) | |||
| { | |||
| var maxDevices = NativeApi.llama_max_devices(); | |||
| var splits = @params.TensorSplits; | |||
| if (splits != null) | |||
| { | |||
| Debug.Assert(@params.TensorSplits != null); | |||
| // If the splits array is too large just throw | |||
| if (splits.Length > maxDevices) | |||
| throw new ArgumentException($"TensorSplits size must be <= NativeApi.llama_max_devices() ({maxDevices})"); | |||
| // If the splits array is too small pad it up to the necessary size | |||
| if (splits.Length < maxDevices) | |||
| { | |||
| splits = new float[maxDevices]; | |||
| for (var i = 0; i < @params.TensorSplits.Length; i++) | |||
| splits[i] = @params.TensorSplits[i]; | |||
| } | |||
| } | |||
| result = NativeApi.llama_model_default_params(); | |||
| result.main_gpu = @params.MainGpu; | |||
| @@ -49,7 +29,7 @@ namespace LLama.Extensions | |||
| result.use_mmap = @params.UseMemorymap; | |||
| result.vocab_only = @params.VocabOnly; | |||
| var pin = splits.AsMemory().Pin(); | |||
| var pin = @params.TensorSplits.Pin(); | |||
| unsafe | |||
| { | |||
| result.tensor_split = (float*)pin.Pointer; | |||