Merge branch 'master' of github.com:SciSharp/LLamaSharp into rinne-dev

2 years ago · 9fcbd16b74
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -0,0 +1,55 @@
 name: CI
 on:
  push:
    branches: [master]
  pull_request:
    branches: [master]

 jobs:
  build:
    name: Test
    runs-on: ${{ matrix.os }}
    strategy:
      fail-fast: false
      matrix:
        build: [linux-debug, linux-release, macos-debug, macos-release, windows-debug, windows-release]
        include:
          - build: linux-debug
            os: ubuntu-latest
            config: debug
          - build: linux-release
            os: ubuntu-latest
            config: release
          - build: macos-debug
            os: macos-latest
            config: debug
          - build: macos-release
            os: macos-latest
            config: release
          - build: windows-debug
            os: windows-2019
            config: debug
          - build: windows-release
            os: windows-2019
            config: release
    steps:
    - uses: actions/checkout@v2
    - uses: actions/setup-dotnet@v1
      with:
        dotnet-version: | 
          6.0.x
          7.0.x
    - name: Cache Gradle packages
      uses: actions/cache@v3
      with:
        key: "unit_test_models"
        path: LLama.Unittest/Models
    #  workaround for actions/setup-dotnet#155
    - name: Clear package cache
      run: dotnet clean LLamaSharp.sln && dotnet nuget locals all --clear
    - name: Restore packages
      run: dotnet restore LLamaSharp.sln
    - name: Build
      run: dotnet build LLamaSharp.sln -c ${{ matrix.config }} --no-restore
    - name: Test
      run: dotnet test LLamaSharp.sln -c ${{ matrix.config }}
--- a/.gitignore
+++ b/.gitignore
@@ -341,4 +341,7 @@ test/TensorFlowNET.Examples/mnist
 *.xsd

 # docs
 site/
 site/

 /LLama.Unittest/Models/*.bin

--- a/LLama.Unittest/BasicTest.cs
+++ b/LLama.Unittest/BasicTest.cs
@@ -1,11 +1,15 @@
 using LLama;
 using LLama.Common;

 namespace LLama.Unittest
 {
    public class BasicTest
    {
        [Fact]
        public void SimpleQA()
        public void LoadModel()
        {
            
            var model = new LLamaModel(new ModelParams("Models/llama-2-7b-chat.ggmlv3.q3_K_S.bin", contextSize: 256));
            model.Dispose();
        }
    }
 }
--- a/LLama.Unittest/LLama.Unittest.csproj
+++ b/LLama.Unittest/LLama.Unittest.csproj
@@ -23,8 +23,22 @@
    </PackageReference>
  </ItemGroup>

  <Target Name="DownloadContentFiles" BeforeTargets="Build">
      <DownloadFile SourceUrl="https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGML/resolve/main/llama-2-7b-chat.ggmlv3.q3_K_S.bin" DestinationFolder="Models" DestinationFileName="llama-2-7b-chat.ggmlv3.q3_K_S.bin" SkipUnchangedFiles="true">
    </DownloadFile>
  </Target>

  <ItemGroup>
    <ProjectReference Include="..\LLama\LLamaSharp.csproj" />
  </ItemGroup>

  <ItemGroup>
    <Folder Include="Models\" />
  </ItemGroup>

  <ItemGroup>
    <None Update="Models\llama-2-7b-chat.ggmlv3.q3_K_S.bin">
      <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
    </None>
  </ItemGroup>
 </Project>
--- a/LLama/Common/FixedSizeQueue.cs
+++ b/LLama/Common/FixedSizeQueue.cs
@@ -30,6 +30,7 @@ namespace LLama.Common
        /// <param name="data"></param>
        public FixedSizeQueue(int size, IEnumerable<T> data)
        {
 #if NETCOREAPP3_0_OR_GREATER
            // Try an early check on the amount of data supplied (if possible)
 #if NETSTANDARD2_0
            var dataCount = data.Count();
@@ -52,7 +53,7 @@ namespace LLama.Common
                throw new ArgumentException($"The max size set for the quene is {size}, but got {count} initial values.");
 #endif
        }

 /
        /// <summary>
        /// Replace every item in the queue with the given value
        /// </summary>
--- a/LLama/Common/ModelParams.cs
+++ b/LLama/Common/ModelParams.cs
@@ -84,7 +84,7 @@ namespace LLama.Common
        /// <summary>
        /// how split tensors should be distributed across GPUs
        /// </summary>
        public float[] TensorSplits { get; set; } = new float[] { 0 };
        public nint TensorSplits { get; set; }

        /// <summary>
        /// 
--- a/LLama/Native/LLamaContextParams.cs
+++ b/LLama/Native/LLamaContextParams.cs
@@ -47,7 +47,8 @@ namespace LLama.Native
        /// <summary>
        /// how to split layers across multiple GPUs
        /// </summary>
        public float[] tensor_split;
        public nint tensor_split;


        /// <summary>
        /// ref: https://github.com/ggerganov/llama.cpp/pull/2054
@@ -78,6 +79,11 @@ namespace LLama.Native
        [MarshalAs(UnmanagedType.I1)]
        public bool low_vram;

        /// <summary>
        /// if true, use experimental mul_mat_q kernels
        /// </summary>
        [MarshalAs(UnmanagedType.I1)] public bool mul_mat_q;

        /// <summary>
        /// use fp16 for KV cache
        /// </summary>
@@ -114,9 +120,5 @@ namespace LLama.Native
        [MarshalAs(UnmanagedType.I1)] 
        public bool embedding;
    }

    public struct TensorSplits
    {
        public float Item1;
    }
 }

--- a/LLama/Utils.cs
+++ b/LLama/Utils.cs
@@ -28,12 +28,14 @@ namespace LLama
            lparams.logits_all = @params.Perplexity;
            lparams.embedding = @params.EmbeddingMode;
            lparams.low_vram = @params.LowVram;
       

            /*
            if (@params.TensorSplits.Length != 1)
            {
                throw new ArgumentException("Currently multi-gpu support is not supported by " +
                    "both llama.cpp and LLamaSharp.");
            }
            }*/

            lparams.tensor_split = @params.TensorSplits;

            if (!File.Exists(@params.ModelPath))
--- a/LLama/runtimes/libllama.dylib
+++ b/LLama/runtimes/libllama.dylib