Browse Source

Merge pull request #122 from martindevans/gguf

Add GGUF support
tags/v0.5.1
Rinne GitHub 2 years ago
parent
commit
4e83e48ad1
No known key found for this signature in database GPG Key ID: 4AEE18F83AFDEB23
37 changed files with 830 additions and 658 deletions
  1. +1
    -0
      .gitignore
  2. +1
    -1
      LLama.Unittest/BasicTest.cs
  3. +7
    -0
      LLama.Unittest/Constants.cs
  4. +1
    -1
      LLama.Unittest/GrammarTest.cs
  5. +2
    -2
      LLama.Unittest/LLama.Unittest.csproj
  6. +1
    -1
      LLama.Unittest/LLamaContextTests.cs
  7. +12
    -11
      LLama.Unittest/LLamaEmbedderTests.cs
  8. +14
    -11
      LLama.Unittest/ModelsParamsTests.cs
  9. +1
    -1
      LLama.Unittest/StatelessExecutorTest.cs
  10. +0
    -10
      LLama.Web/Common/ModelOptions.cs
  11. +0
    -10
      LLama/Abstractions/IModelParams.cs
  12. +1
    -15
      LLama/Common/ModelParams.cs
  13. +0
    -2
      LLama/Extensions/IModelParamsExtensions.cs
  14. +5
    -4
      LLama/LLamaContext.cs
  15. +0
    -4
      LLama/LLamaEmbedder.cs
  16. +6
    -5
      LLama/LLamaInstructExecutor.cs
  17. +8
    -9
      LLama/LLamaInteractExecutor.cs
  18. +2
    -2
      LLama/LLamaSharp.Runtime.targets
  19. +16
    -26
      LLama/Native/LLamaContextParams.cs
  20. +5
    -0
      LLama/Native/LLamaFtype.cs
  21. +63
    -11
      LLama/Native/NativeApi.cs
  22. +18
    -6
      LLama/Native/SafeLLamaContextHandle.cs
  23. +57
    -15
      LLama/Native/SafeLlamaModelHandle.cs
  24. +13
    -0
      LLama/Native/SamplingApi.cs
  25. +4
    -4
      LLama/OldVersion/LLamaModel.cs
  26. +577
    -500
      LLama/runtimes/ggml-metal.metal
  27. BIN
      LLama/runtimes/libllama-cuda11.dll
  28. BIN
      LLama/runtimes/libllama-cuda11.so
  29. BIN
      LLama/runtimes/libllama-cuda12.dll
  30. BIN
      LLama/runtimes/libllama-cuda12.so
  31. BIN
      LLama/runtimes/libllama-metal.dylib
  32. BIN
      LLama/runtimes/libllama.dll
  33. BIN
      LLama/runtimes/libllama.dylib
  34. BIN
      LLama/runtimes/libllama.so
  35. +2
    -2
      docs/ContributingGuide.md
  36. +9
    -1
      docs/GetStarted.md
  37. +4
    -4
      docs/Tricks.md

+ 1
- 0
.gitignore View File

@@ -344,4 +344,5 @@ test/TensorFlowNET.Examples/mnist
site/ site/


/LLama.Unittest/Models/*.bin /LLama.Unittest/Models/*.bin
/LLama.Unittest/Models/*.gguf



+ 1
- 1
LLama.Unittest/BasicTest.cs View File

@@ -10,7 +10,7 @@ namespace LLama.Unittest


public BasicTest() public BasicTest()
{ {
_params = new ModelParams("Models/llama-2-7b-chat.ggmlv3.q3_K_S.bin")
_params = new ModelParams(Constants.ModelPath)
{ {
ContextSize = 2048 ContextSize = 2048
}; };


+ 7
- 0
LLama.Unittest/Constants.cs View File

@@ -0,0 +1,7 @@
namespace LLama.Unittest
{
internal static class Constants
{
public static string ModelPath = "Models/llama-2-7b.q4_0.gguf";
}
}

+ 1
- 1
LLama.Unittest/GrammarTest.cs View File

@@ -12,7 +12,7 @@ namespace LLama.Unittest


public GrammarTest() public GrammarTest()
{ {
_params = new ModelParams("Models/llama-2-7b-chat.ggmlv3.q3_K_S.bin")
_params = new ModelParams(Constants.ModelPath)
{ {
ContextSize = 2048, ContextSize = 2048,
}; };


+ 2
- 2
LLama.Unittest/LLama.Unittest.csproj View File

@@ -24,7 +24,7 @@
</ItemGroup> </ItemGroup>


<Target Name="DownloadContentFiles" BeforeTargets="Build"> <Target Name="DownloadContentFiles" BeforeTargets="Build">
<DownloadFile SourceUrl="https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGML/resolve/main/llama-2-7b-chat.ggmlv3.q3_K_S.bin" DestinationFolder="Models" DestinationFileName="llama-2-7b-chat.ggmlv3.q3_K_S.bin" SkipUnchangedFiles="true">
<DownloadFile SourceUrl="https://huggingface.co/narrative-bi/Llama-2-7B-GGUF/resolve/main/llama-2-7b.q4_0.gguf" DestinationFolder="Models" DestinationFileName="llama-2-7b.q4_0.gguf" SkipUnchangedFiles="true">
</DownloadFile> </DownloadFile>
</Target> </Target>


@@ -37,7 +37,7 @@
</ItemGroup> </ItemGroup>


<ItemGroup> <ItemGroup>
<None Update="Models\llama-2-7b-chat.ggmlv3.q3_K_S.bin">
<None Update="Models\llama-2-7b.q4_0.gguf">
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory> <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
</None> </None>
</ItemGroup> </ItemGroup>


+ 1
- 1
LLama.Unittest/LLamaContextTests.cs View File

@@ -10,7 +10,7 @@ namespace LLama.Unittest


public LLamaContextTests() public LLamaContextTests()
{ {
var @params = new ModelParams("Models/llama-2-7b-chat.ggmlv3.q3_K_S.bin")
var @params = new ModelParams(Constants.ModelPath)
{ {
ContextSize = 768, ContextSize = 768,
}; };


+ 12
- 11
LLama.Unittest/LLamaEmbedderTests.cs View File

@@ -5,7 +5,7 @@ namespace LLama.Unittest;
public class LLamaEmbedderTests public class LLamaEmbedderTests
: IDisposable : IDisposable
{ {
private readonly LLamaEmbedder _embedder = new(new ModelParams("Models/llama-2-7b-chat.ggmlv3.q3_K_S.bin"));
private readonly LLamaEmbedder _embedder = new(new ModelParams(Constants.ModelPath));


public void Dispose() public void Dispose()
{ {
@@ -36,18 +36,19 @@ public class LLamaEmbedderTests
Assert.Equal(expected[i], actual[i], epsilon); Assert.Equal(expected[i], actual[i], epsilon);
} }


[Fact]
public void EmbedBasic()
{
var cat = _embedder.GetEmbeddings("cat");
// todo: enable this one llama2 7B gguf is available
//[Fact]
//public void EmbedBasic()
//{
// var cat = _embedder.GetEmbeddings("cat");


Assert.NotNull(cat);
Assert.NotEmpty(cat);
// Assert.NotNull(cat);
// Assert.NotEmpty(cat);


// Expected value generate with llama.cpp embedding.exe
var expected = new float[] { -0.127304f, -0.678057f, -0.085244f, -0.956915f, -0.638633f };
AssertApproxStartsWith(expected, cat);
}
// // Expected value generate with llama.cpp embedding.exe
// var expected = new float[] { -0.127304f, -0.678057f, -0.085244f, -0.956915f, -0.638633f };
// AssertApproxStartsWith(expected, cat);
//}


[Fact] [Fact]
public void EmbedCompare() public void EmbedCompare()


+ 14
- 11
LLama.Unittest/ModelsParamsTests.cs View File

@@ -14,7 +14,6 @@ namespace LLama.Unittest
BatchSize = 17, BatchSize = 17,
ContextSize = 42, ContextSize = 42,
LoraAdapter = "adapter", LoraAdapter = "adapter",
GroupedQueryAttention = 7,
Seed = 42, Seed = 42,
GpuLayerCount = 111 GpuLayerCount = 111
}; };
@@ -33,7 +32,6 @@ namespace LLama.Unittest
BatchSize = 17, BatchSize = 17,
ContextSize = 42, ContextSize = 42,
LoraAdapter = "adapter", LoraAdapter = "adapter",
GroupedQueryAttention = 7,
Seed = 42, Seed = 42,
GpuLayerCount = 111 GpuLayerCount = 111
}; };
@@ -47,21 +45,26 @@ namespace LLama.Unittest
Assert.Equal(expected, actual); Assert.Equal(expected, actual);
} }


private class NewtsonsoftEncodingConverter
: Newtonsoft.Json.JsonConverter<Encoding>


public class NewtsonsoftEncodingConverter : JsonConverter
{ {
public override void WriteJson(JsonWriter writer, Encoding? value, JsonSerializer serializer)
public override bool CanConvert(Type objectType)
{ {
writer.WriteValue((string?)value?.WebName);
return typeof(Encoding).IsAssignableFrom(objectType);
} }


public override Encoding? ReadJson(JsonReader reader, Type objectType, Encoding? existingValue, bool hasExistingValue, JsonSerializer serializer)
public override void WriteJson(JsonWriter writer, object value, JsonSerializer serializer)
{ {
var name = (string?)reader.Value;
if (name == null)
return null;
return Encoding.GetEncoding(name);
writer.WriteValue(((Encoding)value).WebName);
}

public override object ReadJson(JsonReader reader, Type objectType, object existingValue, JsonSerializer serializer)
{
return Encoding.GetEncoding((string)reader.Value);
} }
} }


} }
} }

+ 1
- 1
LLama.Unittest/StatelessExecutorTest.cs View File

@@ -13,7 +13,7 @@ namespace LLama.Unittest
public StatelessExecutorTest(ITestOutputHelper testOutputHelper) public StatelessExecutorTest(ITestOutputHelper testOutputHelper)
{ {
_testOutputHelper = testOutputHelper; _testOutputHelper = testOutputHelper;
_params = new ModelParams("Models/llama-2-7b-chat.ggmlv3.q3_K_S.bin")
_params = new ModelParams(Constants.ModelPath)
{ {
ContextSize = 60, ContextSize = 60,
Seed = 1754 Seed = 1754


+ 0
- 10
LLama.Web/Common/ModelOptions.cs View File

@@ -88,16 +88,6 @@ namespace LLama.Web.Common
/// </summary> /// </summary>
public float[] TensorSplits { get; set; } public float[] TensorSplits { get; set; }


/// <summary>
/// Grouped-Query Attention
/// </summary>
public int GroupedQueryAttention { get; set; } = 1;

/// <summary>
/// RMS Norm Epsilon
/// </summary>
public float RmsNormEpsilon { get; set; } = 5e-6f;

/// <summary> /// <summary>
/// RoPE base frequency /// RoPE base frequency
/// </summary> /// </summary>


+ 0
- 10
LLama/Abstractions/IModelParams.cs View File

@@ -98,16 +98,6 @@ namespace LLama.Abstractions
/// </summary> /// </summary>
float[]? TensorSplits { get; set; } float[]? TensorSplits { get; set; }


/// <summary>
/// Grouped-Query Attention
/// </summary>
int GroupedQueryAttention { get; set; }

/// <summary>
/// RMS Norm Epsilon
/// </summary>
float RmsNormEpsilon { get; set; }

/// <summary> /// <summary>
/// RoPE base frequency /// RoPE base frequency
/// </summary> /// </summary>


+ 1
- 15
LLama/Common/ModelParams.cs View File

@@ -89,16 +89,6 @@ namespace LLama.Common
/// </summary> /// </summary>
public float[]? TensorSplits { get; set; } public float[]? TensorSplits { get; set; }


/// <summary>
/// Grouped-Query Attention
/// </summary>
public int GroupedQueryAttention { get; set; } = 1;

/// <summary>
/// RMS Norm Epsilon
/// </summary>
public float RmsNormEpsilon { get; set; } = 5e-6f;

/// <summary> /// <summary>
/// RoPE base frequency /// RoPE base frequency
/// </summary> /// </summary>
@@ -153,8 +143,6 @@ namespace LLama.Common
/// <param name="batchSize">Batch size for prompt processing (must be >=32 to use BLAS) (n_batch)</param> /// <param name="batchSize">Batch size for prompt processing (must be >=32 to use BLAS) (n_batch)</param>
/// <param name="convertEosToNewLine">Whether to convert eos to newline during the inference.</param> /// <param name="convertEosToNewLine">Whether to convert eos to newline during the inference.</param>
/// <param name="embeddingMode">Whether to use embedding mode. (embedding) Note that if this is set to true, The LLamaModel won't produce text response anymore.</param> /// <param name="embeddingMode">Whether to use embedding mode. (embedding) Note that if this is set to true, The LLamaModel won't produce text response anymore.</param>
/// <param name="groupedQueryAttention">Grouped-Query Attention</param>
/// <param name="rmsNormEpsilon">RMS Norm Epsilon</param>
/// <param name="ropeFrequencyBase">RoPE base frequency.</param> /// <param name="ropeFrequencyBase">RoPE base frequency.</param>
/// <param name="ropeFrequencyScale">RoPE frequency scaling factor</param> /// <param name="ropeFrequencyScale">RoPE frequency scaling factor</param>
/// <param name="mulMatQ">Use experimental mul_mat_q kernels</param> /// <param name="mulMatQ">Use experimental mul_mat_q kernels</param>
@@ -165,7 +153,7 @@ namespace LLama.Common
bool useMemorymap = true, bool useMemoryLock = false, bool perplexity = false, bool useMemorymap = true, bool useMemoryLock = false, bool perplexity = false,
string loraAdapter = "", string loraBase = "", int threads = -1, int batchSize = 512, string loraAdapter = "", string loraBase = "", int threads = -1, int batchSize = 512,
bool convertEosToNewLine = false, bool embeddingMode = false, bool convertEosToNewLine = false, bool embeddingMode = false,
int groupedQueryAttention = 1, float rmsNormEpsilon = 5e-6f, float ropeFrequencyBase = 10000.0f, float ropeFrequencyScale = 1f, bool mulMatQ = false,
float ropeFrequencyBase = 10000.0f, float ropeFrequencyScale = 1f, bool mulMatQ = false,
string encoding = "UTF-8") string encoding = "UTF-8")
{ {
ContextSize = contextSize; ContextSize = contextSize;
@@ -182,8 +170,6 @@ namespace LLama.Common
BatchSize = batchSize; BatchSize = batchSize;
ConvertEosToNewLine = convertEosToNewLine; ConvertEosToNewLine = convertEosToNewLine;
EmbeddingMode = embeddingMode; EmbeddingMode = embeddingMode;
GroupedQueryAttention = groupedQueryAttention;
RmsNormEpsilon = rmsNormEpsilon;
RopeFrequencyBase = ropeFrequencyBase; RopeFrequencyBase = ropeFrequencyBase;
RopeFrequencyScale = ropeFrequencyScale; RopeFrequencyScale = ropeFrequencyScale;
MulMatQ = mulMatQ; MulMatQ = mulMatQ;


+ 0
- 2
LLama/Extensions/IModelParamsExtensions.cs View File

@@ -39,8 +39,6 @@ namespace LLama.Extensions
result.logits_all = @params.Perplexity; result.logits_all = @params.Perplexity;
result.embedding = @params.EmbeddingMode; result.embedding = @params.EmbeddingMode;
result.low_vram = @params.LowVram; result.low_vram = @params.LowVram;
result.n_gqa = @params.GroupedQueryAttention;
result.rms_norm_eps = @params.RmsNormEpsilon;
result.rope_freq_base = @params.RopeFrequencyBase; result.rope_freq_base = @params.RopeFrequencyBase;
result.rope_freq_scale = @params.RopeFrequencyScale; result.rope_freq_scale = @params.RopeFrequencyScale;
result.mul_mat_q = @params.MulMatQ; result.mul_mat_q = @params.MulMatQ;


+ 5
- 4
LLama/LLamaContext.cs View File

@@ -132,9 +132,10 @@ namespace LLama
/// <returns></returns> /// <returns></returns>
public string DeTokenize(IEnumerable<llama_token> tokens) public string DeTokenize(IEnumerable<llama_token> tokens)
{ {
StringBuilder sb = new();
foreach(var token in tokens)
sb.Append(_ctx.TokenToString(token, _encoding));
var sb = new StringBuilder();
foreach (var token in tokens)
_ctx.TokenToString(token, _encoding, sb);

return sb.ToString(); return sb.ToString();
} }


@@ -365,7 +366,7 @@ namespace LLama
} }


// Save the newline logit value // Save the newline logit value
var nl_token = NativeApi.llama_token_nl();
var nl_token = NativeApi.llama_token_nl(_ctx);
var nl_logit = logits[nl_token]; var nl_logit = logits[nl_token];


// Convert logits into token candidates // Convert logits into token candidates


+ 0
- 4
LLama/LLamaEmbedder.cs View File

@@ -70,10 +70,6 @@ namespace LLama
/// <exception cref="RuntimeError"></exception> /// <exception cref="RuntimeError"></exception>
public float[] GetEmbeddings(string text, bool addBos) public float[] GetEmbeddings(string text, bool addBos)
{ {
if (addBos)
{
text = text.Insert(0, " ");
}


var embed_inp_array = _ctx.Tokenize(text, addBos); var embed_inp_array = _ctx.Tokenize(text, addBos);




+ 6
- 5
LLama/LLamaInstructExecutor.cs View File

@@ -5,6 +5,7 @@ using System;
using System.Collections.Generic; using System.Collections.Generic;
using System.IO; using System.IO;
using System.Linq; using System.Linq;
using System.Text;
using System.Text.Json; using System.Text.Json;
using System.Text.Json.Serialization; using System.Text.Json.Serialization;


@@ -113,7 +114,6 @@ namespace LLama
if (_is_prompt_run) if (_is_prompt_run)
{ {
// When running the first input (prompt) in inteactive mode, we should specially process it. // When running the first input (prompt) in inteactive mode, we should specially process it.
text = " " + text;
_embed_inps = Context.Tokenize(text, true).ToList(); _embed_inps = Context.Tokenize(text, true).ToList();
} }
else else
@@ -141,9 +141,10 @@ namespace LLama
{ {
if (args.Antiprompts is not null && args.Antiprompts.Count > 0) if (args.Antiprompts is not null && args.Antiprompts.Count > 0)
{ {
string last_output = "";
foreach (var id in _last_n_tokens)
last_output += Context.NativeHandle.TokenToString(id, Context.Encoding);
var last_output_builder = new StringBuilder();
foreach (var token in _last_n_tokens)
Context.NativeHandle.TokenToString(token, Context.Encoding, last_output_builder);
var last_output = last_output_builder.ToString();


foreach (var antiprompt in args.Antiprompts) foreach (var antiprompt in args.Antiprompts)
{ {
@@ -162,7 +163,7 @@ namespace LLama
} }
} }


if (_embeds.Count > 0 && _embeds.Last() == NativeApi.llama_token_eos())
if (_embeds.Count > 0 && _embeds.Last() == NativeApi.llama_token_eos(Context.NativeHandle))
{ {
args.WaitForInput = true; args.WaitForInput = true;
} }


+ 8
- 9
LLama/LLamaInteractExecutor.cs View File

@@ -7,6 +7,7 @@ using System.IO;
using System.Linq; using System.Linq;
using System.Text.Json; using System.Text.Json;
using System.Text.Json.Serialization; using System.Text.Json.Serialization;
using System.Text;


namespace LLama namespace LLama
{ {
@@ -25,7 +26,7 @@ namespace LLama
/// <param name="context"></param> /// <param name="context"></param>
public InteractiveExecutor(LLamaContext context) : base(context) public InteractiveExecutor(LLamaContext context) : base(context)
{ {
_llama_token_newline = Context.NativeHandle.Tokenize("\n", false, Context.Encoding);
_llama_token_newline = new [] { NativeApi.llama_token_nl(Context.NativeHandle) };
} }


/// <inheritdoc /> /// <inheritdoc />
@@ -103,7 +104,6 @@ namespace LLama
if (_is_prompt_run) if (_is_prompt_run)
{ {
// When running the first input (prompt) in inteactive mode, we should specially process it. // When running the first input (prompt) in inteactive mode, we should specially process it.
text = " " + text;
_embed_inps = Context.Tokenize(text, true).ToList(); _embed_inps = Context.Tokenize(text, true).ToList();
} }
else else
@@ -132,11 +132,10 @@ namespace LLama
{ {
if (args.Antiprompts is not null && args.Antiprompts.Count > 0) if (args.Antiprompts is not null && args.Antiprompts.Count > 0)
{ {
string last_output = "";
foreach (var id in _last_n_tokens)
{
last_output += Context.NativeHandle.TokenToString(id, Context.Encoding);
}
var last_output_builder = new StringBuilder();
foreach (var token in _last_n_tokens)
Context.NativeHandle.TokenToString(token, Context.Encoding, last_output_builder);
var last_output = last_output_builder.ToString();


foreach (var antiprompt in args.Antiprompts) foreach (var antiprompt in args.Antiprompts)
{ {
@@ -154,7 +153,7 @@ namespace LLama
} }
} }


if (_embeds.Count > 0 && _embeds.Last() == NativeApi.llama_token_eos())
if (_embeds.Count > 0 && _embeds.Last() == NativeApi.llama_token_eos(Context.NativeHandle))
{ {
extraOutputs = new[] { " [end of text]\n" }; extraOutputs = new[] { " [end of text]\n" };
return true; return true;
@@ -215,7 +214,7 @@ namespace LLama


_last_n_tokens.Enqueue(id); _last_n_tokens.Enqueue(id);


if (id == NativeApi.llama_token_eos())
if (id == NativeApi.llama_token_eos(Context.NativeHandle))
{ {
id = _llama_token_newline.First(); id = _llama_token_newline.First();
if (args.Antiprompts is not null && args.Antiprompts.Count > 0) if (args.Antiprompts is not null && args.Antiprompts.Count > 0)


+ 2
- 2
LLama/LLamaSharp.Runtime.targets View File

@@ -32,11 +32,11 @@
<Link>libllama.dylib</Link> <Link>libllama.dylib</Link>
</None> </None>
<None Include="$(MSBuildThisFileDirectory)runtimes/libllama-metal.dylib"> <None Include="$(MSBuildThisFileDirectory)runtimes/libllama-metal.dylib">
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
<CopyToOutputDirectory>None</CopyToOutputDirectory>
<Link>libllama-metal.dylib</Link> <Link>libllama-metal.dylib</Link>
</None> </None>
<None Include="$(MSBuildThisFileDirectory)runtimes/ggml-metal.metal"> <None Include="$(MSBuildThisFileDirectory)runtimes/ggml-metal.metal">
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
<CopyToOutputDirectory>None</CopyToOutputDirectory>
<Link>ggml-metal.metal</Link> <Link>ggml-metal.metal</Link>
</None> </None>
</ItemGroup> </ItemGroup>

+ 16
- 26
LLama/Native/LLamaContextParams.cs View File

@@ -31,16 +31,6 @@ namespace LLama.Native
/// </summary> /// </summary>
public int n_batch; public int n_batch;


/// <summary>
/// grouped-query attention (TEMP - will be moved to model hparams)
/// </summary>
public int n_gqa;

/// <summary>
/// rms norm epsilon (TEMP - will be moved to model hparams)
/// </summary>
public float rms_norm_eps;

/// <summary> /// <summary>
/// number of layers to store in VRAM /// number of layers to store in VRAM
/// </summary> /// </summary>
@@ -82,8 +72,8 @@ namespace LLama.Native
/// if true, reduce VRAM usage at the cost of performance /// if true, reduce VRAM usage at the cost of performance
/// </summary> /// </summary>
public bool low_vram public bool low_vram
{
get => Convert.ToBoolean(_low_vram);
{
readonly get => Convert.ToBoolean(_low_vram);
set => _low_vram = Convert.ToSByte(value); set => _low_vram = Convert.ToSByte(value);
} }
private sbyte _low_vram; private sbyte _low_vram;
@@ -92,8 +82,8 @@ namespace LLama.Native
/// if true, use experimental mul_mat_q kernels /// if true, use experimental mul_mat_q kernels
/// </summary> /// </summary>
public bool mul_mat_q public bool mul_mat_q
{
get => Convert.ToBoolean(_mul_mat_q);
{
readonly get => Convert.ToBoolean(_mul_mat_q);
set => _mul_mat_q = Convert.ToSByte(value); set => _mul_mat_q = Convert.ToSByte(value);
} }
private sbyte _mul_mat_q; private sbyte _mul_mat_q;
@@ -102,8 +92,8 @@ namespace LLama.Native
/// use fp16 for KV cache /// use fp16 for KV cache
/// </summary> /// </summary>
public bool f16_kv public bool f16_kv
{
get => Convert.ToBoolean(_f16_kv);
{
readonly get => Convert.ToBoolean(_f16_kv);
set => _f16_kv = Convert.ToSByte(value); set => _f16_kv = Convert.ToSByte(value);
} }
private sbyte _f16_kv; private sbyte _f16_kv;
@@ -112,8 +102,8 @@ namespace LLama.Native
/// the llama_eval() call computes all logits, not just the last one /// the llama_eval() call computes all logits, not just the last one
/// </summary> /// </summary>
public bool logits_all public bool logits_all
{
get => Convert.ToBoolean(_logits_all);
{
readonly get => Convert.ToBoolean(_logits_all);
set => _logits_all = Convert.ToSByte(value); set => _logits_all = Convert.ToSByte(value);
} }
private sbyte _logits_all; private sbyte _logits_all;
@@ -122,8 +112,8 @@ namespace LLama.Native
/// only load the vocabulary, no weights /// only load the vocabulary, no weights
/// </summary> /// </summary>
public bool vocab_only public bool vocab_only
{
get => Convert.ToBoolean(_vocab_only);
{
readonly get => Convert.ToBoolean(_vocab_only);
set => _vocab_only = Convert.ToSByte(value); set => _vocab_only = Convert.ToSByte(value);
} }
private sbyte _vocab_only; private sbyte _vocab_only;
@@ -132,8 +122,8 @@ namespace LLama.Native
/// use mmap if possible /// use mmap if possible
/// </summary> /// </summary>
public bool use_mmap public bool use_mmap
{
get => Convert.ToBoolean(_use_mmap);
{
readonly get => Convert.ToBoolean(_use_mmap);
set => _use_mmap = Convert.ToSByte(value); set => _use_mmap = Convert.ToSByte(value);
} }
private sbyte _use_mmap; private sbyte _use_mmap;
@@ -142,8 +132,8 @@ namespace LLama.Native
/// force system to keep model in RAM /// force system to keep model in RAM
/// </summary> /// </summary>
public bool use_mlock public bool use_mlock
{
get => Convert.ToBoolean(_use_mlock);
{
readonly get => Convert.ToBoolean(_use_mlock);
set => _use_mlock = Convert.ToSByte(value); set => _use_mlock = Convert.ToSByte(value);
} }
private sbyte _use_mlock; private sbyte _use_mlock;
@@ -152,8 +142,8 @@ namespace LLama.Native
/// embedding mode only /// embedding mode only
/// </summary> /// </summary>
public bool embedding public bool embedding
{
get => Convert.ToBoolean(_embedding);
{
readonly get => Convert.ToBoolean(_embedding);
set => _embedding = Convert.ToSByte(value); set => _embedding = Convert.ToSByte(value);
} }
private sbyte _embedding; private sbyte _embedding;


+ 5
- 0
LLama/Native/LLamaFtype.cs View File

@@ -105,5 +105,10 @@
/// </summary> /// </summary>
/// <remarks>Benchmark@7B: 5.15GB, +0.0044 ppl</remarks> /// <remarks>Benchmark@7B: 5.15GB, +0.0044 ppl</remarks>
LLAMA_FTYPE_MOSTLY_Q6_K = 18, LLAMA_FTYPE_MOSTLY_Q6_K = 18,

/// <summary>
/// File type was not specified
/// </summary>
LLAMA_FTYPE_GUESSED = 1024
} }
} }

+ 63
- 11
LLama/Native/NativeApi.cs View File

@@ -11,12 +11,18 @@ namespace LLama.Native
{ {
using llama_token = Int32; using llama_token = Int32;


/// <summary>
/// Callback from llama.cpp with log messages
/// </summary>
/// <param name="level"></param>
/// <param name="message"></param>
public delegate void LLamaLogCallback(ILLamaLogger.LogLevel level, string message); public delegate void LLamaLogCallback(ILLamaLogger.LogLevel level, string message);


/// <summary>
/// Direct translation of the llama.cpp API
/// </summary>
public unsafe partial class NativeApi public unsafe partial class NativeApi
{ {
public static readonly int LLAMA_MAX_DEVICES = 1;

static NativeApi() static NativeApi()
{ {
try try
@@ -43,18 +49,43 @@ namespace LLama.Native
[DllImport(libraryName, EntryPoint = "llama_mmap_supported", CallingConvention = CallingConvention.Cdecl)] [DllImport(libraryName, EntryPoint = "llama_mmap_supported", CallingConvention = CallingConvention.Cdecl)]
public static extern bool llama_empty_call(); public static extern bool llama_empty_call();


/// <summary>
/// Create a LLamaContextParams with default values
/// </summary>
/// <returns></returns>
[DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)] [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
public static extern LLamaContextParams llama_context_default_params(); public static extern LLamaContextParams llama_context_default_params();


/// <summary>
/// Create a LLamaModelQuantizeParams with default values
/// </summary>
/// <returns></returns>
[DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)] [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
public static extern LLamaModelQuantizeParams llama_model_quantize_default_params(); public static extern LLamaModelQuantizeParams llama_model_quantize_default_params();


/// <summary>
/// Check if memory mapping is supported
/// </summary>
/// <returns></returns>
[DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)] [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
public static extern bool llama_mmap_supported(); public static extern bool llama_mmap_supported();


/// <summary>
/// Check if memory lockingis supported
/// </summary>
/// <returns></returns>
[DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)] [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
public static extern bool llama_mlock_supported(); public static extern bool llama_mlock_supported();


/// <summary>
/// Export a static computation graph for context of 511 and batch size of 1
/// NOTE: since this functionality is mostly for debugging and demonstration purposes, we hardcode these
/// parameters here to keep things simple
/// IMPORTANT: do not use for anything else other than debugging and testing!
/// </summary>
/// <param name="ctx"></param>
/// <param name="fname"></param>
/// <returns></returns>
[DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)] [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
public static extern int llama_eval_export(SafeLLamaContextHandle ctx, string fname); public static extern int llama_eval_export(SafeLLamaContextHandle ctx, string fname);


@@ -69,6 +100,13 @@ namespace LLama.Native
[DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)] [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
public static extern IntPtr llama_load_model_from_file(string path_model, LLamaContextParams @params); public static extern IntPtr llama_load_model_from_file(string path_model, LLamaContextParams @params);


/// <summary>
/// Create a new llama_context with the given model.
/// Return value should always be wrapped in SafeLLamaContextHandle!
/// </summary>
/// <param name="model"></param>
/// <param name="params"></param>
/// <returns></returns>
[DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)] [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
public static extern IntPtr llama_new_context_with_model(SafeLlamaModelHandle model, LLamaContextParams @params); public static extern IntPtr llama_new_context_with_model(SafeLlamaModelHandle model, LLamaContextParams @params);


@@ -81,7 +119,7 @@ namespace LLama.Native
public static extern void llama_backend_init(bool numa); public static extern void llama_backend_init(bool numa);


/// <summary> /// <summary>
/// Frees all allocated memory
/// Frees all allocated memory in the given llama_context
/// </summary> /// </summary>
/// <param name="ctx"></param> /// <param name="ctx"></param>
[DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)] [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
@@ -341,14 +379,26 @@ namespace LLama.Native
[DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)] [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
public static extern IntPtr llama_token_to_str(SafeLLamaContextHandle ctx, llama_token token); public static extern IntPtr llama_token_to_str(SafeLLamaContextHandle ctx, llama_token token);


/// <summary>
/// Get the "Beginning of sentence" token
/// </summary>
/// <returns></returns>
[DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)] [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
public static extern llama_token llama_token_bos();
public static extern llama_token llama_token_bos(SafeLLamaContextHandle ctx);


/// <summary>
/// Get the "End of sentence" token
/// </summary>
/// <returns></returns>
[DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)] [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
public static extern llama_token llama_token_eos();
public static extern llama_token llama_token_eos(SafeLLamaContextHandle ctx);


/// <summary>
/// Get the "new line" token
/// </summary>
/// <returns></returns>
[DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)] [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
public static extern llama_token llama_token_nl();
public static extern llama_token llama_token_nl(SafeLLamaContextHandle ctx);


/// <summary> /// <summary>
/// Print out timing information for this context /// Print out timing information for this context
@@ -377,7 +427,7 @@ namespace LLama.Native
/// <param name="model"></param> /// <param name="model"></param>
/// <returns></returns> /// <returns></returns>
[DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)] [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
public static extern int llama_n_vocab_from_model(SafeLlamaModelHandle model);
public static extern int llama_model_n_vocab(SafeLlamaModelHandle model);


/// <summary> /// <summary>
/// Get the size of the context window for the model /// Get the size of the context window for the model
@@ -385,7 +435,7 @@ namespace LLama.Native
/// <param name="model"></param> /// <param name="model"></param>
/// <returns></returns> /// <returns></returns>
[DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)] [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
public static extern int llama_n_ctx_from_model(SafeLlamaModelHandle model);
public static extern int llama_model_n_ctx(SafeLlamaModelHandle model);


/// <summary> /// <summary>
/// Get the dimension of embedding vectors from this model /// Get the dimension of embedding vectors from this model
@@ -393,16 +443,18 @@ namespace LLama.Native
/// <param name="model"></param> /// <param name="model"></param>
/// <returns></returns> /// <returns></returns>
[DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)] [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
public static extern int llama_n_embd_from_model(SafeLlamaModelHandle model);
public static extern int llama_model_n_embd(SafeLlamaModelHandle model);


/// <summary> /// <summary>
/// Convert a single token into text /// Convert a single token into text
/// </summary> /// </summary>
/// <param name="model"></param> /// <param name="model"></param>
/// <param name="llamaToken"></param> /// <param name="llamaToken"></param>
/// <returns></returns>
/// <param name="buffer">buffer to write string into</param>
/// <param name="length">size of the buffer</param>
/// <returns>The length writte, or if the buffer is too small a negative that indicates the length required</returns>
[DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)] [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
public static extern byte* llama_token_to_str_with_model(SafeLlamaModelHandle model, int llamaToken);
public static extern int llama_token_to_piece_with_model(SafeLlamaModelHandle model, int llamaToken, byte* buffer, int length);


/// <summary> /// <summary>
/// Convert text into tokens /// Convert text into tokens


+ 18
- 6
LLama/Native/SafeLLamaContextHandle.cs View File

@@ -183,7 +183,7 @@ namespace LLama.Native
/// <summary> /// <summary>
/// Convert a token into a string /// Convert a token into a string
/// </summary> /// </summary>
/// <param name="token"></param>
/// <param name="token">Token to decode into a string</param>
/// <param name="encoding"></param> /// <param name="encoding"></param>
/// <returns></returns> /// <returns></returns>
public string TokenToString(int token, Encoding encoding) public string TokenToString(int token, Encoding encoding)
@@ -192,13 +192,25 @@ namespace LLama.Native
} }


/// <summary> /// <summary>
/// Convert a token into a span of bytes that could be decoded into a string
/// Append a single llama token to a string builder
/// </summary> /// </summary>
/// <param name="token"></param>
/// <returns></returns>
public ReadOnlySpan<byte> TokenToSpan(int token)
/// <param name="token">Token to decode</param>
/// <param name="encoding"></param>
/// <param name="dest">string builder to append the result to</param>
public void TokenToString(int token, Encoding encoding, StringBuilder dest)
{
ThrowIfDisposed().TokenToString(token, encoding, dest);
}

/// <summary>
/// Convert a single llama token into bytes
/// </summary>
/// <param name="token">Token to decode</param>
/// <param name="dest">A span to attempt to write into. If this is too small nothing will be written</param>
/// <returns>The size of this token. **nothing will be written** if this is larger than `dest`</returns>
public int TokenToSpan(int token, Span<byte> dest)
{ {
return ThrowIfDisposed().TokenToSpan(token);
return ThrowIfDisposed().TokenToSpan(token, dest);
} }


/// <summary> /// <summary>


+ 57
- 15
LLama/Native/SafeLlamaModelHandle.cs View File

@@ -1,4 +1,5 @@
using System; using System;
using System.Diagnostics;
using System.Text; using System.Text;
using LLama.Exceptions; using LLama.Exceptions;


@@ -28,9 +29,9 @@ namespace LLama.Native
internal SafeLlamaModelHandle(IntPtr handle) internal SafeLlamaModelHandle(IntPtr handle)
: base(handle) : base(handle)
{ {
VocabCount = NativeApi.llama_n_vocab_from_model(this);
ContextSize = NativeApi.llama_n_ctx_from_model(this);
EmbeddingSize = NativeApi.llama_n_embd_from_model(this);
VocabCount = NativeApi.llama_model_n_vocab(this);
ContextSize = NativeApi.llama_model_n_ctx(this);
EmbeddingSize = NativeApi.llama_model_n_embd(this);
} }


/// <inheritdoc /> /// <inheritdoc />
@@ -82,17 +83,20 @@ namespace LLama.Native


#region tokenize #region tokenize
/// <summary> /// <summary>
/// Convert a single llama token into string bytes
/// Convert a single llama token into bytes
/// </summary> /// </summary>
/// <param name="llama_token"></param>
/// <returns></returns>
public ReadOnlySpan<byte> TokenToSpan(int llama_token)
/// <param name="llama_token">Token to decode</param>
/// <param name="dest">A span to attempt to write into. If this is too small nothing will be written</param>
/// <returns>The size of this token. **nothing will be written** if this is larger than `dest`</returns>
public int TokenToSpan(int llama_token, Span<byte> dest)
{ {
unsafe unsafe
{ {
var bytes = new ReadOnlySpan<byte>(NativeApi.llama_token_to_str_with_model(this, llama_token), int.MaxValue);
var terminator = bytes.IndexOf((byte)0);
return bytes.Slice(0, terminator);
fixed (byte* destPtr = dest)
{
var length = NativeApi.llama_token_to_piece_with_model(this, llama_token, destPtr, dest.Length);
return Math.Abs(length);
}
} }
} }


@@ -104,16 +108,54 @@ namespace LLama.Native
/// <returns></returns> /// <returns></returns>
public string TokenToString(int llama_token, Encoding encoding) public string TokenToString(int llama_token, Encoding encoding)
{ {
var span = TokenToSpan(llama_token);
unsafe
{
var length = NativeApi.llama_token_to_piece_with_model(this, llama_token, null, 0);
if (length == 0)
return "";

Span<byte> bytes = stackalloc byte[-length];


if (span.Length == 0)
return "";
fixed (byte* bytePtr = bytes)
{
var written = NativeApi.llama_token_to_piece_with_model(this, llama_token, bytePtr, bytes.Length);
Debug.Assert(written == bytes.Length);


return encoding.GetString(bytePtr, bytes.Length);
}
}
}

/// <summary>
/// Append a single llama token to a string builder
/// </summary>
/// <param name="llama_token">Token to decode</param>
/// <param name="encoding"></param>
/// <param name="dest">string builder to append the result to</param>
public void TokenToString(int llama_token, Encoding encoding, StringBuilder dest)
{
unsafe unsafe
{ {
fixed (byte* ptr = &span[0])
var length = NativeApi.llama_token_to_piece_with_model(this, llama_token, null, 0);
if (length == 0)
return;

Span<byte> bytes = stackalloc byte[-length];
fixed (byte* bytePtr = bytes)
{ {
return encoding.GetString(ptr, span.Length);
// Decode into bytes
var written = NativeApi.llama_token_to_piece_with_model(this, llama_token, bytePtr, bytes.Length);
Debug.Assert(written == bytes.Length);

// Decode into chars
var charCount = encoding.GetCharCount(bytePtr, bytes.Length);
Span<char> chars = stackalloc char[charCount];
fixed (char* charPtr = chars)
encoding.GetChars(bytePtr, bytes.Length, charPtr, chars.Length);

// Write it to the output
for (var i = 0; i < chars.Length; i++)
dest.Append(chars[i]);
} }
} }
} }


+ 13
- 0
LLama/Native/SamplingApi.cs View File

@@ -1,8 +1,14 @@
using System; using System;


#pragma warning disable IDE1006 // Naming Styles

namespace LLama.Native namespace LLama.Native
{ {
using llama_token = Int32; using llama_token = Int32;

/// <summary>
/// Direct translation of the llama.cpp sampling API
/// </summary>
public unsafe class SamplingApi public unsafe class SamplingApi
{ {
/// <summary> /// <summary>
@@ -140,6 +146,13 @@ namespace LLama.Native
NativeApi.llama_sample_typical(ctx, ref st, p, min_keep); NativeApi.llama_sample_typical(ctx, ref st, p, min_keep);
} }


/// <summary>
/// Sample with temperature.
/// As temperature increases, the prediction becomes diverse but also vulnerable to hallucinations -- generating tokens that are sensible but not factual
/// </summary>
/// <param name="ctx"></param>
/// <param name="candidates"></param>
/// <param name="temp"></param>
public static void llama_sample_temperature(SafeLLamaContextHandle ctx, LLamaTokenDataArray candidates, float temp) public static void llama_sample_temperature(SafeLLamaContextHandle ctx, LLamaTokenDataArray candidates, float temp)
{ {
using var handle = LLamaTokenDataArrayNative.Create(candidates, out var st); using var handle = LLamaTokenDataArrayNative.Create(candidates, out var st);


+ 4
- 4
LLama/OldVersion/LLamaModel.cs View File

@@ -634,7 +634,7 @@ namespace LLama.OldVersion
LLamaTokenDataArray candidates_p = new LLamaTokenDataArray(candidates); LLamaTokenDataArray candidates_p = new LLamaTokenDataArray(candidates);


// Apply penalties // Apply penalties
float nl_logit = logits[NativeApi.llama_token_nl()];
float nl_logit = logits[NativeApi.llama_token_nl(_ctx)];
var last_n_repeat = Math.Min(Math.Min(_last_n_tokens.Count, repeat_last_n), _n_ctx); var last_n_repeat = Math.Min(Math.Min(_last_n_tokens.Count, repeat_last_n), _n_ctx);
SamplingApi.llama_sample_repetition_penalty(_ctx, candidates_p, SamplingApi.llama_sample_repetition_penalty(_ctx, candidates_p,
_last_n_tokens.Skip(_last_n_tokens.Count - last_n_repeat).ToArray(), _last_n_tokens.Skip(_last_n_tokens.Count - last_n_repeat).ToArray(),
@@ -644,7 +644,7 @@ namespace LLama.OldVersion
(ulong)last_n_repeat, alpha_frequency, alpha_presence); (ulong)last_n_repeat, alpha_frequency, alpha_presence);
if (!penalize_nl) if (!penalize_nl)
{ {
logits[NativeApi.llama_token_nl()] = nl_logit;
logits[NativeApi.llama_token_nl(_ctx)] = nl_logit;
} }


if (temp <= 0) if (temp <= 0)
@@ -684,7 +684,7 @@ namespace LLama.OldVersion
} }


// replace end of text token with newline token when in interactive mode // replace end of text token with newline token when in interactive mode
if (id == NativeApi.llama_token_eos() && _params.interactive && !_params.instruct)
if (id == NativeApi.llama_token_eos(_ctx) && _params.interactive && !_params.instruct)
{ {
id = _llama_token_newline[0]; id = _llama_token_newline[0];
if (_params.antiprompt.Count != 0) if (_params.antiprompt.Count != 0)
@@ -760,7 +760,7 @@ namespace LLama.OldVersion
break; break;
} }


if (_embed.Count > 0 && _embed.Last() == NativeApi.llama_token_eos())
if (_embed.Count > 0 && _embed.Last() == NativeApi.llama_token_eos(_ctx))
{ {
if (_params.instruct) if (_params.instruct)
{ {


+ 577
- 500
LLama/runtimes/ggml-metal.metal
File diff suppressed because it is too large
View File


BIN
LLama/runtimes/libllama-cuda11.dll View File


BIN
LLama/runtimes/libllama-cuda11.so View File


BIN
LLama/runtimes/libllama-cuda12.dll View File


BIN
LLama/runtimes/libllama-cuda12.so View File


BIN
LLama/runtimes/libllama-metal.dylib View File


BIN
LLama/runtimes/libllama.dll View File


BIN
LLama/runtimes/libllama.dylib View File


BIN
LLama/runtimes/libllama.so View File


+ 2
- 2
docs/ContributingGuide.md View File

@@ -33,11 +33,11 @@ When adding the feature, please take care of the namespace and the naming conven


## Find the problem and fix the BUG ## Find the problem and fix the BUG


If the issue is related to the LLM internal behaviors, such as endless generating the response, the best way to find the problem is to do comparison test between llama.cpp and LLamaSharp.
If the issue is related to the LLM internal behaviour, such as endless generating the response, the best way to find the problem is to do comparison test between llama.cpp and LLamaSharp.


You could use exactly the same prompt, the same model and the same parameters to run the inference in llama.cpp and LLamaSharp respectively to see if it's really a problem caused by the implementation in LLamaSharp. You could use exactly the same prompt, the same model and the same parameters to run the inference in llama.cpp and LLamaSharp respectively to see if it's really a problem caused by the implementation in LLamaSharp.


If the experiment showed that it worked well in llama.cpp but didn't in LLamaSharp, a the search for the problem could be started. While the reason of the problem could be various, the best way I think is to add log-print in the code of llama.cpp and use it in LLamaSharp after compilation. Thus, when running LLamaSharp, you could see what happened in the native library.
If the experiment showed that it worked well in llama.cpp but didn't in LLamaSharp, a search for the problem could be started. While the reason of the problem could be various, the best way I think is to add log-print in the code of llama.cpp and use it in LLamaSharp after compilation. Thus, when running LLamaSharp, you could see what happened in the native library.


After finding out the reason, a painful but happy process comes. When working on the BUG fix, there's only one rule to follow, that is keeping the examples working well. If the modification fixed the BUG but impact on other functions, it would not be a good fix. After finding out the reason, a painful but happy process comes. When working on the BUG fix, there's only one rule to follow, that is keeping the examples working well. If the modification fixed the BUG but impact on other functions, it would not be a good fix.




+ 9
- 1
docs/GetStarted.md View File

@@ -54,8 +54,16 @@ using LLama;
string modelPath = "<Your model path>" // change it to your own model path string modelPath = "<Your model path>" // change it to your own model path
var prompt = "Transcript of a dialog, where the User interacts with an Assistant named Bob. Bob is helpful, kind, honest, good at writing, and never fails to answer the User's requests immediately and with precision.\r\n\r\nUser: Hello, Bob.\r\nBob: Hello. How may I help you today?\r\nUser: Please tell me the largest city in Europe.\r\nBob: Sure. The largest city in Europe is Moscow, the capital of Russia.\r\nUser:"; // use the "chat-with-bob" prompt here. var prompt = "Transcript of a dialog, where the User interacts with an Assistant named Bob. Bob is helpful, kind, honest, good at writing, and never fails to answer the User's requests immediately and with precision.\r\n\r\nUser: Hello, Bob.\r\nBob: Hello. How may I help you today?\r\nUser: Please tell me the largest city in Europe.\r\nBob: Sure. The largest city in Europe is Moscow, the capital of Russia.\r\nUser:"; // use the "chat-with-bob" prompt here.


// Load model
var parameters = new ModelParams(modelPath)
{
ContextSize = 1024
};
using var model = LLamaWeights.LoadFromFile(parameters);

// Initialize a chat session // Initialize a chat session
var ex = new InteractiveExecutor(new LLamaModel(new ModelParams(modelPath, contextSize: 1024, seed: 1337, gpuLayerCount: 5)));
using var context = model.CreateContext(parameters);
var ex = new InteractiveExecutor(context);
ChatSession session = new ChatSession(ex); ChatSession session = new ChatSession(ex);


// show the prompt // show the prompt


+ 4
- 4
docs/Tricks.md View File

@@ -1,11 +1,11 @@
# Tricks for FAQ # Tricks for FAQ


Sometimes, your application with LLM and LLamaSharp may have strange behaviors. Before opening an issue to report the BUG, the following tricks may worth a try.
Sometimes, your application with LLM and LLamaSharp may have strange behaviours. Before opening an issue to report the BUG, the following tricks may worth a try.




## Carefully set the anti-prompts ## Carefully set the anti-prompts


Anti-prompt can also be called as "Stop-keyword", which decides when to stop the response generation. Under interactive mode, the maximum tokens count is always not set, which makes the LLM generates responses infinitively. Therefore, setting anti-prompt correctly helps a lot to avoid the strange behaviors. For example, the prompt file `chat-with-bob.txt` has the following content:
Anti-prompt can also be called as "Stop-keyword", which decides when to stop the response generation. Under interactive mode, the maximum tokens count is always not set, which makes the LLM generates responses infinitively. Therefore, setting anti-prompt correctly helps a lot to avoid the strange behaviours. For example, the prompt file `chat-with-bob.txt` has the following content:


``` ```
Transcript of a dialog, where the User interacts with an Assistant named Bob. Bob is helpful, kind, honest, good at writing, and never fails to answer the User's requests immediately and with precision. Transcript of a dialog, where the User interacts with an Assistant named Bob. Bob is helpful, kind, honest, good at writing, and never fails to answer the User's requests immediately and with precision.
@@ -19,7 +19,7 @@ User:


Therefore, the anti-prompt should be set as "User:". If the last line of the prompt is removed, LLM will automatically generate a question (user) and a response (bob) for one time when running the chat session. Therefore, the antiprompt is suggested to be appended to the prompt when starting a chat session. Therefore, the anti-prompt should be set as "User:". If the last line of the prompt is removed, LLM will automatically generate a question (user) and a response (bob) for one time when running the chat session. Therefore, the antiprompt is suggested to be appended to the prompt when starting a chat session.


What if an extra line is appended? The string "User:" in the prompt will be followed with a char "\n". Thus when running the model, the automatic generation of a pair of question and response may appear because the anti-prompt is "User:" but the last token is "User:\n". As for whether it will appear, it's an undefined behavior, which depends on the implementation inside the `LLamaExecutor`. Anyway, since it may leads to unexpected behaviors, it's recommended to trim your prompt or carefully keep consistent with your anti-prompt.
What if an extra line is appended? The string "User:" in the prompt will be followed with a char "\n". Thus when running the model, the automatic generation of a pair of question and response may appear because the anti-prompt is "User:" but the last token is "User:\n". As for whether it will appear, it's an undefined behaviour, which depends on the implementation inside the `LLamaExecutor`. Anyway, since it may leads to unexpected behaviors, it's recommended to trim your prompt or carefully keep consistent with your anti-prompt.


## Pay attention to the length of prompt ## Pay attention to the length of prompt


@@ -37,7 +37,7 @@ If your chat bot has bad performance, trying different executor will possibly ma


## Choose models weight depending on you task ## Choose models weight depending on you task


The differences between modes may lead to much different behaviors under the same task. For example, if you're building a chat bot with non-English, a fine-tuned model specially for the language you want to use will have huge effect on the performance.
The differences between modes may lead to much different behaviours under the same task. For example, if you're building a chat bot with non-English, a fine-tuned model specially for the language you want to use will have huge effect on the performance.


## Set the layer count you want to offload to GPU ## Set the layer count you want to offload to GPU



Loading…
Cancel
Save