Merge pull request #326 from AsakusaRinne/fix_chinese

fix: Chinese encoding error with gb2312.
2 years ago · 687fff5608
--- a/LLama.Examples/Assets/chat-with-kunkun-chinese.txt
+++ b/LLama.Examples/Assets/chat-with-kunkun-chinese.txt
@@ -0,0 +1,8 @@
 指令：下面是一段你和用户的对话，你叫坤坤，是一个在各方面都拥有丰富经验的助理，你非常乐于回答用户的问题和帮助用户。

 用戶：你好，坤坤。
 坤坤：你好，有什么我能帮助你的吗？
 用戶：中国的首都是哪座城市？
 坤坤：中国的首都是北京市。
 用戶：特朗普是谁？
 坤坤：
--- a/LLama.Examples/Examples/ChatChineseGB2312.cs
+++ b/LLama.Examples/Examples/ChatChineseGB2312.cs
@@ -0,0 +1,69 @@
 using System;
 using System.Collections.Generic;
 using System.Linq;
 using System.Text;
 using System.Threading.Tasks;
 using LLama.Common;

 namespace LLama.Examples.Examples
 {
    public class ChatChineseGB2312
    {
        private static string ConvertFromEncodingToAnother(string input, Encoding original, Encoding target)
        {
            byte[] bytes = original.GetBytes(input);
            var convertedBytes = Encoding.Convert(original, target, bytes);
            return target.GetString(convertedBytes);
        }

        public static async Task Run()
        {
            Encoding.RegisterProvider(CodePagesEncodingProvider.Instance); // Register gb2312 encoding
            Console.Write("Please input your model path: ");
            var modelPath = Console.ReadLine();
            var prompt = File.ReadAllText("Assets/chat-with-kunkun-chinese.txt", encoding: Encoding.GetEncoding("gb2312")).Trim();
            prompt = ConvertFromEncodingToAnother(prompt, Encoding.GetEncoding("gb2312"), Encoding.UTF8);

            var parameters = new ModelParams(modelPath)
            {
                ContextSize = 1024,
                Seed = 1337,
                GpuLayerCount = 20,
                Encoding = Encoding.UTF8
            };
            using var model = LLamaWeights.LoadFromFile(parameters);
            using var context = model.CreateContext(parameters);
            var executor = new InteractiveExecutor(context);

            var session = new ChatSession(executor).WithHistoryTransform(new LLamaTransforms.DefaultHistoryTransform("用户"));

            Console.ForegroundColor = ConsoleColor.Yellow;
            Console.WriteLine("This example shows how to use Chinese with gb2312 encoding, which is common in windows. It's recommended" +
                " to use https://huggingface.co/hfl/chinese-alpaca-2-7b-gguf/blob/main/ggml-model-q5_0.gguf, which has been verified by LLamaSharp developers.");
            Console.ForegroundColor = ConsoleColor.White;

            // show the prompt
            Console.Write(prompt);
            while (true)
            {
                await foreach (var text in session.ChatAsync(prompt, new InferenceParams()
                {
                    Temperature = 0.3f,
                    TopK = 5,
                    TopP = 0.85f,
                    AntiPrompts = new List<string> { "用户：" },
                    MaxTokens = 2048,
                    RepeatPenalty = 1.05f
                }))
                {
                    //Console.Write(text);
                    Console.Write(ConvertFromEncodingToAnother(text, Encoding.UTF8, Encoding.GetEncoding("gb2312")));
                }

                Console.ForegroundColor = ConsoleColor.Green;
                prompt = Console.ReadLine();
                Console.ForegroundColor = ConsoleColor.White;
            }
        }
    }
 }
--- a/LLama.Examples/Examples/Runner.cs
+++ b/LLama.Examples/Examples/Runner.cs
@@ -23,6 +23,7 @@ public class Runner
        { "Coding Assistant.", CodingAssistant.Run },
        { "Batch Decoding.", BatchedDecoding.Run },
        { "SK Kernel Memory.", KernelMemory.Run },
        { "Chinese gb2312 chat", ChatChineseGB2312.Run }, 
        { "Exit", async () => Environment.Exit(0) }
    };

--- a/LLama.Examples/LLama.Examples.csproj
+++ b/LLama.Examples/LLama.Examples.csproj
@@ -1,4 +1,4 @@
 <Project Sdk="Microsoft.NET.Sdk">
 <Project Sdk="Microsoft.NET.Sdk">
  <Import Project="..\LLama\LLamaSharp.Runtime.targets" />
  <PropertyGroup>
    <OutputType>Exe</OutputType>
@@ -68,6 +68,9 @@
    <None Update="Assets\sample-SK-Readme.pdf">
      <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
    </None>
    <None Update="Assets\chat-with-kunkun-chinese.txt">
      <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
    </None>
  </ItemGroup>

 </Project>
--- a/LLama/Native/NativeApi.cs
+++ b/LLama/Native/NativeApi.cs
@@ -9,6 +9,17 @@ namespace LLama.Native
 {
    using llama_token = Int32;

    public enum LLamaTokenType
    {
        LLAMA_TOKEN_TYPE_UNDEFINED = 0,
        LLAMA_TOKEN_TYPE_NORMAL = 1,
        LLAMA_TOKEN_TYPE_UNKNOWN = 2,
        LLAMA_TOKEN_TYPE_CONTROL = 3,
        LLAMA_TOKEN_TYPE_USER_DEFINED = 4,
        LLAMA_TOKEN_TYPE_UNUSED = 5,
        LLAMA_TOKEN_TYPE_BYTE = 6,
    }

    /// <summary>
    /// Callback from llama.cpp with log messages
    /// </summary>
@@ -243,6 +254,9 @@ namespace LLama.Native
            }
        }

        [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
        public static extern LLamaTokenType llama_token_get_type(SafeLlamaModelHandle model, llama_token token);

        /// <summary>
        /// Get the size of the context window for the model for this context
        /// </summary>