From 02a9a12d0f12efb219ca877d1c1006c18d7c16e5 Mon Sep 17 00:00:00 2001
From: dota17 <chenguopingdota@163.com>
Date: Fri, 10 Jan 2020 10:43:28 +0800
Subject: [PATCH] validate utf-8 string

---
 json_tokener.c            | 52 +++++++++++++++++++++++++++++++++++++++
 json_tokener.h            |  7 ++++++
 tests/test_parse.c        | 22 +++++++++++++++++
 tests/test_parse.expected | 18 +++++++++++++-
 4 files changed, 98 insertions(+), 1 deletion(-)

diff --git a/json_tokener.c b/json_tokener.c
index fc8fb65..f786c9f 100644
--- a/json_tokener.c
+++ b/json_tokener.c
@@ -83,6 +83,7 @@ static const char* json_tokener_errors[] = {
   "object value separator ',' expected",
   "invalid string sequence",
   "expected comment",
+  "invalid utf-8 string",
   "buffer size overflow"
 };
 
@@ -282,6 +283,13 @@ struct json_object* json_tokener_parse_ex(struct json_tokener *tok,
   }
 #endif
 
+  if ((tok->flags & JSON_TOKENER_STRICT) &&
+     (!json_tokener_validate_utf8(str)))
+  {
+    tok->err = json_tokener_error_parse_utf8_string;
+    goto out;
+  }
+
   while (PEEK_CHAR(c, tok)) {
 
   redo_char:
@@ -985,6 +993,50 @@ struct json_object* json_tokener_parse_ex(struct json_tokener *tok,
   return NULL;
 }
 
+json_bool json_tokener_validate_utf8(const char *str)
+{
+  unsigned int nBytes = 0;
+  unsigned char chr = *str;
+  unsigned int i; 
+  for (i = 0; str[i] != '\0'; ++i)
+  {
+    chr = *(str + i);
+    if (nBytes == 0)
+    {
+    /*Multibyte character, count the num of bytes(nBytes) */
+      if (chr >= 0x80)
+      {
+        if(chr >= 0xFC && chr <= 0xFD)
+          nBytes = 6;
+        else if (chr >= 0xF8)
+          nBytes = 5;
+        else if (chr >= 0xF0)
+          nBytes = 4;
+        else if (chr >= 0xE0)
+          nBytes = 3;
+        else if (chr >= 0xC0)
+          nBytes = 2;
+        else
+          return 0;
+        nBytes--;
+      }
+    }
+    else
+    {
+      /*The non-first byte of multibyte character should be 10xxxxxx */
+      if ((chr & 0xC0) != 0x80)
+        return 0;
+      nBytes--;
+    }
+  }
+  /*Violate  UTF-8 encoding rules*/
+  if (nBytes != 0)
+  {
+    return 0;
+  }
+  return 1;
+}
+
 void json_tokener_set_flags(struct json_tokener *tok, int flags)
 {
 	tok->flags = flags;
diff --git a/json_tokener.h b/json_tokener.h
index da2b24c..7612a0d 100644
--- a/json_tokener.h
+++ b/json_tokener.h
@@ -38,6 +38,7 @@ enum json_tokener_error {
   json_tokener_error_parse_object_value_sep,
   json_tokener_error_parse_string,
   json_tokener_error_parse_comment,
+  json_tokener_error_parse_utf8_string,
   json_tokener_error_size
 };
 
@@ -162,6 +163,12 @@ JSON_EXPORT void json_tokener_reset(struct json_tokener *tok);
 JSON_EXPORT struct json_object* json_tokener_parse(const char *str);
 JSON_EXPORT struct json_object* json_tokener_parse_verbose(const char *str, enum json_tokener_error *error);
 
+/**
+ * validete the utf-8 string before parse in strict model.
+ * if not utf-8 format, return err.
+ */
+json_bool json_tokener_validate_utf8(const char *str);
+
 /**
  * Set flags that control how parsing will be done.
  */
diff --git a/tests/test_parse.c b/tests/test_parse.c
index 807b457..1dbfcf7 100644
--- a/tests/test_parse.c
+++ b/tests/test_parse.c
@@ -355,6 +355,28 @@ struct incremental_step {
 	{ "[1,2,3,]",         -1, 7, json_tokener_error_parse_unexpected, 3 },
 	{ "{\"a\":1,}",         -1, 7, json_tokener_error_parse_unexpected, 3 },
 
+  // acsll encoding "123asc$%&"
+	{ "\x22\x31\x32\x33\x61\x73\x63\x24\x25\x26\x22", -1, -1, json_tokener_success, 3 },
+	{ "\x22\x31\x32\x33\x61\x73\x63\x24\x25\x26\x22", -1, -1, json_tokener_success, 1 },
+  // utf-8 encoding "ä¸–ç•Œ" "Ï€Ï†" "ð¥‘•"
+	{ "\x22\xe4\xb8\x96\xe7\x95\x8c\x22", -1, -1, json_tokener_success, 3 },
+	{ "\x22\xe4\xb8\x96\xe7\x95\x8c\x22", -1, -1, json_tokener_success, 1 },
+	{ "\x22\xcf\x80\xcf\x86\x22", -1, -1, json_tokener_success, 3 },
+	{ "\x22\xf0\xa5\x91\x95\x22", -1, -1, json_tokener_success, 3 },
+	{ "\x22\xf8\xa5\xa5\x91\x95\x22", -1, -1, json_tokener_success, 3 },
+	{ "\x22\xfd\xa5\xa5\xa5\x91\x95\x22", -1, -1, json_tokener_success, 3 },
+  // wrong utf-8 encoding
+	{ "\x22\xe6\x9d\x4e\x22", -1, 0, json_tokener_error_parse_utf8_string, 3 },
+	{ "\x22\xe6\x9d\x4e\x22", -1, 5, json_tokener_success, 1 },
+  // GBK encoding
+	{ "\x22\xc0\xee\xc5\xf4\x22", -1, 0, json_tokener_error_parse_utf8_string, 3 },
+	{ "\x22\xc0\xee\xc5\xf4\x22", -1, 6, json_tokener_success, 1 },
+  // ucs-2/utf-16 encoding
+	{ "\x22\x11\xd2\x22", -1, 0, json_tokener_error_parse_utf8_string, 3 },
+	{ "\x22\x11\xd2\x22", -1, 4, json_tokener_success, 1 },
+	{ "\x22\x55\xd8\55\xdc\x22", -1, 0, json_tokener_error_parse_utf8_string, 3 },
+	{ "\x22\x16\x4e\x4c\x75\x22", -1, 6, json_tokener_success, 1 },
+
 	{ NULL, -1, -1, json_tokener_success, 0 },
 };
 
diff --git a/tests/test_parse.expected b/tests/test_parse.expected
index af075b0..18f1e3d 100644
--- a/tests/test_parse.expected
+++ b/tests/test_parse.expected
@@ -183,5 +183,21 @@ json_tokener_parse_ex(tok, [1,2,3,]    ,   8) ... OK: got object of type [array]
 json_tokener_parse_ex(tok, [1,2,,3,]   ,   9) ... OK: got correct error: unexpected character
 json_tokener_parse_ex(tok, [1,2,3,]    ,   8) ... OK: got correct error: unexpected character
 json_tokener_parse_ex(tok, {"a":1,}    ,   8) ... OK: got correct error: unexpected character
-End Incremental Tests OK=105 ERROR=0
+json_tokener_parse_ex(tok, "123asc$%&" ,  11) ... OK: got object of type [string]: "123asc$%&"
+json_tokener_parse_ex(tok, "123asc$%&" ,  11) ... OK: got object of type [string]: "123asc$%&"
+json_tokener_parse_ex(tok, "ä¸–ç•Œ"    ,   8) ... OK: got object of type [string]: "ä¸–ç•Œ"
+json_tokener_parse_ex(tok, "ä¸–ç•Œ"    ,   8) ... OK: got object of type [string]: "ä¸–ç•Œ"
+json_tokener_parse_ex(tok, "Ï€Ï†"      ,   6) ... OK: got object of type [string]: "Ï€Ï†"
+json_tokener_parse_ex(tok, "ð¥‘•"      ,   6) ... OK: got object of type [string]: "ð¥‘•"
+json_tokener_parse_ex(tok, "ø¥¥‘•"     ,   7) ... OK: got object of type [string]: "ø¥¥‘•"
+json_tokener_parse_ex(tok, "ý¥¥¥‘•"    ,   8) ... OK: got object of type [string]: "ý¥¥¥‘•"
+json_tokener_parse_ex(tok, "æN"       ,   5) ... OK: got correct error: invalid utf-8 string
+json_tokener_parse_ex(tok, "æN"       ,   5) ... OK: got object of type [string]: "æN"
+json_tokener_parse_ex(tok, "ÀîÅô"      ,   6) ... OK: got correct error: invalid utf-8 string
+json_tokener_parse_ex(tok, "ÀîÅô"      ,   6) ... OK: got object of type [string]: "ÀîÅô"
+json_tokener_parse_ex(tok, "Ò"        ,   4) ... OK: got correct error: invalid utf-8 string
+json_tokener_parse_ex(tok, "Ò"        ,   4) ... OK: got object of type [string]: "\u0011Ò"
+json_tokener_parse_ex(tok, "UØ-Ü"      ,   6) ... OK: got correct error: invalid utf-8 string
+json_tokener_parse_ex(tok, "NLu"      ,   6) ... OK: got object of type [string]: "\u0016NLu"
+End Incremental Tests OK=121 ERROR=0
 ==================================