From 02a9a12d0f12efb219ca877d1c1006c18d7c16e5 Mon Sep 17 00:00:00 2001 From: dota17 Date: Fri, 10 Jan 2020 10:43:28 +0800 Subject: [PATCH] validate utf-8 string --- json_tokener.c | 52 +++++++++++++++++++++++++++++++++++++++ json_tokener.h | 7 ++++++ tests/test_parse.c | 22 +++++++++++++++++ tests/test_parse.expected | 18 +++++++++++++- 4 files changed, 98 insertions(+), 1 deletion(-) diff --git a/json_tokener.c b/json_tokener.c index fc8fb65..f786c9f 100644 --- a/json_tokener.c +++ b/json_tokener.c @@ -83,6 +83,7 @@ static const char* json_tokener_errors[] = { "object value separator ',' expected", "invalid string sequence", "expected comment", + "invalid utf-8 string", "buffer size overflow" }; @@ -282,6 +283,13 @@ struct json_object* json_tokener_parse_ex(struct json_tokener *tok, } #endif + if ((tok->flags & JSON_TOKENER_STRICT) && + (!json_tokener_validate_utf8(str))) + { + tok->err = json_tokener_error_parse_utf8_string; + goto out; + } + while (PEEK_CHAR(c, tok)) { redo_char: @@ -985,6 +993,50 @@ struct json_object* json_tokener_parse_ex(struct json_tokener *tok, return NULL; } +json_bool json_tokener_validate_utf8(const char *str) +{ + unsigned int nBytes = 0; + unsigned char chr = *str; + unsigned int i; + for (i = 0; str[i] != '\0'; ++i) + { + chr = *(str + i); + if (nBytes == 0) + { + /*Multibyte character, count the num of bytes(nBytes) */ + if (chr >= 0x80) + { + if(chr >= 0xFC && chr <= 0xFD) + nBytes = 6; + else if (chr >= 0xF8) + nBytes = 5; + else if (chr >= 0xF0) + nBytes = 4; + else if (chr >= 0xE0) + nBytes = 3; + else if (chr >= 0xC0) + nBytes = 2; + else + return 0; + nBytes--; + } + } + else + { + /*The non-first byte of multibyte character should be 10xxxxxx */ + if ((chr & 0xC0) != 0x80) + return 0; + nBytes--; + } + } + /*Violate UTF-8 encoding rules*/ + if (nBytes != 0) + { + return 0; + } + return 1; +} + void json_tokener_set_flags(struct json_tokener *tok, int flags) { tok->flags = flags; diff --git a/json_tokener.h b/json_tokener.h index da2b24c..7612a0d 100644 --- a/json_tokener.h +++ b/json_tokener.h @@ -38,6 +38,7 @@ enum json_tokener_error { json_tokener_error_parse_object_value_sep, json_tokener_error_parse_string, json_tokener_error_parse_comment, + json_tokener_error_parse_utf8_string, json_tokener_error_size }; @@ -162,6 +163,12 @@ JSON_EXPORT void json_tokener_reset(struct json_tokener *tok); JSON_EXPORT struct json_object* json_tokener_parse(const char *str); JSON_EXPORT struct json_object* json_tokener_parse_verbose(const char *str, enum json_tokener_error *error); +/** + * validete the utf-8 string before parse in strict model. + * if not utf-8 format, return err. + */ +json_bool json_tokener_validate_utf8(const char *str); + /** * Set flags that control how parsing will be done. */ diff --git a/tests/test_parse.c b/tests/test_parse.c index 807b457..1dbfcf7 100644 --- a/tests/test_parse.c +++ b/tests/test_parse.c @@ -355,6 +355,28 @@ struct incremental_step { { "[1,2,3,]", -1, 7, json_tokener_error_parse_unexpected, 3 }, { "{\"a\":1,}", -1, 7, json_tokener_error_parse_unexpected, 3 }, + // acsll encoding "123asc$%&" + { "\x22\x31\x32\x33\x61\x73\x63\x24\x25\x26\x22", -1, -1, json_tokener_success, 3 }, + { "\x22\x31\x32\x33\x61\x73\x63\x24\x25\x26\x22", -1, -1, json_tokener_success, 1 }, + // utf-8 encoding "世界" "πφ" "𥑕" + { "\x22\xe4\xb8\x96\xe7\x95\x8c\x22", -1, -1, json_tokener_success, 3 }, + { "\x22\xe4\xb8\x96\xe7\x95\x8c\x22", -1, -1, json_tokener_success, 1 }, + { "\x22\xcf\x80\xcf\x86\x22", -1, -1, json_tokener_success, 3 }, + { "\x22\xf0\xa5\x91\x95\x22", -1, -1, json_tokener_success, 3 }, + { "\x22\xf8\xa5\xa5\x91\x95\x22", -1, -1, json_tokener_success, 3 }, + { "\x22\xfd\xa5\xa5\xa5\x91\x95\x22", -1, -1, json_tokener_success, 3 }, + // wrong utf-8 encoding + { "\x22\xe6\x9d\x4e\x22", -1, 0, json_tokener_error_parse_utf8_string, 3 }, + { "\x22\xe6\x9d\x4e\x22", -1, 5, json_tokener_success, 1 }, + // GBK encoding + { "\x22\xc0\xee\xc5\xf4\x22", -1, 0, json_tokener_error_parse_utf8_string, 3 }, + { "\x22\xc0\xee\xc5\xf4\x22", -1, 6, json_tokener_success, 1 }, + // ucs-2/utf-16 encoding + { "\x22\x11\xd2\x22", -1, 0, json_tokener_error_parse_utf8_string, 3 }, + { "\x22\x11\xd2\x22", -1, 4, json_tokener_success, 1 }, + { "\x22\x55\xd8\55\xdc\x22", -1, 0, json_tokener_error_parse_utf8_string, 3 }, + { "\x22\x16\x4e\x4c\x75\x22", -1, 6, json_tokener_success, 1 }, + { NULL, -1, -1, json_tokener_success, 0 }, }; diff --git a/tests/test_parse.expected b/tests/test_parse.expected index af075b0..18f1e3d 100644 --- a/tests/test_parse.expected +++ b/tests/test_parse.expected @@ -183,5 +183,21 @@ json_tokener_parse_ex(tok, [1,2,3,] , 8) ... OK: got object of type [array] json_tokener_parse_ex(tok, [1,2,,3,] , 9) ... OK: got correct error: unexpected character json_tokener_parse_ex(tok, [1,2,3,] , 8) ... OK: got correct error: unexpected character json_tokener_parse_ex(tok, {"a":1,} , 8) ... OK: got correct error: unexpected character -End Incremental Tests OK=105 ERROR=0 +json_tokener_parse_ex(tok, "123asc$%&" , 11) ... OK: got object of type [string]: "123asc$%&" +json_tokener_parse_ex(tok, "123asc$%&" , 11) ... OK: got object of type [string]: "123asc$%&" +json_tokener_parse_ex(tok, "世界" , 8) ... OK: got object of type [string]: "世界" +json_tokener_parse_ex(tok, "世界" , 8) ... OK: got object of type [string]: "世界" +json_tokener_parse_ex(tok, "πφ" , 6) ... OK: got object of type [string]: "πφ" +json_tokener_parse_ex(tok, "𥑕" , 6) ... OK: got object of type [string]: "𥑕" +json_tokener_parse_ex(tok, "" , 7) ... OK: got object of type [string]: "" +json_tokener_parse_ex(tok, "" , 8) ... OK: got object of type [string]: "" +json_tokener_parse_ex(tok, "N" , 5) ... OK: got correct error: invalid utf-8 string +json_tokener_parse_ex(tok, "N" , 5) ... OK: got object of type [string]: "N" +json_tokener_parse_ex(tok, "" , 6) ... OK: got correct error: invalid utf-8 string +json_tokener_parse_ex(tok, "" , 6) ... OK: got object of type [string]: "" +json_tokener_parse_ex(tok, "" , 4) ... OK: got correct error: invalid utf-8 string +json_tokener_parse_ex(tok, "" , 4) ... OK: got object of type [string]: "\u0011" +json_tokener_parse_ex(tok, "U-" , 6) ... OK: got correct error: invalid utf-8 string +json_tokener_parse_ex(tok, "NLu" , 6) ... OK: got object of type [string]: "\u0016NLu" +End Incremental Tests OK=121 ERROR=0 ==================================