validate utf-8 stringtags/json-c-0.14-20200419
@@ -83,6 +83,7 @@ static const char* json_tokener_errors[] = { | |||||
"object value separator ',' expected", | "object value separator ',' expected", | ||||
"invalid string sequence", | "invalid string sequence", | ||||
"expected comment", | "expected comment", | ||||
"invalid utf-8 string", | |||||
"buffer size overflow" | "buffer size overflow" | ||||
}; | }; | ||||
@@ -222,8 +223,12 @@ struct json_object* json_tokener_parse_verbose(const char *str, | |||||
: \ | : \ | ||||
(((tok)->err = json_tokener_continue), 0) \ | (((tok)->err = json_tokener_continue), 0) \ | ||||
) : \ | ) : \ | ||||
(((dest) = *str), 1) \ | |||||
) | |||||
(((tok->flags & JSON_TOKENER_VALIDATE_UTF8) && \ | |||||
(!json_tokener_validate_utf8(*str, nBytesp)))? \ | |||||
((tok->err = json_tokener_error_parse_utf8_string), 0) \ | |||||
: \ | |||||
(((dest) = *str), 1) \ | |||||
)) | |||||
/* ADVANCE_CHAR() macro: | /* ADVANCE_CHAR() macro: | ||||
* Increments str & tok->char_offset. | * Increments str & tok->char_offset. | ||||
@@ -242,6 +247,9 @@ struct json_object* json_tokener_parse_ex(struct json_tokener *tok, | |||||
{ | { | ||||
struct json_object *obj = NULL; | struct json_object *obj = NULL; | ||||
char c = '\1'; | char c = '\1'; | ||||
unsigned int nBytes = 0; | |||||
unsigned int *nBytesp = &nBytes; | |||||
#ifdef HAVE_USELOCALE | #ifdef HAVE_USELOCALE | ||||
locale_t oldlocale = uselocale(NULL); | locale_t oldlocale = uselocale(NULL); | ||||
locale_t newloc; | locale_t newloc; | ||||
@@ -948,6 +956,10 @@ struct json_object* json_tokener_parse_ex(struct json_tokener *tok, | |||||
} /* while(PEEK_CHAR) */ | } /* while(PEEK_CHAR) */ | ||||
out: | out: | ||||
if ((tok->flags & JSON_TOKENER_VALIDATE_UTF8) && (nBytes != 0)) | |||||
{ | |||||
tok->err = json_tokener_error_parse_utf8_string; | |||||
} | |||||
if (c && | if (c && | ||||
(state == json_tokener_state_finish) && | (state == json_tokener_state_finish) && | ||||
(tok->depth == 0) && | (tok->depth == 0) && | ||||
@@ -985,6 +997,32 @@ struct json_object* json_tokener_parse_ex(struct json_tokener *tok, | |||||
return NULL; | return NULL; | ||||
} | } | ||||
json_bool json_tokener_validate_utf8(const char c, unsigned int *nBytes) | |||||
{ | |||||
unsigned char chr = c; | |||||
if (*nBytes == 0) | |||||
{ | |||||
if (chr >= 0x80) | |||||
{ | |||||
if ((chr & 0xe0) == 0xc0) | |||||
*nBytes = 1; | |||||
else if ((chr & 0xf0) == 0xe0) | |||||
*nBytes = 2; | |||||
else if ((chr & 0xf8) == 0xf0) | |||||
*nBytes = 3; | |||||
else | |||||
return 0; | |||||
} | |||||
} | |||||
else | |||||
{ | |||||
if ((chr & 0xC0) != 0x80) | |||||
return 0; | |||||
(*nBytes)--; | |||||
} | |||||
return 1; | |||||
} | |||||
void json_tokener_set_flags(struct json_tokener *tok, int flags) | void json_tokener_set_flags(struct json_tokener *tok, int flags) | ||||
{ | { | ||||
tok->flags = flags; | tok->flags = flags; | ||||
@@ -38,6 +38,7 @@ enum json_tokener_error { | |||||
json_tokener_error_parse_object_value_sep, | json_tokener_error_parse_object_value_sep, | ||||
json_tokener_error_parse_string, | json_tokener_error_parse_string, | ||||
json_tokener_error_parse_comment, | json_tokener_error_parse_comment, | ||||
json_tokener_error_parse_utf8_string, | |||||
json_tokener_error_size | json_tokener_error_size | ||||
}; | }; | ||||
@@ -136,6 +137,17 @@ typedef struct json_tokener json_tokener; | |||||
*/ | */ | ||||
#define JSON_TOKENER_STRICT 0x01 | #define JSON_TOKENER_STRICT 0x01 | ||||
/** | |||||
* Allow json_tokener_parse_ex() validate utf-8 char. | |||||
* The json_tokener_validate_utf8() validate one utf8 char | |||||
* after get one char, then begin to parse it. | |||||
* | |||||
* This flag is not set by default. | |||||
* | |||||
* @see json_tokener_set_flags() | |||||
*/ | |||||
#define JSON_TOKENER_VALIDATE_UTF8 0x10 | |||||
/** | /** | ||||
* Given an error previously returned by json_tokener_get_error(), | * Given an error previously returned by json_tokener_get_error(), | ||||
* return a human readable description of the error. | * return a human readable description of the error. | ||||
@@ -162,6 +174,11 @@ JSON_EXPORT void json_tokener_reset(struct json_tokener *tok); | |||||
JSON_EXPORT struct json_object* json_tokener_parse(const char *str); | JSON_EXPORT struct json_object* json_tokener_parse(const char *str); | ||||
JSON_EXPORT struct json_object* json_tokener_parse_verbose(const char *str, enum json_tokener_error *error); | JSON_EXPORT struct json_object* json_tokener_parse_verbose(const char *str, enum json_tokener_error *error); | ||||
/** | |||||
* validete the utf-8 string in strict model. | |||||
* if not utf-8 format, return err. | |||||
*/ | |||||
json_bool json_tokener_validate_utf8(const char c, unsigned int *nBytes); | |||||
/** | /** | ||||
* Set flags that control how parsing will be done. | * Set flags that control how parsing will be done. | ||||
*/ | */ | ||||
@@ -355,6 +355,39 @@ struct incremental_step { | |||||
{ "[1,2,3,]", -1, 7, json_tokener_error_parse_unexpected, 3 }, | { "[1,2,3,]", -1, 7, json_tokener_error_parse_unexpected, 3 }, | ||||
{ "{\"a\":1,}", -1, 7, json_tokener_error_parse_unexpected, 3 }, | { "{\"a\":1,}", -1, 7, json_tokener_error_parse_unexpected, 3 }, | ||||
// utf-8 test | |||||
// acsll encoding | |||||
{ "\x22\x31\x32\x33\x61\x73\x63\x24\x25\x26\x22",-1, -1, json_tokener_success, 5 }, | |||||
{ "\x22\x31\x32\x33\x61\x73\x63\x24\x25\x26\x22",-1, -1, json_tokener_success, 1 }, | |||||
// utf-8 encoding | |||||
{ "\x22\xe4\xb8\x96\xe7\x95\x8c\x22",-1, -1, json_tokener_success, 5 }, | |||||
{ "\x22\xe4\xb8",-1, 3, json_tokener_error_parse_utf8_string, 4 }, | |||||
{ "\x96\xe7\x95\x8c\x22",-1, 0, json_tokener_error_parse_utf8_string, 5 }, | |||||
{ "\x22\xe4\xb8\x96\xe7\x95\x8c\x22",-1, -1, json_tokener_success, 1 }, | |||||
{ "\x22\xcf\x80\xcf\x86\x22",-1, -1, json_tokener_success, 5 }, | |||||
{ "\x22\xf0\xa5\x91\x95\x22",-1, -1, json_tokener_success, 5 }, | |||||
// wrong utf-8 encoding | |||||
{ "\x22\xe6\x9d\x4e\x22",-1, 3, json_tokener_error_parse_utf8_string, 5 }, | |||||
{ "\x22\xe6\x9d\x4e\x22",-1, 5, json_tokener_success, 1 }, | |||||
// GBK encoding | |||||
{ "\x22\xc0\xee\xc5\xf4\x22",-1, 2, json_tokener_error_parse_utf8_string, 5 }, | |||||
{ "\x22\xc0\xee\xc5\xf4\x22",-1, 6, json_tokener_success, 1 }, | |||||
// char after space | |||||
{ "\x20\x20\x22\xe4\xb8\x96\x22",-1, -1, json_tokener_success, 5 }, | |||||
{ "\x20\x20\x81\x22\xe4\xb8\x96\x22",-1, 2, json_tokener_error_parse_utf8_string, 5 }, | |||||
{ "\x5b\x20\x81\x31\x5d",-1, 2, json_tokener_error_parse_utf8_string, 5 }, | |||||
// char in state inf | |||||
{ "\x49\x6e\x66\x69\x6e\x69\x74\x79",9, 8, json_tokener_success, 1 }, | |||||
{ "\x49\x6e\x66\x81\x6e\x69\x74\x79",-1, 3, json_tokener_error_parse_utf8_string, 5 }, | |||||
// char in escape unicode | |||||
{ "\x22\x5c\x75\x64\x38\x35\x35\x5c\x75\x64\x63\x35\x35\x22",15, 14, json_tokener_success, 5 }, | |||||
{ "\x22\x5c\x75\x64\x38\x35\x35\xc0\x75\x64\x63\x35\x35\x22",-1, 8, json_tokener_error_parse_utf8_string, 5 }, | |||||
{ "\x22\x5c\x75\x64\x30\x30\x33\x31\xc0\x22",-1, 9, json_tokener_error_parse_utf8_string, 5 }, | |||||
// char in number | |||||
{ "\x31\x31\x81\x31\x31",-1, 2, json_tokener_error_parse_utf8_string, 5 }, | |||||
// char in object | |||||
{ "\x7b\x22\x31\x81\x22\x3a\x31\x7d",-1, 3, json_tokener_error_parse_utf8_string, 5 }, | |||||
{ NULL, -1, -1, json_tokener_success, 0 }, | { NULL, -1, -1, json_tokener_success, 0 }, | ||||
}; | }; | ||||
@@ -389,9 +422,19 @@ static void test_incremental_parse() | |||||
size_t expected_char_offset; | size_t expected_char_offset; | ||||
if (step->reset_tokener & 2) | if (step->reset_tokener & 2) | ||||
json_tokener_set_flags(tok, JSON_TOKENER_STRICT); | |||||
{ | |||||
if (step->reset_tokener & 4) | |||||
json_tokener_set_flags(tok, 3); | |||||
else | |||||
json_tokener_set_flags(tok, JSON_TOKENER_STRICT); | |||||
} | |||||
else | else | ||||
json_tokener_set_flags(tok, 0); | |||||
{ | |||||
if (step->reset_tokener & 4) | |||||
json_tokener_set_flags(tok, JSON_TOKENER_VALIDATE_UTF8); | |||||
else | |||||
json_tokener_set_flags(tok, 0); | |||||
} | |||||
if (length == -1) | if (length == -1) | ||||
length = strlen(step->string_to_parse); | length = strlen(step->string_to_parse); | ||||
@@ -183,5 +183,27 @@ json_tokener_parse_ex(tok, [1,2,3,] , 8) ... OK: got object of type [array] | |||||
json_tokener_parse_ex(tok, [1,2,,3,] , 9) ... OK: got correct error: unexpected character | json_tokener_parse_ex(tok, [1,2,,3,] , 9) ... OK: got correct error: unexpected character | ||||
json_tokener_parse_ex(tok, [1,2,3,] , 8) ... OK: got correct error: unexpected character | json_tokener_parse_ex(tok, [1,2,3,] , 8) ... OK: got correct error: unexpected character | ||||
json_tokener_parse_ex(tok, {"a":1,} , 8) ... OK: got correct error: unexpected character | json_tokener_parse_ex(tok, {"a":1,} , 8) ... OK: got correct error: unexpected character | ||||
End Incremental Tests OK=105 ERROR=0 | |||||
json_tokener_parse_ex(tok, "123asc$%&" , 11) ... OK: got object of type [string]: "123asc$%&" | |||||
json_tokener_parse_ex(tok, "123asc$%&" , 11) ... OK: got object of type [string]: "123asc$%&" | |||||
json_tokener_parse_ex(tok, "世界" , 8) ... OK: got object of type [string]: "世界" | |||||
json_tokener_parse_ex(tok, "ä¸ , 3) ... OK: got correct error: invalid utf-8 string | |||||
json_tokener_parse_ex(tok, –界" , 5) ... OK: got correct error: invalid utf-8 string | |||||
json_tokener_parse_ex(tok, "世界" , 8) ... OK: got object of type [string]: "世界" | |||||
json_tokener_parse_ex(tok, "πφ" , 6) ... OK: got object of type [string]: "πφ" | |||||
json_tokener_parse_ex(tok, "𥑕" , 6) ... OK: got object of type [string]: "𥑕" | |||||
json_tokener_parse_ex(tok, "æ�N" , 5) ... OK: got correct error: invalid utf-8 string | |||||
json_tokener_parse_ex(tok, "æ�N" , 5) ... OK: got object of type [string]: "æ�N" | |||||
json_tokener_parse_ex(tok, "ÀîÅô" , 6) ... OK: got correct error: invalid utf-8 string | |||||
json_tokener_parse_ex(tok, "ÀîÅô" , 6) ... OK: got object of type [string]: "ÀîÅô" | |||||
json_tokener_parse_ex(tok, "世" , 7) ... OK: got object of type [string]: "世" | |||||
json_tokener_parse_ex(tok, �"世" , 8) ... OK: got correct error: invalid utf-8 string | |||||
json_tokener_parse_ex(tok, [ �1] , 5) ... OK: got correct error: invalid utf-8 string | |||||
json_tokener_parse_ex(tok, Infinity , 9) ... OK: got object of type [double]: Infinity | |||||
json_tokener_parse_ex(tok, Inf�nity , 8) ... OK: got correct error: invalid utf-8 string | |||||
json_tokener_parse_ex(tok, "\ud855\udc55", 15) ... OK: got object of type [string]: "𥑕" | |||||
json_tokener_parse_ex(tok, "\ud855Àudc55", 14) ... OK: got correct error: invalid utf-8 string | |||||
json_tokener_parse_ex(tok, "\ud0031À" , 10) ... OK: got correct error: invalid utf-8 string | |||||
json_tokener_parse_ex(tok, 11�11 , 5) ... OK: got correct error: invalid utf-8 string | |||||
json_tokener_parse_ex(tok, {"1�":1} , 8) ... OK: got correct error: invalid utf-8 string | |||||
End Incremental Tests OK=127 ERROR=0 | |||||
================================== | ================================== |