Browse Source

validate utf-8 string

pull/530/head
dota17 5 years ago
parent
commit
02a9a12d0f
4 changed files with 98 additions and 1 deletions
  1. +52
    -0
      json_tokener.c
  2. +7
    -0
      json_tokener.h
  3. +22
    -0
      tests/test_parse.c
  4. +17
    -1
      tests/test_parse.expected

+ 52
- 0
json_tokener.c View File

@@ -83,6 +83,7 @@ static const char* json_tokener_errors[] = {
"object value separator ',' expected",
"invalid string sequence",
"expected comment",
"invalid utf-8 string",
"buffer size overflow"
};

@@ -282,6 +283,13 @@ struct json_object* json_tokener_parse_ex(struct json_tokener *tok,
}
#endif

if ((tok->flags & JSON_TOKENER_STRICT) &&
(!json_tokener_validate_utf8(str)))
{
tok->err = json_tokener_error_parse_utf8_string;
goto out;
}

while (PEEK_CHAR(c, tok)) {

redo_char:
@@ -985,6 +993,50 @@ struct json_object* json_tokener_parse_ex(struct json_tokener *tok,
return NULL;
}

json_bool json_tokener_validate_utf8(const char *str)
{
unsigned int nBytes = 0;
unsigned char chr = *str;
unsigned int i;
for (i = 0; str[i] != '\0'; ++i)
{
chr = *(str + i);
if (nBytes == 0)
{
/*Multibyte character, count the num of bytes(nBytes) */
if (chr >= 0x80)
{
if(chr >= 0xFC && chr <= 0xFD)
nBytes = 6;
else if (chr >= 0xF8)
nBytes = 5;
else if (chr >= 0xF0)
nBytes = 4;
else if (chr >= 0xE0)
nBytes = 3;
else if (chr >= 0xC0)
nBytes = 2;
else
return 0;
nBytes--;
}
}
else
{
/*The non-first byte of multibyte character should be 10xxxxxx */
if ((chr & 0xC0) != 0x80)
return 0;
nBytes--;
}
}
/*Violate UTF-8 encoding rules*/
if (nBytes != 0)
{
return 0;
}
return 1;
}

void json_tokener_set_flags(struct json_tokener *tok, int flags)
{
tok->flags = flags;


+ 7
- 0
json_tokener.h View File

@@ -38,6 +38,7 @@ enum json_tokener_error {
json_tokener_error_parse_object_value_sep,
json_tokener_error_parse_string,
json_tokener_error_parse_comment,
json_tokener_error_parse_utf8_string,
json_tokener_error_size
};

@@ -162,6 +163,12 @@ JSON_EXPORT void json_tokener_reset(struct json_tokener *tok);
JSON_EXPORT struct json_object* json_tokener_parse(const char *str);
JSON_EXPORT struct json_object* json_tokener_parse_verbose(const char *str, enum json_tokener_error *error);

/**
* validete the utf-8 string before parse in strict model.
* if not utf-8 format, return err.
*/
json_bool json_tokener_validate_utf8(const char *str);

/**
* Set flags that control how parsing will be done.
*/


+ 22
- 0
tests/test_parse.c View File

@@ -355,6 +355,28 @@ struct incremental_step {
{ "[1,2,3,]", -1, 7, json_tokener_error_parse_unexpected, 3 },
{ "{\"a\":1,}", -1, 7, json_tokener_error_parse_unexpected, 3 },

// acsll encoding "123asc$%&"
{ "\x22\x31\x32\x33\x61\x73\x63\x24\x25\x26\x22", -1, -1, json_tokener_success, 3 },
{ "\x22\x31\x32\x33\x61\x73\x63\x24\x25\x26\x22", -1, -1, json_tokener_success, 1 },
// utf-8 encoding "世界" "πφ" "𥑕"
{ "\x22\xe4\xb8\x96\xe7\x95\x8c\x22", -1, -1, json_tokener_success, 3 },
{ "\x22\xe4\xb8\x96\xe7\x95\x8c\x22", -1, -1, json_tokener_success, 1 },
{ "\x22\xcf\x80\xcf\x86\x22", -1, -1, json_tokener_success, 3 },
{ "\x22\xf0\xa5\x91\x95\x22", -1, -1, json_tokener_success, 3 },
{ "\x22\xf8\xa5\xa5\x91\x95\x22", -1, -1, json_tokener_success, 3 },
{ "\x22\xfd\xa5\xa5\xa5\x91\x95\x22", -1, -1, json_tokener_success, 3 },
// wrong utf-8 encoding
{ "\x22\xe6\x9d\x4e\x22", -1, 0, json_tokener_error_parse_utf8_string, 3 },
{ "\x22\xe6\x9d\x4e\x22", -1, 5, json_tokener_success, 1 },
// GBK encoding
{ "\x22\xc0\xee\xc5\xf4\x22", -1, 0, json_tokener_error_parse_utf8_string, 3 },
{ "\x22\xc0\xee\xc5\xf4\x22", -1, 6, json_tokener_success, 1 },
// ucs-2/utf-16 encoding
{ "\x22\x11\xd2\x22", -1, 0, json_tokener_error_parse_utf8_string, 3 },
{ "\x22\x11\xd2\x22", -1, 4, json_tokener_success, 1 },
{ "\x22\x55\xd8\55\xdc\x22", -1, 0, json_tokener_error_parse_utf8_string, 3 },
{ "\x22\x16\x4e\x4c\x75\x22", -1, 6, json_tokener_success, 1 },

{ NULL, -1, -1, json_tokener_success, 0 },
};



+ 17
- 1
tests/test_parse.expected View File

@@ -183,5 +183,21 @@ json_tokener_parse_ex(tok, [1,2,3,] , 8) ... OK: got object of type [array]
json_tokener_parse_ex(tok, [1,2,,3,] , 9) ... OK: got correct error: unexpected character
json_tokener_parse_ex(tok, [1,2,3,] , 8) ... OK: got correct error: unexpected character
json_tokener_parse_ex(tok, {"a":1,} , 8) ... OK: got correct error: unexpected character
End Incremental Tests OK=105 ERROR=0
json_tokener_parse_ex(tok, "123asc$%&" , 11) ... OK: got object of type [string]: "123asc$%&"
json_tokener_parse_ex(tok, "123asc$%&" , 11) ... OK: got object of type [string]: "123asc$%&"
json_tokener_parse_ex(tok, "世界" , 8) ... OK: got object of type [string]: "世界"
json_tokener_parse_ex(tok, "世界" , 8) ... OK: got object of type [string]: "世界"
json_tokener_parse_ex(tok, "πφ" , 6) ... OK: got object of type [string]: "πφ"
json_tokener_parse_ex(tok, "𥑕" , 6) ... OK: got object of type [string]: "𥑕"
json_tokener_parse_ex(tok, "ø¥¥‘•" , 7) ... OK: got object of type [string]: "ø¥¥‘•"
json_tokener_parse_ex(tok, "ý¥¥¥‘•" , 8) ... OK: got object of type [string]: "ý¥¥¥‘•"
json_tokener_parse_ex(tok, "æ�N" , 5) ... OK: got correct error: invalid utf-8 string
json_tokener_parse_ex(tok, "æ�N" , 5) ... OK: got object of type [string]: "æ�N"
json_tokener_parse_ex(tok, "ÀîÅô" , 6) ... OK: got correct error: invalid utf-8 string
json_tokener_parse_ex(tok, "ÀîÅô" , 6) ... OK: got object of type [string]: "ÀîÅô"
json_tokener_parse_ex(tok, "Ò" , 4) ... OK: got correct error: invalid utf-8 string
json_tokener_parse_ex(tok, "Ò" , 4) ... OK: got object of type [string]: "\u0011Ò"
json_tokener_parse_ex(tok, "UØ-Ü" , 6) ... OK: got correct error: invalid utf-8 string
json_tokener_parse_ex(tok, "NLu" , 6) ... OK: got object of type [string]: "\u0016NLu"
End Incremental Tests OK=121 ERROR=0
==================================

Loading…
Cancel
Save