Browse Source

Merge pull request #633 from dota17/issue616

fix issue 616: support the surrogate pair in split file.
tags/json-c-0.15-20200726
Eric Hawicz GitHub 5 years ago
parent
commit
da76ee26e7
No known key found for this signature in database GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 113 additions and 35 deletions
  1. +18
    -7
      json_tokener.c
  2. +1
    -1
      json_tokener.h
  3. +69
    -26
      tests/test_parse.c
  4. +25
    -1
      tests/test_parse.expected

+ 18
- 7
json_tokener.c View File

@@ -630,8 +630,6 @@ struct json_object *json_tokener_parse_ex(struct json_tokener *tok, const char *


case json_tokener_state_escape_unicode: case json_tokener_state_escape_unicode:
{ {
unsigned int got_hi_surrogate = 0;

/* Handle a 4-byte sequence, or two sequences if a surrogate pair */ /* Handle a 4-byte sequence, or two sequences if a surrogate pair */
while (1) while (1)
{ {
@@ -643,14 +641,24 @@ struct json_object *json_tokener_parse_ex(struct json_tokener *tok, const char *
{ {
unsigned char unescaped_utf[4]; unsigned char unescaped_utf[4];


if (got_hi_surrogate)
if (tok->got_hi_surrogate)
{ {
if (IS_LOW_SURROGATE(tok->ucs_char)) if (IS_LOW_SURROGATE(tok->ucs_char))
{ {
/* remove the utf8_replacement_char */
/* which may generate during */
/* parsing the high surrogate pair. */
if (!strcmp(
tok->pb->buf,
(char *)
utf8_replacement_char))
{
printbuf_reset(tok->pb);
}
/* Recalculate the ucs_char, then fall thru to process normally */ /* Recalculate the ucs_char, then fall thru to process normally */
tok->ucs_char = tok->ucs_char =
DECODE_SURROGATE_PAIR( DECODE_SURROGATE_PAIR(
got_hi_surrogate,
tok->got_hi_surrogate,
tok->ucs_char); tok->ucs_char);
} }
else else
@@ -662,7 +670,7 @@ struct json_object *json_tokener_parse_ex(struct json_tokener *tok, const char *
(char *)utf8_replacement_char, (char *)utf8_replacement_char,
3); 3);
} }
got_hi_surrogate = 0;
tok->got_hi_surrogate = 0;
} }


if (tok->ucs_char < 0x80) if (tok->ucs_char < 0x80)
@@ -686,7 +694,7 @@ struct json_object *json_tokener_parse_ex(struct json_tokener *tok, const char *
* the beginning of another sequence, which * the beginning of another sequence, which
* should be the low surrogate. * should be the low surrogate.
*/ */
got_hi_surrogate = tok->ucs_char;
tok->got_hi_surrogate = tok->ucs_char;
/* Not at end, and the next two chars should be "\u" */ /* Not at end, and the next two chars should be "\u" */
if ((len == -1 || if ((len == -1 ||
len > (tok->char_offset + 2)) && len > (tok->char_offset + 2)) &&
@@ -717,6 +725,8 @@ struct json_object *json_tokener_parse_ex(struct json_tokener *tok, const char *
(char *) (char *)
utf8_replacement_char, utf8_replacement_char,
3); 3);
tok->ucs_char = 0;
tok->st_pos = 0;
goto out; goto out;
} }
tok->ucs_char = 0; tok->ucs_char = 0;
@@ -786,7 +796,8 @@ struct json_object *json_tokener_parse_ex(struct json_tokener *tok, const char *
if (!ADVANCE_CHAR(str, tok) || !PEEK_CHAR(c, tok)) if (!ADVANCE_CHAR(str, tok) || !PEEK_CHAR(c, tok))
{ {
/* Clean up any pending chars */ /* Clean up any pending chars */
if (got_hi_surrogate)
if (tok->got_hi_surrogate &&
strcmp(tok->pb->buf, (char *)utf8_replacement_char))
printbuf_memappend_fast( printbuf_memappend_fast(
tok->pb, (char *)utf8_replacement_char, 3); tok->pb, (char *)utf8_replacement_char, 3);
goto out; goto out;


+ 1
- 1
json_tokener.h View File

@@ -111,7 +111,7 @@ struct json_tokener
* @deprecated See json_tokener_get_error() instead. * @deprecated See json_tokener_get_error() instead.
*/ */
enum json_tokener_error err; enum json_tokener_error err;
unsigned int ucs_char;
unsigned int ucs_char, got_hi_surrogate;
char quote_char; char quote_char;
struct json_tokener_srec *stack; struct json_tokener_srec *stack;
int flags; int flags;


+ 69
- 26
tests/test_parse.c View File

@@ -224,6 +224,35 @@ struct incremental_step
{"\": {\"bar", -1, -1, json_tokener_continue, 0}, {"\": {\"bar", -1, -1, json_tokener_continue, 0},
{"\":13}}", -1, -1, json_tokener_success, 1}, {"\":13}}", -1, -1, json_tokener_success, 1},


/* Check the UTF-16 surrogate pair */
/* parse one char at every time */
{"\"\\", -1, -1, json_tokener_continue, 0},
{"u", -1, -1, json_tokener_continue, 0},
{"d", -1, -1, json_tokener_continue, 0},
{"8", -1, -1, json_tokener_continue, 0},
{"3", -1, -1, json_tokener_continue, 0},
{"4", -1, -1, json_tokener_continue, 0},
{"\\", -1, -1, json_tokener_continue, 0},
{"u", -1, -1, json_tokener_continue, 0},
{"d", -1, -1, json_tokener_continue, 0},
{"d", -1, -1, json_tokener_continue, 0},
{"1", -1, -1, json_tokener_continue, 0},
{"e\"", -1, -1, json_tokener_success, 1},
/* parse two char at every time */
{"\"\\u", -1, -1, json_tokener_continue, 0},
{"d8", -1, -1, json_tokener_continue, 0},
{"34", -1, -1, json_tokener_continue, 0},
{"\\u", -1, -1, json_tokener_continue, 0},
{"dd", -1, -1, json_tokener_continue, 0},
{"1e\"", -1, -1, json_tokener_success, 1},
/* check the low surrogate pair */
{"\"\\ud834", -1, -1, json_tokener_continue, 0},
{"\\udd1e\"", -1, -1, json_tokener_success, 1},
{"\"\\ud834\\", -1, -1, json_tokener_continue, 0},
{"udd1e\"", -1, -1, json_tokener_success, 1},
{"\"\\ud834\\u", -1, -1, json_tokener_continue, 0},
{"dd1e\"", -1, -1, json_tokener_success, 1},

/* Check that json_tokener_reset actually resets */ /* Check that json_tokener_reset actually resets */
{"{ \"foo", -1, -1, json_tokener_continue, 1}, {"{ \"foo", -1, -1, json_tokener_continue, 1},
{": \"bar\"}", -1, 0, json_tokener_error_parse_unexpected, 1}, {": \"bar\"}", -1, 0, json_tokener_error_parse_unexpected, 1},
@@ -239,11 +268,13 @@ struct incremental_step
{"\"Y\"", -1, -1, json_tokener_success, 1}, {"\"Y\"", -1, -1, json_tokener_success, 1},


/* Trailing characters should cause a failure in strict mode */ /* Trailing characters should cause a failure in strict mode */
{"{\"foo\":9}{\"bar\":8}", -1, 9, json_tokener_error_parse_unexpected, 1, JSON_TOKENER_STRICT },
{"{\"foo\":9}{\"bar\":8}", -1, 9, json_tokener_error_parse_unexpected, 1, JSON_TOKENER_STRICT},


/* ... unless explicitly allowed. */ /* ... unless explicitly allowed. */
{"{\"foo\":9}{\"bar\":8}", -1, 9, json_tokener_success, 0, JSON_TOKENER_STRICT|JSON_TOKENER_ALLOW_TRAILING_CHARS },
{"{\"b\":8}ignored garbage", -1, 7, json_tokener_success, 1, JSON_TOKENER_STRICT|JSON_TOKENER_ALLOW_TRAILING_CHARS },
{"{\"foo\":9}{\"bar\":8}", -1, 9, json_tokener_success, 0,
JSON_TOKENER_STRICT | JSON_TOKENER_ALLOW_TRAILING_CHARS},
{"{\"b\":8}ignored garbage", -1, 7, json_tokener_success, 1,
JSON_TOKENER_STRICT | JSON_TOKENER_ALLOW_TRAILING_CHARS},


/* To stop parsing a number we need to reach a non-digit, e.g. a \0 */ /* To stop parsing a number we need to reach a non-digit, e.g. a \0 */
{"1", 1, 1, json_tokener_continue, 0}, {"1", 1, 1, json_tokener_continue, 0},
@@ -251,7 +282,7 @@ struct incremental_step
{"2", 2, 1, json_tokener_success, 0}, {"2", 2, 1, json_tokener_success, 0},
{"12{", 3, 2, json_tokener_success, 1}, {"12{", 3, 2, json_tokener_success, 1},
/* Parse number in strict model */ /* Parse number in strict model */
{"[02]", -1, 3, json_tokener_error_parse_number, 1, JSON_TOKENER_STRICT },
{"[02]", -1, 3, json_tokener_error_parse_number, 1, JSON_TOKENER_STRICT},


/* Similar tests for other kinds of objects: */ /* Similar tests for other kinds of objects: */
/* These could all return success immediately, since regardless of /* These could all return success immediately, since regardless of
@@ -267,8 +298,8 @@ struct incremental_step
{"Infinity", 9, 8, json_tokener_success, 1}, {"Infinity", 9, 8, json_tokener_success, 1},
{"infinity", 9, 8, json_tokener_success, 1}, {"infinity", 9, 8, json_tokener_success, 1},
{"-infinity", 10, 9, json_tokener_success, 1}, {"-infinity", 10, 9, json_tokener_success, 1},
{"infinity", 9, 0, json_tokener_error_parse_unexpected, 1, JSON_TOKENER_STRICT },
{"-infinity", 10, 1, json_tokener_error_parse_unexpected, 1, JSON_TOKENER_STRICT },
{"infinity", 9, 0, json_tokener_error_parse_unexpected, 1, JSON_TOKENER_STRICT},
{"-infinity", 10, 1, json_tokener_error_parse_unexpected, 1, JSON_TOKENER_STRICT},


{"inf", 3, 3, json_tokener_continue, 0}, {"inf", 3, 3, json_tokener_continue, 0},
{"inity", 6, 5, json_tokener_success, 1}, {"inity", 6, 5, json_tokener_success, 1},
@@ -350,7 +381,7 @@ struct incremental_step
{"\"\\a\"", -1, 2, json_tokener_error_parse_string, 1}, {"\"\\a\"", -1, 2, json_tokener_error_parse_string, 1},


/* Check '\'' in strict model */ /* Check '\'' in strict model */
{"\'foo\'", -1, 0, json_tokener_error_parse_unexpected, 1, JSON_TOKENER_STRICT },
{"\'foo\'", -1, 0, json_tokener_error_parse_unexpected, 1, JSON_TOKENER_STRICT},


/* Parse array/object */ /* Parse array/object */
{"[1,2,3]", -1, -1, json_tokener_success, 0}, {"[1,2,3]", -1, -1, json_tokener_success, 0},
@@ -372,42 +403,54 @@ struct incremental_step
{"[1,2,3,]", -1, -1, json_tokener_success, 0}, {"[1,2,3,]", -1, -1, json_tokener_success, 0},
{"[1,2,,3,]", -1, 5, json_tokener_error_parse_unexpected, 0}, {"[1,2,,3,]", -1, 5, json_tokener_error_parse_unexpected, 0},


{"[1,2,3,]", -1, 7, json_tokener_error_parse_unexpected, 1, JSON_TOKENER_STRICT },
{"{\"a\":1,}", -1, 7, json_tokener_error_parse_unexpected, 1, JSON_TOKENER_STRICT },
{"[1,2,3,]", -1, 7, json_tokener_error_parse_unexpected, 1, JSON_TOKENER_STRICT},
{"{\"a\":1,}", -1, 7, json_tokener_error_parse_unexpected, 1, JSON_TOKENER_STRICT},


// utf-8 test // utf-8 test
// acsll encoding // acsll encoding
{"\x22\x31\x32\x33\x61\x73\x63\x24\x25\x26\x22", -1, -1, json_tokener_success, 1, JSON_TOKENER_VALIDATE_UTF8 },
{"\x22\x31\x32\x33\x61\x73\x63\x24\x25\x26\x22", -1, -1, json_tokener_success, 1,
JSON_TOKENER_VALIDATE_UTF8},
{"\x22\x31\x32\x33\x61\x73\x63\x24\x25\x26\x22", -1, -1, json_tokener_success, 1}, {"\x22\x31\x32\x33\x61\x73\x63\x24\x25\x26\x22", -1, -1, json_tokener_success, 1},
// utf-8 encoding // utf-8 encoding
{"\x22\xe4\xb8\x96\xe7\x95\x8c\x22", -1, -1, json_tokener_success, 1, JSON_TOKENER_VALIDATE_UTF8 },
{"\x22\xe4\xb8", -1, 3, json_tokener_error_parse_utf8_string, 0, JSON_TOKENER_VALIDATE_UTF8 },
{"\x96\xe7\x95\x8c\x22", -1, 0, json_tokener_error_parse_utf8_string, 1, JSON_TOKENER_VALIDATE_UTF8 },
{"\x22\xe4\xb8\x96\xe7\x95\x8c\x22", -1, -1, json_tokener_success, 1,
JSON_TOKENER_VALIDATE_UTF8},
{"\x22\xe4\xb8", -1, 3, json_tokener_error_parse_utf8_string, 0, JSON_TOKENER_VALIDATE_UTF8},
{"\x96\xe7\x95\x8c\x22", -1, 0, json_tokener_error_parse_utf8_string, 1,
JSON_TOKENER_VALIDATE_UTF8},
{"\x22\xe4\xb8\x96\xe7\x95\x8c\x22", -1, -1, json_tokener_success, 1}, {"\x22\xe4\xb8\x96\xe7\x95\x8c\x22", -1, -1, json_tokener_success, 1},
{"\x22\xcf\x80\xcf\x86\x22", -1, -1, json_tokener_success, 1, JSON_TOKENER_VALIDATE_UTF8 },
{"\x22\xf0\xa5\x91\x95\x22", -1, -1, json_tokener_success, 1, JSON_TOKENER_VALIDATE_UTF8 },
{"\x22\xcf\x80\xcf\x86\x22", -1, -1, json_tokener_success, 1, JSON_TOKENER_VALIDATE_UTF8},
{"\x22\xf0\xa5\x91\x95\x22", -1, -1, json_tokener_success, 1, JSON_TOKENER_VALIDATE_UTF8},
// wrong utf-8 encoding // wrong utf-8 encoding
{"\x22\xe6\x9d\x4e\x22", -1, 3, json_tokener_error_parse_utf8_string, 1, JSON_TOKENER_VALIDATE_UTF8 },
{"\x22\xe6\x9d\x4e\x22", -1, 3, json_tokener_error_parse_utf8_string, 1,
JSON_TOKENER_VALIDATE_UTF8},
{"\x22\xe6\x9d\x4e\x22", -1, 5, json_tokener_success, 1}, {"\x22\xe6\x9d\x4e\x22", -1, 5, json_tokener_success, 1},
// GBK encoding // GBK encoding
{"\x22\xc0\xee\xc5\xf4\x22", -1, 2, json_tokener_error_parse_utf8_string, 1, JSON_TOKENER_VALIDATE_UTF8 },
{"\x22\xc0\xee\xc5\xf4\x22", -1, 2, json_tokener_error_parse_utf8_string, 1,
JSON_TOKENER_VALIDATE_UTF8},
{"\x22\xc0\xee\xc5\xf4\x22", -1, 6, json_tokener_success, 1}, {"\x22\xc0\xee\xc5\xf4\x22", -1, 6, json_tokener_success, 1},
// char after space // char after space
{"\x20\x20\x22\xe4\xb8\x96\x22", -1, -1, json_tokener_success, 1, JSON_TOKENER_VALIDATE_UTF8 },
{"\x20\x20\x81\x22\xe4\xb8\x96\x22", -1, 2, json_tokener_error_parse_utf8_string, 1, JSON_TOKENER_VALIDATE_UTF8 },
{"\x5b\x20\x81\x31\x5d", -1, 2, json_tokener_error_parse_utf8_string, 1, JSON_TOKENER_VALIDATE_UTF8 },
{"\x20\x20\x22\xe4\xb8\x96\x22", -1, -1, json_tokener_success, 1, JSON_TOKENER_VALIDATE_UTF8},
{"\x20\x20\x81\x22\xe4\xb8\x96\x22", -1, 2, json_tokener_error_parse_utf8_string, 1,
JSON_TOKENER_VALIDATE_UTF8},
{"\x5b\x20\x81\x31\x5d", -1, 2, json_tokener_error_parse_utf8_string, 1,
JSON_TOKENER_VALIDATE_UTF8},
// char in state inf // char in state inf
{"\x49\x6e\x66\x69\x6e\x69\x74\x79", 9, 8, json_tokener_success, 1}, {"\x49\x6e\x66\x69\x6e\x69\x74\x79", 9, 8, json_tokener_success, 1},
{"\x49\x6e\x66\x81\x6e\x69\x74\x79", -1, 3, json_tokener_error_parse_utf8_string, 1, JSON_TOKENER_VALIDATE_UTF8 },
{"\x49\x6e\x66\x81\x6e\x69\x74\x79", -1, 3, json_tokener_error_parse_utf8_string, 1,
JSON_TOKENER_VALIDATE_UTF8},
// char in escape unicode // char in escape unicode
{"\x22\x5c\x75\x64\x38\x35\x35\x5c\x75\x64\x63\x35\x35\x22", 15, 14, json_tokener_success, 1, JSON_TOKENER_VALIDATE_UTF8 },
{"\x22\x5c\x75\x64\x38\x35\x35\x5c\x75\x64\x63\x35\x35\x22", 15, 14, json_tokener_success, 1,
JSON_TOKENER_VALIDATE_UTF8},
{"\x22\x5c\x75\x64\x38\x35\x35\xc0\x75\x64\x63\x35\x35\x22", -1, 8, {"\x22\x5c\x75\x64\x38\x35\x35\xc0\x75\x64\x63\x35\x35\x22", -1, 8,
json_tokener_error_parse_utf8_string, 1, JSON_TOKENER_VALIDATE_UTF8 },
{"\x22\x5c\x75\x64\x30\x30\x33\x31\xc0\x22", -1, 9, json_tokener_error_parse_utf8_string, 1, JSON_TOKENER_VALIDATE_UTF8 },
json_tokener_error_parse_utf8_string, 1, JSON_TOKENER_VALIDATE_UTF8},
{"\x22\x5c\x75\x64\x30\x30\x33\x31\xc0\x22", -1, 9, json_tokener_error_parse_utf8_string, 1,
JSON_TOKENER_VALIDATE_UTF8},
// char in number // char in number
{"\x31\x31\x81\x31\x31", -1, 2, json_tokener_error_parse_utf8_string, 1, JSON_TOKENER_VALIDATE_UTF8 },
{"\x31\x31\x81\x31\x31", -1, 2, json_tokener_error_parse_utf8_string, 1,
JSON_TOKENER_VALIDATE_UTF8},
// char in object // char in object
{"\x7b\x22\x31\x81\x22\x3a\x31\x7d", -1, 3, json_tokener_error_parse_utf8_string, 1, JSON_TOKENER_VALIDATE_UTF8 },
{"\x7b\x22\x31\x81\x22\x3a\x31\x7d", -1, 3, json_tokener_error_parse_utf8_string, 1,
JSON_TOKENER_VALIDATE_UTF8},


{NULL, -1, -1, json_tokener_success, 0}, {NULL, -1, -1, json_tokener_success, 0},
}; };


+ 25
- 1
tests/test_parse.expected View File

@@ -100,6 +100,30 @@ json_tokener_parse_ex(tok, // hello"foo", 13) ... OK: got correct error: contin
json_tokener_parse_ex(tok, { "foo , 6) ... OK: got correct error: continue json_tokener_parse_ex(tok, { "foo , 6) ... OK: got correct error: continue
json_tokener_parse_ex(tok, ": {"bar , 8) ... OK: got correct error: continue json_tokener_parse_ex(tok, ": {"bar , 8) ... OK: got correct error: continue
json_tokener_parse_ex(tok, ":13}} , 6) ... OK: got object of type [object]: { "foo": { "bar": 13 } } json_tokener_parse_ex(tok, ":13}} , 6) ... OK: got object of type [object]: { "foo": { "bar": 13 } }
json_tokener_parse_ex(tok, "\ , 2) ... OK: got correct error: continue
json_tokener_parse_ex(tok, u , 1) ... OK: got correct error: continue
json_tokener_parse_ex(tok, d , 1) ... OK: got correct error: continue
json_tokener_parse_ex(tok, 8 , 1) ... OK: got correct error: continue
json_tokener_parse_ex(tok, 3 , 1) ... OK: got correct error: continue
json_tokener_parse_ex(tok, 4 , 1) ... OK: got correct error: continue
json_tokener_parse_ex(tok, \ , 1) ... OK: got correct error: continue
json_tokener_parse_ex(tok, u , 1) ... OK: got correct error: continue
json_tokener_parse_ex(tok, d , 1) ... OK: got correct error: continue
json_tokener_parse_ex(tok, d , 1) ... OK: got correct error: continue
json_tokener_parse_ex(tok, 1 , 1) ... OK: got correct error: continue
json_tokener_parse_ex(tok, e" , 2) ... OK: got object of type [string]: "�"
json_tokener_parse_ex(tok, "\u , 3) ... OK: got correct error: continue
json_tokener_parse_ex(tok, d8 , 2) ... OK: got correct error: continue
json_tokener_parse_ex(tok, 34 , 2) ... OK: got correct error: continue
json_tokener_parse_ex(tok, \u , 2) ... OK: got correct error: continue
json_tokener_parse_ex(tok, dd , 2) ... OK: got correct error: continue
json_tokener_parse_ex(tok, 1e" , 3) ... OK: got object of type [string]: "�"
json_tokener_parse_ex(tok, "\ud834 , 7) ... OK: got correct error: continue
json_tokener_parse_ex(tok, \udd1e" , 7) ... OK: got object of type [string]: "�"
json_tokener_parse_ex(tok, "\ud834\ , 8) ... OK: got correct error: continue
json_tokener_parse_ex(tok, udd1e" , 6) ... OK: got object of type [string]: "�"
json_tokener_parse_ex(tok, "\ud834\u , 9) ... OK: got correct error: continue
json_tokener_parse_ex(tok, dd1e" , 5) ... OK: got object of type [string]: "�"
json_tokener_parse_ex(tok, { "foo , 6) ... OK: got correct error: continue json_tokener_parse_ex(tok, { "foo , 6) ... OK: got correct error: continue
json_tokener_parse_ex(tok, : "bar"} , 8) ... OK: got correct error: unexpected character json_tokener_parse_ex(tok, : "bar"} , 8) ... OK: got correct error: unexpected character
json_tokener_parse_ex(tok, { "foo , 6) ... OK: got correct error: continue json_tokener_parse_ex(tok, { "foo , 6) ... OK: got correct error: continue
@@ -216,5 +240,5 @@ json_tokener_parse_ex(tok, "\ud855
json_tokener_parse_ex(tok, "\ud0031À" , 10) ... OK: got correct error: invalid utf-8 string json_tokener_parse_ex(tok, "\ud0031À" , 10) ... OK: got correct error: invalid utf-8 string
json_tokener_parse_ex(tok, 11�11 , 5) ... OK: got correct error: invalid utf-8 string json_tokener_parse_ex(tok, 11�11 , 5) ... OK: got correct error: invalid utf-8 string
json_tokener_parse_ex(tok, {"1�":1} , 8) ... OK: got correct error: invalid utf-8 string json_tokener_parse_ex(tok, {"1�":1} , 8) ... OK: got correct error: invalid utf-8 string
End Incremental Tests OK=130 ERROR=0
End Incremental Tests OK=154 ERROR=0
================================== ==================================

Loading…
Cancel
Save