Issue #616: Change the parsing of surrogate pairs in unicode escapes so it uses a couple of additional states instead of assuming the low surrogate is already present, to ensure that we correctly handle various cases of incremental parsing.

5 years ago · a68566bf6a
--- a/+ 2
+++ b/+ 2
@@ -25,6 +25,8 @@ Other changes
   Add json_object_array_shrink() and array_list_shrink() functions.
 * Add json_object_new_array_ext(int) and array_list_new_2(int) to allow
   arrays to be allocated with the exact size needed, when known.
 * Parsing of surrogate pairs in unicode escapes now properly handles
   incremental parsing.


 ***
--- a/json_tokener.c
+++ b/json_tokener.c
@@ -295,7 +295,7 @@ struct json_object *json_tokener_parse_ex(struct json_tokener *tok, const char *
 	}
 #endif

 	while (PEEK_CHAR(c, tok))
 	while (PEEK_CHAR(c, tok)) // Note: c might be '\0' !
 	{

 	redo_char:
@@ -628,9 +628,11 @@ struct json_object *json_tokener_parse_ex(struct json_tokener *tok, const char *
 			}
 			break;

 			// ===================================================

 		case json_tokener_state_escape_unicode:
 		{
 			/* Handle a 4-byte sequence, or two sequences if a surrogate pair */
 			/* Handle a 4-byte \uNNNN sequence, or two sequences if a surrogate pair */
 			while (1)
 			{
 				if (!c || !strchr(json_hex_chars, c))
@@ -638,181 +640,153 @@ struct json_object *json_tokener_parse_ex(struct json_tokener *tok, const char *
 					tok->err = json_tokener_error_parse_string;
 					goto out;
 				}
 				tok->ucs_char |= ((unsigned int)jt_hexdigit(c)
 								  << ((3 - tok->st_pos) * 4));
 				tok->ucs_char |=
 				    ((unsigned int)jt_hexdigit(c) << ((3 - tok->st_pos) * 4));
 				tok->st_pos++;
 				if (tok->st_pos < 4)
 				{
 					ADVANCE_CHAR(str, tok);
 					if (!PEEK_CHAR(c, tok))
 					{
 						/*
 						 * We're out of characters in the current call to
 						 * json_tokener_parse(), but a subsequent call might
 						 * provide us with more, so leave our current state
 						 * as-is (including tok->high_surrogate) and return.
 						 */
 						goto out;
 					}
 					continue;
 				}

 				/* Now, we have a full \uNNNN sequence in tok->ucs_char */

 				if (tok->high_surrogate)
 				{
 					if (IS_LOW_SURROGATE(tok->ucs_char))
 					{
 						/* remove the utf8_replacement_char */
 						/* which may generate during */
 						/* parsing the high surrogate pair. */
 						if (!strcmp(
 								tok->pb->buf,
 								(char *)
 									utf8_replacement_char))
 						{
 							printbuf_reset(tok->pb);
 						}
 						/* Recalculate the ucs_char, then fall thru to process normally */
 						tok->ucs_char =
 							DECODE_SURROGATE_PAIR(
 								tok->high_surrogate,
 								tok->ucs_char);
 					}
 					else
 					{
 						/* High surrogate was not followed by a low surrogate
 						 * Replace the high and process the rest normally
 						 */
 						printbuf_memappend_fast(
 							tok->pb,
 							(char *)utf8_replacement_char,
 							3);
 					}
 					tok->high_surrogate = 0;
 				}
 				if (tok->st_pos >= 4)
 					break;

 				if (tok->ucs_char < 0x80)
 				{
 					unsigned char unescaped_utf[1];
 					unescaped_utf[0] = tok->ucs_char;
 					printbuf_memappend_fast(
 						tok->pb, (char *)unescaped_utf, 1);
 				}
 				else if (tok->ucs_char < 0x800)
 				{
 					unsigned char unescaped_utf[2];
 					unescaped_utf[0] =
 						0xc0 | (tok->ucs_char >> 6);
 					unescaped_utf[1] =
 						0x80 | (tok->ucs_char & 0x3f);
 					printbuf_memappend_fast(
 						tok->pb, (char *)unescaped_utf, 2);
 				}
 				else if (IS_HIGH_SURROGATE(tok->ucs_char))
 				ADVANCE_CHAR(str, tok);
 				if (!PEEK_CHAR(c, tok))
 				{
 					/* Got a high surrogate.  Remember it and look for
 					 * the beginning of another \uNNNN sequence, which
 					 * should be the low surrogate.
 					/*
 					 * We're out of characters in the current call to
 					 * json_tokener_parse(), but a subsequent call might
 					 * provide us with more, so leave our current state
 					 * as-is (including tok->high_surrogate) and return.
 					 */
 					tok->high_surrogate = tok->ucs_char;
 					/* Not at end, and the next two chars should be "\u" */
 					if ((len == -1 ||
 						 len > (tok->char_offset + 2)) &&
 						// str[0] != '0' &&  // implied by json_hex_chars, above.
 						(str[1] == '\\') && (str[2] == 'u'))
 					{
 						/* Advance through the 16 bit surrogate, and move
 						 * on to the next sequence. The next step is to
 						 * process the following characters.
 						 */
 						if (!ADVANCE_CHAR(str, tok) ||
 							!ADVANCE_CHAR(str, tok))
 						{
 							printbuf_memappend_fast(
 								tok->pb,
 								(char *)
 									utf8_replacement_char,
 								3);
 						}
 						/* Advance to the first char of the next sequence and
 						 * continue processing with the next sequence.
 						 */
 						if (!ADVANCE_CHAR(str, tok) ||
 							!PEEK_CHAR(c, tok))
 						{
 							printbuf_memappend_fast(
 								tok->pb,
 								(char *)
 									utf8_replacement_char,
 								3);
 							tok->ucs_char = 0;
 							tok->st_pos = 0;
 							goto out;
 						}
 						tok->ucs_char = 0;
 						tok->st_pos = 0;
 						/* other json_tokener_state_escape_unicode */
 						continue;
 					}
 					else
 					{
 						/* Got a high surrogate without another sequence following
 						 * it.  Put a replacement char in for the high surrogate
 						 * and pretend we finished.
 						 */
 						printbuf_memappend_fast(
 							tok->pb,
 							(char *)utf8_replacement_char,
 							3);
 					}
 				}
 				else if (IS_LOW_SURROGATE(tok->ucs_char))
 				{
 					/* Got a low surrogate not preceded by a high */
 					printbuf_memappend_fast(
 						tok->pb, (char *)utf8_replacement_char,
 						3);
 				}
 				else if (tok->ucs_char < 0x10000)
 				{
 					unsigned char unescaped_utf[3];
 					unescaped_utf[0] =
 						0xe0 | (tok->ucs_char >> 12);
 					unescaped_utf[1] =
 						0x80 | ((tok->ucs_char >> 6) & 0x3f);
 					unescaped_utf[2] =
 						0x80 | (tok->ucs_char & 0x3f);
 					printbuf_memappend_fast(
 						tok->pb, (char *)unescaped_utf, 3);
 					goto out;
 				}
 				else if (tok->ucs_char < 0x110000)
 			}
 			tok->st_pos = 0;

 			/* Now, we have a full \uNNNN sequence in tok->ucs_char */

 			/* If the *previous* sequence was a high surrogate ... */
 			if (tok->high_surrogate)
 			{
 				if (IS_LOW_SURROGATE(tok->ucs_char))
 				{
 					unsigned char unescaped_utf[4];
 					unescaped_utf[0] =
 						0xf0 | ((tok->ucs_char >> 18) & 0x07);
 					unescaped_utf[1] =
 						0x80 | ((tok->ucs_char >> 12) & 0x3f);
 					unescaped_utf[2] =
 						0x80 | ((tok->ucs_char >> 6) & 0x3f);
 					unescaped_utf[3] =
 						0x80 | (tok->ucs_char & 0x3f);
 					printbuf_memappend_fast(
 						tok->pb, (char *)unescaped_utf, 4);
 					/* Recalculate the ucs_char, then fall thru to process normally */
 					tok->ucs_char = DECODE_SURROGATE_PAIR(tok->high_surrogate,
 					                                      tok->ucs_char);
 				}
 				else
 				{
 					/* Don't know what we got--insert the replacement char */
 					printbuf_memappend_fast(
 						tok->pb, (char *)utf8_replacement_char,
 						3);
 					/* High surrogate was not followed by a low surrogate
 					 * Replace the high and process the rest normally
 					 */
 					printbuf_memappend_fast(tok->pb,
 					                        (char *)utf8_replacement_char, 3);
 				}
 				state = saved_state; // i.e. _state_string or _object_field
 				tok->high_surrogate = 0;
 			}

 			if (tok->ucs_char < 0x80)
 			{
 				unsigned char unescaped_utf[1];
 				unescaped_utf[0] = tok->ucs_char;
 				printbuf_memappend_fast(tok->pb, (char *)unescaped_utf, 1);
 			}
 			else if (tok->ucs_char < 0x800)
 			{
 				unsigned char unescaped_utf[2];
 				unescaped_utf[0] = 0xc0 | (tok->ucs_char >> 6);
 				unescaped_utf[1] = 0x80 | (tok->ucs_char & 0x3f);
 				printbuf_memappend_fast(tok->pb, (char *)unescaped_utf, 2);
 			}
 			else if (IS_HIGH_SURROGATE(tok->ucs_char))
 			{
 				/*
 				 * The next two characters should be \u, HOWEVER,
 				 * we can't simply peek ahead here, because the
 				 * characters we need might not be passed to us
 				 * until a subsequent call to json_tokener_parse.
 				 * Instead, transition throug a couple of states.
 				 * (now):
 				 *   _escape_unicode => _unicode_need_escape
 				 * (see a '\\' char):
 				 *   _unicode_need_escape => _unicode_need_u
 				 * (see a 'u' char):
 				 *   _unicode_need_u => _escape_unicode
 				 *      ...and we'll end up back around here.
 				 */
 				tok->high_surrogate = tok->ucs_char;
 				tok->ucs_char = 0;
 				state = json_tokener_state_escape_unicode_need_escape;
 				break;
 			}
 			else if (IS_LOW_SURROGATE(tok->ucs_char))
 			{
 				/* Got a low surrogate not preceded by a high */
 				printbuf_memappend_fast(tok->pb, (char *)utf8_replacement_char, 3);
 			}
 			else if (tok->ucs_char < 0x10000)
 			{
 				unsigned char unescaped_utf[3];
 				unescaped_utf[0] = 0xe0 | (tok->ucs_char >> 12);
 				unescaped_utf[1] = 0x80 | ((tok->ucs_char >> 6) & 0x3f);
 				unescaped_utf[2] = 0x80 | (tok->ucs_char & 0x3f);
 				printbuf_memappend_fast(tok->pb, (char *)unescaped_utf, 3);
 			}
 			else if (tok->ucs_char < 0x110000)
 			{
 				unsigned char unescaped_utf[4];
 				unescaped_utf[0] = 0xf0 | ((tok->ucs_char >> 18) & 0x07);
 				unescaped_utf[1] = 0x80 | ((tok->ucs_char >> 12) & 0x3f);
 				unescaped_utf[2] = 0x80 | ((tok->ucs_char >> 6) & 0x3f);
 				unescaped_utf[3] = 0x80 | (tok->ucs_char & 0x3f);
 				printbuf_memappend_fast(tok->pb, (char *)unescaped_utf, 4);
 			}
 			else
 			{
 				/* Don't know what we got--insert the replacement char */
 				printbuf_memappend_fast(tok->pb, (char *)utf8_replacement_char, 3);
 			}
 			state = saved_state; // i.e. _state_string or _state_object_field
 		}
 		break;

 		case json_tokener_state_escape_unicode_need_escape:
 			// We get here after processing a high_surrogate
 			// require a '\\' char
 			if (!c || c != '\\')
 			{
 				/* Got a high surrogate without another sequence following
 				 * it.  Put a replacement char in for the high surrogate
 				 * and pop back up to _state_string or _state_object_field.
 				 */
 				printbuf_memappend_fast(tok->pb, (char *)utf8_replacement_char, 3);
 				tok->high_surrogate = 0;
 				tok->ucs_char = 0;
 				tok->st_pos = 0;
 				state = saved_state;
 				goto redo_char;
 			}
 			state = json_tokener_state_escape_unicode_need_u;
 			break;

 		case json_tokener_state_escape_unicode_need_u:
 			/* We already had a \ char, check that it's \u */
 			if (!c || c != 'u')
 			{
 				/* Got a high surrogate with some non-unicode escape
 				 * sequence following it.
 				 * Put a replacement char in for the high surrogate
 				 * and handle the escape sequence normally.
 				 */
 				printbuf_memappend_fast(tok->pb, (char *)utf8_replacement_char, 3);
 				tok->high_surrogate = 0;
 				tok->ucs_char = 0;
 				tok->st_pos = 0;
 				state = json_tokener_state_string_escape;
 				goto redo_char;
 			}
 			state = json_tokener_state_escape_unicode;
 			break;

 			// ===================================================

 		case json_tokener_state_boolean:
 		{
 			int size1, size2;
@@ -1146,8 +1120,9 @@ struct json_object *json_tokener_parse_ex(struct json_tokener *tok, const char *
 			}
 			break;
 		}
 		if (!ADVANCE_CHAR(str, tok))
 			goto out;
 		(void)ADVANCE_CHAR(str, tok);
 		if (!c) // This is the char *before* advancing
 			break;
 	} /* while(PEEK_CHAR) */

 out:
@@ -1156,7 +1131,8 @@ out:
 		tok->err = json_tokener_error_parse_utf8_string;
 	}
 	if (c && (state == json_tokener_state_finish) && (tok->depth == 0) &&
 	    (tok->flags & (JSON_TOKENER_STRICT|JSON_TOKENER_ALLOW_TRAILING_CHARS)) == JSON_TOKENER_STRICT)
 	    (tok->flags & (JSON_TOKENER_STRICT | JSON_TOKENER_ALLOW_TRAILING_CHARS)) ==
 	        JSON_TOKENER_STRICT)
 	{
 		/* unexpected char after JSON data */
 		tok->err = json_tokener_error_parse_unexpected;
--- a/json_tokener.h
+++ b/json_tokener.h
@@ -59,6 +59,8 @@ enum json_tokener_state
 	json_tokener_state_string,
 	json_tokener_state_string_escape,
 	json_tokener_state_escape_unicode,
 	json_tokener_state_escape_unicode_need_escape,
 	json_tokener_state_escape_unicode_need_u,
 	json_tokener_state_boolean,
 	json_tokener_state_number,
 	json_tokener_state_array,
--- a/tests/test_parse.c
+++ b/tests/test_parse.c
@@ -68,8 +68,8 @@ static void single_incremental_parse(const char *test_string, int clear_serializ

 	if (strcmp(all_at_once_str, new_str) != 0)
 	{
 		printf("ERROR: failed to parse (%s) in %d byte chunks: %s != %s\n",
 		    test_string, chunksize, all_at_once_str, new_str);
 		printf("ERROR: failed to parse (%s) in %d byte chunks: %s != %s\n", test_string,
 		       chunksize, all_at_once_str, new_str);
 	}
 	json_tokener_free(tok);
 }
@@ -193,8 +193,8 @@ static void test_utf8_parse()
 	// json_tokener_parse doesn't support checking for byte order marks.
 	// It's the responsibility of the caller to detect and skip a BOM.
 	// Both of these checks return null.
 	char* utf8_bom = "\xEF\xBB\xBF";
 	char* utf8_bom_and_chars = "\xEF\xBB\xBF{}";
 	char *utf8_bom = "\xEF\xBB\xBF";
 	char *utf8_bom_and_chars = "\xEF\xBB\xBF{}";
 	single_basic_parse(utf8_bom, 0);
 	single_basic_parse(utf8_bom_and_chars, 0);
 }
@@ -245,7 +245,7 @@ struct incremental_step
 	int char_offset;
 	enum json_tokener_error expected_error;
 	int reset_tokener; /* Set to 1 to call json_tokener_reset() after parsing */
 	int tok_flags; /* JSON_TOKENER_* flags to pass to json_tokener_set_flags() */
 	int tok_flags;     /* JSON_TOKENER_* flags to pass to json_tokener_set_flags() */
 } incremental_steps[] = {

    /* Check that full json messages can be parsed, both w/ and w/o a reset */
@@ -268,7 +268,11 @@ struct incremental_step
    {"\": {\"bar", -1, -1, json_tokener_continue, 0},
    {"\":13}}", -1, -1, json_tokener_success, 1},

    /* Check the UTF-16 surrogate pair */
    /* Check the UTF-16 surrogate pair handling in various ways.
 	 * Note: \ud843\udd1e is u+1D11E, Musical Symbol G Clef
 	 * Your terminal may not display these correctly, in particular
 	 *  PuTTY doesn't currently show this character.
 	 */
    /* parse one char at every time */
    {"\"\\", -1, -1, json_tokener_continue, 0},
    {"u", -1, -1, json_tokener_continue, 0},
@@ -296,6 +300,16 @@ struct incremental_step
    {"udd1e\"", -1, -1, json_tokener_success, 1},
    {"\"\\ud834\\u", -1, -1, json_tokener_continue, 0},
    {"dd1e\"", -1, -1, json_tokener_success, 1},
    {"\"fff \\ud834\\ud", -1, -1, json_tokener_continue, 0},
    {"d1e bar\"", -1, -1, json_tokener_success, 1},
    {"\"fff \\ud834\\udd", -1, -1, json_tokener_continue, 0},
    {"1e bar\"", -1, -1, json_tokener_success, 1},

    /* \ud83d\ude00 is U+1F600, Grinning Face
 	 * Displays fine in PuTTY, though you may need "less -r"
 	 */
    {"\"fff \\ud83d\\ude", -1, -1, json_tokener_continue, 0},
    {"00 bar\"", -1, -1, json_tokener_success, 1},

    /* Check that json_tokener_reset actually resets */
    {"{ \"foo", -1, -1, json_tokener_continue, 1},
--- a/tests/test_parse.expected
+++ b/tests/test_parse.expected
@@ -124,6 +124,12 @@ json_tokener_parse_ex(tok, "\ud834\    ,   8) ... OK: got correct error: continu
 json_tokener_parse_ex(tok, udd1e"      ,   6) ... OK: got object of type [string]: "ð�„ž"
 json_tokener_parse_ex(tok, "\ud834\u   ,   9) ... OK: got correct error: continue
 json_tokener_parse_ex(tok, dd1e"       ,   5) ... OK: got object of type [string]: "ð�„ž"
 json_tokener_parse_ex(tok, "fff \ud834\ud,  14) ... OK: got correct error: continue
 json_tokener_parse_ex(tok, d1e bar"    ,   8) ... OK: got object of type [string]: "fff ð�„ž bar"
 json_tokener_parse_ex(tok, "fff \ud834\udd,  15) ... OK: got correct error: continue
 json_tokener_parse_ex(tok, 1e bar"     ,   7) ... OK: got object of type [string]: "fff ð�„ž bar"
 json_tokener_parse_ex(tok, "fff \ud83d\ude,  15) ... OK: got correct error: continue
 json_tokener_parse_ex(tok, 00 bar"     ,   7) ... OK: got object of type [string]: "fff ðŸ˜€ bar"
 json_tokener_parse_ex(tok, { "foo      ,   6) ... OK: got correct error: continue
 json_tokener_parse_ex(tok, : "bar"}    ,   8) ... OK: got correct error: unexpected character
 json_tokener_parse_ex(tok, { "foo      ,   6) ... OK: got correct error: continue
@@ -240,5 +246,5 @@ json_tokener_parse_ex(tok, "\ud855
 json_tokener_parse_ex(tok, "\ud0031À"  ,  10) ... OK: got correct error: invalid utf-8 string
 json_tokener_parse_ex(tok, 11�11       ,   5) ... OK: got correct error: invalid utf-8 string
 json_tokener_parse_ex(tok, {"1�":1}    ,   8) ... OK: got correct error: invalid utf-8 string
 End Incremental Tests OK=154 ERROR=0
 End Incremental Tests OK=160 ERROR=0
 ==================================