Rearrange the json_tokener_state_escape_unicode case in json_tokener to simplify the code slightly and make it a bit easier to understand.

While here, drop the utf8_replacement_char that is unnecesarily added if we run out of input in the middle of a unicode escape. No other functional changes (yet).
5 years ago · 36118b681e
--- a/json_tokener.c
+++ b/json_tokener.c
@@ -223,7 +223,7 @@ struct json_object *json_tokener_parse_verbose(const char *str, enum json_tokene
 /* PEEK_CHAR(dest, tok) macro:
 *   Peeks at the current char and stores it in dest.
 *   Returns 1 on success, sets tok->err and returns 0 if no more chars.
 *   Implicit inputs:  str, len vars
 *   Implicit inputs:  str, len, nBytesp vars
 */
 #define PEEK_CHAR(dest, tok)                                                 \
 	(((tok)->char_offset == len)                                         \
@@ -633,175 +633,182 @@ struct json_object *json_tokener_parse_ex(struct json_tokener *tok, const char *
 			/* Handle a 4-byte sequence, or two sequences if a surrogate pair */
 			while (1)
 			{
 				if (c && strchr(json_hex_chars, c))
 				if (!c || !strchr(json_hex_chars, c))
 				{
 					tok->ucs_char += ((unsigned int)jt_hexdigit(c)
 					                  << ((3 - tok->st_pos++) * 4));
 					if (tok->st_pos == 4)
 					tok->err = json_tokener_error_parse_string;
 					goto out;
 				}
 				tok->ucs_char |= ((unsigned int)jt_hexdigit(c)
 								  << ((3 - tok->st_pos) * 4));
 				tok->st_pos++;
 				if (tok->st_pos < 4)
 				{
 					ADVANCE_CHAR(str, tok);
 					if (!PEEK_CHAR(c, tok))
 					{
 						unsigned char unescaped_utf[4];
 						/*
 						 * We're out of characters in the current call to
 						 * json_tokener_parse(), but a subsequent call might
 						 * provide us with more, so leave our current state
 						 * as-is (including tok->high_surrogate) and return.
 						 */
 						goto out;
 					}
 					continue;
 				}

 						if (tok->got_hi_surrogate)
 						{
 							if (IS_LOW_SURROGATE(tok->ucs_char))
 							{
 								/* remove the utf8_replacement_char */
 								/* which may generate during */
 								/* parsing the high surrogate pair. */
 								if (!strcmp(
 								        tok->pb->buf,
 								        (char *)
 								            utf8_replacement_char))
 								{
 									printbuf_reset(tok->pb);
 								}
 								/* Recalculate the ucs_char, then fall thru to process normally */
 								tok->ucs_char =
 								    DECODE_SURROGATE_PAIR(
 								        tok->got_hi_surrogate,
 								        tok->ucs_char);
 							}
 							else
 							{
 								/* Hi surrogate was not followed by a low surrogate */
 								/* Replace the hi and process the rest normally */
 								printbuf_memappend_fast(
 								    tok->pb,
 								    (char *)utf8_replacement_char,
 								    3);
 							}
 							tok->got_hi_surrogate = 0;
 						}
 				/* Now, we have a full \uNNNN sequence in tok->ucs_char */

 						if (tok->ucs_char < 0x80)
 						{
 							unescaped_utf[0] = tok->ucs_char;
 							printbuf_memappend_fast(
 							    tok->pb, (char *)unescaped_utf, 1);
 						}
 						else if (tok->ucs_char < 0x800)
 						{
 							unescaped_utf[0] =
 							    0xc0 | (tok->ucs_char >> 6);
 							unescaped_utf[1] =
 							    0x80 | (tok->ucs_char & 0x3f);
 							printbuf_memappend_fast(
 							    tok->pb, (char *)unescaped_utf, 2);
 						}
 						else if (IS_HIGH_SURROGATE(tok->ucs_char))
 						{
 							/* Got a high surrogate.  Remember it and look for
 							 * the beginning of another sequence, which
 							 * should be the low surrogate.
 							 */
 							tok->got_hi_surrogate = tok->ucs_char;
 							/* Not at end, and the next two chars should be "\u" */
 							if ((len == -1 ||
 							     len > (tok->char_offset + 2)) &&
 							    // str[0] != '0' &&  // implied by json_hex_chars, above.
 							    (str[1] == '\\') && (str[2] == 'u'))
 							{
 								/* Advance through the 16 bit surrogate, and move
 								 * on to the next sequence. The next step is to
 								 * process the following characters.
 								 */
 								if (!ADVANCE_CHAR(str, tok) ||
 								    !ADVANCE_CHAR(str, tok))
 								{
 									printbuf_memappend_fast(
 									    tok->pb,
 									    (char *)
 									        utf8_replacement_char,
 									    3);
 								}
 								/* Advance to the first char of the next sequence and
 								 * continue processing with the next sequence.
 								 */
 								if (!ADVANCE_CHAR(str, tok) ||
 								    !PEEK_CHAR(c, tok))
 								{
 									printbuf_memappend_fast(
 									    tok->pb,
 									    (char *)
 									        utf8_replacement_char,
 									    3);
 									tok->ucs_char = 0;
 									tok->st_pos = 0;
 									goto out;
 								}
 								tok->ucs_char = 0;
 								tok->st_pos = 0;
 								/* other json_tokener_state_escape_unicode */
 								continue;
 							}
 							else
 							{
 								/* Got a high surrogate without another sequence following
 								 * it.  Put a replacement char in for the hi surrogate
 								 * and pretend we finished.
 								 */
 								printbuf_memappend_fast(
 								    tok->pb,
 								    (char *)utf8_replacement_char,
 								    3);
 							}
 						}
 						else if (IS_LOW_SURROGATE(tok->ucs_char))
 						{
 							/* Got a low surrogate not preceded by a high */
 							printbuf_memappend_fast(
 							    tok->pb, (char *)utf8_replacement_char,
 							    3);
 						}
 						else if (tok->ucs_char < 0x10000)
 				if (tok->high_surrogate)
 				{
 					if (IS_LOW_SURROGATE(tok->ucs_char))
 					{
 						/* remove the utf8_replacement_char */
 						/* which may generate during */
 						/* parsing the high surrogate pair. */
 						if (!strcmp(
 								tok->pb->buf,
 								(char *)
 									utf8_replacement_char))
 						{
 							unescaped_utf[0] =
 							    0xe0 | (tok->ucs_char >> 12);
 							unescaped_utf[1] =
 							    0x80 | ((tok->ucs_char >> 6) & 0x3f);
 							unescaped_utf[2] =
 							    0x80 | (tok->ucs_char & 0x3f);
 							printbuf_memappend_fast(
 							    tok->pb, (char *)unescaped_utf, 3);
 							printbuf_reset(tok->pb);
 						}
 						else if (tok->ucs_char < 0x110000)
 						/* Recalculate the ucs_char, then fall thru to process normally */
 						tok->ucs_char =
 							DECODE_SURROGATE_PAIR(
 								tok->high_surrogate,
 								tok->ucs_char);
 					}
 					else
 					{
 						/* High surrogate was not followed by a low surrogate
 						 * Replace the high and process the rest normally
 						 */
 						printbuf_memappend_fast(
 							tok->pb,
 							(char *)utf8_replacement_char,
 							3);
 					}
 					tok->high_surrogate = 0;
 				}

 				if (tok->ucs_char < 0x80)
 				{
 					unsigned char unescaped_utf[1];
 					unescaped_utf[0] = tok->ucs_char;
 					printbuf_memappend_fast(
 						tok->pb, (char *)unescaped_utf, 1);
 				}
 				else if (tok->ucs_char < 0x800)
 				{
 					unsigned char unescaped_utf[2];
 					unescaped_utf[0] =
 						0xc0 | (tok->ucs_char >> 6);
 					unescaped_utf[1] =
 						0x80 | (tok->ucs_char & 0x3f);
 					printbuf_memappend_fast(
 						tok->pb, (char *)unescaped_utf, 2);
 				}
 				else if (IS_HIGH_SURROGATE(tok->ucs_char))
 				{
 					/* Got a high surrogate.  Remember it and look for
 					 * the beginning of another \uNNNN sequence, which
 					 * should be the low surrogate.
 					 */
 					tok->high_surrogate = tok->ucs_char;
 					/* Not at end, and the next two chars should be "\u" */
 					if ((len == -1 ||
 						 len > (tok->char_offset + 2)) &&
 						// str[0] != '0' &&  // implied by json_hex_chars, above.
 						(str[1] == '\\') && (str[2] == 'u'))
 					{
 						/* Advance through the 16 bit surrogate, and move
 						 * on to the next sequence. The next step is to
 						 * process the following characters.
 						 */
 						if (!ADVANCE_CHAR(str, tok) ||
 							!ADVANCE_CHAR(str, tok))
 						{
 							unescaped_utf[0] =
 							    0xf0 | ((tok->ucs_char >> 18) & 0x07);
 							unescaped_utf[1] =
 							    0x80 | ((tok->ucs_char >> 12) & 0x3f);
 							unescaped_utf[2] =
 							    0x80 | ((tok->ucs_char >> 6) & 0x3f);
 							unescaped_utf[3] =
 							    0x80 | (tok->ucs_char & 0x3f);
 							printbuf_memappend_fast(
 							    tok->pb, (char *)unescaped_utf, 4);
 								tok->pb,
 								(char *)
 									utf8_replacement_char,
 								3);
 						}
 						else
 						/* Advance to the first char of the next sequence and
 						 * continue processing with the next sequence.
 						 */
 						if (!ADVANCE_CHAR(str, tok) ||
 							!PEEK_CHAR(c, tok))
 						{
 							/* Don't know what we got--insert the replacement char */
 							printbuf_memappend_fast(
 							    tok->pb, (char *)utf8_replacement_char,
 							    3);
 								tok->pb,
 								(char *)
 									utf8_replacement_char,
 								3);
 							tok->ucs_char = 0;
 							tok->st_pos = 0;
 							goto out;
 						}
 						state = saved_state;
 						break;
 						tok->ucs_char = 0;
 						tok->st_pos = 0;
 						/* other json_tokener_state_escape_unicode */
 						continue;
 					}
 					else
 					{
 						/* Got a high surrogate without another sequence following
 						 * it.  Put a replacement char in for the high surrogate
 						 * and pretend we finished.
 						 */
 						printbuf_memappend_fast(
 							tok->pb,
 							(char *)utf8_replacement_char,
 							3);
 					}
 				}
 				else
 				else if (IS_LOW_SURROGATE(tok->ucs_char))
 				{
 					tok->err = json_tokener_error_parse_string;
 					goto out;
 					/* Got a low surrogate not preceded by a high */
 					printbuf_memappend_fast(
 						tok->pb, (char *)utf8_replacement_char,
 						3);
 				}
 				if (!ADVANCE_CHAR(str, tok) || !PEEK_CHAR(c, tok))
 				else if (tok->ucs_char < 0x10000)
 				{
 					/* Clean up any pending chars */
 					if (tok->got_hi_surrogate &&
 					    strcmp(tok->pb->buf, (char *)utf8_replacement_char))
 						printbuf_memappend_fast(
 						    tok->pb, (char *)utf8_replacement_char, 3);
 					goto out;
 					unsigned char unescaped_utf[3];
 					unescaped_utf[0] =
 						0xe0 | (tok->ucs_char >> 12);
 					unescaped_utf[1] =
 						0x80 | ((tok->ucs_char >> 6) & 0x3f);
 					unescaped_utf[2] =
 						0x80 | (tok->ucs_char & 0x3f);
 					printbuf_memappend_fast(
 						tok->pb, (char *)unescaped_utf, 3);
 				}
 				else if (tok->ucs_char < 0x110000)
 				{
 					unsigned char unescaped_utf[4];
 					unescaped_utf[0] =
 						0xf0 | ((tok->ucs_char >> 18) & 0x07);
 					unescaped_utf[1] =
 						0x80 | ((tok->ucs_char >> 12) & 0x3f);
 					unescaped_utf[2] =
 						0x80 | ((tok->ucs_char >> 6) & 0x3f);
 					unescaped_utf[3] =
 						0x80 | (tok->ucs_char & 0x3f);
 					printbuf_memappend_fast(
 						tok->pb, (char *)unescaped_utf, 4);
 				}
 				else
 				{
 					/* Don't know what we got--insert the replacement char */
 					printbuf_memappend_fast(
 						tok->pb, (char *)utf8_replacement_char,
 						3);
 				}
 				state = saved_state; // i.e. _state_string or _object_field
 				break;
 			}
 		}
 		break;
--- a/json_tokener.h
+++ b/json_tokener.h
@@ -111,7 +111,7 @@ struct json_tokener
 	 * @deprecated See json_tokener_get_error() instead.
 	 */
 	enum json_tokener_error err;
 	unsigned int ucs_char, got_hi_surrogate;
 	unsigned int ucs_char, high_surrogate;
 	char quote_char;
 	struct json_tokener_srec *stack;
 	int flags;