Adding UTF-8 validation. Fixes #122

8 years ago · b58076b242
--- a/.gitignore
+++ b/.gitignore
@@ -26,6 +26,7 @@
 /tests/test_charcase
 /tests/test_compare
 /tests/test_double_serializer
 /tests/test_float
 /tests/test_locale
 /tests/test_null
 /tests/test_parse
@@ -36,6 +37,7 @@
 /tests/test_util_file
 /tests/test_visit
 /tests/test_json_pointer
 /tests/test_utf8
 /tests/*.vg.out
 /tests/*.log
 /tests/*.trs
--- a/json_object.c
+++ b/json_object.c
@@ -105,13 +105,30 @@ get_string_component(const struct json_object *jso)

 static int json_escape_str(struct printbuf *pb, const char *str, int len, int flags)
 {
 	int pos = 0, start_offset = 0;
 	int pos = 0, start_offset = 0, utf8_start = 0, utf8_end = 0;
 	unsigned char c;
 	while (len--)
 	{
 		c = str[pos];
 		switch(c)
 		{
 		if (utf8_end > pos) {
 			// Expecting a continuation byte.
 			if (c >= 0x80 && c <= 0xBf) {
 				// Found the continuation byte.
 				goto utf8_loop_end;
 			} else {
 				// Invalid byte.
 				if(utf8_start - start_offset > 0)
 					printbuf_memappend(pb, str + start_offset, utf8_start - start_offset);
 				printbuf_memappend(pb, "\xEF\xBF\xBD", 3);
 				start_offset = pos;
 				utf8_end = pos; // get out of the UTF-8 state
 				goto utf8_reset;
 			}
 		}

 utf8_reset:
 		switch(c) {

 		case '\b':
 		case '\n':
 		case '\r':
@@ -122,7 +139,6 @@ static int json_escape_str(struct printbuf *pb, const char *str, int len, int fl
 		case '/':
 			if((flags & JSON_C_TO_STRING_NOSLASHESCAPE) && c == '/')
 			{
 				pos++;
 				break;
 			}

@@ -138,7 +154,7 @@ static int json_escape_str(struct printbuf *pb, const char *str, int len, int fl
 			else if(c == '\\') printbuf_memappend(pb, "\\\\", 2);
 			else if(c == '/') printbuf_memappend(pb, "\\/", 2);

 			start_offset = ++pos;
 			start_offset = pos + 1;
 			break;
 		default:
 			if(c < ' ')
@@ -150,12 +166,45 @@ static int json_escape_str(struct printbuf *pb, const char *str, int len, int fl
 				sprintbuf(pb, "\\u00%c%c",
 				json_hex_chars[c >> 4],
 				json_hex_chars[c & 0xf]);
 				start_offset = ++pos;
 			} else
 				pos++;
 				start_offset = pos + 1;
 			} else if (c >= 0x80) {
 				// Expecting a start byte.
 				if (c >= 0xC2 && c <= 0xDF) {
 					// 2-byte start byte.
 					utf8_start = pos;
 					utf8_end = pos + 2;
 				} else if (c >= 0xE0 && c <= 0xEF) {
 					// 3-byte start byte.
 					utf8_start = pos;
 					utf8_end = pos + 3;
 				} else if (c >= 0xF0 && c <= 0xF4) {
 					// 4-byte start byte.
 					utf8_start = pos;
 					utf8_end = pos + 4;
 				} else {
 					// Invalid byte.
 					if(pos - start_offset > 0)
 						printbuf_memappend(pb,
 								   str + start_offset,
 								   pos - start_offset);
 					printbuf_memappend(pb, "\xEF\xBF\xBD", 3);
 					start_offset = pos + 1;
 				}
 			} else {
 				// Some other valid ASCII character.
 			}
 			break;
 		}

 utf8_loop_end:
 		pos++;
 	}
 	if (utf8_end > pos) {
 		if(utf8_start - start_offset > 0)
 			printbuf_memappend(pb, str + start_offset, utf8_start - start_offset);
 		printbuf_memappend(pb, "\xEF\xBF\xBD", 3);
 	}
 	if (pos - start_offset > 0)
 	else if (pos - start_offset > 0)
 		printbuf_memappend(pb, str + start_offset, pos - start_offset);
 	return 0;
 }
--- a/tests/Makefile.am
+++ b/tests/Makefile.am
@@ -25,6 +25,7 @@ TESTS+= test_compare.test
 TESTS+= test_set_value.test
 TESTS+= test_visit.test
 TESTS+= test_json_pointer.test
 TESTS+= test_utf8.test

 check_PROGRAMS=
 check_PROGRAMS += $(TESTS:.test=)
--- a/tests/test_utf8.c
+++ b/tests/test_utf8.c
@@ -0,0 +1,73 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>

 #include "json.h"

 int main() {
 	const char inputs[][20] = {
 		"\0",  // empty string
 		"AbC;dE\0",  // ASCII string
 		"\xE2\x82\xAC\0",  // A single valid UTF-8
 		"Ab\xE2\x82\xAC;dE\0",  // Valid UTF-8 in context
 		"Ab\xFF;dE\0",  // One illegal byte
 		"Ab\xE2;dE\0",  // One invalid start byte
 		"Ab\xE2\xE2;dE\0",  // Two invalid start bytes
 		"Ab\xE2\xE2\xE2;dE\0",  // Three invalid start bytes
 		"Ab\xE2\xE2...\xE2;dE\0",  // Two disjoint invalid sequences
 		"Ab\xE2\x82\xFF;dE\0",  // First two bytes are OK but not the third
 		"Ab\xE2\x82\xFF\xE2;dE\0",  // Like above but with another start byte
 		"\xE2\0",  // A start byte that "overhangs" the end
 		"A\xFD\0",  // Normal ASCII character with invalid byte at end
 	};
 	const char outputs[][30] = {
 		"\"\"\0",
 		"\"AbC;dE\"\0",
 		"\"\xE2\x82\xAC\"\0",
 		"\"Ab\xE2\x82\xAC;dE\"\0",
 		"\"Ab\xEF\xBF\xBD;dE\"\0",
 		"\"Ab\xEF\xBF\xBD;dE\"\0",
 		"\"Ab\xEF\xBF\xBD\xEF\xBF\xBD;dE\"\0",
 		"\"Ab\xEF\xBF\xBD\xEF\xBF\xBD\xEF\xBF\xBD;dE\"\0",
 		"\"Ab\xEF\xBF\xBD\xEF\xBF\xBD...\xEF\xBF\xBD;dE\"\0",
 		"\"Ab\xEF\xBF\xBD\xEF\xBF\xBD;dE\"\0",
 		"\"Ab\xEF\xBF\xBD\xEF\xBF\xBD\xEF\xBF\xBD;dE\"\0",
 		"\"\xEF\xBF\xBD\"",
 		"\"A\xEF\xBF\xBD\"",
 	};
 	const size_t num_cases = 13;

 	int errcode = 0;

 	for (size_t i=0; i<num_cases; i++) {
 		const char* in = inputs[i];
 		const char* expected = outputs[i];
 		const size_t expected_len = strlen(expected);

 		json_object* strobj = json_object_new_string(in);
 		const char* actual = json_object_to_json_string(strobj);
 		size_t actual_len = strlen(actual);

 		if (expected_len != actual_len) {
 			printf("FAIL ON CASE %d: expected length %d but got %d\n",
 				(int)i, (int)expected_len, (int)actual_len);
 			printf("%s\n", actual);
 			errcode = 1;
 			goto cleanup;
 		}

 		if (memcmp(expected, actual, actual_len) != 0) {
 			printf("FAIL ON CASE %d: expected '%s' but got '%s'\n",
 				(int)i, expected, actual);
 			errcode = 2;
 			goto cleanup;
 		}

 		printf("PASS CASE %d\n", (int)i);

 cleanup:
 		json_object_put(strobj);
 	}

 	return errcode;
 }
--- a/tests/test_utf8.expected
+++ b/tests/test_utf8.expected
@@ -0,0 +1,13 @@
 PASS CASE 0
 PASS CASE 1
 PASS CASE 2
 PASS CASE 3
 PASS CASE 4
 PASS CASE 5
 PASS CASE 6
 PASS CASE 7
 PASS CASE 8
 PASS CASE 9
 PASS CASE 10
 PASS CASE 11
 PASS CASE 12
--- a/tests/test_utf8.test
+++ b/tests/test_utf8.test
@@ -0,0 +1,12 @@
 #!/bin/sh

 # Common definitions
 if test -z "$srcdir"; then
    srcdir="${0%/*}"
    test "$srcdir" = "$0" && srcdir=.
    test -z "$srcdir" && srcdir=.
 fi
 . "$srcdir/test-defs.sh"

 run_output_test test_utf8
 exit $?