@@ -26,6 +26,7 @@ | |||||
/tests/test_charcase | /tests/test_charcase | ||||
/tests/test_compare | /tests/test_compare | ||||
/tests/test_double_serializer | /tests/test_double_serializer | ||||
/tests/test_float | |||||
/tests/test_locale | /tests/test_locale | ||||
/tests/test_null | /tests/test_null | ||||
/tests/test_parse | /tests/test_parse | ||||
@@ -36,6 +37,7 @@ | |||||
/tests/test_util_file | /tests/test_util_file | ||||
/tests/test_visit | /tests/test_visit | ||||
/tests/test_json_pointer | /tests/test_json_pointer | ||||
/tests/test_utf8 | |||||
/tests/*.vg.out | /tests/*.vg.out | ||||
/tests/*.log | /tests/*.log | ||||
/tests/*.trs | /tests/*.trs | ||||
@@ -105,13 +105,30 @@ get_string_component(const struct json_object *jso) | |||||
static int json_escape_str(struct printbuf *pb, const char *str, int len, int flags) | static int json_escape_str(struct printbuf *pb, const char *str, int len, int flags) | ||||
{ | { | ||||
int pos = 0, start_offset = 0; | |||||
int pos = 0, start_offset = 0, utf8_start = 0, utf8_end = 0; | |||||
unsigned char c; | unsigned char c; | ||||
while (len--) | while (len--) | ||||
{ | { | ||||
c = str[pos]; | c = str[pos]; | ||||
switch(c) | |||||
{ | |||||
if (utf8_end > pos) { | |||||
// Expecting a continuation byte. | |||||
if (c >= 0x80 && c <= 0xBf) { | |||||
// Found the continuation byte. | |||||
goto utf8_loop_end; | |||||
} else { | |||||
// Invalid byte. | |||||
if(utf8_start - start_offset > 0) | |||||
printbuf_memappend(pb, str + start_offset, utf8_start - start_offset); | |||||
printbuf_memappend(pb, "\xEF\xBF\xBD", 3); | |||||
start_offset = pos; | |||||
utf8_end = pos; // get out of the UTF-8 state | |||||
goto utf8_reset; | |||||
} | |||||
} | |||||
utf8_reset: | |||||
switch(c) { | |||||
case '\b': | case '\b': | ||||
case '\n': | case '\n': | ||||
case '\r': | case '\r': | ||||
@@ -122,7 +139,6 @@ static int json_escape_str(struct printbuf *pb, const char *str, int len, int fl | |||||
case '/': | case '/': | ||||
if((flags & JSON_C_TO_STRING_NOSLASHESCAPE) && c == '/') | if((flags & JSON_C_TO_STRING_NOSLASHESCAPE) && c == '/') | ||||
{ | { | ||||
pos++; | |||||
break; | break; | ||||
} | } | ||||
@@ -138,7 +154,7 @@ static int json_escape_str(struct printbuf *pb, const char *str, int len, int fl | |||||
else if(c == '\\') printbuf_memappend(pb, "\\\\", 2); | else if(c == '\\') printbuf_memappend(pb, "\\\\", 2); | ||||
else if(c == '/') printbuf_memappend(pb, "\\/", 2); | else if(c == '/') printbuf_memappend(pb, "\\/", 2); | ||||
start_offset = ++pos; | |||||
start_offset = pos + 1; | |||||
break; | break; | ||||
default: | default: | ||||
if(c < ' ') | if(c < ' ') | ||||
@@ -150,12 +166,45 @@ static int json_escape_str(struct printbuf *pb, const char *str, int len, int fl | |||||
sprintbuf(pb, "\\u00%c%c", | sprintbuf(pb, "\\u00%c%c", | ||||
json_hex_chars[c >> 4], | json_hex_chars[c >> 4], | ||||
json_hex_chars[c & 0xf]); | json_hex_chars[c & 0xf]); | ||||
start_offset = ++pos; | |||||
} else | |||||
pos++; | |||||
start_offset = pos + 1; | |||||
} else if (c >= 0x80) { | |||||
// Expecting a start byte. | |||||
if (c >= 0xC2 && c <= 0xDF) { | |||||
// 2-byte start byte. | |||||
utf8_start = pos; | |||||
utf8_end = pos + 2; | |||||
} else if (c >= 0xE0 && c <= 0xEF) { | |||||
// 3-byte start byte. | |||||
utf8_start = pos; | |||||
utf8_end = pos + 3; | |||||
} else if (c >= 0xF0 && c <= 0xF4) { | |||||
// 4-byte start byte. | |||||
utf8_start = pos; | |||||
utf8_end = pos + 4; | |||||
} else { | |||||
// Invalid byte. | |||||
if(pos - start_offset > 0) | |||||
printbuf_memappend(pb, | |||||
str + start_offset, | |||||
pos - start_offset); | |||||
printbuf_memappend(pb, "\xEF\xBF\xBD", 3); | |||||
start_offset = pos + 1; | |||||
} | |||||
} else { | |||||
// Some other valid ASCII character. | |||||
} | |||||
break; | |||||
} | } | ||||
utf8_loop_end: | |||||
pos++; | |||||
} | |||||
if (utf8_end > pos) { | |||||
if(utf8_start - start_offset > 0) | |||||
printbuf_memappend(pb, str + start_offset, utf8_start - start_offset); | |||||
printbuf_memappend(pb, "\xEF\xBF\xBD", 3); | |||||
} | } | ||||
if (pos - start_offset > 0) | |||||
else if (pos - start_offset > 0) | |||||
printbuf_memappend(pb, str + start_offset, pos - start_offset); | printbuf_memappend(pb, str + start_offset, pos - start_offset); | ||||
return 0; | return 0; | ||||
} | } | ||||
@@ -25,6 +25,7 @@ TESTS+= test_compare.test | |||||
TESTS+= test_set_value.test | TESTS+= test_set_value.test | ||||
TESTS+= test_visit.test | TESTS+= test_visit.test | ||||
TESTS+= test_json_pointer.test | TESTS+= test_json_pointer.test | ||||
TESTS+= test_utf8.test | |||||
check_PROGRAMS= | check_PROGRAMS= | ||||
check_PROGRAMS += $(TESTS:.test=) | check_PROGRAMS += $(TESTS:.test=) | ||||
@@ -0,0 +1,73 @@ | |||||
#include <stdio.h> | |||||
#include <stdlib.h> | |||||
#include <string.h> | |||||
#include "json.h" | |||||
int main() { | |||||
const char inputs[][20] = { | |||||
"\0", // empty string | |||||
"AbC;dE\0", // ASCII string | |||||
"\xE2\x82\xAC\0", // A single valid UTF-8 | |||||
"Ab\xE2\x82\xAC;dE\0", // Valid UTF-8 in context | |||||
"Ab\xFF;dE\0", // One illegal byte | |||||
"Ab\xE2;dE\0", // One invalid start byte | |||||
"Ab\xE2\xE2;dE\0", // Two invalid start bytes | |||||
"Ab\xE2\xE2\xE2;dE\0", // Three invalid start bytes | |||||
"Ab\xE2\xE2...\xE2;dE\0", // Two disjoint invalid sequences | |||||
"Ab\xE2\x82\xFF;dE\0", // First two bytes are OK but not the third | |||||
"Ab\xE2\x82\xFF\xE2;dE\0", // Like above but with another start byte | |||||
"\xE2\0", // A start byte that "overhangs" the end | |||||
"A\xFD\0", // Normal ASCII character with invalid byte at end | |||||
}; | |||||
const char outputs[][30] = { | |||||
"\"\"\0", | |||||
"\"AbC;dE\"\0", | |||||
"\"\xE2\x82\xAC\"\0", | |||||
"\"Ab\xE2\x82\xAC;dE\"\0", | |||||
"\"Ab\xEF\xBF\xBD;dE\"\0", | |||||
"\"Ab\xEF\xBF\xBD;dE\"\0", | |||||
"\"Ab\xEF\xBF\xBD\xEF\xBF\xBD;dE\"\0", | |||||
"\"Ab\xEF\xBF\xBD\xEF\xBF\xBD\xEF\xBF\xBD;dE\"\0", | |||||
"\"Ab\xEF\xBF\xBD\xEF\xBF\xBD...\xEF\xBF\xBD;dE\"\0", | |||||
"\"Ab\xEF\xBF\xBD\xEF\xBF\xBD;dE\"\0", | |||||
"\"Ab\xEF\xBF\xBD\xEF\xBF\xBD\xEF\xBF\xBD;dE\"\0", | |||||
"\"\xEF\xBF\xBD\"", | |||||
"\"A\xEF\xBF\xBD\"", | |||||
}; | |||||
const size_t num_cases = 13; | |||||
int errcode = 0; | |||||
for (size_t i=0; i<num_cases; i++) { | |||||
const char* in = inputs[i]; | |||||
const char* expected = outputs[i]; | |||||
const size_t expected_len = strlen(expected); | |||||
json_object* strobj = json_object_new_string(in); | |||||
const char* actual = json_object_to_json_string(strobj); | |||||
size_t actual_len = strlen(actual); | |||||
if (expected_len != actual_len) { | |||||
printf("FAIL ON CASE %d: expected length %d but got %d\n", | |||||
(int)i, (int)expected_len, (int)actual_len); | |||||
printf("%s\n", actual); | |||||
errcode = 1; | |||||
goto cleanup; | |||||
} | |||||
if (memcmp(expected, actual, actual_len) != 0) { | |||||
printf("FAIL ON CASE %d: expected '%s' but got '%s'\n", | |||||
(int)i, expected, actual); | |||||
errcode = 2; | |||||
goto cleanup; | |||||
} | |||||
printf("PASS CASE %d\n", (int)i); | |||||
cleanup: | |||||
json_object_put(strobj); | |||||
} | |||||
return errcode; | |||||
} |
@@ -0,0 +1,13 @@ | |||||
PASS CASE 0 | |||||
PASS CASE 1 | |||||
PASS CASE 2 | |||||
PASS CASE 3 | |||||
PASS CASE 4 | |||||
PASS CASE 5 | |||||
PASS CASE 6 | |||||
PASS CASE 7 | |||||
PASS CASE 8 | |||||
PASS CASE 9 | |||||
PASS CASE 10 | |||||
PASS CASE 11 | |||||
PASS CASE 12 |
@@ -0,0 +1,12 @@ | |||||
#!/bin/sh | |||||
# Common definitions | |||||
if test -z "$srcdir"; then | |||||
srcdir="${0%/*}" | |||||
test "$srcdir" = "$0" && srcdir=. | |||||
test -z "$srcdir" && srcdir=. | |||||
fi | |||||
. "$srcdir/test-defs.sh" | |||||
run_output_test test_utf8 | |||||
exit $? |