diff --git a/.gitignore b/.gitignore index 6e85d6f..4253967 100644 --- a/.gitignore +++ b/.gitignore @@ -26,6 +26,7 @@ /tests/test_charcase /tests/test_compare /tests/test_double_serializer +/tests/test_float /tests/test_locale /tests/test_null /tests/test_parse @@ -36,6 +37,7 @@ /tests/test_util_file /tests/test_visit /tests/test_json_pointer +/tests/test_utf8 /tests/*.vg.out /tests/*.log /tests/*.trs diff --git a/json_object.c b/json_object.c index 139d857..d8e3b67 100644 --- a/json_object.c +++ b/json_object.c @@ -105,13 +105,30 @@ get_string_component(const struct json_object *jso) static int json_escape_str(struct printbuf *pb, const char *str, int len, int flags) { - int pos = 0, start_offset = 0; + int pos = 0, start_offset = 0, utf8_start = 0, utf8_end = 0; unsigned char c; while (len--) { c = str[pos]; - switch(c) - { + if (utf8_end > pos) { + // Expecting a continuation byte. + if (c >= 0x80 && c <= 0xBf) { + // Found the continuation byte. + goto utf8_loop_end; + } else { + // Invalid byte. + if(utf8_start - start_offset > 0) + printbuf_memappend(pb, str + start_offset, utf8_start - start_offset); + printbuf_memappend(pb, "\xEF\xBF\xBD", 3); + start_offset = pos; + utf8_end = pos; // get out of the UTF-8 state + goto utf8_reset; + } + } + +utf8_reset: + switch(c) { + case '\b': case '\n': case '\r': @@ -122,7 +139,6 @@ static int json_escape_str(struct printbuf *pb, const char *str, int len, int fl case '/': if((flags & JSON_C_TO_STRING_NOSLASHESCAPE) && c == '/') { - pos++; break; } @@ -138,7 +154,7 @@ static int json_escape_str(struct printbuf *pb, const char *str, int len, int fl else if(c == '\\') printbuf_memappend(pb, "\\\\", 2); else if(c == '/') printbuf_memappend(pb, "\\/", 2); - start_offset = ++pos; + start_offset = pos + 1; break; default: if(c < ' ') @@ -150,12 +166,45 @@ static int json_escape_str(struct printbuf *pb, const char *str, int len, int fl sprintbuf(pb, "\\u00%c%c", json_hex_chars[c >> 4], json_hex_chars[c & 0xf]); - start_offset = ++pos; - } else - pos++; + start_offset = pos + 1; + } else if (c >= 0x80) { + // Expecting a start byte. + if (c >= 0xC2 && c <= 0xDF) { + // 2-byte start byte. + utf8_start = pos; + utf8_end = pos + 2; + } else if (c >= 0xE0 && c <= 0xEF) { + // 3-byte start byte. + utf8_start = pos; + utf8_end = pos + 3; + } else if (c >= 0xF0 && c <= 0xF4) { + // 4-byte start byte. + utf8_start = pos; + utf8_end = pos + 4; + } else { + // Invalid byte. + if(pos - start_offset > 0) + printbuf_memappend(pb, + str + start_offset, + pos - start_offset); + printbuf_memappend(pb, "\xEF\xBF\xBD", 3); + start_offset = pos + 1; + } + } else { + // Some other valid ASCII character. + } + break; } + +utf8_loop_end: + pos++; + } + if (utf8_end > pos) { + if(utf8_start - start_offset > 0) + printbuf_memappend(pb, str + start_offset, utf8_start - start_offset); + printbuf_memappend(pb, "\xEF\xBF\xBD", 3); } - if (pos - start_offset > 0) + else if (pos - start_offset > 0) printbuf_memappend(pb, str + start_offset, pos - start_offset); return 0; } diff --git a/tests/Makefile.am b/tests/Makefile.am index 824ed30..5aa317c 100644 --- a/tests/Makefile.am +++ b/tests/Makefile.am @@ -25,6 +25,7 @@ TESTS+= test_compare.test TESTS+= test_set_value.test TESTS+= test_visit.test TESTS+= test_json_pointer.test +TESTS+= test_utf8.test check_PROGRAMS= check_PROGRAMS += $(TESTS:.test=) diff --git a/tests/test_utf8.c b/tests/test_utf8.c new file mode 100644 index 0000000..0823105 --- /dev/null +++ b/tests/test_utf8.c @@ -0,0 +1,73 @@ +#include +#include +#include + +#include "json.h" + +int main() { + const char inputs[][20] = { + "\0", // empty string + "AbC;dE\0", // ASCII string + "\xE2\x82\xAC\0", // A single valid UTF-8 + "Ab\xE2\x82\xAC;dE\0", // Valid UTF-8 in context + "Ab\xFF;dE\0", // One illegal byte + "Ab\xE2;dE\0", // One invalid start byte + "Ab\xE2\xE2;dE\0", // Two invalid start bytes + "Ab\xE2\xE2\xE2;dE\0", // Three invalid start bytes + "Ab\xE2\xE2...\xE2;dE\0", // Two disjoint invalid sequences + "Ab\xE2\x82\xFF;dE\0", // First two bytes are OK but not the third + "Ab\xE2\x82\xFF\xE2;dE\0", // Like above but with another start byte + "\xE2\0", // A start byte that "overhangs" the end + "A\xFD\0", // Normal ASCII character with invalid byte at end + }; + const char outputs[][30] = { + "\"\"\0", + "\"AbC;dE\"\0", + "\"\xE2\x82\xAC\"\0", + "\"Ab\xE2\x82\xAC;dE\"\0", + "\"Ab\xEF\xBF\xBD;dE\"\0", + "\"Ab\xEF\xBF\xBD;dE\"\0", + "\"Ab\xEF\xBF\xBD\xEF\xBF\xBD;dE\"\0", + "\"Ab\xEF\xBF\xBD\xEF\xBF\xBD\xEF\xBF\xBD;dE\"\0", + "\"Ab\xEF\xBF\xBD\xEF\xBF\xBD...\xEF\xBF\xBD;dE\"\0", + "\"Ab\xEF\xBF\xBD\xEF\xBF\xBD;dE\"\0", + "\"Ab\xEF\xBF\xBD\xEF\xBF\xBD\xEF\xBF\xBD;dE\"\0", + "\"\xEF\xBF\xBD\"", + "\"A\xEF\xBF\xBD\"", + }; + const size_t num_cases = 13; + + int errcode = 0; + + for (size_t i=0; i