Browse Source

Adding UTF-8 validation. Fixes #122

pull/294/head
Shane F. Carr 8 years ago
parent
commit
b58076b242
6 changed files with 159 additions and 9 deletions
  1. +2
    -0
      .gitignore
  2. +58
    -9
      json_object.c
  3. +1
    -0
      tests/Makefile.am
  4. +73
    -0
      tests/test_utf8.c
  5. +13
    -0
      tests/test_utf8.expected
  6. +12
    -0
      tests/test_utf8.test

+ 2
- 0
.gitignore View File

@@ -26,6 +26,7 @@
/tests/test_charcase /tests/test_charcase
/tests/test_compare /tests/test_compare
/tests/test_double_serializer /tests/test_double_serializer
/tests/test_float
/tests/test_locale /tests/test_locale
/tests/test_null /tests/test_null
/tests/test_parse /tests/test_parse
@@ -36,6 +37,7 @@
/tests/test_util_file /tests/test_util_file
/tests/test_visit /tests/test_visit
/tests/test_json_pointer /tests/test_json_pointer
/tests/test_utf8
/tests/*.vg.out /tests/*.vg.out
/tests/*.log /tests/*.log
/tests/*.trs /tests/*.trs


+ 58
- 9
json_object.c View File

@@ -105,13 +105,30 @@ get_string_component(const struct json_object *jso)


static int json_escape_str(struct printbuf *pb, const char *str, int len, int flags) static int json_escape_str(struct printbuf *pb, const char *str, int len, int flags)
{ {
int pos = 0, start_offset = 0;
int pos = 0, start_offset = 0, utf8_start = 0, utf8_end = 0;
unsigned char c; unsigned char c;
while (len--) while (len--)
{ {
c = str[pos]; c = str[pos];
switch(c)
{
if (utf8_end > pos) {
// Expecting a continuation byte.
if (c >= 0x80 && c <= 0xBf) {
// Found the continuation byte.
goto utf8_loop_end;
} else {
// Invalid byte.
if(utf8_start - start_offset > 0)
printbuf_memappend(pb, str + start_offset, utf8_start - start_offset);
printbuf_memappend(pb, "\xEF\xBF\xBD", 3);
start_offset = pos;
utf8_end = pos; // get out of the UTF-8 state
goto utf8_reset;
}
}

utf8_reset:
switch(c) {

case '\b': case '\b':
case '\n': case '\n':
case '\r': case '\r':
@@ -122,7 +139,6 @@ static int json_escape_str(struct printbuf *pb, const char *str, int len, int fl
case '/': case '/':
if((flags & JSON_C_TO_STRING_NOSLASHESCAPE) && c == '/') if((flags & JSON_C_TO_STRING_NOSLASHESCAPE) && c == '/')
{ {
pos++;
break; break;
} }


@@ -138,7 +154,7 @@ static int json_escape_str(struct printbuf *pb, const char *str, int len, int fl
else if(c == '\\') printbuf_memappend(pb, "\\\\", 2); else if(c == '\\') printbuf_memappend(pb, "\\\\", 2);
else if(c == '/') printbuf_memappend(pb, "\\/", 2); else if(c == '/') printbuf_memappend(pb, "\\/", 2);


start_offset = ++pos;
start_offset = pos + 1;
break; break;
default: default:
if(c < ' ') if(c < ' ')
@@ -150,12 +166,45 @@ static int json_escape_str(struct printbuf *pb, const char *str, int len, int fl
sprintbuf(pb, "\\u00%c%c", sprintbuf(pb, "\\u00%c%c",
json_hex_chars[c >> 4], json_hex_chars[c >> 4],
json_hex_chars[c & 0xf]); json_hex_chars[c & 0xf]);
start_offset = ++pos;
} else
pos++;
start_offset = pos + 1;
} else if (c >= 0x80) {
// Expecting a start byte.
if (c >= 0xC2 && c <= 0xDF) {
// 2-byte start byte.
utf8_start = pos;
utf8_end = pos + 2;
} else if (c >= 0xE0 && c <= 0xEF) {
// 3-byte start byte.
utf8_start = pos;
utf8_end = pos + 3;
} else if (c >= 0xF0 && c <= 0xF4) {
// 4-byte start byte.
utf8_start = pos;
utf8_end = pos + 4;
} else {
// Invalid byte.
if(pos - start_offset > 0)
printbuf_memappend(pb,
str + start_offset,
pos - start_offset);
printbuf_memappend(pb, "\xEF\xBF\xBD", 3);
start_offset = pos + 1;
}
} else {
// Some other valid ASCII character.
}
break;
} }

utf8_loop_end:
pos++;
}
if (utf8_end > pos) {
if(utf8_start - start_offset > 0)
printbuf_memappend(pb, str + start_offset, utf8_start - start_offset);
printbuf_memappend(pb, "\xEF\xBF\xBD", 3);
} }
if (pos - start_offset > 0)
else if (pos - start_offset > 0)
printbuf_memappend(pb, str + start_offset, pos - start_offset); printbuf_memappend(pb, str + start_offset, pos - start_offset);
return 0; return 0;
} }


+ 1
- 0
tests/Makefile.am View File

@@ -25,6 +25,7 @@ TESTS+= test_compare.test
TESTS+= test_set_value.test TESTS+= test_set_value.test
TESTS+= test_visit.test TESTS+= test_visit.test
TESTS+= test_json_pointer.test TESTS+= test_json_pointer.test
TESTS+= test_utf8.test


check_PROGRAMS= check_PROGRAMS=
check_PROGRAMS += $(TESTS:.test=) check_PROGRAMS += $(TESTS:.test=)


+ 73
- 0
tests/test_utf8.c View File

@@ -0,0 +1,73 @@
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#include "json.h"

int main() {
const char inputs[][20] = {
"\0", // empty string
"AbC;dE\0", // ASCII string
"\xE2\x82\xAC\0", // A single valid UTF-8
"Ab\xE2\x82\xAC;dE\0", // Valid UTF-8 in context
"Ab\xFF;dE\0", // One illegal byte
"Ab\xE2;dE\0", // One invalid start byte
"Ab\xE2\xE2;dE\0", // Two invalid start bytes
"Ab\xE2\xE2\xE2;dE\0", // Three invalid start bytes
"Ab\xE2\xE2...\xE2;dE\0", // Two disjoint invalid sequences
"Ab\xE2\x82\xFF;dE\0", // First two bytes are OK but not the third
"Ab\xE2\x82\xFF\xE2;dE\0", // Like above but with another start byte
"\xE2\0", // A start byte that "overhangs" the end
"A\xFD\0", // Normal ASCII character with invalid byte at end
};
const char outputs[][30] = {
"\"\"\0",
"\"AbC;dE\"\0",
"\"\xE2\x82\xAC\"\0",
"\"Ab\xE2\x82\xAC;dE\"\0",
"\"Ab\xEF\xBF\xBD;dE\"\0",
"\"Ab\xEF\xBF\xBD;dE\"\0",
"\"Ab\xEF\xBF\xBD\xEF\xBF\xBD;dE\"\0",
"\"Ab\xEF\xBF\xBD\xEF\xBF\xBD\xEF\xBF\xBD;dE\"\0",
"\"Ab\xEF\xBF\xBD\xEF\xBF\xBD...\xEF\xBF\xBD;dE\"\0",
"\"Ab\xEF\xBF\xBD\xEF\xBF\xBD;dE\"\0",
"\"Ab\xEF\xBF\xBD\xEF\xBF\xBD\xEF\xBF\xBD;dE\"\0",
"\"\xEF\xBF\xBD\"",
"\"A\xEF\xBF\xBD\"",
};
const size_t num_cases = 13;

int errcode = 0;

for (size_t i=0; i<num_cases; i++) {
const char* in = inputs[i];
const char* expected = outputs[i];
const size_t expected_len = strlen(expected);

json_object* strobj = json_object_new_string(in);
const char* actual = json_object_to_json_string(strobj);
size_t actual_len = strlen(actual);

if (expected_len != actual_len) {
printf("FAIL ON CASE %d: expected length %d but got %d\n",
(int)i, (int)expected_len, (int)actual_len);
printf("%s\n", actual);
errcode = 1;
goto cleanup;
}

if (memcmp(expected, actual, actual_len) != 0) {
printf("FAIL ON CASE %d: expected '%s' but got '%s'\n",
(int)i, expected, actual);
errcode = 2;
goto cleanup;
}

printf("PASS CASE %d\n", (int)i);

cleanup:
json_object_put(strobj);
}

return errcode;
}

+ 13
- 0
tests/test_utf8.expected View File

@@ -0,0 +1,13 @@
PASS CASE 0
PASS CASE 1
PASS CASE 2
PASS CASE 3
PASS CASE 4
PASS CASE 5
PASS CASE 6
PASS CASE 7
PASS CASE 8
PASS CASE 9
PASS CASE 10
PASS CASE 11
PASS CASE 12

+ 12
- 0
tests/test_utf8.test View File

@@ -0,0 +1,12 @@
#!/bin/sh

# Common definitions
if test -z "$srcdir"; then
srcdir="${0%/*}"
test "$srcdir" = "$0" && srcdir=.
test -z "$srcdir" && srcdir=.
fi
. "$srcdir/test-defs.sh"

run_output_test test_utf8
exit $?

Loading…
Cancel
Save