* Add handling of surrogate pairs

git-svn-id: http://svn.metaparadigm.com/svn/json-c/trunk@53 327403b1-1117-474d-bef2-5cb71233fd97
16 years ago · 126ad95fc4
--- a/+ 2
+++ b/+ 2
@@ -1,3 +1,5 @@
  * Add handling of surrogate pairs (json_tokener.c, test4.c, Makefile.am)
    Brent Miller, bdmiller at yahoo dash inc dot com
  * Correction to comment describing printbuf_memappend in printbuf.h
    Brent Miller, bdmiller at yahoo dash inc dot com

--- a/Makefile.am
+++ b/Makefile.am
@@ -31,7 +31,7 @@ libjson_la_SOURCES = \
 	linkhash.c \
 	printbuf.c

 check_PROGRAMS = test1 test2 test3
 check_PROGRAMS = test1 test2 test3 test4

 test1_SOURCES = test1.c
 test1_LDADD = $(lib_LTLIBRARIES)
@@ -41,3 +41,6 @@ test2_LDADD = $(lib_LTLIBRARIES)

 test3_SOURCES = test3.c
 test3_LDADD = $(lib_LTLIBRARIES)

 test4_SOURCES = test4.c
 test4_LDADD = $(lib_LTLIBRARIES)
--- a/json_tokener.c
+++ b/json_tokener.c
@@ -58,6 +58,12 @@ const char* json_tokener_errors[] = {
  "expected comment",
 };

 /* Stuff for decoding unicode sequences */
 #define IS_HIGH_SURROGATE(uc) (((uc) & 0xFC00) == 0xD800)
 #define IS_LOW_SURROGATE(uc)  (((uc) & 0xFC00) == 0xDC00)
 #define DECODE_SURROGATE_PAIR(hi,lo) ((((hi) & 0x3FF) << 10) + ((lo) & 0x3FF) + 0x10000)
 static unsigned char utf8_replacement_char[3] = { 0xEF, 0xBF, 0xBD };


 struct json_tokener* json_tokener_new(void)
 {
@@ -176,6 +182,7 @@ char* strndup(const char* str, size_t n)
 #define ADVANCE_CHAR(str, tok) \
  ( ++(str), ((tok)->char_offset)++, c)


 /* End optimization macro defs */


@@ -398,40 +405,97 @@ struct json_object* json_tokener_parse_ex(struct json_tokener *tok,
      break;

    case json_tokener_state_escape_unicode:
            /* Note that the following code is inefficient for handling large
       * chunks of extended chars, calling printbuf_memappend() once
       * for each multi-byte character of input.
       * This is a good area for future optimization.
       */
 	{
 	  /* Advance until we change state */
          unsigned int got_hi_surrogate = 0;

 	  /* Handle a 4-byte sequence, or two sequences if a surrogate pair */
 	  while(1) {
 	    if(strchr(json_hex_chars, c)) {
 	      tok->ucs_char += ((unsigned int)hexdigit(c) << ((3-tok->st_pos++)*4));
 	      if(tok->st_pos == 4) {
 		unsigned char utf_out[3];
 		unsigned char unescaped_utf[4];

                if (got_hi_surrogate) {
 		  if (IS_LOW_SURROGATE(tok->ucs_char)) {
                    /* Recalculate the ucs_char, then fall thru to process normally */
                    tok->ucs_char = DECODE_SURROGATE_PAIR(got_hi_surrogate, tok->ucs_char);
                  } else {
                    /* Hi surrogate was not followed by a low surrogate */
                    /* Replace the hi and process the rest normally */
 		    printbuf_memappend_fast(tok->pb, (char*)utf8_replacement_char, 3);
                  }
                  got_hi_surrogate = 0;
                }

 		if (tok->ucs_char < 0x80) {
 		  utf_out[0] = tok->ucs_char;
 		  printbuf_memappend_fast(tok->pb, (char*)utf_out, 1);
 		  unescaped_utf[0] = tok->ucs_char;
 		  printbuf_memappend_fast(tok->pb, (char*)unescaped_utf, 1);
 		} else if (tok->ucs_char < 0x800) {
 		  utf_out[0] = 0xc0 | (tok->ucs_char >> 6);
 		  utf_out[1] = 0x80 | (tok->ucs_char & 0x3f);
 		  printbuf_memappend_fast(tok->pb, (char*)utf_out, 2);
 		  unescaped_utf[0] = 0xc0 | (tok->ucs_char >> 6);
 		  unescaped_utf[1] = 0x80 | (tok->ucs_char & 0x3f);
 		  printbuf_memappend_fast(tok->pb, (char*)unescaped_utf, 2);
 		} else if (IS_HIGH_SURROGATE(tok->ucs_char)) {
                  /* Got a high surrogate.  Remember it and look for the
                   * the beginning of another sequence, which should be the
                   * low surrogate.
                   */
                  got_hi_surrogate = tok->ucs_char;
                  /* Not at end, and the next two chars should be "\u" */
                  if ((tok->char_offset+1 != len) &&
                      (tok->char_offset+2 != len) &&
                      (str[1] == '\\') &&
                      (str[2] == 'u'))
                  {
 	            ADVANCE_CHAR(str, tok);
 	            ADVANCE_CHAR(str, tok);

                    /* Advance to the first char of the next sequence and
                     * continue processing with the next sequence.
                     */
 	            if (!ADVANCE_CHAR(str, tok) || !POP_CHAR(c, tok)) {
 	              printbuf_memappend_fast(tok->pb, (char*)utf8_replacement_char, 3);
 	              goto out;
                    }
 	            tok->ucs_char = 0;
                    tok->st_pos = 0;
                    continue; /* other json_tokener_state_escape_unicode */
                  } else {
                    /* Got a high surrogate without another sequence following
                     * it.  Put a replacement char in for the hi surrogate
                     * and pretend we finished.
                     */
 		    printbuf_memappend_fast(tok->pb, (char*)utf8_replacement_char, 3);
                  }
 		} else if (IS_LOW_SURROGATE(tok->ucs_char)) {
                  /* Got a low surrogate not preceded by a high */
 		  printbuf_memappend_fast(tok->pb, (char*)utf8_replacement_char, 3);
                } else if (tok->ucs_char < 0x10000) {
 		  unescaped_utf[0] = 0xe0 | (tok->ucs_char >> 12);
 		  unescaped_utf[1] = 0x80 | ((tok->ucs_char >> 6) & 0x3f);
 		  unescaped_utf[2] = 0x80 | (tok->ucs_char & 0x3f);
 		  printbuf_memappend_fast(tok->pb, (char*)unescaped_utf, 3);
 		} else if (tok->ucs_char < 0x110000) {
 		  unescaped_utf[0] = 0xf0 | ((tok->ucs_char >> 18) & 0x07);
 		  unescaped_utf[1] = 0x80 | ((tok->ucs_char >> 12) & 0x3f);
 		  unescaped_utf[2] = 0x80 | ((tok->ucs_char >> 6) & 0x3f);
 		  unescaped_utf[3] = 0x80 | (tok->ucs_char & 0x3f);
 		  printbuf_memappend_fast(tok->pb, (char*)unescaped_utf, 4);
 		} else {
 		  utf_out[0] = 0xe0 | (tok->ucs_char >> 12);
 		  utf_out[1] = 0x80 | ((tok->ucs_char >> 6) & 0x3f);
 		  utf_out[2] = 0x80 | (tok->ucs_char & 0x3f);
 		  printbuf_memappend_fast(tok->pb, (char*)utf_out, 3);
 		}
                  /* Don't know what we got--insert the replacement char */
 		  printbuf_memappend_fast(tok->pb, (char*)utf8_replacement_char, 3);
                }
 		state = saved_state;
 		break;
 	      }
 	    } else {
 	      tok->err = json_tokener_error_parse_string;
 	      goto out;
 	      	  }
 	  if (!ADVANCE_CHAR(str, tok) || !POP_CHAR(c, tok))
 	    }
 	  if (!ADVANCE_CHAR(str, tok) || !POP_CHAR(c, tok)) {
            if (got_hi_surrogate) /* Clean up any pending chars */
 	      printbuf_memappend_fast(tok->pb, (char*)utf8_replacement_char, 3);
 	    goto out;
 	  }
 	}
      }
      break;
--- a/test4.c
+++ b/test4.c
@@ -0,0 +1,44 @@
 /*
 * gcc -o utf8 utf8.c -I/home/y/include -L./.libs -ljson
 */

 #include <stdio.h>
 #include <string.h>
 #include <json/json_object.h>
 #include <json/json_tokener.h>

 void print_hex( const unsigned char* s) {
        const unsigned char *iter = s;
        unsigned char ch;
        while ((ch = *iter++) != 0) {
           if( ',' != ch)
            printf("%x ", ch);
           else
            printf( ",");
        }
        printf("\n");
 }

 int main() {
    const char *input = "\"\\ud840\\udd26,\\ud840\\udd27,\\ud800\\udd26,\\ud800\\udd27\"";
    const char *expected = "\xF0\xA0\x84\xA6,\xF0\xA0\x84\xA7,\xF0\x90\x84\xA6,\xF0\x90\x84\xA7";
    struct json_object *parse_result = json_tokener_parse((char*)input);
    const char *unjson = json_object_get_string(parse_result);

    printf("input: %s\n", input);

    int strings_match = !strcmp( expected, unjson);
    if (strings_match) {
        printf("JSON parse result is correct: %s\n", unjson);
        printf("PASS\n");
        return(0);
    } else {
        printf("JSON parse result doesn't match expected string\n");
        printf("expected string bytes: ");
        print_hex( expected);
        printf("parsed string bytes:   ");
        print_hex( unjson);
        printf("FAIL\n");
        return(1);
    }
 }