From 4268665fc11f176dafc6a29a07f4eb823fbdb8b9 Mon Sep 17 00:00:00 2001 From: Roan Kattouw Date: Mon, 12 Jan 2009 14:11:20 +0000 Subject: [PATCH] API: (bug 16798) Fix for PHP bug 46944 (PHP's JSON formatter encodes certain UTF8 characters incorrectly) by falling back to our own formatter if PHP's is broken. Also fix up our own JSON formatter which also messed up these characters, albeit in a different way. Slightly modified patch by Brad Jorsch. --- RELEASE-NOTES | 3 ++ includes/api/ApiFormatJson.php | 5 ++- includes/api/ApiFormatJson_json.php | 70 +++++++++++++++++------------ 3 files changed, 49 insertions(+), 29 deletions(-) diff --git a/RELEASE-NOTES b/RELEASE-NOTES index 2ae95bfe09..d2e1cf3a0b 100644 --- a/RELEASE-NOTES +++ b/RELEASE-NOTES @@ -39,6 +39,9 @@ it from source control: http://www.mediawiki.org/wiki/Download_from_SVN * Fixing the caching issue by using -{T|xxx}- syntax (only applies on wiki with LanguageConverter class) * Improving the efficiency by using -{A|xxx}- syntax (only applies on wiki with LanguageConverter class) +== API changes in 1.15 == +* (bug 16798) JSON encoding errors for some characters outside the BMP + === Languages updated in 1.15 === MediaWiki supports over 300 languages. Many localisations are updated diff --git a/includes/api/ApiFormatJson.php b/includes/api/ApiFormatJson.php index 72ef220537..7920f738a7 100644 --- a/includes/api/ApiFormatJson.php +++ b/includes/api/ApiFormatJson.php @@ -58,7 +58,10 @@ class ApiFormatJson extends ApiFormatBase { $suffix = ")"; } - if (!function_exists('json_encode') || $this->getIsHtml()) { + // Some versions of PHP have a broken json_encode, see PHP bug + // 46944. Test encoding an affected character (U+20000) to + // avoid this. + if (!function_exists('json_encode') || $this->getIsHtml() || strtolower(json_encode("\xf0\xa0\x80\x80")) != '\ud840\udc00') { $json = new Services_JSON(); $this->printText($prefix . $json->encode($this->getResultData(), $this->getIsHtml()) . $suffix); } else { diff --git a/includes/api/ApiFormatJson_json.php b/includes/api/ApiFormatJson_json.php index 9714ab80e6..e28cbb37e0 100644 --- a/includes/api/ApiFormatJson_json.php +++ b/includes/api/ApiFormatJson_json.php @@ -168,6 +168,17 @@ class Services_JSON return chr(0xC0 | (($bytes >> 6) & 0x1F)) . chr(0x80 | ($bytes & 0x3F)); + case (0xFC00 & $bytes) == 0xD800 && strlen($utf16) >= 4 && (0xFC & ord($utf16{2})) == 0xDC: + // return a 4-byte UTF-8 character + $char = ((($bytes & 0x03FF) << 10) + | ((ord($utf16{2}) & 0x03) << 8) + | ord($utf16{3})); + $char += 0x10000; + return chr(0xF0 | (($char >> 18) & 0x07)) + . chr(0x80 | (($char >> 12) & 0x3F)) + . chr(0x80 | (($char >> 6) & 0x3F)) + . chr(0x80 | ($char & 0x3F)); + case (0xFFFF & $bytes) == $bytes: // return a 3-byte UTF-8 character // see: http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8 @@ -218,6 +229,20 @@ class Services_JSON | (0x0F & (ord($utf8{1}) >> 2))) . chr((0xC0 & (ord($utf8{1}) << 6)) | (0x7F & ord($utf8{2}))); + + case 4: + // return a UTF-16 surrogate pair from a 4-byte UTF-8 char + if(ord($utf8{0}) > 0xF4) return ''; # invalid + $char = ((0x1C0000 & (ord($utf8{0}) << 18)) + | (0x03F000 & (ord($utf8{1}) << 12)) + | (0x000FC0 & (ord($utf8{2}) << 6)) + | (0x00003F & ord($utf8{3}))); + if($char > 0x10FFFF) return ''; # invalid + $char -= 0x10000; + return chr(0xD8 | (($char >> 18) & 0x03)) + . chr(($char >> 10) & 0xFF) + . chr(0xDC | (($char >> 8) & 0x03)) + . chr($char & 0xFF); } // ignoring UTF-32 for now, sorry @@ -346,40 +371,19 @@ class Services_JSON case (($ord_var_c & 0xF8) == 0xF0): // characters U-00010000 - U-001FFFFF, mask 11110XXX // see http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8 + // These will always return a surrogate pair $char = pack('C*', $ord_var_c, ord($var{$c + 1}), ord($var{$c + 2}), ord($var{$c + 3})); $c += 3; $utf16 = $this->utf82utf16($char); - $ascii .= sprintf('\u%04s', bin2hex($utf16)); - break; - - case (($ord_var_c & 0xFC) == 0xF8): - // characters U-00200000 - U-03FFFFFF, mask 111110XX - // see http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8 - $char = pack('C*', $ord_var_c, - ord($var{$c + 1}), - ord($var{$c + 2}), - ord($var{$c + 3}), - ord($var{$c + 4})); - $c += 4; - $utf16 = $this->utf82utf16($char); - $ascii .= sprintf('\u%04s', bin2hex($utf16)); - break; - - case (($ord_var_c & 0xFE) == 0xFC): - // characters U-04000000 - U-7FFFFFFF, mask 1111110X - // see http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8 - $char = pack('C*', $ord_var_c, - ord($var{$c + 1}), - ord($var{$c + 2}), - ord($var{$c + 3}), - ord($var{$c + 4}), - ord($var{$c + 5})); - $c += 5; - $utf16 = $this->utf82utf16($char); - $ascii .= sprintf('\u%04s', bin2hex($utf16)); + if($utf16 == '') { + $ascii .= '\ufffd'; + } else { + $utf16 = str_split($utf16, 2); + $ascii .= sprintf('\u%04s\u%04s', bin2hex($utf16[0]), bin2hex($utf16[1])); + } break; } } @@ -591,6 +595,16 @@ class Services_JSON } break; + case preg_match('/\\\uD[89AB][0-9A-F]{2}\\\uD[C-F][0-9A-F]{2}/i', substr($chrs, $c, 12)): + // escaped unicode surrogate pair + $utf16 = chr(hexdec(substr($chrs, ($c + 2), 2))) + . chr(hexdec(substr($chrs, ($c + 4), 2))) + . chr(hexdec(substr($chrs, ($c + 8), 2))) + . chr(hexdec(substr($chrs, ($c + 10), 2))); + $utf8 .= $this->utf162utf8($utf16); + $c += 11; + break; + case preg_match('/\\\u[0-9A-F]{4}/i', substr($chrs, $c, 6)): // single, escaped unicode character $utf16 = chr(hexdec(substr($chrs, ($c + 2), 2))) -- 2.20.1