From 727e4d1aaba4ecdd76ff1086fc4cb005fc0f93bf Mon Sep 17 00:00:00 2001 From: Brion Vibber Date: Mon, 15 Nov 2004 00:59:40 +0000 Subject: [PATCH] Fix composition bug: completed hangul syllable should not be merged with another following final jamo --- includes/normal/CleanUpTest.php | 45 +++++++++++++++++++++++++++++++++ includes/normal/UtfNormal.php | 11 +++++++- 2 files changed, 55 insertions(+), 1 deletion(-) diff --git a/includes/normal/CleanUpTest.php b/includes/normal/CleanUpTest.php index e9156abd80..219cc575e0 100644 --- a/includes/normal/CleanUpTest.php +++ b/includes/normal/CleanUpTest.php @@ -1,4 +1,36 @@ +# http://www.mediawiki.org/ +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with this program; if not, write to the Free Software Foundation, Inc., +# 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. +# http://www.gnu.org/copyleft/gpl.html + +/** + * Additional tests for UtfNormal::cleanUp() function, inclusion + * regression checks for known problems. + * + * Requires PHPUnit. + * + * @package UtfNormal + * @access private + */ + +if( php_sapi_name() != 'cli' ) { + die( "Run me from the command line please.\n" ); +} + /** */ if( isset( $_SERVER['argv'] ) && in_array( '--icu', $_SERVER['argv'] ) ) { dl( 'php_utfnormal.so' ); @@ -338,6 +370,15 @@ class CleanUpTest extends PHPUnit_TestCase { bin2hex( $expect ), bin2hex( UtfNormal::cleanUp( $text ) ) ); } + + function testHangulRegression() { + $text = "\xed\x9c\xaf" . # Hangul char + "\xe1\x87\x81"; # followed by another final jamo + $expect = $text; # Should *not* change. + $this->assertEquals( + bin2hex( $expect ), + bin2hex( UtfNormal::cleanUp( $text ) ) ); + } } @@ -345,4 +386,8 @@ $suite =& new PHPUnit_TestSuite( 'CleanUpTest' ); $result = PHPUnit::run( $suite ); echo $result->toString(); +if( !$result->wasSuccessful() ) { + exit( -1 ); +} +exit( 0 ); ?> \ No newline at end of file diff --git a/includes/normal/UtfNormal.php b/includes/normal/UtfNormal.php index 62461d626c..55f420eddd 100644 --- a/includes/normal/UtfNormal.php +++ b/includes/normal/UtfNormal.php @@ -652,6 +652,7 @@ class UtfNormal { $len = strlen( $string ); $out = ''; $lastClass = -1; + $lastHangul = 0; $startChar = ''; $combining = ''; $x1 = ord(substr(UTF8_HANGUL_VBASE,0,1)); @@ -692,6 +693,7 @@ class UtfNormal { $combining .= $c; } $lastClass = $class; + $lastHangul = 0; continue; } } @@ -699,6 +701,7 @@ class UtfNormal { if( $lastClass == 0 ) { if( isset( $utfCanonicalComp[$pair] ) ) { $startChar = $utfCanonicalComp[$pair]; + $lastHangul = 0; continue; } if( $n >= $x1 && $n <= $x2 ) { @@ -726,11 +729,13 @@ class UtfNormal { $startChar = chr( $hangulPoint >> 12 & 0x0f | 0xe0 ) . chr( $hangulPoint >> 6 & 0x3f | 0x80 ) . chr( $hangulPoint & 0x3f | 0x80 ); + $lastHangul = 0; continue; } elseif( $c >= UTF8_HANGUL_TBASE && $c <= UTF8_HANGUL_TEND && $startChar >= UTF8_HANGUL_FIRST && - $startChar <= UTF8_HANGUL_LAST ) { + $startChar <= UTF8_HANGUL_LAST && + !$lastHangul ) { # $tIndex = utf8ToCodepoint( $c ) - UNICODE_HANGUL_TBASE; $tIndex = ord( $c{2} ) - 0xa7; if( $tIndex < 0 ) $tIndex = ord( $c{2} ) - 0x80 + (0x11c0 - 0x11a7); @@ -749,6 +754,9 @@ class UtfNormal { $startChar{1} = chr( $mid ); } $startChar{2} = chr( $tail ); + + # If there's another jamo char after this, *don't* try to merge it. + $lastHangul = 1; continue; } } @@ -758,6 +766,7 @@ class UtfNormal { $startChar = $c; $combining = ''; $lastClass = 0; + $lastHangul = 0; } $out .= $startChar . $combining; return $out; -- 2.20.1