From 3fca385791f1bd1390d0c66e820105a57cf977d2 Mon Sep 17 00:00:00 2001 From: Zheng Zhu Date: Sun, 5 Dec 2004 02:17:21 +0000 Subject: [PATCH] Treat each Chinese character as a single word when diffing, and glue them back together afterward. This gives much more readable diff for zh. --- includes/DifferenceEngine.php | 13 ++++++++----- languages/Language.php | 11 ++++++++++- languages/LanguageLatin1.php | 8 ++++++++ languages/LanguageZh.php | 15 ++++++++++++++- 4 files changed, 40 insertions(+), 7 deletions(-) diff --git a/includes/DifferenceEngine.php b/includes/DifferenceEngine.php index a4793f7703..4c6017ef43 100644 --- a/includes/DifferenceEngine.php +++ b/includes/DifferenceEngine.php @@ -215,7 +215,7 @@ class DifferenceEngine { } function getDiff( $otext, $ntext, $otitle, $ntitle ) { - global $wgUseExternalDiffEngine; + global $wgUseExternalDiffEngine, $wgContLang; $out = " @@ -223,7 +223,9 @@ class DifferenceEngine { "; - + $otext = $wgContLang->segmentForDiff($otext); + $ntext = $wgContLang->segmentForDiff($ntext); + $difftext=''; if ( $wgUseExternalDiffEngine ) { # For historical reasons, external diff engine expects # input text to be HTML-escaped already @@ -232,15 +234,16 @@ class DifferenceEngine { if( !function_exists( 'wikidiff_do_diff' ) ) { dl('php_wikidiff.so'); } - $out .= wikidiff_do_diff( $otext, $ntext, 2 ); + $difftext = wikidiff_do_diff( $otext, $ntext, 2 ); } else { $ota = explode( "\n", str_replace( "\r\n", "\n", $otext ) ); $nta = explode( "\n", str_replace( "\r\n", "\n", $ntext ) ); $diffs =& new Diff( $ota, $nta ); $formatter =& new TableDiffFormatter(); - $out .= $formatter->format( $diffs ); + $difftext = $formatter->format( $diffs ); } - $out .= "
{$ntitle}
\n"; + $difftext = $wgContLang->unsegmentForDiff($difftext); + $out .= $difftext."\n"; return $out; } diff --git a/languages/Language.php b/languages/Language.php index 55e37eae69..98abf3cf39 100644 --- a/languages/Language.php +++ b/languages/Language.php @@ -2038,7 +2038,16 @@ class Language { return $word; } - + # languages like Chinese need to be segmented in order for the diff + # to be of any use + function segmentForDiff( $text ) { + return $text; + } + # and unsegment to show the result + function unsegmentForDiff( $text ) { + return $text; + } + # convert text to different variants of a language. the automatic # conversion is done in autoConvert(). here we parse the text # marked with -{}-, which specifies special conversions of the diff --git a/languages/LanguageLatin1.php b/languages/LanguageLatin1.php index e031007e4d..9a54330bd3 100644 --- a/languages/LanguageLatin1.php +++ b/languages/LanguageLatin1.php @@ -263,6 +263,14 @@ class LanguageLatin1 { return $this->lang->getPreferredVariant(); } + function segmentForDiff( $text ) { + return $text; + } + + function unsegmentForDiff( $text ) { + return $text; + } + function convert( $text, $isTitle=false ) { return utf8_decode( $this->lang->convert( utf8_encode( $text ), $isTitle ) ); } diff --git a/languages/LanguageZh.php b/languages/LanguageZh.php index e24af2649d..38cdcb7e55 100644 --- a/languages/LanguageZh.php +++ b/languages/LanguageZh.php @@ -55,7 +55,20 @@ class LanguageZh extends LanguageZh_cn { return $this->mZhLanguageCode; } - + # this should give much better diff info + function segmentForDiff( $text ) { + return preg_replace( + "/([\\xc0-\\xff][\\x80-\\xbf]*)/e", + "' ' .\"$1\"", $text); + } + + function unsegmentForDiff( $text ) { + return preg_replace( + "/ ([\\xc0-\\xff][\\x80-\\xbf]*)/e", + "\"$1\"", $text); + } + + function autoConvert($text, $toVariant=false) { if(!$toVariant) -- 2.20.1