Treat each Chinese character as a single word when diffing, and glue them back togeth...
authorZheng Zhu <zhengzhu@users.mediawiki.org>
Sun, 5 Dec 2004 02:17:21 +0000 (02:17 +0000)
committerZheng Zhu <zhengzhu@users.mediawiki.org>
Sun, 5 Dec 2004 02:17:21 +0000 (02:17 +0000)
includes/DifferenceEngine.php
languages/Language.php
languages/LanguageLatin1.php
languages/LanguageZh.php

index a4793f7..4c6017e 100644 (file)
@@ -215,7 +215,7 @@ class DifferenceEngine {
        }
        
        function getDiff( $otext, $ntext, $otitle, $ntitle ) {
-               global $wgUseExternalDiffEngine;
+               global $wgUseExternalDiffEngine, $wgContLang;
                $out = "
                        <table border='0' width='98%' cellpadding='0' cellspacing='4' class='diff'>
                        <tr>
@@ -223,7 +223,9 @@ class DifferenceEngine {
                                <td colspan='2' width='50%' align='center' class='diff-ntitle'>{$ntitle}</td>
                        </tr>
                ";
-
+               $otext = $wgContLang->segmentForDiff($otext);
+               $ntext = $wgContLang->segmentForDiff($ntext);
+               $difftext='';
                if ( $wgUseExternalDiffEngine ) {
                        # For historical reasons, external diff engine expects
                        # input text to be HTML-escaped already
@@ -232,15 +234,16 @@ class DifferenceEngine {
                        if( !function_exists( 'wikidiff_do_diff' ) ) {
                                dl('php_wikidiff.so');
                        }
-                       $out .= wikidiff_do_diff( $otext, $ntext, 2 );
+                       $difftext = wikidiff_do_diff( $otext, $ntext, 2 );
                } else {
                        $ota = explode( "\n", str_replace( "\r\n", "\n", $otext ) );
                        $nta = explode( "\n", str_replace( "\r\n", "\n", $ntext ) );
                        $diffs =& new Diff( $ota, $nta );
                        $formatter =& new TableDiffFormatter();
-                       $out .= $formatter->format( $diffs );
+                       $difftext = $formatter->format( $diffs );
                }
-               $out .= "</table>\n";
+               $difftext = $wgContLang->unsegmentForDiff($difftext);
+               $out .= $difftext."</table>\n";
                return $out;
        }
 
index 55e37ea..98abf3c 100644 (file)
@@ -2038,7 +2038,16 @@ class Language {
                return $word;
        }
 
-       
+       # languages like Chinese need to be segmented in order for the diff
+       # to be of any use
+       function segmentForDiff( $text ) {
+               return $text;
+       }
+       # and unsegment to show the result
+       function unsegmentForDiff( $text ) {
+               return $text;
+       }
+
        # convert text to different variants of a language. the automatic
        # conversion is done in autoConvert(). here we parse the text 
        # marked with -{}-, which specifies special conversions of the 
index e031007..9a54330 100644 (file)
@@ -263,6 +263,14 @@ class LanguageLatin1 {
                return $this->lang->getPreferredVariant();
        }
 
+       function segmentForDiff( $text ) {
+               return $text;
+       }
+
+       function unsegmentForDiff( $text ) {
+               return $text;
+       }
+
        function convert( $text, $isTitle=false ) {
                return utf8_decode( $this->lang->convert( utf8_encode( $text ), $isTitle ) );
        }
index e24af26..38cdcb7 100644 (file)
@@ -55,7 +55,20 @@ class LanguageZh extends LanguageZh_cn {
                return $this->mZhLanguageCode;
        }
        
-       
+       # this should give much better diff info
+       function segmentForDiff( $text ) {
+               return preg_replace(
+                       "/([\\xc0-\\xff][\\x80-\\xbf]*)/e",
+                       "' ' .\"$1\"", $text);
+       }
+
+       function unsegmentForDiff( $text ) {
+               return preg_replace(
+                       "/ ([\\xc0-\\xff][\\x80-\\xbf]*)/e",
+                       "\"$1\"", $text);
+       }
+
+
        
        function autoConvert($text, $toVariant=false) {
                if(!$toVariant)