Allower zh: users to customize the Traditional/Simplified conversion tables. From...
authorZheng Zhu <zhengzhu@users.mediawiki.org>
Wed, 29 Dec 2004 01:07:43 +0000 (01:07 +0000)
committerZheng Zhu <zhengzhu@users.mediawiki.org>
Wed, 29 Dec 2004 01:07:43 +0000 (01:07 +0000)
includes/Article.php
includes/MessageCache.php
includes/ZhClient.php
languages/Language.php
languages/LanguageZh.php

index ff0be1d..2c5dc57 100644 (file)
@@ -45,6 +45,14 @@ class Article {
                $this->clear();
        }
 
+       /**
+        * get the title object of the article
+        * @public
+        */
+       function getTitle() {
+               return $this->mTitle;   
+       }
+
        /**
          * Clear the object
          * @private
index b464ae0..a099a04 100755 (executable)
@@ -215,7 +215,7 @@ class MessageCache
                $this->mMemc->delete( $lockKey );
        }
 
-       function get( $key, $useDB, $forcontent=true ) {
+       function get( $key, $useDB, $forcontent=true, $isfullkey = false ) {
                global $wgContLanguageCode;
                if( $forcontent ) {
                        global $wgContLang;
@@ -238,7 +238,7 @@ class MessageCache
                $message = false;
                if( !$this->mDisable && $useDB ) {
                        $title = $lang->ucfirst( $key );
-                       if( $langcode != $wgContLanguageCode ) {
+                       if(!$isfullkey && ($langcode != $wgContLanguageCode) ) {
                                $title .= '/' . $langcode;
                        }
 
index fb0d826..4f1524f 100644 (file)
@@ -144,137 +144,4 @@ class ZhClient {
        }
 }
 
-
-class ZhClientFake {
-       function ZhClientFake() {
-               global $wgMemc, $wgDBname;
-               $this->mZh2TW = $wgMemc->get($key1 = "$wgDBname:zhConvert:tw");
-               $this->mZh2CN = $wgMemc->get($key2 = "$wgDBname:zhConvert:cn");
-               $this->mZh2SG = $wgMemc->get($key3 = "$wgDBname:zhConvert:sg");
-               $this->mZh2HK = $wgMemc->get($key4 = "$wgDBname:zhConvert:hk");
-               if(empty($this->mZh2TW) || empty($this->mZh2CN) || empty($this->mZh2SG) || empty($this->mZh2HK)) {
-                       require("includes/ZhConversion.php");
-                       $this->mZh2TW = $zh2TW;
-                       $this->mZh2CN = $zh2CN;
-                       $this->mZh2HK = $zh2HK;
-                       $this->mZh2SG = $zh2SG;
-                       $wgMemc->set($key1, $this->mZh2TW);
-                       $wgMemc->set($key2, $this->mZh2CN);
-                       $wgMemc->set($key3, $this->mZh2SG);
-                       $wgMemc->set($key4, $this->mZh2HK);
-               }
-       }
-
-       function isconnected() {
-               return true;
-       }
-
-       /**
-        * Convert to zh-tw
-        *
-        * @access private
-        */
-       function zh2tw($text) {
-               return strtr($text, $this->mZh2TW);
-       }
-
-       /**
-        * Convert to zh-cn
-        *
-        * @access private
-        */
-       function zh2cn($text) {
-               return strtr($text, $this->mZh2CN);
-       }
-
-       /**
-        * Convert to zh-sg
-        *
-        * @access private
-        */
-       function zh2sg($text) {
-               return strtr(strtr($text, $this->mZh2CN), $this->mZh2SG);
-       }
-
-       /**
-        * Convert to zh-hk
-        *
-        * @access private
-        */
-       function zh2hk($text) {
-               return strtr(strtr($text, $this->mZh2TW), $this->mZh2HK);
-       }
-
-       /**
-        * Convert the input to a different language variant
-        *
-        * @param string $text input text
-        * @param string $tolang language variant
-        * @return string the converted text
-        * @access public
-        */
-       function convert($text, $tolang) {
-               $t = '';
-               switch($tolang) {
-        case 'zh-cn':
-                       $t = $this->zh2cn($text);
-                       break;
-               case 'zh-tw':
-                       $t = $this->zh2tw($text);
-                       break;
-               case 'zh-sg':
-                       $t = $this->zh2sg($text);
-                       break;
-               case 'zh-hk':
-                       $t = $this->zh2hk($text);
-                       break;
-               default:
-                       $t = $text;
-               }
-               return $t;
-       }
-
-       function convertToAllVariants($text) {
-               $ret = array();
-               $ret['zh-cn'] = $this->zh2cn($text);
-               $ret['zh-tw'] = $this->zh2tw($text);
-               $ret['zh-sg'] = $this->zh2sg($text);
-               $ret['zh-hk'] = $this->zh2hk($text);
-               return $ret;
-       }
-
-       /**
-        * Perform "fake" word segmentation, i.e. treating each character as a word
-        *
-        * @param string $text input text
-        * @return string segmented text
-        * @access public
-        */
-       function segment($text) {
-               /* adapted from LanguageZh_cn::stripForSearch()
-                       here we will first separate the single characters,
-                       and let the caller conver it to hex
-        */
-               if( function_exists( 'mb_strtolower' ) ) {
-                       return preg_replace(
-                               "/([\\xc0-\\xff][\\x80-\\xbf]*)/e",
-                               "' ' .\"$1\"",
-                               mb_strtolower( $text ) );
-               } else {
-                       global $wikiLowerChars;
-                       return preg_replace(
-                               "/([\\xc0-\\xff][\\x80-\\xbf]*)/e",
-                               "' ' . strtr( \"\$1\", \$wikiLowerChars )",
-                               $text );
-               }
-       }
-
-       /**
-        * Close the fake connection
-        *
-        * @access public
-        */
-       function close() {      }
-}
-
 ?>
\ No newline at end of file
index 51383ab..0101538 100644 (file)
@@ -1751,6 +1751,9 @@ ta[\'ca-nstab-category\'] = new Array(\'c\',\'View the category page\');
 'variantname-zh-sg' => 'sg',
 'variantname-zh' => 'zh',
 
+# Chinese conversion table
+'zhconversiontable' => '-{}-',
+
 # labels for User: and Title: on Special:Log pages
 'specialloguserlabel' => 'User: ',
 'speciallogtitlelabel' => 'Title: ',
index bac6d1e..a48e62e 100644 (file)
@@ -1,10 +1,32 @@
 <?php
-require_once( "includes/ZhClient.php" );
 require_once( "LanguageZh_cn.php");
 require_once( "LanguageZh_tw.php");
 require_once( "LanguageZh_sg.php");
 require_once( "LanguageZh_hk.php");
 
+/*
+   hook to refresh the cache of conversion tables when 
+   MediaWiki:zhconversiontable* is updated
+*/
+function zhOnArticleSaveComplete($article, $user, $text, $summary, $isminor, $iswatch, $section) {
+       $titleobj = $article->getTitle();
+       if($titleobj->getNamespace() == NS_MEDIAWIKI) { 
+               global $wgContLang; // should be an LanguageZh.
+               if(get_class($wgContLang) != 'languagezh')      
+                       return true;
+
+               $title = $titleobj->getDBkey();
+               $t = explode('/', $title, 2);
+               if( $t[0] == 'Zhconversiontable' ) {
+                       if(!in_array($t[1], array('zh-cn', 'zh-tw', 'zh-sg', 'zh-hk')))
+                               return true;
+                       $wgContLang->reloadTables();                    
+               }
+       }
+}
+
+$wgHooks['ArticleSaveComplete'][] = 'zhOnArticleSaveComplete';
+
 /* class that handles both Traditional and Simplified Chinese
    right now it only distinguish zh_cn and zh_tw (actuall, zh_cn and
    non-zh_cn), will add support for zh_sg, zh_hk, etc, later.
@@ -12,21 +34,84 @@ require_once( "LanguageZh_hk.php");
 class LanguageZh extends LanguageZh_cn {
        
        var $mZhLanguageCode=false;
-       var $mZhClient=false;   
+       var $mTables=false; //the mapping tables
+       var $mTablesLoaded = false;
+       var $mCacheKey;
        function LanguageZh() {
-               global $wgUseZhdaemon, $wgZhdaemonHost, $wgZhdaemonPort;
-               global $wgDisableLangConversion, $wgUser;
+               global $wgDBname;
+               $this->mCacheKey = $wgDBname . ":zhtables";
+       }
+
+       function reloadTables() {
+               global $wgMemc;
+               $wgMemc->delete($this->mCacheKey);
+               $this->mTablesLoaded=false;
+               $this->loadTables();
+       }
 
-               if($wgUseZhdaemon) {
-                       $this->mZhClient=new ZhClient($wgZhdaemonHost, $wgZhdaemonPort);
-                       if(!$this->mZhClient->isconnected())
-                               $this->mZhClient = false;
+       // load conversion tables either from the cache or the disk
+       function loadTables() {
+               global $wgMemc;
+               if( $this->mTablesLoaded )
+                       return;
+               $this->mTablesLoaded = true;
+               $this->mTables = $wgMemc->get( $this->mCacheKey );
+               if( empty( $this->mTables ) ) {
+                       global $wgMessageCache;
+                       require( "includes/ZhConversion.php" );
+                       $this->mTables = array();
+                       $this->mTables['zh-cn'] = $zh2CN;
+                       $this->mTables['zh-tw'] = $zh2TW;
+                       $this->mTables['zh-sg'] = $zh2SG;
+                       $this->mTables['zh-hk'] = $zh2HK;
+                       if( is_object( $wgMessageCache ) ){
+                               $cached = $this->parseCachedTable( $wgMessageCache->get( 'zhconversiontable/zh-cn', true, true, true ) );
+                               $this->mTables['zh-cn'] = array_merge($this->mTables['zh-cn'], $cached);
+
+                               $cached = $this->parseCachedTable( $wgMessageCache->get( 'zhconversiontable/zh-tw', true, true, true ) );
+                               $this->mTables['zh-tw'] = array_merge($this->mTables['zh-tw'], $cached);
+
+                               $cached = $this->parseCachedTable( $wgMessageCache->get( 'zhconversiontable/zh-sg', true, true, true ) );
+                               $this->mTables['zh-sg'] = array_merge($this->mTables['zh-sg'], $cached);
+
+                               $cached = $this->parseCachedTable( $wgMessageCache->get( 'zhconversiontable/zh-hk', true, true, true ) );
+                               $this->mTables['zh-hk'] = array_merge($this->mTables['zh-hk'], $cached);
+                       }
+                       $wgMemc->set($this->mCacheKey, $this->mTables, 43200);
                }
-               // fallback to fake client
-               if($this->mZhClient == false)
-                       $this->mZhClient=new ZhClientFake();
        }
        
+       /*
+               parse the conversion table stored in the cache 
+
+               the table should be in the following format:
+
+                       -{
+                               word => word ;
+                               word => word ;
+                               ...
+                       -}
+       */
+       function parseCachedTable($txt) {
+               /* $txt should be enclosed by -{ and }- */
+               $a = explode( '-{', $txt);
+               if( count($a) < 2)
+                       return array();
+               array_shift($a);
+               $b = explode( '}-', $a[0]);
+
+               $stripped = str_replace(array('*','#'), '', $b[0]);
+               $table = explode( ';', $stripped );
+               $ret = array();
+               foreach( $table as $t ) {
+                       $m = explode( '=>', $t );
+                       if( count( $m ) != 2)
+                               continue;
+                       $ret[trim($m[0])] = trim($m[1]);
+               }
+               return $ret;
+       }
+
        /* 
                get preferred language variants.
        */
@@ -74,24 +159,37 @@ class LanguageZh extends LanguageZh_cn {
        }
 
        function autoConvert($text, $toVariant=false) {
+               $fname="LanguageZh::autoConvert";
+               wfProfileIn( $fname );
+
+               if(!$this->mTablesLoaded)
+                       $this->loadTables();
+
                if(!$toVariant) 
                        $toVariant = $this->getPreferredVariant();
-               if($toVariant == 'zh')
-                       return $text;
-               $fname="zhautoConvert";
-               wfProfileIn( $fname );
-               $t = $this->mZhClient->convert($text, $toVariant);
+               $ret = '';
+               switch( $toVariant ) {
+                       case 'zh-cn': $ret = strtr($text, $this->mTables['zh-cn']);break;
+                       case 'zh-tw': $ret = strtr($text, $this->mTables['zh-tw']);break;
+                       case 'zh-sg': $ret = strtr(strtr($text, $this->mTables['zh-cn']), $this->mTables['zh-sg']);break;
+                       case 'zh-hk': $ret = strtr(strtr($text, $this->mTables['zh-tw']), $this->mTables['zh-hk']);break;
+                       default: $ret = $text;
+               }
                wfProfileOut( $fname );
-               return $t;
+               return $ret;
        }
     
        function autoConvertToAllVariants($text) {
-               $fname="zhautoConvertToAll";
+               $fname="LanguageZh::autoConvertToAllVariants";
                wfProfileIn( $fname );
-               $ret = $this->mZhClient->convertToAllVariants($text);
-               if($ret == false) {//fall back...
-                       $ret = ZhClientFake::autoConvertToAllVariants($text);
-               }
+               if( !$this->mTablesLoaded )
+                       $this->loadTables();
+
+               $ret = array();
+               $ret['zh-cn'] = strtr($text, $this->mTables['zh-cn']);
+               $ret['zh-tw'] = strtr($text, $this->mTables['zh-tw']);
+               $ret['zh-sg'] = strtr(strtr($text, $this->mTables['zh-cn']), $this->mTables['zh-sg']);
+               $ret['zh-hk'] = strtr(strtr($text, $this->mTables['zh-tw']), $this->mTables['zh-hk']);
                wfProfileOut( $fname );
                return $ret;
        }
@@ -201,16 +299,23 @@ class LanguageZh extends LanguageZh_cn {
                return false;
        }
 
-       // word segmentation through ZhClient
+       // word segmentation
        function stripForSearch( $string ) {
-               $fname="zhsegment";
+               $fname="LanguageZh::stripForSearch";
                wfProfileIn( $fname );
+
+               // eventually this should be a word segmentation
+               // for now just treat each character as a word
+               $t = preg_replace(
+                               "/([\\xc0-\\xff][\\x80-\\xbf]*)/e",
+                               "' ' .\"$1\"", $string);
+
         //always convert to zh-cn before indexing. it should be
                //better to use zh-cn for search, since conversion from 
                //Traditional to Simplified is less ambiguous than the
                //other way around
-               $t = $this->mZhClient->segment($string);
-        $t = $this->autoConvert($t, 'zh-cn');
+
+               $t = $this->autoConvert($t, 'zh-cn');
                $t = LanguageUtf8::stripForSearch( $t );
                wfProfileOut( $fname );
                return $t;