From 12c4d6f9776a392bfcfe720570bdba086735b4d4 Mon Sep 17 00:00:00 2001 From: Zheng Zhu Date: Wed, 29 Dec 2004 01:07:43 +0000 Subject: [PATCH] Allower zh: users to customize the Traditional/Simplified conversion tables. From REL1_4. --- includes/Article.php | 8 ++ includes/MessageCache.php | 4 +- includes/ZhClient.php | 133 -------------------------------- languages/Language.php | 3 + languages/LanguageZh.php | 157 +++++++++++++++++++++++++++++++------- 5 files changed, 144 insertions(+), 161 deletions(-) diff --git a/includes/Article.php b/includes/Article.php index ff0be1d848..2c5dc57fc3 100644 --- a/includes/Article.php +++ b/includes/Article.php @@ -45,6 +45,14 @@ class Article { $this->clear(); } + /** + * get the title object of the article + * @public + */ + function getTitle() { + return $this->mTitle; + } + /** * Clear the object * @private diff --git a/includes/MessageCache.php b/includes/MessageCache.php index b464ae049a..a099a0429f 100755 --- a/includes/MessageCache.php +++ b/includes/MessageCache.php @@ -215,7 +215,7 @@ class MessageCache $this->mMemc->delete( $lockKey ); } - function get( $key, $useDB, $forcontent=true ) { + function get( $key, $useDB, $forcontent=true, $isfullkey = false ) { global $wgContLanguageCode; if( $forcontent ) { global $wgContLang; @@ -238,7 +238,7 @@ class MessageCache $message = false; if( !$this->mDisable && $useDB ) { $title = $lang->ucfirst( $key ); - if( $langcode != $wgContLanguageCode ) { + if(!$isfullkey && ($langcode != $wgContLanguageCode) ) { $title .= '/' . $langcode; } diff --git a/includes/ZhClient.php b/includes/ZhClient.php index fb0d826123..4f1524f621 100644 --- a/includes/ZhClient.php +++ b/includes/ZhClient.php @@ -144,137 +144,4 @@ class ZhClient { } } - -class ZhClientFake { - function ZhClientFake() { - global $wgMemc, $wgDBname; - $this->mZh2TW = $wgMemc->get($key1 = "$wgDBname:zhConvert:tw"); - $this->mZh2CN = $wgMemc->get($key2 = "$wgDBname:zhConvert:cn"); - $this->mZh2SG = $wgMemc->get($key3 = "$wgDBname:zhConvert:sg"); - $this->mZh2HK = $wgMemc->get($key4 = "$wgDBname:zhConvert:hk"); - if(empty($this->mZh2TW) || empty($this->mZh2CN) || empty($this->mZh2SG) || empty($this->mZh2HK)) { - require("includes/ZhConversion.php"); - $this->mZh2TW = $zh2TW; - $this->mZh2CN = $zh2CN; - $this->mZh2HK = $zh2HK; - $this->mZh2SG = $zh2SG; - $wgMemc->set($key1, $this->mZh2TW); - $wgMemc->set($key2, $this->mZh2CN); - $wgMemc->set($key3, $this->mZh2SG); - $wgMemc->set($key4, $this->mZh2HK); - } - } - - function isconnected() { - return true; - } - - /** - * Convert to zh-tw - * - * @access private - */ - function zh2tw($text) { - return strtr($text, $this->mZh2TW); - } - - /** - * Convert to zh-cn - * - * @access private - */ - function zh2cn($text) { - return strtr($text, $this->mZh2CN); - } - - /** - * Convert to zh-sg - * - * @access private - */ - function zh2sg($text) { - return strtr(strtr($text, $this->mZh2CN), $this->mZh2SG); - } - - /** - * Convert to zh-hk - * - * @access private - */ - function zh2hk($text) { - return strtr(strtr($text, $this->mZh2TW), $this->mZh2HK); - } - - /** - * Convert the input to a different language variant - * - * @param string $text input text - * @param string $tolang language variant - * @return string the converted text - * @access public - */ - function convert($text, $tolang) { - $t = ''; - switch($tolang) { - case 'zh-cn': - $t = $this->zh2cn($text); - break; - case 'zh-tw': - $t = $this->zh2tw($text); - break; - case 'zh-sg': - $t = $this->zh2sg($text); - break; - case 'zh-hk': - $t = $this->zh2hk($text); - break; - default: - $t = $text; - } - return $t; - } - - function convertToAllVariants($text) { - $ret = array(); - $ret['zh-cn'] = $this->zh2cn($text); - $ret['zh-tw'] = $this->zh2tw($text); - $ret['zh-sg'] = $this->zh2sg($text); - $ret['zh-hk'] = $this->zh2hk($text); - return $ret; - } - - /** - * Perform "fake" word segmentation, i.e. treating each character as a word - * - * @param string $text input text - * @return string segmented text - * @access public - */ - function segment($text) { - /* adapted from LanguageZh_cn::stripForSearch() - here we will first separate the single characters, - and let the caller conver it to hex - */ - if( function_exists( 'mb_strtolower' ) ) { - return preg_replace( - "/([\\xc0-\\xff][\\x80-\\xbf]*)/e", - "' ' .\"$1\"", - mb_strtolower( $text ) ); - } else { - global $wikiLowerChars; - return preg_replace( - "/([\\xc0-\\xff][\\x80-\\xbf]*)/e", - "' ' . strtr( \"\$1\", \$wikiLowerChars )", - $text ); - } - } - - /** - * Close the fake connection - * - * @access public - */ - function close() { } -} - ?> \ No newline at end of file diff --git a/languages/Language.php b/languages/Language.php index 51383ab8eb..0101538285 100644 --- a/languages/Language.php +++ b/languages/Language.php @@ -1751,6 +1751,9 @@ ta[\'ca-nstab-category\'] = new Array(\'c\',\'View the category page\'); 'variantname-zh-sg' => 'sg', 'variantname-zh' => 'zh', +# Chinese conversion table +'zhconversiontable' => '-{}-', + # labels for User: and Title: on Special:Log pages 'specialloguserlabel' => 'User: ', 'speciallogtitlelabel' => 'Title: ', diff --git a/languages/LanguageZh.php b/languages/LanguageZh.php index bac6d1eaba..a48e62e42f 100644 --- a/languages/LanguageZh.php +++ b/languages/LanguageZh.php @@ -1,10 +1,32 @@ getTitle(); + if($titleobj->getNamespace() == NS_MEDIAWIKI) { + global $wgContLang; // should be an LanguageZh. + if(get_class($wgContLang) != 'languagezh') + return true; + + $title = $titleobj->getDBkey(); + $t = explode('/', $title, 2); + if( $t[0] == 'Zhconversiontable' ) { + if(!in_array($t[1], array('zh-cn', 'zh-tw', 'zh-sg', 'zh-hk'))) + return true; + $wgContLang->reloadTables(); + } + } +} + +$wgHooks['ArticleSaveComplete'][] = 'zhOnArticleSaveComplete'; + /* class that handles both Traditional and Simplified Chinese right now it only distinguish zh_cn and zh_tw (actuall, zh_cn and non-zh_cn), will add support for zh_sg, zh_hk, etc, later. @@ -12,21 +34,84 @@ require_once( "LanguageZh_hk.php"); class LanguageZh extends LanguageZh_cn { var $mZhLanguageCode=false; - var $mZhClient=false; + var $mTables=false; //the mapping tables + var $mTablesLoaded = false; + var $mCacheKey; function LanguageZh() { - global $wgUseZhdaemon, $wgZhdaemonHost, $wgZhdaemonPort; - global $wgDisableLangConversion, $wgUser; + global $wgDBname; + $this->mCacheKey = $wgDBname . ":zhtables"; + } + + function reloadTables() { + global $wgMemc; + $wgMemc->delete($this->mCacheKey); + $this->mTablesLoaded=false; + $this->loadTables(); + } - if($wgUseZhdaemon) { - $this->mZhClient=new ZhClient($wgZhdaemonHost, $wgZhdaemonPort); - if(!$this->mZhClient->isconnected()) - $this->mZhClient = false; + // load conversion tables either from the cache or the disk + function loadTables() { + global $wgMemc; + if( $this->mTablesLoaded ) + return; + $this->mTablesLoaded = true; + $this->mTables = $wgMemc->get( $this->mCacheKey ); + if( empty( $this->mTables ) ) { + global $wgMessageCache; + require( "includes/ZhConversion.php" ); + $this->mTables = array(); + $this->mTables['zh-cn'] = $zh2CN; + $this->mTables['zh-tw'] = $zh2TW; + $this->mTables['zh-sg'] = $zh2SG; + $this->mTables['zh-hk'] = $zh2HK; + if( is_object( $wgMessageCache ) ){ + $cached = $this->parseCachedTable( $wgMessageCache->get( 'zhconversiontable/zh-cn', true, true, true ) ); + $this->mTables['zh-cn'] = array_merge($this->mTables['zh-cn'], $cached); + + $cached = $this->parseCachedTable( $wgMessageCache->get( 'zhconversiontable/zh-tw', true, true, true ) ); + $this->mTables['zh-tw'] = array_merge($this->mTables['zh-tw'], $cached); + + $cached = $this->parseCachedTable( $wgMessageCache->get( 'zhconversiontable/zh-sg', true, true, true ) ); + $this->mTables['zh-sg'] = array_merge($this->mTables['zh-sg'], $cached); + + $cached = $this->parseCachedTable( $wgMessageCache->get( 'zhconversiontable/zh-hk', true, true, true ) ); + $this->mTables['zh-hk'] = array_merge($this->mTables['zh-hk'], $cached); + } + $wgMemc->set($this->mCacheKey, $this->mTables, 43200); } - // fallback to fake client - if($this->mZhClient == false) - $this->mZhClient=new ZhClientFake(); } + /* + parse the conversion table stored in the cache + + the table should be in the following format: + + -{ + word => word ; + word => word ; + ... + -} + */ + function parseCachedTable($txt) { + /* $txt should be enclosed by -{ and }- */ + $a = explode( '-{', $txt); + if( count($a) < 2) + return array(); + array_shift($a); + $b = explode( '}-', $a[0]); + + $stripped = str_replace(array('*','#'), '', $b[0]); + $table = explode( ';', $stripped ); + $ret = array(); + foreach( $table as $t ) { + $m = explode( '=>', $t ); + if( count( $m ) != 2) + continue; + $ret[trim($m[0])] = trim($m[1]); + } + return $ret; + } + /* get preferred language variants. */ @@ -74,24 +159,37 @@ class LanguageZh extends LanguageZh_cn { } function autoConvert($text, $toVariant=false) { + $fname="LanguageZh::autoConvert"; + wfProfileIn( $fname ); + + if(!$this->mTablesLoaded) + $this->loadTables(); + if(!$toVariant) $toVariant = $this->getPreferredVariant(); - if($toVariant == 'zh') - return $text; - $fname="zhautoConvert"; - wfProfileIn( $fname ); - $t = $this->mZhClient->convert($text, $toVariant); + $ret = ''; + switch( $toVariant ) { + case 'zh-cn': $ret = strtr($text, $this->mTables['zh-cn']);break; + case 'zh-tw': $ret = strtr($text, $this->mTables['zh-tw']);break; + case 'zh-sg': $ret = strtr(strtr($text, $this->mTables['zh-cn']), $this->mTables['zh-sg']);break; + case 'zh-hk': $ret = strtr(strtr($text, $this->mTables['zh-tw']), $this->mTables['zh-hk']);break; + default: $ret = $text; + } wfProfileOut( $fname ); - return $t; + return $ret; } function autoConvertToAllVariants($text) { - $fname="zhautoConvertToAll"; + $fname="LanguageZh::autoConvertToAllVariants"; wfProfileIn( $fname ); - $ret = $this->mZhClient->convertToAllVariants($text); - if($ret == false) {//fall back... - $ret = ZhClientFake::autoConvertToAllVariants($text); - } + if( !$this->mTablesLoaded ) + $this->loadTables(); + + $ret = array(); + $ret['zh-cn'] = strtr($text, $this->mTables['zh-cn']); + $ret['zh-tw'] = strtr($text, $this->mTables['zh-tw']); + $ret['zh-sg'] = strtr(strtr($text, $this->mTables['zh-cn']), $this->mTables['zh-sg']); + $ret['zh-hk'] = strtr(strtr($text, $this->mTables['zh-tw']), $this->mTables['zh-hk']); wfProfileOut( $fname ); return $ret; } @@ -201,16 +299,23 @@ class LanguageZh extends LanguageZh_cn { return false; } - // word segmentation through ZhClient + // word segmentation function stripForSearch( $string ) { - $fname="zhsegment"; + $fname="LanguageZh::stripForSearch"; wfProfileIn( $fname ); + + // eventually this should be a word segmentation + // for now just treat each character as a word + $t = preg_replace( + "/([\\xc0-\\xff][\\x80-\\xbf]*)/e", + "' ' .\"$1\"", $string); + //always convert to zh-cn before indexing. it should be //better to use zh-cn for search, since conversion from //Traditional to Simplified is less ambiguous than the //other way around - $t = $this->mZhClient->segment($string); - $t = $this->autoConvert($t, 'zh-cn'); + + $t = $this->autoConvert($t, 'zh-cn'); $t = LanguageUtf8::stripForSearch( $t ); wfProfileOut( $fname ); return $t; -- 2.20.1