* Client for querying zhdaemon
*
* @package MediaWiki
- * @version $Id$
*/
class ZhClient {
* @access private
*/
function connect() {
+ wfSuppressWarnings();
$this->mFP = fsockopen($this->mHost, $this->mPort, $errno, $errstr, 30);
+ wfRestoreWarnings();
if(!$this->mFP) {
return false;
}
$bytesread += strlen($str);
$data .= $str;
}
+ //data should be of length $len. otherwise something is wrong
+ if(strlen($data) != $len)
+ return false;
return $data;
}
return $result;
}
+ /**
+ * Convert the input to all possible variants
+ *
+ * @param string $text input text
+ * @return array langcode => converted_string
+ * @access public
+ */
+ function convertToAllVariants($text) {
+ $len = strlen($text);
+ $q = "CONV ALL $len\n$text";
+ $result = $this->query($q);
+ if(!$result)
+ return false;
+ list($infoline, $data) = explode('|', $result, 2);
+ $info = explode(";", $infoline);
+ $ret = array();
+ $i=0;
+ foreach($info as $variant) {
+ list($code, $len) = explode(' ', $variant);
+ $ret[strtolower($code)] = substr($data, $i, $len);
+ $r = $ret[strtolower($code)];
+ $i+=$len;
+ }
+ return $ret;
+ }
/**
* Perform word segmentation
*
$len = strlen($text);
$q = "SEG $len\n$text";
$result = $this->query($q);
- if(!$result)
- $result = $text;
+ if(!$result) {// fallback to character based segmentation
+ $result = ZhClientFake::segment($text);
+ }
return $result;
}
class ZhClientFake {
-
function ZhClientFake() {
global $wgMemc, $wgDBname;
- $this->zh2TW = $wgMemc->get($key1 = "$wgDBname:zhConvert:tw");
- $this->zh2CN = $wgMemc->get($key2 = "$wgDBname:zhConvert:cn");
- $this->zh2SG = $wgMemc->get($key3 = "$wgDBname:zhConvert:sg");
- $this->zh2HK = $wgMemc->get($key4 = "$wgDBname:zhConvert:hk");
- if(empty($this->zh2TW) || empty($this->zh2CN) || empty($this->zh2SG) || empty($this->zh2HK)) {
- require_once("includes/ZhConversion.php");
- $this->zh2TW = $zh2TW;
- $this->zh2CN = $zh2CN;
- $this->zh2HK = $zh2HK;
- $this->zh2SG = $zh2SG;
- $wgMemc->set($key1, $this->zh2TW);
- $wgMemc->set($key2, $this->zh2CN);
- $wgMemc->set($key3, $this->zh2SG);
- $wgMemc->set($key4, $this->zh2HK);
+ $this->mZh2TW = $wgMemc->get($key1 = "$wgDBname:zhConvert:tw");
+ $this->mZh2CN = $wgMemc->get($key2 = "$wgDBname:zhConvert:cn");
+ $this->mZh2SG = $wgMemc->get($key3 = "$wgDBname:zhConvert:sg");
+ $this->mZh2HK = $wgMemc->get($key4 = "$wgDBname:zhConvert:hk");
+ if(empty($this->mZh2TW) || empty($this->mZh2CN) || empty($this->mZh2SG) || empty($this->mZh2HK)) {
+ require("includes/ZhConversion.php");
+ $this->mZh2TW = $zh2TW;
+ $this->mZh2CN = $zh2CN;
+ $this->mZh2HK = $zh2HK;
+ $this->mZh2SG = $zh2SG;
+ $wgMemc->set($key1, $this->mZh2TW);
+ $wgMemc->set($key2, $this->mZh2CN);
+ $wgMemc->set($key3, $this->mZh2SG);
+ $wgMemc->set($key4, $this->mZh2HK);
}
}
* @access private
*/
function zh2tw($text) {
- return strtr($text, $this->zh2TW);
+ return strtr($text, $this->mZh2TW);
}
/**
* @access private
*/
function zh2cn($text) {
- return strtr($text, $this->zh2CN);
+ return strtr($text, $this->mZh2CN);
}
/**
* @access private
*/
function zh2sg($text) {
- return strtr(strtr($text, $this->zh2CN), $this->zh2SG);
+ return strtr(strtr($text, $this->mZh2CN), $this->mZh2SG);
}
/**
* @access private
*/
function zh2hk($text) {
- return strtr(strtr($text, $this->zh2TW), $this->zh2HK);
+ return strtr(strtr($text, $this->mZh2TW), $this->mZh2HK);
}
/**
return $t;
}
+ function convertToAllVariants($text) {
+ $ret = array();
+ $ret['zh-cn'] = $this->zh2cn($text);
+ $ret['zh-tw'] = $this->zh2tw($text);
+ $ret['zh-sg'] = $this->zh2sg($text);
+ $ret['zh-hk'] = $this->zh2hk($text);
+ return $ret;
+ }
+
/**
* Perform "fake" word segmentation, i.e. treating each character as a word
*
* @access public
*/
function segment($text) {
- /* copied from LanguageZh_cn.stripForSearch() */
+ /* adapted from LanguageZh_cn::stripForSearch()
+ here we will first separate the single characters,
+ and let the caller conver it to hex
+ */
if( function_exists( 'mb_strtolower' ) ) {
return preg_replace(
"/([\\xc0-\\xff][\\x80-\\xbf]*)/e",
- "' U8' . bin2hex( \"$1\" )",
- mb_strtolower( $string ) );
+ "' ' .\"$1\"",
+ mb_strtolower( $text ) );
} else {
global $wikiLowerChars;
return preg_replace(
"/([\\xc0-\\xff][\\x80-\\xbf]*)/e",
- "' U8' . bin2hex( strtr( \"\$1\", \$wikiLowerChars ) )",
- $string );
+ "' ' . strtr( \"\$1\", \$wikiLowerChars )",
+ $text );
}
}