Merged my changes from REL1_4
[lhc/web/wiklou.git] / includes / ZhClient.php
index 8a6d8e2..fb0d826 100644 (file)
@@ -3,7 +3,6 @@
  * Client for querying zhdaemon
  *
  * @package MediaWiki
- * @version $Id$
  */
 
 class ZhClient {
@@ -35,7 +34,9 @@ class ZhClient {
         * @access private
         */
        function connect() {
+               wfSuppressWarnings();
                $this->mFP = fsockopen($this->mHost, $this->mPort, $errno, $errstr, 30);
+               wfRestoreWarnings();
                if(!$this->mFP) {
                        return false;
                }
@@ -68,6 +69,9 @@ class ZhClient {
                        $bytesread += strlen($str);
                        $data .= $str;
                }
+               //data should be of length $len. otherwise something is wrong
+               if(strlen($data) != $len)
+                       return false;
                return $data;
        }
 
@@ -88,6 +92,31 @@ class ZhClient {
                return $result;
        }
 
+       /**
+        * Convert the input to all possible variants 
+        *
+        * @param string $text input text
+        * @return array langcode => converted_string
+        * @access public
+        */     
+       function convertToAllVariants($text) {
+               $len = strlen($text);
+               $q = "CONV ALL $len\n$text";
+               $result = $this->query($q);
+               if(!$result)
+                       return false;
+               list($infoline, $data) = explode('|', $result, 2);
+               $info = explode(";", $infoline);
+               $ret = array();
+               $i=0;
+               foreach($info as $variant) {
+                       list($code, $len) = explode(' ', $variant);
+                       $ret[strtolower($code)] = substr($data, $i, $len);
+                       $r = $ret[strtolower($code)];
+                       $i+=$len;
+               }
+               return $ret;
+    }
        /**
         * Perform word segmentation
         *
@@ -99,8 +128,9 @@ class ZhClient {
                $len = strlen($text);
                $q = "SEG $len\n$text";
                $result = $this->query($q);
-               if(!$result)
-                       $result = $text;
+               if(!$result) {// fallback to character based segmentation
+                       $result = ZhClientFake::segment($text);
+               }
                return $result;
        }
 
@@ -116,23 +146,22 @@ class ZhClient {
 
 
 class ZhClientFake {
-
        function ZhClientFake() {
                global $wgMemc, $wgDBname;
-               $this->zh2TW = $wgMemc->get($key1 = "$wgDBname:zhConvert:tw");
-               $this->zh2CN = $wgMemc->get($key2 = "$wgDBname:zhConvert:cn");
-               $this->zh2SG = $wgMemc->get($key3 = "$wgDBname:zhConvert:sg");
-               $this->zh2HK = $wgMemc->get($key4 = "$wgDBname:zhConvert:hk");
-               if(empty($this->zh2TW) || empty($this->zh2CN) || empty($this->zh2SG) || empty($this->zh2HK)) {
-                       require_once("includes/ZhConversion.php");
-                       $this->zh2TW = $zh2TW;
-                       $this->zh2CN = $zh2CN;
-                       $this->zh2HK = $zh2HK;
-                       $this->zh2SG = $zh2SG;
-                       $wgMemc->set($key1, $this->zh2TW);
-                       $wgMemc->set($key2, $this->zh2CN);
-                       $wgMemc->set($key3, $this->zh2SG);
-                       $wgMemc->set($key4, $this->zh2HK);
+               $this->mZh2TW = $wgMemc->get($key1 = "$wgDBname:zhConvert:tw");
+               $this->mZh2CN = $wgMemc->get($key2 = "$wgDBname:zhConvert:cn");
+               $this->mZh2SG = $wgMemc->get($key3 = "$wgDBname:zhConvert:sg");
+               $this->mZh2HK = $wgMemc->get($key4 = "$wgDBname:zhConvert:hk");
+               if(empty($this->mZh2TW) || empty($this->mZh2CN) || empty($this->mZh2SG) || empty($this->mZh2HK)) {
+                       require("includes/ZhConversion.php");
+                       $this->mZh2TW = $zh2TW;
+                       $this->mZh2CN = $zh2CN;
+                       $this->mZh2HK = $zh2HK;
+                       $this->mZh2SG = $zh2SG;
+                       $wgMemc->set($key1, $this->mZh2TW);
+                       $wgMemc->set($key2, $this->mZh2CN);
+                       $wgMemc->set($key3, $this->mZh2SG);
+                       $wgMemc->set($key4, $this->mZh2HK);
                }
        }
 
@@ -146,7 +175,7 @@ class ZhClientFake {
         * @access private
         */
        function zh2tw($text) {
-               return strtr($text, $this->zh2TW);
+               return strtr($text, $this->mZh2TW);
        }
 
        /**
@@ -155,7 +184,7 @@ class ZhClientFake {
         * @access private
         */
        function zh2cn($text) {
-               return strtr($text, $this->zh2CN);
+               return strtr($text, $this->mZh2CN);
        }
 
        /**
@@ -164,7 +193,7 @@ class ZhClientFake {
         * @access private
         */
        function zh2sg($text) {
-               return strtr(strtr($text, $this->zh2CN), $this->zh2SG);
+               return strtr(strtr($text, $this->mZh2CN), $this->mZh2SG);
        }
 
        /**
@@ -173,7 +202,7 @@ class ZhClientFake {
         * @access private
         */
        function zh2hk($text) {
-               return strtr(strtr($text, $this->zh2TW), $this->zh2HK);
+               return strtr(strtr($text, $this->mZh2TW), $this->mZh2HK);
        }
 
        /**
@@ -205,6 +234,15 @@ class ZhClientFake {
                return $t;
        }
 
+       function convertToAllVariants($text) {
+               $ret = array();
+               $ret['zh-cn'] = $this->zh2cn($text);
+               $ret['zh-tw'] = $this->zh2tw($text);
+               $ret['zh-sg'] = $this->zh2sg($text);
+               $ret['zh-hk'] = $this->zh2hk($text);
+               return $ret;
+       }
+
        /**
         * Perform "fake" word segmentation, i.e. treating each character as a word
         *
@@ -213,18 +251,21 @@ class ZhClientFake {
         * @access public
         */
        function segment($text) {
-               /* copied from LanguageZh_cn.stripForSearch() */
+               /* adapted from LanguageZh_cn::stripForSearch()
+                       here we will first separate the single characters,
+                       and let the caller conver it to hex
+        */
                if( function_exists( 'mb_strtolower' ) ) {
                        return preg_replace(
                                "/([\\xc0-\\xff][\\x80-\\xbf]*)/e",
-                               "' U8' . bin2hex( \"$1\" )",
-                               mb_strtolower( $string ) );
+                               "' ' .\"$1\"",
+                               mb_strtolower( $text ) );
                } else {
                        global $wikiLowerChars;
                        return preg_replace(
                                "/([\\xc0-\\xff][\\x80-\\xbf]*)/e",
-                               "' U8' . bin2hex( strtr( \"\$1\", \$wikiLowerChars ) )",
-                               $string );
+                               "' ' . strtr( \"\$1\", \$wikiLowerChars )",
+                               $text );
                }
        }