Merged my changes from REL1_4

[lhc/web/wiklou.git] / includes / ZhClient.php
diff --git a/includes/ZhClient.php b/includes/ZhClient.php

index 8a6d8e2..fb0d826 100644 (file)
--- a/includes/ZhClient.php
+++ b/includes/ZhClient.php
@@ -3,7 +3,6 @@
   * Client for querying zhdaemon
   *
   * @package MediaWiki
- * @version $Id$
   */
  
  class ZhClient {
@@ -35,7 +34,9 @@ class ZhClient {
          * @access private
          */
         function connect() {
+               wfSuppressWarnings();
                 $this->mFP = fsockopen($this->mHost, $this->mPort, $errno, $errstr, 30);
+               wfRestoreWarnings();
                 if(!$this->mFP) {
                         return false;
                 }
@@ -68,6 +69,9 @@ class ZhClient {
                         $bytesread += strlen($str);
                         $data .= $str;
                 }
+               //data should be of length $len. otherwise something is wrong
+               if(strlen($data) != $len)
+                       return false;
                 return $data;
         }
  
@@ -88,6 +92,31 @@ class ZhClient {
                 return $result;
         }
  
+       /**
+        * Convert the input to all possible variants 
+        *
+        * @param string $text input text
+        * @return array langcode => converted_string
+        * @access public
+        */     
+       function convertToAllVariants($text) {
+               $len = strlen($text);
+               $q = "CONV ALL $len\n$text";
+               $result = $this->query($q);
+               if(!$result)
+                       return false;
+               list($infoline, $data) = explode('|', $result, 2);
+               $info = explode(";", $infoline);
+               $ret = array();
+               $i=0;
+               foreach($info as $variant) {
+                       list($code, $len) = explode(' ', $variant);
+                       $ret[strtolower($code)] = substr($data, $i, $len);
+                       $r = $ret[strtolower($code)];
+                       $i+=$len;
+               }
+               return $ret;
+    }
         /**
          * Perform word segmentation
          *
@@ -99,8 +128,9 @@ class ZhClient {
                 $len = strlen($text);
                 $q = "SEG $len\n$text";
                 $result = $this->query($q);
-               if(!$result)
-                       $result = $text;
+               if(!$result) {// fallback to character based segmentation
+                       $result = ZhClientFake::segment($text);
+               }
                 return $result;
         }
  
@@ -116,23 +146,22 @@ class ZhClient {
  
  
  class ZhClientFake {
-
         function ZhClientFake() {
                 global $wgMemc, $wgDBname;
-               $this->zh2TW = $wgMemc->get($key1 = "$wgDBname:zhConvert:tw");
-               $this->zh2CN = $wgMemc->get($key2 = "$wgDBname:zhConvert:cn");
-               $this->zh2SG = $wgMemc->get($key3 = "$wgDBname:zhConvert:sg");
-               $this->zh2HK = $wgMemc->get($key4 = "$wgDBname:zhConvert:hk");
-               if(empty($this->zh2TW) || empty($this->zh2CN) || empty($this->zh2SG) || empty($this->zh2HK)) {
-                       require_once("includes/ZhConversion.php");
-                       $this->zh2TW = $zh2TW;
-                       $this->zh2CN = $zh2CN;
-                       $this->zh2HK = $zh2HK;
-                       $this->zh2SG = $zh2SG;
-                       $wgMemc->set($key1, $this->zh2TW);
-                       $wgMemc->set($key2, $this->zh2CN);
-                       $wgMemc->set($key3, $this->zh2SG);
-                       $wgMemc->set($key4, $this->zh2HK);
+               $this->mZh2TW = $wgMemc->get($key1 = "$wgDBname:zhConvert:tw");
+               $this->mZh2CN = $wgMemc->get($key2 = "$wgDBname:zhConvert:cn");
+               $this->mZh2SG = $wgMemc->get($key3 = "$wgDBname:zhConvert:sg");
+               $this->mZh2HK = $wgMemc->get($key4 = "$wgDBname:zhConvert:hk");
+               if(empty($this->mZh2TW) || empty($this->mZh2CN) || empty($this->mZh2SG) || empty($this->mZh2HK)) {
+                       require("includes/ZhConversion.php");
+                       $this->mZh2TW = $zh2TW;
+                       $this->mZh2CN = $zh2CN;
+                       $this->mZh2HK = $zh2HK;
+                       $this->mZh2SG = $zh2SG;
+                       $wgMemc->set($key1, $this->mZh2TW);
+                       $wgMemc->set($key2, $this->mZh2CN);
+                       $wgMemc->set($key3, $this->mZh2SG);
+                       $wgMemc->set($key4, $this->mZh2HK);
                 }
         }
  
@@ -146,7 +175,7 @@ class ZhClientFake {
          * @access private
          */
         function zh2tw($text) {
-               return strtr($text, $this->zh2TW);
+               return strtr($text, $this->mZh2TW);
         }
  
         /**
@@ -155,7 +184,7 @@ class ZhClientFake {
          * @access private
          */
         function zh2cn($text) {
-               return strtr($text, $this->zh2CN);
+               return strtr($text, $this->mZh2CN);
         }
  
         /**
@@ -164,7 +193,7 @@ class ZhClientFake {
          * @access private
          */
         function zh2sg($text) {
-               return strtr(strtr($text, $this->zh2CN), $this->zh2SG);
+               return strtr(strtr($text, $this->mZh2CN), $this->mZh2SG);
         }
  
         /**
@@ -173,7 +202,7 @@ class ZhClientFake {
          * @access private
          */
         function zh2hk($text) {
-               return strtr(strtr($text, $this->zh2TW), $this->zh2HK);
+               return strtr(strtr($text, $this->mZh2TW), $this->mZh2HK);
         }
  
         /**
@@ -205,6 +234,15 @@ class ZhClientFake {
                 return $t;
         }
  
+       function convertToAllVariants($text) {
+               $ret = array();
+               $ret['zh-cn'] = $this->zh2cn($text);
+               $ret['zh-tw'] = $this->zh2tw($text);
+               $ret['zh-sg'] = $this->zh2sg($text);
+               $ret['zh-hk'] = $this->zh2hk($text);
+               return $ret;
+       }
+
         /**
          * Perform "fake" word segmentation, i.e. treating each character as a word
          *
@@ -213,18 +251,21 @@ class ZhClientFake {
          * @access public
          */
         function segment($text) {
-               /* copied from LanguageZh_cn.stripForSearch() */
+               /* adapted from LanguageZh_cn::stripForSearch()
+                       here we will first separate the single characters,
+                       and let the caller conver it to hex
+        */
                 if( function_exists( 'mb_strtolower' ) ) {
                         return preg_replace(
                                 "/([\\xc0-\\xff][\\x80-\\xbf]*)/e",
-                               "' U8' . bin2hex( \"$1\" )",
-                               mb_strtolower( $string ) );
+                               "' ' .\"$1\"",
+                               mb_strtolower( $text ) );
                 } else {
                         global $wikiLowerChars;
                         return preg_replace(
                                 "/([\\xc0-\\xff][\\x80-\\xbf]*)/e",
-                               "' U8' . bin2hex( strtr( \"\$1\", \$wikiLowerChars ) )",
-                               $string );
+                               "' ' . strtr( \"\$1\", \$wikiLowerChars )",
+                               $text );
                 }
         }