Follow-up r61856
authorMark A. Hershberger <mah@users.mediawiki.org>
Wed, 10 Mar 2010 21:54:23 +0000 (21:54 +0000)
committerMark A. Hershberger <mah@users.mediawiki.org>
Wed, 10 Mar 2010 21:54:23 +0000 (21:54 +0000)
* Rename wordSegmentation() to segmentByWord().
* Consolidate search index locking and iteration to Maintenance.php
* Add maintenance/updateDoubleWidthSearch.php to take care of new
  format for normalized double-width roman characters.
* Add error checking to updateSearchIndex.php for creating $posFile.
* Add note to UPGRADE about running updateDoubleWidthSearch.php.

UPGRADE
languages/Language.php
languages/classes/LanguageJa.php
languages/classes/LanguageYue.php
languages/classes/LanguageZh_hans.php
maintenance/Maintenance.php
maintenance/updateDoubleWidthSearch.php [new file with mode: 0644]
maintenance/updateSearchIndex.php

diff --git a/UPGRADE b/UPGRADE
index 44acc5c..cac08cd 100644 (file)
--- a/UPGRADE
+++ b/UPGRADE
@@ -53,11 +53,19 @@ deleted file archives, and any custom skins.
 You will need to have $wgDBadminuser and $wgDBadminpass set in your
 LocalSettings.php, see there for more info.
 
-From the command line, browse to the "maintenance" directory and run the 
+From the command line, browse to the "maintenance" directory and run the
 update.php script to check and update the schema. This will insert missing
 tables, update existing tables, and move data around as needed. In most cases,
 this is successful and nothing further needs to be done.
 
+If you have a Chinese or Japanese wiki ($wgLanguageCode is set to one
+of "zh", "ja", or "yue") and you are using MySQL fulltext search, you
+will probably want to update the search index.
+
+In the "maintenance" directory, run the updateDoubleWidthSearch.php
+script.  This will update the searchindex table for those pages that
+contain double-byte latin characters.
+
 === Check configuration settings ===
 
 The names of configuration variables, and their default values and purposes,
@@ -67,6 +75,7 @@ notes to check for configuration changes which would alter the expected
 behaviour of MediaWiki.
 
 === Check installed extensions ===
+
 In MediaWiki 1.14 some extensions are migrated into the core. Please see the
 HISTORY section "Migrated extensions" and disable these extensions in your
 LocalSettings.php
index bd44f5c..39ee1ce 100644 (file)
@@ -1695,7 +1695,7 @@ class Language {
         * @param $string String
         * @return String
         */
-       function wordSegmentation( $string ) {
+       function segmentByWord( $string ) {
                return $string;
        }
 
index 4a24260..e56d230 100644 (file)
@@ -6,7 +6,7 @@
  * @ingroup Language
  */
 class LanguageJa extends Language {
-       function wordSegmentation( $string ) {
+       function segmentByWord( $string ) {
                // Strip known punctuation ?
                // $s = preg_replace( '/\xe3\x80[\x80-\xbf]/', '', $s ); # U3000-303f
 
index 6581d78..f988548 100644 (file)
@@ -12,7 +12,7 @@ class LanguageYue extends Language {
         * for now just treat each character as a word.
         * @todo Fixme: only do this for Han characters...
         */
-       function wordSegmentation( $string ) {
+       function segmentByWord( $string ) {
                $reg = "/([\\xc0-\\xff][\\x80-\\xbf]*)/";
                $s = self::insertSpace( $string, $reg );
                return $s;
index 20d3415..4b20e62 100644 (file)
@@ -13,7 +13,7 @@ class LanguageZh_hans extends Language {
         * for now just treat each character as a word.
         * @todo Fixme: only do this for Han characters...
         */
-       function wordSegmentation( $string ) {
+       function segmentByWord( $string ) {
                $reg = "/([\\xc0-\\xff][\\x80-\\xbf]*)/";
                $s = self::insertSpace( $string, $reg );
                return $s;
@@ -25,7 +25,7 @@ class LanguageZh_hans extends Language {
                // Double-width roman characters
                $s = self::convertDoubleWidth( $string );
                $s = trim( $s );
-               $s = self::wordSegmentation( $s );
+               $s = self::segmentByWord( $s );
                $s = parent::normalizeForSearch( $s );
 
                wfProfileOut( __METHOD__ );
index 1656a61..45867b1 100644 (file)
@@ -844,4 +844,91 @@ abstract class Maintenance {
                }
                return self::$mCoreScripts;
        }
+
+       /**
+        * Lock the search index
+        * @param &$db Database object
+        */
+       private function lockSearchindex( &$db ) {
+               $write = array( 'searchindex' );
+               $read = array( 'page', 'revision', 'text', 'interwiki', 'l10n_cache' );
+               $db->lockTables( $read, $write, __CLASS__ . '::' . __METHOD__ );
+       }
+
+       /**
+        * Unlock the tables
+        * @param &$db Database object
+        */
+       private function unlockSearchindex( &$db ) {
+               $db->unlockTables(  __CLASS__ . '::' . __METHOD__ );
+       }
+
+       /**
+        * Unlock and lock again
+        * Since the lock is low-priority, queued reads will be able to complete
+        * @param &$db Database object
+        */
+       private function relockSearchindex( &$db ) {
+               $this->unlockSearchindex( $db );
+               $this->lockSearchindex( $db );
+       }
+
+       /**
+        * Perform a search index update with locking
+        * @param $maxLockTime integer the maximum time to keep the search index locked.
+        * @param $updateFunction callback the function that will update the function.
+        */
+       public function updateSearchIndex( $maxLockTime, $callback, $dbw, $results ) {
+               $lockTime = time();
+
+               # Lock searchindex
+               if ( $maxLockTime ) {
+                       $this->output( "   --- Waiting for lock ---" );
+                       $this->lockSearchindex( $dbw );
+                       $lockTime = time();
+                       $this->output( "\n" );
+               }
+
+               # Loop through the results and do a search update
+               foreach ( $results as $row ) {
+                       # Allow reads to be processed
+                       if ( $maxLockTime && time() > $lockTime + $maxLockTime ) {
+                               $this->output( "    --- Relocking ---" );
+                               $this->relockSearchindex( $dbw );
+                               $lockTime = time();
+                               $this->output( "\n" );
+                       }
+                       call_user_func( $callback, $dbw, $row );
+               }
+
+               # Unlock searchindex
+               if ( $maxLockTime ) {
+                       $this->output( "    --- Unlocking --" );
+                       $this->unlockSearchindex( $dbw );
+                       $this->output( "\n" );
+               }
+
+       }
+
+       /**
+        * Update the searchindex table for a given pageid
+        * @param $dbw Database a database write handle
+        * @param $pageId the page ID to update.
+        */
+       public function updateSearchIndexForPage( $dbw, $pageId ) {
+               // Get current revision
+               $rev = Revision::loadFromPageId( $dbw, $pageId );
+               $title = null;
+               if( $rev ) {
+                       $titleObj = $rev->getTitle();
+                       $title = $titleObj->getPrefixedDBkey();
+                       $this->output( "$title..." );
+                       # Update searchindex
+                       $u = new SearchUpdate( $pageId, $titleObj->getText(), $rev->getText() );
+                       $u->doUpdate();
+                       $this->output( "\n" );
+               }
+               return $title;
+       }
+
 }
diff --git a/maintenance/updateDoubleWidthSearch.php b/maintenance/updateDoubleWidthSearch.php
new file mode 100644 (file)
index 0000000..09eaf75
--- /dev/null
@@ -0,0 +1,72 @@
+<?php
+/**
+ * Script to normalize double-byte latin UTF-8 characters
+ *
+ * Usage: php updateDoubleWidthSearch.php
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ * http://www.gnu.org/copyleft/gpl.html
+ *
+ * @ingroup Maintenance
+ */
+
+require_once( dirname(__FILE__) . '/Maintenance.php' );
+
+class UpdateDoubleWidthSearch extends Maintenance {
+
+       public function __construct() {
+               parent::__construct();
+               $this->mDescription = "Script to normalize double-byte latin UTF-8 characters";
+               $this->addOption( 'q', 'quiet', false, true );
+               $this->addOption( 'l', 'How long the searchindex and revision tables will be locked for', false, true );
+       }
+
+       public function getDbType() {
+               return Maintenance::DB_ADMIN;
+       }
+
+       public function execute() {
+               $quiet = $this->hasOption( 'q' );
+               $maxLockTime = $this->getOption( 'l', 20 );
+               $lockTime = time();
+
+               $dbw = wfGetDB( DB_MASTER );
+               if( $dbw->getType() !== 'mysql' ) {
+                       $this->output( "This change is only needed on MySQL, quitting..." );
+                       exit(1);
+               }
+
+               $res = $this->findRows($dbw);
+               $this->updateSearchIndex($maxLockTime, array($this, 'searchIndexUpdateCallback'), $dbw, $res);
+
+               $this->output( "Done\n" );
+       }
+
+       public function searchIndexUpdateCallback($dbw, $row) {
+               return $this->updateSearchIndexForPage( $dbw, $row->si_page );
+       }
+
+       private function findRows($dbw) {
+               $searchindex = $dbw->tableName( 'searchindex' );
+               $regexp = '[[:<:]]u8efbd([89][1-9a]|8[b-f]|90)[[:>:]]';
+               $sql = "SELECT si_page FROM $searchindex
+                 WHERE ( si_text RLIKE '$regexp' )
+                    OR ( si_title RLIKE '$regexp' )";
+               return $dbw->query( $sql, __METHOD__ );
+       }
+}
+
+$maintClass = "UpdateDoubleWidthSearch";
+require_once( DO_MAINTENANCE );
index 152ce1b..7d4656b 100644 (file)
@@ -63,9 +63,18 @@ class UpdateSearchIndex extends Maintenance {
                $lockTime = $this->getOption( 'l', 20 );
                
                $this->doUpdateSearchIndex( $start, $end, $lockTime );
-               $file = fopen( $posFile, 'w' );
-               fwrite( $file, $end );
-               fclose( $file );
+               if( is_writable( dirname( realpath( $posFile ) ) ) ) {
+                       $file = fopen( $posFile, 'w' );
+                       if( $file !== false ) {
+                               fwrite( $file, $end );
+                               fclose( $file );
+                       } else {
+                       echo posix_get_last_error();
+                               $this->output( "*** Couldn't write to the $posFile!" );
+                       }
+               } else {
+                       $this->output( "*** Couldn't write to the $posFile!" );
+               }
        }
        
        private function doUpdateSearchIndex( $start, $end, $maxLockTime ) {
@@ -89,83 +98,22 @@ class UpdateSearchIndex extends Maintenance {
                  ";
                $res = $dbw->query( $sql, __METHOD__ );
 
+               $this->updateSearchIndex($maxLockTime, array($this, 'searchIndexUpdateCallback'), $dbw, $res);
 
-               # Lock searchindex
-               if ( $maxLockTime ) {
-                       $this->output( "   --- Waiting for lock ---" );
-                       $this->lockSearchindex( $dbw );
-                       $lockTime = time();
-                       $this->output( "\n" );
-               }
-
-               # Loop through the results and do a search update
-               foreach ( $res as $row ) {
-                       # Allow reads to be processed
-                       if ( $maxLockTime && time() > $lockTime + $maxLockTime ) {
-                               $this->output( "    --- Relocking ---" );
-                               $this->relockSearchindex( $dbw );
-                               $lockTime = time();
-                               $this->output( "\n" );
-                       }
-                       if ( $row->rc_type == RC_LOG ) {
-                               continue;
-                       } elseif ( $row->rc_type == RC_MOVE || $row->rc_type == RC_MOVE_OVER_REDIRECT ) {
-                               # Rename searchindex entry
-                               $titleObj = Title::makeTitle( $row->rc_moved_to_ns, $row->rc_moved_to_title );
-                               $title = $titleObj->getPrefixedDBkey();
-                               $this->output( "$title..." );
-                               $u = new SearchUpdate( $row->rc_cur_id, $title, false );
-                               $this->output( "\n" );
-                       } else {
-                               // Get current revision
-                               $rev = Revision::loadFromPageId( $dbw, $row->rc_cur_id );
-                               if( $rev ) {
-                                       $titleObj = $rev->getTitle();
-                                       $title = $titleObj->getPrefixedDBkey();
-                                       $this->output( $title );
-                                       # Update searchindex
-                                       $u = new SearchUpdate( $row->rc_cur_id, $titleObj->getText(), $rev->getText() );
-                                       $u->doUpdate();
-                                       $this->output( "\n" );
-                               }
-                       }
-               }
-
-               # Unlock searchindex
-               if ( $maxLockTime ) {
-                       $this->output( "    --- Unlocking --" );
-                       $this->unlockSearchindex( $dbw );
-                       $this->output( "\n" );
-               }
                $this->output( "Done\n" );
        }
 
-       /**
-        * Lock the search index
-        * @param &$db Database object
-        */
-       private function lockSearchindex( &$db ) {
-               $write = array( 'searchindex' );
-               $read = array( 'page', 'revision', 'text', 'interwiki' );
-               $db->lockTables( $read, $write, 'updateSearchIndex.php ' . __METHOD__ );
-       }
-
-       /**
-        * Unlock the tables
-        * @param &$db Database object
-        */
-       private function unlockSearchindex( &$db ) {
-               $db->unlockTables( 'updateSearchIndex.php ' . __METHOD__ );
-       }
-       
-       /**
-        * Unlock and lock again
-        * Since the lock is low-priority, queued reads will be able to complete
-        * @param &$db Database object
-        */
-       private function relockSearchindex( &$db ) {
-               $this->unlockSearchindex( $db );
-               $this->lockSearchindex( $db );
+       public function searchIndexUpdateCallback($dbw, $row) {
+               if ( $row->rc_type == RC_MOVE || $row->rc_type == RC_MOVE_OVER_REDIRECT ) {
+                       # Rename searchindex entry
+                       $titleObj = Title::makeTitle( $row->rc_moved_to_ns, $row->rc_moved_to_title );
+                       $title = $titleObj->getPrefixedDBkey();
+                       $this->output( "$title..." );
+                       $u = new SearchUpdate( $row->rc_cur_id, $title, false );
+                       $this->output( "\n" );
+               } elseif ( $row->rc_type !== RC_LOG ) {
+                       $this->updateSearchIndexForPage( $dbw, $row->rc_cur_id );
+               }
        }
 }