* Rename wordSegmentation() to segmentByWord().
* Consolidate search index locking and iteration to Maintenance.php
* Add maintenance/updateDoubleWidthSearch.php to take care of new
format for normalized double-width roman characters.
* Add error checking to updateSearchIndex.php for creating $posFile.
* Add note to UPGRADE about running updateDoubleWidthSearch.php.
You will need to have $wgDBadminuser and $wgDBadminpass set in your
LocalSettings.php, see there for more info.
-From the command line, browse to the "maintenance" directory and run the
+From the command line, browse to the "maintenance" directory and run the
update.php script to check and update the schema. This will insert missing
tables, update existing tables, and move data around as needed. In most cases,
this is successful and nothing further needs to be done.
+If you have a Chinese or Japanese wiki ($wgLanguageCode is set to one
+of "zh", "ja", or "yue") and you are using MySQL fulltext search, you
+will probably want to update the search index.
+
+In the "maintenance" directory, run the updateDoubleWidthSearch.php
+script. This will update the searchindex table for those pages that
+contain double-byte latin characters.
+
=== Check configuration settings ===
The names of configuration variables, and their default values and purposes,
behaviour of MediaWiki.
=== Check installed extensions ===
+
In MediaWiki 1.14 some extensions are migrated into the core. Please see the
HISTORY section "Migrated extensions" and disable these extensions in your
LocalSettings.php
* @param $string String
* @return String
*/
- function wordSegmentation( $string ) {
+ function segmentByWord( $string ) {
return $string;
}
* @ingroup Language
*/
class LanguageJa extends Language {
- function wordSegmentation( $string ) {
+ function segmentByWord( $string ) {
// Strip known punctuation ?
// $s = preg_replace( '/\xe3\x80[\x80-\xbf]/', '', $s ); # U3000-303f
* for now just treat each character as a word.
* @todo Fixme: only do this for Han characters...
*/
- function wordSegmentation( $string ) {
+ function segmentByWord( $string ) {
$reg = "/([\\xc0-\\xff][\\x80-\\xbf]*)/";
$s = self::insertSpace( $string, $reg );
return $s;
* for now just treat each character as a word.
* @todo Fixme: only do this for Han characters...
*/
- function wordSegmentation( $string ) {
+ function segmentByWord( $string ) {
$reg = "/([\\xc0-\\xff][\\x80-\\xbf]*)/";
$s = self::insertSpace( $string, $reg );
return $s;
// Double-width roman characters
$s = self::convertDoubleWidth( $string );
$s = trim( $s );
- $s = self::wordSegmentation( $s );
+ $s = self::segmentByWord( $s );
$s = parent::normalizeForSearch( $s );
wfProfileOut( __METHOD__ );
}
return self::$mCoreScripts;
}
+
+ /**
+ * Lock the search index
+ * @param &$db Database object
+ */
+ private function lockSearchindex( &$db ) {
+ $write = array( 'searchindex' );
+ $read = array( 'page', 'revision', 'text', 'interwiki', 'l10n_cache' );
+ $db->lockTables( $read, $write, __CLASS__ . '::' . __METHOD__ );
+ }
+
+ /**
+ * Unlock the tables
+ * @param &$db Database object
+ */
+ private function unlockSearchindex( &$db ) {
+ $db->unlockTables( __CLASS__ . '::' . __METHOD__ );
+ }
+
+ /**
+ * Unlock and lock again
+ * Since the lock is low-priority, queued reads will be able to complete
+ * @param &$db Database object
+ */
+ private function relockSearchindex( &$db ) {
+ $this->unlockSearchindex( $db );
+ $this->lockSearchindex( $db );
+ }
+
+ /**
+ * Perform a search index update with locking
+ * @param $maxLockTime integer the maximum time to keep the search index locked.
+ * @param $updateFunction callback the function that will update the function.
+ */
+ public function updateSearchIndex( $maxLockTime, $callback, $dbw, $results ) {
+ $lockTime = time();
+
+ # Lock searchindex
+ if ( $maxLockTime ) {
+ $this->output( " --- Waiting for lock ---" );
+ $this->lockSearchindex( $dbw );
+ $lockTime = time();
+ $this->output( "\n" );
+ }
+
+ # Loop through the results and do a search update
+ foreach ( $results as $row ) {
+ # Allow reads to be processed
+ if ( $maxLockTime && time() > $lockTime + $maxLockTime ) {
+ $this->output( " --- Relocking ---" );
+ $this->relockSearchindex( $dbw );
+ $lockTime = time();
+ $this->output( "\n" );
+ }
+ call_user_func( $callback, $dbw, $row );
+ }
+
+ # Unlock searchindex
+ if ( $maxLockTime ) {
+ $this->output( " --- Unlocking --" );
+ $this->unlockSearchindex( $dbw );
+ $this->output( "\n" );
+ }
+
+ }
+
+ /**
+ * Update the searchindex table for a given pageid
+ * @param $dbw Database a database write handle
+ * @param $pageId the page ID to update.
+ */
+ public function updateSearchIndexForPage( $dbw, $pageId ) {
+ // Get current revision
+ $rev = Revision::loadFromPageId( $dbw, $pageId );
+ $title = null;
+ if( $rev ) {
+ $titleObj = $rev->getTitle();
+ $title = $titleObj->getPrefixedDBkey();
+ $this->output( "$title..." );
+ # Update searchindex
+ $u = new SearchUpdate( $pageId, $titleObj->getText(), $rev->getText() );
+ $u->doUpdate();
+ $this->output( "\n" );
+ }
+ return $title;
+ }
+
}
--- /dev/null
+<?php
+/**
+ * Script to normalize double-byte latin UTF-8 characters
+ *
+ * Usage: php updateDoubleWidthSearch.php
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ * http://www.gnu.org/copyleft/gpl.html
+ *
+ * @ingroup Maintenance
+ */
+
+require_once( dirname(__FILE__) . '/Maintenance.php' );
+
+class UpdateDoubleWidthSearch extends Maintenance {
+
+ public function __construct() {
+ parent::__construct();
+ $this->mDescription = "Script to normalize double-byte latin UTF-8 characters";
+ $this->addOption( 'q', 'quiet', false, true );
+ $this->addOption( 'l', 'How long the searchindex and revision tables will be locked for', false, true );
+ }
+
+ public function getDbType() {
+ return Maintenance::DB_ADMIN;
+ }
+
+ public function execute() {
+ $quiet = $this->hasOption( 'q' );
+ $maxLockTime = $this->getOption( 'l', 20 );
+ $lockTime = time();
+
+ $dbw = wfGetDB( DB_MASTER );
+ if( $dbw->getType() !== 'mysql' ) {
+ $this->output( "This change is only needed on MySQL, quitting..." );
+ exit(1);
+ }
+
+ $res = $this->findRows($dbw);
+ $this->updateSearchIndex($maxLockTime, array($this, 'searchIndexUpdateCallback'), $dbw, $res);
+
+ $this->output( "Done\n" );
+ }
+
+ public function searchIndexUpdateCallback($dbw, $row) {
+ return $this->updateSearchIndexForPage( $dbw, $row->si_page );
+ }
+
+ private function findRows($dbw) {
+ $searchindex = $dbw->tableName( 'searchindex' );
+ $regexp = '[[:<:]]u8efbd([89][1-9a]|8[b-f]|90)[[:>:]]';
+ $sql = "SELECT si_page FROM $searchindex
+ WHERE ( si_text RLIKE '$regexp' )
+ OR ( si_title RLIKE '$regexp' )";
+ return $dbw->query( $sql, __METHOD__ );
+ }
+}
+
+$maintClass = "UpdateDoubleWidthSearch";
+require_once( DO_MAINTENANCE );
$lockTime = $this->getOption( 'l', 20 );
$this->doUpdateSearchIndex( $start, $end, $lockTime );
- $file = fopen( $posFile, 'w' );
- fwrite( $file, $end );
- fclose( $file );
+ if( is_writable( dirname( realpath( $posFile ) ) ) ) {
+ $file = fopen( $posFile, 'w' );
+ if( $file !== false ) {
+ fwrite( $file, $end );
+ fclose( $file );
+ } else {
+ echo posix_get_last_error();
+ $this->output( "*** Couldn't write to the $posFile!" );
+ }
+ } else {
+ $this->output( "*** Couldn't write to the $posFile!" );
+ }
}
private function doUpdateSearchIndex( $start, $end, $maxLockTime ) {
";
$res = $dbw->query( $sql, __METHOD__ );
+ $this->updateSearchIndex($maxLockTime, array($this, 'searchIndexUpdateCallback'), $dbw, $res);
- # Lock searchindex
- if ( $maxLockTime ) {
- $this->output( " --- Waiting for lock ---" );
- $this->lockSearchindex( $dbw );
- $lockTime = time();
- $this->output( "\n" );
- }
-
- # Loop through the results and do a search update
- foreach ( $res as $row ) {
- # Allow reads to be processed
- if ( $maxLockTime && time() > $lockTime + $maxLockTime ) {
- $this->output( " --- Relocking ---" );
- $this->relockSearchindex( $dbw );
- $lockTime = time();
- $this->output( "\n" );
- }
- if ( $row->rc_type == RC_LOG ) {
- continue;
- } elseif ( $row->rc_type == RC_MOVE || $row->rc_type == RC_MOVE_OVER_REDIRECT ) {
- # Rename searchindex entry
- $titleObj = Title::makeTitle( $row->rc_moved_to_ns, $row->rc_moved_to_title );
- $title = $titleObj->getPrefixedDBkey();
- $this->output( "$title..." );
- $u = new SearchUpdate( $row->rc_cur_id, $title, false );
- $this->output( "\n" );
- } else {
- // Get current revision
- $rev = Revision::loadFromPageId( $dbw, $row->rc_cur_id );
- if( $rev ) {
- $titleObj = $rev->getTitle();
- $title = $titleObj->getPrefixedDBkey();
- $this->output( $title );
- # Update searchindex
- $u = new SearchUpdate( $row->rc_cur_id, $titleObj->getText(), $rev->getText() );
- $u->doUpdate();
- $this->output( "\n" );
- }
- }
- }
-
- # Unlock searchindex
- if ( $maxLockTime ) {
- $this->output( " --- Unlocking --" );
- $this->unlockSearchindex( $dbw );
- $this->output( "\n" );
- }
$this->output( "Done\n" );
}
- /**
- * Lock the search index
- * @param &$db Database object
- */
- private function lockSearchindex( &$db ) {
- $write = array( 'searchindex' );
- $read = array( 'page', 'revision', 'text', 'interwiki' );
- $db->lockTables( $read, $write, 'updateSearchIndex.php ' . __METHOD__ );
- }
-
- /**
- * Unlock the tables
- * @param &$db Database object
- */
- private function unlockSearchindex( &$db ) {
- $db->unlockTables( 'updateSearchIndex.php ' . __METHOD__ );
- }
-
- /**
- * Unlock and lock again
- * Since the lock is low-priority, queued reads will be able to complete
- * @param &$db Database object
- */
- private function relockSearchindex( &$db ) {
- $this->unlockSearchindex( $db );
- $this->lockSearchindex( $db );
+ public function searchIndexUpdateCallback($dbw, $row) {
+ if ( $row->rc_type == RC_MOVE || $row->rc_type == RC_MOVE_OVER_REDIRECT ) {
+ # Rename searchindex entry
+ $titleObj = Title::makeTitle( $row->rc_moved_to_ns, $row->rc_moved_to_title );
+ $title = $titleObj->getPrefixedDBkey();
+ $this->output( "$title..." );
+ $u = new SearchUpdate( $row->rc_cur_id, $title, false );
+ $this->output( "\n" );
+ } elseif ( $row->rc_type !== RC_LOG ) {
+ $this->updateSearchIndexForPage( $dbw, $row->rc_cur_id );
+ }
}
}