From 54badce2d8d5e62e54ae3fbada2fa2570dd69aa2 Mon Sep 17 00:00:00 2001 From: "Mark A. Hershberger" Date: Wed, 10 Mar 2010 21:54:23 +0000 Subject: [PATCH] Follow-up r61856 * Rename wordSegmentation() to segmentByWord(). * Consolidate search index locking and iteration to Maintenance.php * Add maintenance/updateDoubleWidthSearch.php to take care of new format for normalized double-width roman characters. * Add error checking to updateSearchIndex.php for creating $posFile. * Add note to UPGRADE about running updateDoubleWidthSearch.php. --- UPGRADE | 11 ++- languages/Language.php | 2 +- languages/classes/LanguageJa.php | 2 +- languages/classes/LanguageYue.php | 2 +- languages/classes/LanguageZh_hans.php | 4 +- maintenance/Maintenance.php | 87 +++++++++++++++++++++ maintenance/updateDoubleWidthSearch.php | 72 +++++++++++++++++ maintenance/updateSearchIndex.php | 100 ++++++------------------ 8 files changed, 198 insertions(+), 82 deletions(-) create mode 100644 maintenance/updateDoubleWidthSearch.php diff --git a/UPGRADE b/UPGRADE index 44acc5c253..cac08cd0f4 100644 --- a/UPGRADE +++ b/UPGRADE @@ -53,11 +53,19 @@ deleted file archives, and any custom skins. You will need to have $wgDBadminuser and $wgDBadminpass set in your LocalSettings.php, see there for more info. -From the command line, browse to the "maintenance" directory and run the +From the command line, browse to the "maintenance" directory and run the update.php script to check and update the schema. This will insert missing tables, update existing tables, and move data around as needed. In most cases, this is successful and nothing further needs to be done. +If you have a Chinese or Japanese wiki ($wgLanguageCode is set to one +of "zh", "ja", or "yue") and you are using MySQL fulltext search, you +will probably want to update the search index. + +In the "maintenance" directory, run the updateDoubleWidthSearch.php +script. This will update the searchindex table for those pages that +contain double-byte latin characters. + === Check configuration settings === The names of configuration variables, and their default values and purposes, @@ -67,6 +75,7 @@ notes to check for configuration changes which would alter the expected behaviour of MediaWiki. === Check installed extensions === + In MediaWiki 1.14 some extensions are migrated into the core. Please see the HISTORY section "Migrated extensions" and disable these extensions in your LocalSettings.php diff --git a/languages/Language.php b/languages/Language.php index bd44f5c8f1..39ee1ce2bc 100644 --- a/languages/Language.php +++ b/languages/Language.php @@ -1695,7 +1695,7 @@ class Language { * @param $string String * @return String */ - function wordSegmentation( $string ) { + function segmentByWord( $string ) { return $string; } diff --git a/languages/classes/LanguageJa.php b/languages/classes/LanguageJa.php index 4a24260b00..e56d230703 100644 --- a/languages/classes/LanguageJa.php +++ b/languages/classes/LanguageJa.php @@ -6,7 +6,7 @@ * @ingroup Language */ class LanguageJa extends Language { - function wordSegmentation( $string ) { + function segmentByWord( $string ) { // Strip known punctuation ? // $s = preg_replace( '/\xe3\x80[\x80-\xbf]/', '', $s ); # U3000-303f diff --git a/languages/classes/LanguageYue.php b/languages/classes/LanguageYue.php index 6581d788e4..f988548e98 100644 --- a/languages/classes/LanguageYue.php +++ b/languages/classes/LanguageYue.php @@ -12,7 +12,7 @@ class LanguageYue extends Language { * for now just treat each character as a word. * @todo Fixme: only do this for Han characters... */ - function wordSegmentation( $string ) { + function segmentByWord( $string ) { $reg = "/([\\xc0-\\xff][\\x80-\\xbf]*)/"; $s = self::insertSpace( $string, $reg ); return $s; diff --git a/languages/classes/LanguageZh_hans.php b/languages/classes/LanguageZh_hans.php index 20d34155b6..4b20e62a5d 100644 --- a/languages/classes/LanguageZh_hans.php +++ b/languages/classes/LanguageZh_hans.php @@ -13,7 +13,7 @@ class LanguageZh_hans extends Language { * for now just treat each character as a word. * @todo Fixme: only do this for Han characters... */ - function wordSegmentation( $string ) { + function segmentByWord( $string ) { $reg = "/([\\xc0-\\xff][\\x80-\\xbf]*)/"; $s = self::insertSpace( $string, $reg ); return $s; @@ -25,7 +25,7 @@ class LanguageZh_hans extends Language { // Double-width roman characters $s = self::convertDoubleWidth( $string ); $s = trim( $s ); - $s = self::wordSegmentation( $s ); + $s = self::segmentByWord( $s ); $s = parent::normalizeForSearch( $s ); wfProfileOut( __METHOD__ ); diff --git a/maintenance/Maintenance.php b/maintenance/Maintenance.php index 1656a6146b..45867b1c57 100644 --- a/maintenance/Maintenance.php +++ b/maintenance/Maintenance.php @@ -844,4 +844,91 @@ abstract class Maintenance { } return self::$mCoreScripts; } + + /** + * Lock the search index + * @param &$db Database object + */ + private function lockSearchindex( &$db ) { + $write = array( 'searchindex' ); + $read = array( 'page', 'revision', 'text', 'interwiki', 'l10n_cache' ); + $db->lockTables( $read, $write, __CLASS__ . '::' . __METHOD__ ); + } + + /** + * Unlock the tables + * @param &$db Database object + */ + private function unlockSearchindex( &$db ) { + $db->unlockTables( __CLASS__ . '::' . __METHOD__ ); + } + + /** + * Unlock and lock again + * Since the lock is low-priority, queued reads will be able to complete + * @param &$db Database object + */ + private function relockSearchindex( &$db ) { + $this->unlockSearchindex( $db ); + $this->lockSearchindex( $db ); + } + + /** + * Perform a search index update with locking + * @param $maxLockTime integer the maximum time to keep the search index locked. + * @param $updateFunction callback the function that will update the function. + */ + public function updateSearchIndex( $maxLockTime, $callback, $dbw, $results ) { + $lockTime = time(); + + # Lock searchindex + if ( $maxLockTime ) { + $this->output( " --- Waiting for lock ---" ); + $this->lockSearchindex( $dbw ); + $lockTime = time(); + $this->output( "\n" ); + } + + # Loop through the results and do a search update + foreach ( $results as $row ) { + # Allow reads to be processed + if ( $maxLockTime && time() > $lockTime + $maxLockTime ) { + $this->output( " --- Relocking ---" ); + $this->relockSearchindex( $dbw ); + $lockTime = time(); + $this->output( "\n" ); + } + call_user_func( $callback, $dbw, $row ); + } + + # Unlock searchindex + if ( $maxLockTime ) { + $this->output( " --- Unlocking --" ); + $this->unlockSearchindex( $dbw ); + $this->output( "\n" ); + } + + } + + /** + * Update the searchindex table for a given pageid + * @param $dbw Database a database write handle + * @param $pageId the page ID to update. + */ + public function updateSearchIndexForPage( $dbw, $pageId ) { + // Get current revision + $rev = Revision::loadFromPageId( $dbw, $pageId ); + $title = null; + if( $rev ) { + $titleObj = $rev->getTitle(); + $title = $titleObj->getPrefixedDBkey(); + $this->output( "$title..." ); + # Update searchindex + $u = new SearchUpdate( $pageId, $titleObj->getText(), $rev->getText() ); + $u->doUpdate(); + $this->output( "\n" ); + } + return $title; + } + } diff --git a/maintenance/updateDoubleWidthSearch.php b/maintenance/updateDoubleWidthSearch.php new file mode 100644 index 0000000000..09eaf7595c --- /dev/null +++ b/maintenance/updateDoubleWidthSearch.php @@ -0,0 +1,72 @@ +mDescription = "Script to normalize double-byte latin UTF-8 characters"; + $this->addOption( 'q', 'quiet', false, true ); + $this->addOption( 'l', 'How long the searchindex and revision tables will be locked for', false, true ); + } + + public function getDbType() { + return Maintenance::DB_ADMIN; + } + + public function execute() { + $quiet = $this->hasOption( 'q' ); + $maxLockTime = $this->getOption( 'l', 20 ); + $lockTime = time(); + + $dbw = wfGetDB( DB_MASTER ); + if( $dbw->getType() !== 'mysql' ) { + $this->output( "This change is only needed on MySQL, quitting..." ); + exit(1); + } + + $res = $this->findRows($dbw); + $this->updateSearchIndex($maxLockTime, array($this, 'searchIndexUpdateCallback'), $dbw, $res); + + $this->output( "Done\n" ); + } + + public function searchIndexUpdateCallback($dbw, $row) { + return $this->updateSearchIndexForPage( $dbw, $row->si_page ); + } + + private function findRows($dbw) { + $searchindex = $dbw->tableName( 'searchindex' ); + $regexp = '[[:<:]]u8efbd([89][1-9a]|8[b-f]|90)[[:>:]]'; + $sql = "SELECT si_page FROM $searchindex + WHERE ( si_text RLIKE '$regexp' ) + OR ( si_title RLIKE '$regexp' )"; + return $dbw->query( $sql, __METHOD__ ); + } +} + +$maintClass = "UpdateDoubleWidthSearch"; +require_once( DO_MAINTENANCE ); diff --git a/maintenance/updateSearchIndex.php b/maintenance/updateSearchIndex.php index 152ce1b6cd..7d4656b759 100644 --- a/maintenance/updateSearchIndex.php +++ b/maintenance/updateSearchIndex.php @@ -63,9 +63,18 @@ class UpdateSearchIndex extends Maintenance { $lockTime = $this->getOption( 'l', 20 ); $this->doUpdateSearchIndex( $start, $end, $lockTime ); - $file = fopen( $posFile, 'w' ); - fwrite( $file, $end ); - fclose( $file ); + if( is_writable( dirname( realpath( $posFile ) ) ) ) { + $file = fopen( $posFile, 'w' ); + if( $file !== false ) { + fwrite( $file, $end ); + fclose( $file ); + } else { + echo posix_get_last_error(); + $this->output( "*** Couldn't write to the $posFile!" ); + } + } else { + $this->output( "*** Couldn't write to the $posFile!" ); + } } private function doUpdateSearchIndex( $start, $end, $maxLockTime ) { @@ -89,83 +98,22 @@ class UpdateSearchIndex extends Maintenance { "; $res = $dbw->query( $sql, __METHOD__ ); + $this->updateSearchIndex($maxLockTime, array($this, 'searchIndexUpdateCallback'), $dbw, $res); - # Lock searchindex - if ( $maxLockTime ) { - $this->output( " --- Waiting for lock ---" ); - $this->lockSearchindex( $dbw ); - $lockTime = time(); - $this->output( "\n" ); - } - - # Loop through the results and do a search update - foreach ( $res as $row ) { - # Allow reads to be processed - if ( $maxLockTime && time() > $lockTime + $maxLockTime ) { - $this->output( " --- Relocking ---" ); - $this->relockSearchindex( $dbw ); - $lockTime = time(); - $this->output( "\n" ); - } - if ( $row->rc_type == RC_LOG ) { - continue; - } elseif ( $row->rc_type == RC_MOVE || $row->rc_type == RC_MOVE_OVER_REDIRECT ) { - # Rename searchindex entry - $titleObj = Title::makeTitle( $row->rc_moved_to_ns, $row->rc_moved_to_title ); - $title = $titleObj->getPrefixedDBkey(); - $this->output( "$title..." ); - $u = new SearchUpdate( $row->rc_cur_id, $title, false ); - $this->output( "\n" ); - } else { - // Get current revision - $rev = Revision::loadFromPageId( $dbw, $row->rc_cur_id ); - if( $rev ) { - $titleObj = $rev->getTitle(); - $title = $titleObj->getPrefixedDBkey(); - $this->output( $title ); - # Update searchindex - $u = new SearchUpdate( $row->rc_cur_id, $titleObj->getText(), $rev->getText() ); - $u->doUpdate(); - $this->output( "\n" ); - } - } - } - - # Unlock searchindex - if ( $maxLockTime ) { - $this->output( " --- Unlocking --" ); - $this->unlockSearchindex( $dbw ); - $this->output( "\n" ); - } $this->output( "Done\n" ); } - /** - * Lock the search index - * @param &$db Database object - */ - private function lockSearchindex( &$db ) { - $write = array( 'searchindex' ); - $read = array( 'page', 'revision', 'text', 'interwiki' ); - $db->lockTables( $read, $write, 'updateSearchIndex.php ' . __METHOD__ ); - } - - /** - * Unlock the tables - * @param &$db Database object - */ - private function unlockSearchindex( &$db ) { - $db->unlockTables( 'updateSearchIndex.php ' . __METHOD__ ); - } - - /** - * Unlock and lock again - * Since the lock is low-priority, queued reads will be able to complete - * @param &$db Database object - */ - private function relockSearchindex( &$db ) { - $this->unlockSearchindex( $db ); - $this->lockSearchindex( $db ); + public function searchIndexUpdateCallback($dbw, $row) { + if ( $row->rc_type == RC_MOVE || $row->rc_type == RC_MOVE_OVER_REDIRECT ) { + # Rename searchindex entry + $titleObj = Title::makeTitle( $row->rc_moved_to_ns, $row->rc_moved_to_title ); + $title = $titleObj->getPrefixedDBkey(); + $this->output( "$title..." ); + $u = new SearchUpdate( $row->rc_cur_id, $title, false ); + $this->output( "\n" ); + } elseif ( $row->rc_type !== RC_LOG ) { + $this->updateSearchIndexForPage( $dbw, $row->rc_cur_id ); + } } } -- 2.20.1