From: Mark A. Hershberger Date: Tue, 23 Mar 2010 19:50:59 +0000 (+0000) Subject: * Implement normalization of fullwidth latin characters for all Languages, not just... X-Git-Tag: 1.31.0-rc.0~37375 X-Git-Url: https://git.cyclocoop.org/%7B%24admin_url%7Dmembres/fiche.php?a=commitdiff_plain;h=560b72c4abfac9e9e2172974673781f9907b5e3c;p=lhc%2Fweb%2Fwiklou.git * Implement normalization of fullwidth latin characters for all Languages, not just Japanese and Chinese. * Tune Language::convertDoubleWidth() so that it is 8-10x faster. (See http://xrl.us/bg2mon) --- diff --git a/languages/Language.php b/languages/Language.php index 3e3415c119..ef50f73624 100644 --- a/languages/Language.php +++ b/languages/Language.php @@ -1707,7 +1707,7 @@ class Language { * @return String */ function normalizeForSearch( $string ) { - return $string; + return self::convertDoubleWidth($string); } /** @@ -1715,8 +1715,17 @@ class Language { * range: ff00-ff5f ~= 0020-007f */ protected static function convertDoubleWidth( $string ) { - $string = preg_replace( '/\xef\xbc([\x80-\xbf])/e', 'chr((ord("$1") & 0x3f) + 0x20)', $string ); - $string = preg_replace( '/\xef\xbd([\x80-\x9a])/e', 'chr((ord("$1") & 0x3f) + 0x60)', $string ); + static $full = null; + static $half = null; + + if( $full === null ) { + $fullWidth = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"; + $halfWidth = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"; + $full = str_split( $fullWidth, 3 ); + $half = str_split( $halfWidth ); + } + + $string = str_replace( $full, $half, $string ); return $string; } diff --git a/languages/classes/LanguageJa.php b/languages/classes/LanguageJa.php index e56d230703..c5a992d5a0 100644 --- a/languages/classes/LanguageJa.php +++ b/languages/classes/LanguageJa.php @@ -23,14 +23,6 @@ class LanguageJa extends Language { return $s; } - function normalizeForSearch( $string ) { - // Double-width roman characters - $s = self::convertDoubleWidth( $string ); - - # Do general case folding and UTF-8 armoring - return parent::normalizeForSearch( $s ); - } - # Italic is not appropriate for Japanese script # Unfortunately most browsers do not recognise this, and render as italic function emphasize( $text ) { diff --git a/languages/classes/LanguageZh_hans.php b/languages/classes/LanguageZh_hans.php index 4b20e62a5d..69f7e201d7 100644 --- a/languages/classes/LanguageZh_hans.php +++ b/languages/classes/LanguageZh_hans.php @@ -23,10 +23,9 @@ class LanguageZh_hans extends Language { wfProfileIn( __METHOD__ ); // Double-width roman characters - $s = self::convertDoubleWidth( $string ); + $s = parent::normalizeForSearch( $s ); $s = trim( $s ); $s = self::segmentByWord( $s ); - $s = parent::normalizeForSearch( $s ); wfProfileOut( __METHOD__ ); return $s; diff --git a/maintenance/tests/SearchDbTest.php b/maintenance/tests/SearchDbTest.php index 1f22340b5d..190742a827 100644 --- a/maintenance/tests/SearchDbTest.php +++ b/maintenance/tests/SearchDbTest.php @@ -6,23 +6,22 @@ class SearchDbTest extends SearchEngineTest { function setUp() { global $wgDBprefix, $wgDBtype; - - if($wgDBprefix === "parsertest_" || - ($wgDBtype === 'oracle' && $wgDBprefix === 'pt_')) { - $this->markTestSkipped("This test can't (yet?) be run with the parser tests"); - } + $this->db = wfGetDB( DB_MASTER ); + if( !$this->db ) { + $this->markTestIncomplete( "Can't find a database to test with." ); + } $GLOBALS['wgContLang'] = new Language; - $this->db = $this->buildTestDatabase( - array( 'page', 'revision', 'text', 'searchindex', 'user' ) ); - if( $this->db ) { - $this->insertSearchData(); - } - $searchType = preg_replace("/Database/", "Search", get_class($this->db)); + $this->insertSearchData(); + + $this->insertSearchData(); + $searchType = preg_replace("/Database/", "Search", + get_class($this->db)); $this->search = new $searchType( $this->db ); } function tearDown() { + $this->removeSearchData(); if( !is_null( $this->db ) ) { wfGetLB()->closeConnecton( $this->db ); } diff --git a/maintenance/tests/SearchEngineTest.php b/maintenance/tests/SearchEngineTest.php index 8f6b86b2c8..fa6964f3f4 100644 --- a/maintenance/tests/SearchEngineTest.php +++ b/maintenance/tests/SearchEngineTest.php @@ -6,42 +6,44 @@ require_once( 'MediaWiki_Setup.php' ); * @group Stub */ class SearchEngineTest extends MediaWiki_Setup { - var $db, $search; - private $count = 0; - - function insertSearchData() { - $this->insertPage("Main_Page", "This is a main page", 0); - $this->insertPage('Main_Page', 'This is a talk page to the main page, see [[smithee]]', 1); - $this->insertPage('Smithee', 'A smithee is one who smiths. See also [[Alan Smithee]]', 0); - $this->insertPage('Smithee', 'This article sucks.', 1); - $this->insertPage('Unrelated_page', 'Nothing in this page is about the S word.', 0); - $this->insertPage('Another_page', 'This page also is unrelated.', 0); - $this->insertPage('Help', 'Help me!', 4); - $this->insertPage('Thppt', 'Blah blah', 0); - $this->insertPage('Alan_Smithee', 'yum', 0); - $this->insertPage('Pages', 'are food', 0); - $this->insertPage('DblPageOne', 'ABCDEF', 0); - $this->insertPage('DblPageTwo', 'ABCDE', 0); - $this->insertPage('DblPageTwoLow', 'abcde', 0); - } + var $db, $search, $pageList; - function normalize( $text ) { - return strtolower(preg_replace("/[^[:alnum:] ]/", " ", $text)); + function pageExists( $title ) { + return false; } - function insertPage( $pageName, $text, $ns ) { - $this->count++; - $this->db->safeQuery( 'INSERT INTO ! (page_id,page_namespace,page_title,page_latest) VALUES (?,?,?,?)', - $this->db->tableName( 'page' ), $this->count, $ns, $pageName, $this->count ); - $this->db->safeQuery( 'INSERT INTO ! (rev_id,rev_page) VALUES (?, ?)', - $this->db->tableName( 'revision' ), $this->count, $this->count ); - $this->db->safeQuery( 'INSERT INTO ! (old_id,old_text) VALUES (?, ?)', - $this->db->tableName( 'text' ), $this->count, $text ); - $this->db->safeQuery( 'INSERT INTO ! (si_page,si_title,si_text) VALUES (?, ?, ?)', - $this->db->tableName( 'searchindex' ), $this->count, - $this->normalize( $pageName ), $this->normalize( $text ) ); + function insertSearchData() { + if( $this->pageExists( 'Not_Main_Page' ) ) { + return; + } + $this->insertPage("Not_Main_Page", "This is not a main page", 0); + $this->insertPage('Talk:Not_Main_Page', 'This is not a talk page to the main page, see [[smithee]]', 1); + $this->insertPage('Smithee', 'A smithee is one who smiths. See also [[Alan Smithee]]', 0); + $this->insertPage('Talk:Smithee', 'This article sucks.', 1); + $this->insertPage('Unrelated_page', 'Nothing in this page is about the S word.', 0); + $this->insertPage('Another_page', 'This page also is unrelated.', 0); + $this->insertPage('Help:Help', 'Help me!', 4); + $this->insertPage('Thppt', 'Blah blah', 0); + $this->insertPage('Alan_Smithee', 'yum', 0); + $this->insertPage('Pages', 'are\'food', 0); + $this->insertPage('HalfOneUp', 'AZ', 0); + $this->insertPage('FullOneUp', 'AZ', 0); + $this->insertPage('HalfTwoLow', 'az', 0); + $this->insertPage('FullTwoLow', 'az', 0); + $this->insertPage('HalfNumbers', '1234567890', 0); + $this->insertPage('FullNumbers', '1234567890', 0); + $this->insertPage('DomainName', 'example.com', 0); } + function removeSearchData() { + return; + while( count($this->pageList) ) { + list( $title, $id ) = array_pop( $this->pageList ); + $article = new Article( $title, $id ); + $article->doDeleteArticle("Search Test"); + } + } + function fetchIds( $results ) { $matches = array(); while( $row = $results->next() ) { @@ -55,34 +57,98 @@ class SearchEngineTest extends MediaWiki_Setup { return $matches; } - function testTextSearch() { - if( is_null( $this->db ) ) { - $this->markTestIncomplete( "Can't find a database to test with." ); - } - $this->assertEquals( - array( 'Smithee' ), - $this->fetchIds( $this->search->searchText( 'smithee' ) ), - "Plain search failed" ); + // Modified version of WikiRevision::importOldRevision() + function insertPage( $pageName, $text, $ns ) { + $dbw = $this->db; + $title = Title::newFromText( $pageName ); + + $userId = 0; + $userText = 'WikiSysop'; + $comment = 'Search Test'; + + // avoid memory leak...? + $linkCache = LinkCache::singleton(); + $linkCache->clear(); + + $article = new Article( $title ); + $pageId = $article->getId(); + $created = false; + if( $pageId == 0 ) { + # must create the page... + $pageId = $article->insertOn( $dbw ); + $created = true; + } + + # FIXME: Use original rev_id optionally (better for backups) + # Insert the row + $revision = new Revision( array( + 'page' => $pageId, + 'text' => $text, + 'comment' => $comment, + 'user' => $userId, + 'user_text' => $userText, + 'timestamp' => 0, + 'minor_edit' => false, + ) ); + $revId = $revision->insertOn( $dbw ); + $changed = $article->updateIfNewerOn( $dbw, $revision ); + + $GLOBALS['wgTitle'] = $title; + if( $created ) { + Article::onArticleCreate( $title ); + $article->createUpdates( $revision ); + } elseif( $changed ) { + Article::onArticleEdit( $title ); + $article->editUpdates( + $text, $comment, false, 0, $revId ); + } + + $su = new SearchUpdate($article->getId(), $pageName, $text); + $su->doUpdate(); + + $this->pageList[] = array( $title, $article->getId() ); + + return true; + } + + function testFullWidth() { + $this->assertEquals( + array( 'FullOneUp', 'FullTwoLow', 'HalfOneUp', 'HalfTwoLow' ), + $this->fetchIds( $this->search->searchText( 'AZ' ) ), + "Search for normalized from Half-width Upper" ); + $this->assertEquals( + array( 'FullOneUp', 'FullTwoLow', 'HalfOneUp', 'HalfTwoLow' ), + $this->fetchIds( $this->search->searchText( 'az' ) ), + "Search for normalized from Half-width Lower" ); + $this->assertEquals( + array( 'FullOneUp', 'FullTwoLow', 'HalfOneUp', 'HalfTwoLow' ), + $this->fetchIds( $this->search->searchText( 'AZ' ) ), + "Search for normalized from Full-width Upper" ); + $this->assertEquals( + array( 'FullOneUp', 'FullTwoLow', 'HalfOneUp', 'HalfTwoLow' ), + $this->fetchIds( $this->search->searchText( 'az' ) ), + "Search for normalized from Full-width Lower" ); + } + + function testTextSearch() { + $this->assertEquals( + array( 'Smithee' ), + $this->fetchIds( $this->search->searchText( 'smithee' ) ), + "Plain search failed" ); } function testTextPowerSearch() { - if( is_null( $this->db ) ) { - $this->markTestIncomplete( "Can't find a database to test with." ); - } $this->search->setNamespaces( array( 0, 1, 4 ) ); $this->assertEquals( array( 'Smithee', - 'Talk:Main Page', + 'Talk:Not Main Page', ), $this->fetchIds( $this->search->searchText( 'smithee' ) ), "Power search failed" ); } function testTitleSearch() { - if( is_null( $this->db ) ) { - $this->markTestIncomplete( "Can't find a database to test with." ); - } $this->assertEquals( array( 'Alan Smithee', @@ -93,9 +159,6 @@ class SearchEngineTest extends MediaWiki_Setup { } function testTextTitlePowerSearch() { - if( is_null( $this->db ) ) { - $this->markTestIncomplete( "Can't find a database to test with." ); - } $this->search->setNamespaces( array( 0, 1, 4 ) ); $this->assertEquals( array( @@ -108,6 +171,3 @@ class SearchEngineTest extends MediaWiki_Setup { } } - - -