* Implement normalization of fullwidth latin characters for all Languages, not just...
authorMark A. Hershberger <mah@users.mediawiki.org>
Tue, 23 Mar 2010 19:50:59 +0000 (19:50 +0000)
committerMark A. Hershberger <mah@users.mediawiki.org>
Tue, 23 Mar 2010 19:50:59 +0000 (19:50 +0000)
* Tune Language::convertDoubleWidth() so that it is 8-10x faster.  (See http://xrl.us/bg2mon)

languages/Language.php
languages/classes/LanguageJa.php
languages/classes/LanguageZh_hans.php
maintenance/tests/SearchDbTest.php
maintenance/tests/SearchEngineTest.php

index 3e3415c..ef50f73 100644 (file)
@@ -1707,7 +1707,7 @@ class Language {
         * @return String
         */
        function normalizeForSearch( $string ) {
-               return $string;
+               return self::convertDoubleWidth($string);
        }
 
        /**
@@ -1715,8 +1715,17 @@ class Language {
         * range: ff00-ff5f ~= 0020-007f
         */
        protected static function convertDoubleWidth( $string ) {
-               $string = preg_replace( '/\xef\xbc([\x80-\xbf])/e', 'chr((ord("$1") & 0x3f) + 0x20)', $string );
-               $string = preg_replace( '/\xef\xbd([\x80-\x9a])/e', 'chr((ord("$1") & 0x3f) + 0x60)', $string );
+               static $full = null;
+               static $half = null;
+
+               if( $full === null ) {
+                       $fullWidth = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";
+                       $halfWidth = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";
+                       $full = str_split( $fullWidth, 3 );
+                       $half = str_split( $halfWidth );
+               }
+
+               $string = str_replace( $full, $half, $string );
                return $string;
        }
 
index e56d230..c5a992d 100644 (file)
@@ -23,14 +23,6 @@ class LanguageJa extends Language {
                return $s;
        }
 
-       function normalizeForSearch( $string ) {
-               // Double-width roman characters
-               $s = self::convertDoubleWidth( $string );
-               
-               # Do general case folding and UTF-8 armoring
-               return parent::normalizeForSearch( $s );
-       }
-
        # Italic is not appropriate for Japanese script
        # Unfortunately most browsers do not recognise this, and render <em> as italic
        function emphasize( $text ) {
index 4b20e62..69f7e20 100644 (file)
@@ -23,10 +23,9 @@ class LanguageZh_hans extends Language {
                wfProfileIn( __METHOD__ );
 
                // Double-width roman characters
-               $s = self::convertDoubleWidth( $string );
+               $s = parent::normalizeForSearch( $s );
                $s = trim( $s );
                $s = self::segmentByWord( $s );
-               $s = parent::normalizeForSearch( $s );
 
                wfProfileOut( __METHOD__ );
                return $s;
index 1f22340..190742a 100644 (file)
@@ -6,23 +6,22 @@ class SearchDbTest extends SearchEngineTest {
 
        function setUp() {
                global $wgDBprefix, $wgDBtype;
-
-               if($wgDBprefix === "parsertest_" ||
-                  ($wgDBtype === 'oracle' && $wgDBprefix === 'pt_')) {
-                       $this->markTestSkipped("This test can't (yet?) be run with the parser tests");
-               }
+               $this->db = wfGetDB( DB_MASTER );
+               if( !$this->db  ) {
+                       $this->markTestIncomplete( "Can't find a database to test with." );
+               }
 
                $GLOBALS['wgContLang'] = new Language;
-               $this->db = $this->buildTestDatabase(
-                       array( 'page', 'revision', 'text', 'searchindex', 'user' ) );
-               if( $this->db ) {
-                       $this->insertSearchData();
-               }
-               $searchType = preg_replace("/Database/", "Search", get_class($this->db));
+               $this->insertSearchData();
+
+               $this->insertSearchData();
+               $searchType = preg_replace("/Database/", "Search",
+                                                                  get_class($this->db));
                $this->search = new $searchType( $this->db );
        }
 
        function tearDown() {
+               $this->removeSearchData();
                if( !is_null( $this->db ) ) {
                        wfGetLB()->closeConnecton( $this->db );
                }
index 8f6b86b..fa6964f 100644 (file)
@@ -6,42 +6,44 @@ require_once( 'MediaWiki_Setup.php' );
  * @group Stub
  */
 class SearchEngineTest extends MediaWiki_Setup {
-       var $db, $search;
-       private $count = 0;
-
-       function insertSearchData() {
-               $this->insertPage("Main_Page",      "This is a main page", 0);
-               $this->insertPage('Main_Page',      'This is a talk page to the main page, see [[smithee]]', 1);
-               $this->insertPage('Smithee',            'A smithee is one who smiths. See also [[Alan Smithee]]', 0);
-               $this->insertPage('Smithee',            'This article sucks.', 1);
-               $this->insertPage('Unrelated_page',     'Nothing in this page is about the S word.', 0);
-               $this->insertPage('Another_page',       'This page also is unrelated.', 0);
-               $this->insertPage('Help',                       'Help me!', 4);
-               $this->insertPage('Thppt',                      'Blah blah', 0);
-               $this->insertPage('Alan_Smithee',       'yum', 0);
-               $this->insertPage('Pages',                      'are food', 0);
-               $this->insertPage('DblPageOne',         'ABCDEF', 0);
-               $this->insertPage('DblPageTwo',         'ABCDE', 0);
-               $this->insertPage('DblPageTwoLow',  'abcde', 0);
-       }
+       var $db, $search, $pageList;
 
-       function normalize( $text ) {
-               return strtolower(preg_replace("/[^[:alnum:] ]/", " ", $text));
+       function pageExists( $title ) {
+               return false;
        }
 
-       function insertPage( $pageName, $text, $ns ) {
-               $this->count++;
-               $this->db->safeQuery( 'INSERT INTO ! (page_id,page_namespace,page_title,page_latest) VALUES (?,?,?,?)',
-                       $this->db->tableName( 'page' ), $this->count, $ns, $pageName, $this->count );
-               $this->db->safeQuery( 'INSERT INTO ! (rev_id,rev_page) VALUES (?, ?)',
-                       $this->db->tableName( 'revision' ), $this->count, $this->count );
-               $this->db->safeQuery( 'INSERT INTO ! (old_id,old_text) VALUES (?, ?)',
-                       $this->db->tableName( 'text' ), $this->count, $text );
-               $this->db->safeQuery( 'INSERT INTO ! (si_page,si_title,si_text) VALUES (?, ?, ?)',
-                       $this->db->tableName( 'searchindex' ), $this->count,
-                       $this->normalize( $pageName ), $this->normalize( $text ) );
+       function insertSearchData() {
+           if( $this->pageExists( 'Not_Main_Page' ) ) {
+               return;
+           }
+           $this->insertPage("Not_Main_Page",  "This is not a main page", 0);
+           $this->insertPage('Talk:Not_Main_Page',     'This is not a talk page to the main page, see [[smithee]]', 1);
+           $this->insertPage('Smithee',        'A smithee is one who smiths. See also [[Alan Smithee]]', 0);
+           $this->insertPage('Talk:Smithee',   'This article sucks.', 1);
+           $this->insertPage('Unrelated_page', 'Nothing in this page is about the S word.', 0);
+           $this->insertPage('Another_page',   'This page also is unrelated.', 0);
+           $this->insertPage('Help:Help',              'Help me!', 4);
+           $this->insertPage('Thppt',          'Blah blah', 0);
+           $this->insertPage('Alan_Smithee',   'yum', 0);
+           $this->insertPage('Pages',          'are\'food', 0);
+           $this->insertPage('HalfOneUp',      'AZ', 0);
+           $this->insertPage('FullOneUp',      'AZ', 0);
+           $this->insertPage('HalfTwoLow',     'az', 0);
+           $this->insertPage('FullTwoLow',     'az', 0);
+           $this->insertPage('HalfNumbers',    '1234567890', 0);
+           $this->insertPage('FullNumbers',    '1234567890', 0);
+           $this->insertPage('DomainName',     'example.com', 0);
        }
 
+       function removeSearchData() {
+            return;
+            while( count($this->pageList) ) {
+                list( $title, $id ) = array_pop( $this->pageList );
+                $article = new Article( $title, $id );
+                $article->doDeleteArticle("Search Test");
+            }
+       }
+
        function fetchIds( $results ) {
                $matches = array();
                while( $row = $results->next() ) {
@@ -55,34 +57,98 @@ class SearchEngineTest extends MediaWiki_Setup {
                return $matches;
        }
 
-       function testTextSearch() {
-               if( is_null( $this->db ) ) {
-                       $this->markTestIncomplete( "Can't find a database to test with." );
-               }
-               $this->assertEquals(
-                       array( 'Smithee' ),
-                       $this->fetchIds( $this->search->searchText( 'smithee' ) ),
-                       "Plain search failed" );
+       // Modified version of WikiRevision::importOldRevision()
+       function insertPage( $pageName, $text, $ns ) {
+            $dbw = $this->db;
+            $title = Title::newFromText( $pageName );
+
+            $userId = 0;
+            $userText = 'WikiSysop';
+            $comment = 'Search Test';
+
+            // avoid memory leak...?
+            $linkCache = LinkCache::singleton();
+            $linkCache->clear();
+
+            $article = new Article( $title );
+            $pageId = $article->getId();
+            $created = false;
+            if( $pageId == 0 ) {
+                # must create the page...
+                $pageId = $article->insertOn( $dbw );
+                $created = true;
+            }
+
+            # FIXME: Use original rev_id optionally (better for backups)
+            # Insert the row
+            $revision = new Revision( array(
+                            'page'       => $pageId,
+                            'text'       => $text,
+                            'comment'    => $comment,
+                            'user'       => $userId,
+                            'user_text'  => $userText,
+                            'timestamp'  => 0,
+                            'minor_edit' => false,
+                       ) );
+            $revId = $revision->insertOn( $dbw );
+            $changed = $article->updateIfNewerOn( $dbw, $revision );
+
+            $GLOBALS['wgTitle'] = $title;
+            if( $created ) {
+                Article::onArticleCreate( $title );
+                $article->createUpdates( $revision );
+            } elseif( $changed ) {
+                Article::onArticleEdit( $title );
+                $article->editUpdates(
+                    $text, $comment, false, 0, $revId );
+            }
+
+            $su = new SearchUpdate($article->getId(), $pageName, $text);
+            $su->doUpdate();
+
+            $this->pageList[] = array( $title, $article->getId() );
+
+            return true;
+        }
+
+       function testFullWidth() {
+            $this->assertEquals(
+                array( 'FullOneUp', 'FullTwoLow', 'HalfOneUp', 'HalfTwoLow' ),
+                $this->fetchIds( $this->search->searchText( 'AZ' ) ),
+                "Search for normalized from Half-width Upper" );
+            $this->assertEquals(
+                array( 'FullOneUp', 'FullTwoLow', 'HalfOneUp', 'HalfTwoLow' ),
+                $this->fetchIds( $this->search->searchText( 'az' ) ),
+                "Search for normalized from Half-width Lower" );
+            $this->assertEquals(
+                array( 'FullOneUp', 'FullTwoLow', 'HalfOneUp', 'HalfTwoLow' ),
+                $this->fetchIds( $this->search->searchText( 'AZ' ) ),
+                "Search for normalized from Full-width Upper" );
+            $this->assertEquals(
+                array( 'FullOneUp', 'FullTwoLow', 'HalfOneUp', 'HalfTwoLow' ),
+                $this->fetchIds( $this->search->searchText( 'az' ) ),
+                "Search for normalized from Full-width Lower" );
+       }
+
+        function testTextSearch() {
+            $this->assertEquals(
+                array( 'Smithee' ),
+                $this->fetchIds( $this->search->searchText( 'smithee' ) ),
+                "Plain search failed" );
        }
 
        function testTextPowerSearch() {
-               if( is_null( $this->db ) ) {
-                       $this->markTestIncomplete( "Can't find a database to test with." );
-               }
                $this->search->setNamespaces( array( 0, 1, 4 ) );
                $this->assertEquals(
                        array(
                                'Smithee',
-                               'Talk:Main Page',
+                               'Talk:Not Main Page',
                        ),
                        $this->fetchIds( $this->search->searchText( 'smithee' ) ),
                        "Power search failed" );
        }
 
        function testTitleSearch() {
-               if( is_null( $this->db ) ) {
-                       $this->markTestIncomplete( "Can't find a database to test with." );
-               }
                $this->assertEquals(
                        array(
                                'Alan Smithee',
@@ -93,9 +159,6 @@ class SearchEngineTest extends MediaWiki_Setup {
        }
 
        function testTextTitlePowerSearch() {
-               if( is_null( $this->db ) ) {
-                       $this->markTestIncomplete( "Can't find a database to test with." );
-               }
                $this->search->setNamespaces( array( 0, 1, 4 ) );
                $this->assertEquals(
                        array(
@@ -108,6 +171,3 @@ class SearchEngineTest extends MediaWiki_Setup {
        }
 
 }
-
-
-