From 022b7ba1406a4f402ff24c355288cb6ffd6eb82d Mon Sep 17 00:00:00 2001 From: Aryeh Gregor Date: Mon, 26 Jul 2010 19:27:13 +0000 Subject: [PATCH] Reconcept cl_raw_sortkey as cl_sortkey_prefix In response to feedback by Phillipe Verdy on bug 164. Now if a bunch of pages have [[Category:Foo| ]], they'll sort amongst themselves according to page name, instead of in basically random order as it is currently. This also makes storage more elegant and intuitive: instead of giving NULL a magic meaning when there's no custom sortkey specified, we just store an empty string, since there's no prefix. This means {{defaultsort:}} really now means {{defaultsortprefix:}}, which is slightly confusing, and a lot of code is now slightly misleading or poorly named. But it should all work fine. Also, while I was at it, I made updateCollation.php work as a transition script, so you can apply the SQL patch and then run updateCollation.php and things will work. However, with the new schema it's not trivial to reverse this -- you'd have to recover the raw sort keys with some PHP. Conversion goes at about a thousand rows a second for me, and seems to be CPU-bound. Could probably be optimized. I also adjusted the transition script so it will fix rows with collation versions *greater* than the current one, as well as less. Thus if some site wants to use their own collation, they can call it 137 or something, and if they later want to switch back to MediaWiki stock collation 7, it will work. Also fixed a silly bug in updateCollation.php where it would say "1000 done" if it did nothing, and changed $res->numRows() >= self::BATCH_SIZE to == so people don't wonder how it could be bigger (since it can't, I hope). --- includes/CategoryPage.php | 9 +-- includes/DefaultSettings.php | 4 +- includes/LinksUpdate.php | 25 ++++++-- includes/Title.php | 18 ++++++ includes/parser/Parser.php | 6 +- languages/Language.php | 4 +- .../patch-categorylinks-better-collation.sql | 4 +- maintenance/updateCollation.php | 59 ++++++++++++++----- 8 files changed, 96 insertions(+), 33 deletions(-) diff --git a/includes/CategoryPage.php b/includes/CategoryPage.php index c614a279fd..97a60985f7 100644 --- a/includes/CategoryPage.php +++ b/includes/CategoryPage.php @@ -270,7 +270,7 @@ class CategoryViewer { foreach ( array( 'page', 'subcat', 'file' ) as $type ) { $res = $dbr->select( $tables, - array_merge( $fields, array( 'cl_raw_sortkey' ) ), + array_merge( $fields, array( 'cl_sortkey_prefix' ) ), $conds + array( 'cl_type' => $type ) + ( $type == 'page' ? array( $pageCondition ) : array() ), __METHOD__, $opts + ( $type == 'page' ? array( 'LIMIT' => $this->limit + 1 ) : array() ), @@ -286,14 +286,15 @@ class CategoryViewer { } $title = Title::newFromRow( $row ); + $rawSortkey = $row->cl_sortkey_prefix . $title->getCategorySortkey(); if ( $title->getNamespace() == NS_CATEGORY ) { $cat = Category::newFromRow( $row, $title ); - $this->addSubcategoryObject( $cat, $row->cl_raw_sortkey, $row->page_len ); + $this->addSubcategoryObject( $cat, $rawSortkey, $row->page_len ); } elseif ( $this->showGallery && $title->getNamespace() == NS_FILE ) { - $this->addImage( $title, $row->cl_raw_sortkey, $row->page_len, $row->page_is_redirect ); + $this->addImage( $title, $rawSortkey, $row->page_len, $row->page_is_redirect ); } else { - $this->addPage( $title, $row->cl_raw_sortkey, $row->page_len, $row->page_is_redirect ); + $this->addPage( $title, $rawSortkey, $row->page_len, $row->page_is_redirect ); } } } diff --git a/includes/DefaultSettings.php b/includes/DefaultSettings.php index 1e9b6cd177..ba1c7c5d16 100644 --- a/includes/DefaultSettings.php +++ b/includes/DefaultSettings.php @@ -4476,8 +4476,8 @@ $wgExperimentalCategorySort = false; /** * A version indicator for collations that will be stored in cl_collation for * all new rows. Used when the collation algorithm changes: a script checks - * for all rows where cl_collation < $wgCollationVersion and regenerates - * cl_sortkey based on cl_raw_sortkey. + * for all rows where cl_collation != $wgCollationVersion and regenerates + * cl_sortkey based on the page name and cl_sortkey_prefix. */ $wgCollationVersion = 1; diff --git a/includes/LinksUpdate.php b/includes/LinksUpdate.php index 9cb11b9ba9..fd62cff649 100644 --- a/includes/LinksUpdate.php +++ b/includes/LinksUpdate.php @@ -441,14 +441,31 @@ class LinksUpdate { } else { $type = 'page'; } - $convertedSortkey = $wgContLang->convertToSortkey( $sortkey ); - # TODO: Set $sortkey to null if it's redundant + + # TODO: This is kind of wrong, because someone might set a sort + # key prefix that's the same as the default sortkey for the + # title. This should be fixed by refactoring code to replace + # $sortkey in this array by a prefix, but it's basically harmless + # (Title::moveTo() has had the same issue for a long time). + if ( $this->mTitle->getCategorySortkey() == $sortkey ) { + $prefix = ''; + $sortkey = $wgContLang->convertToSortkey( $sortkey ); + } else { + # Treat custom sortkeys as a prefix, so that if multiple + # things are forced to sort as '*' or something, they'll + # sort properly in the category rather than in page_id + # order or such. + $prefix = $sortkey; + $sortkey = $wgContLang->convertToSortkey( + $prefix . $this->mTitle->getCategorySortkey() ); + } + $arr[] = array( 'cl_from' => $this->mId, 'cl_to' => $name, - 'cl_sortkey' => $convertedSortkey, + 'cl_sortkey' => $sortkey, 'cl_timestamp' => $this->mDb->timestamp(), - 'cl_raw_sortkey' => $sortkey, + 'cl_sortkey_prefix' => $prefix, 'cl_collation' => $wgCollationVersion, 'cl_type' => $type, ); diff --git a/includes/Title.php b/includes/Title.php index cb2c554420..aa0ec5f3d3 100644 --- a/includes/Title.php +++ b/includes/Title.php @@ -4137,4 +4137,22 @@ class Title { return $types; } + + /** + * Returns what the default sort key for categories would be, if + * {{defaultsort:}} isn't used. This is the same as getText() for + * categories, and for everything if $wgCategoryPrefixedDefaultSortkey is + * false; otherwise it's the same as getPrefixedText(). + * + * @return string + */ + public function getCategorySortkey() { + global $wgCategoryPrefixedDefaultSortkey; + if ( $this->getNamespace() == NS_CATEGORY + || !$wgCategoryPrefixedDefaultSortkey ) { + return $this->getText(); + } else { + return $this->getPrefixedText(); + } + } } diff --git a/includes/parser/Parser.php b/includes/parser/Parser.php index 1fb6d981b8..4d16393125 100644 --- a/includes/parser/Parser.php +++ b/includes/parser/Parser.php @@ -5056,12 +5056,8 @@ class Parser { global $wgCategoryPrefixedDefaultSortkey; if ( $this->mDefaultSort !== false ) { return $this->mDefaultSort; - } elseif ( $this->mTitle->getNamespace() == NS_CATEGORY || - !$wgCategoryPrefixedDefaultSortkey ) - { - return $this->mTitle->getText(); } else { - return $this->mTitle->getPrefixedText(); + return $this->mTitle->getCategorySortkey(); } } diff --git a/languages/Language.php b/languages/Language.php index 211cf0537b..4b6a72f54f 100644 --- a/languages/Language.php +++ b/languages/Language.php @@ -2939,7 +2939,9 @@ class Language { * Given a string, convert it to a (hopefully short) key that can be used * for efficient sorting. A binary sort according to the sortkeys * corresponds to a logical sort of the corresponding strings. Applying - * this to cl_raw_sortkey produces cl_sortkey. + * this to cl_sortkey_prefix concatenated with the page title (possibly + * with namespace prefix, depending on $wgCategoryPrefixedDefaultSortkey) + * gives you cl_sortkey. * * @param string $string UTF-8 string * @return string Binary sortkey diff --git a/maintenance/archives/patch-categorylinks-better-collation.sql b/maintenance/archives/patch-categorylinks-better-collation.sql index f73dd524b8..5722a33c95 100644 --- a/maintenance/archives/patch-categorylinks-better-collation.sql +++ b/maintenance/archives/patch-categorylinks-better-collation.sql @@ -8,9 +8,9 @@ -- to work for MySQL for now, without table prefixes, possibly other random -- limitations. ALTER TABLE categorylinks - ADD COLUMN cl_raw_sortkey varchar(255) binary NULL default NULL, + ADD COLUMN cl_sortkey_prefix varchar(255) binary NOT NULL default '', ADD COLUMN cl_collation tinyint NOT NULL default 0, - ADD COLUMN cl_type ENUM('page', 'subcat', 'file') NOT NULL, + ADD COLUMN cl_type ENUM('page', 'subcat', 'file') NOT NULL default 'page', ADD INDEX (cl_collation), DROP INDEX cl_sortkey, ADD INDEX cl_sortkey (cl_to, cl_type, cl_sortkey, cl_from); diff --git a/maintenance/updateCollation.php b/maintenance/updateCollation.php index 93c845db16..f842537153 100644 --- a/maintenance/updateCollation.php +++ b/maintenance/updateCollation.php @@ -1,6 +1,6 @@ mDescription = <<addOption( 'force', 'Run on all rows, even if the collation is supposed to be up-to-date.' ); @@ -32,8 +32,8 @@ TEXT; $dbw = wfGetDB( DB_MASTER ); $count = $dbw->estimateRowCount( 'categorylinks', - array( 'cl_from', 'cl_to', 'cl_raw_sortkey' ), - 'cl_collation < ' . $dbw->addQuotes( $wgCollationVersion ), + array( 'cl_from', 'cl_to', 'cl_sortkey_prefix' ), + 'cl_collation != ' . $dbw->addQuotes( $wgCollationVersion ), __METHOD__ ); @@ -42,21 +42,50 @@ TEXT; $count = 0; do { $res = $dbw->select( - 'categorylinks', - array( 'cl_from', 'cl_to', 'cl_raw_sortkey' ), - 'cl_collation < ' . $dbw->addQuotes( $wgCollationVersion ), + array( 'categorylinks', 'page' ), + array( 'cl_from', 'cl_to', 'cl_sortkey_prefix', 'cl_collation', + 'cl_sortkey', 'page_namespace', 'page_title' + ), + array( + 'cl_collation != ' . $dbw->addQuotes( $wgCollationVersion ), + 'cl_from = page_id' + ), __METHOD__, array( 'LIMIT' => self::BATCH_SIZE ) ); $dbw->begin(); foreach ( $res as $row ) { - # TODO: Handle the case where cl_raw_sortkey is null. + $title = Title::newFromRow( $row ); + $rawSortkey = $title->getCategorySortkey(); + if ( $row->cl_collation == 0 ) { + # This is an old-style row, so the sortkey needs to be + # converted. + if ( $row->cl_sortkey == $rawSortkey ) { + $prefix = ''; + } else { + # Custom sortkey, use it as a prefix + $prefix = $row->cl_sortkey; + } + } else { + $prefix = $row->cl_sortkey_prefix; + } + # cl_type will be wrong for lots of pages if cl_collation is 0, + # so let's update it while we're here. + if ( $title->getNamespace() == NS_CATEGORY ) { + $type = 'subcat'; + } elseif ( $title->getNamespace() == NS_FILE ) { + $type = 'file'; + } else { + $type = 'page'; + } $dbw->update( 'categorylinks', array( - 'cl_sortkey' => $wgContLang->convertToSortkey( $row->cl_raw_sortkey ), - 'cl_collation' => $wgCollationVersion + 'cl_sortkey' => $wgContLang->convertToSortkey( $prefix . $rawSortkey ), + 'cl_sortkey_prefix' => $prefix, + 'cl_collation' => $wgCollationVersion, + 'cl_type' => $type, ), array( 'cl_from' => $row->cl_from, 'cl_to' => $row->cl_to ), __METHOD__ @@ -64,9 +93,9 @@ TEXT; } $dbw->commit(); - $count += self::BATCH_SIZE; + $count += $res->numRows(); $this->output( "$count done.\n" ); - } while ( $res->numRows() >= self::BATCH_SIZE ); + } while ( $res->numRows() == self::BATCH_SIZE ); } } -- 2.20.1