From 907695c56a68b45bc7b594c3e039a5817b7e9f41 Mon Sep 17 00:00:00 2001 From: Aryeh Gregor Date: Fri, 23 Jul 2010 19:52:02 +0000 Subject: [PATCH] Initial commit for category collation framework Hidden behind $wgExperimentalCategorySort until it's reasonably complete. If that's false, no behavior should change (but I didn't test carefully, so poke me if there's a bug). See DefaultSettings.php for documentation on setting it to true. Currently you should not do this except if you're working on the feature, since functionality is not close to reasonable yet and will change rapidly. Bug 1211 is already fixed with this commit for me. However, many other things still need to be done, so this is all very much a proof-of-concept. --- includes/CategoryPage.php | 65 +++++++++++++++++++++++++++++++----- includes/DefaultSettings.php | 18 ++++++++++ includes/LinksUpdate.php | 36 ++++++++++++++++---- languages/Language.php | 54 ++++++++++++++++++++++++++++++ 4 files changed, 157 insertions(+), 16 deletions(-) diff --git a/includes/CategoryPage.php b/includes/CategoryPage.php index 56f85faabe..e07ae775a6 100644 --- a/includes/CategoryPage.php +++ b/includes/CategoryPage.php @@ -226,6 +226,8 @@ class CategoryViewer { } function doCategoryQuery() { + global $wgExperimentalCategorySort; + $dbr = wfGetDB( DB_SLAVE, 'category' ); if ( $this->from != '' ) { $pageCondition = 'cl_sortkey >= ' . $dbr->addQuotes( $this->from ); @@ -238,17 +240,23 @@ class CategoryViewer { $this->flip = false; } + $tables = array( 'page', 'categorylinks', 'category' ); + $fields = array( 'page_title', 'page_namespace', 'page_len', + 'page_is_redirect', 'cl_sortkey', 'cat_id', 'cat_title', + 'cat_subcats', 'cat_pages', 'cat_files' ); + $conds = array( $pageCondition, 'cl_to' => $this->title->getDBkey() ); + $opts = array( 'ORDER BY' => $this->flip ? 'cl_sortkey DESC' : + 'cl_sortkey', 'USE INDEX' => array( 'categorylinks' => 'cl_sortkey' ) ); + $joins = array( 'categorylinks' => array( 'INNER JOIN', 'cl_from = page_id' ), + 'category' => array( 'LEFT JOIN', 'cat_title = page_title AND page_namespace = ' . NS_CATEGORY ) ); + $res = $dbr->select( - array( 'page', 'categorylinks', 'category' ), - array( 'page_title', 'page_namespace', 'page_len', 'page_is_redirect', 'cl_sortkey', - 'cat_id', 'cat_title', 'cat_subcats', 'cat_pages', 'cat_files' ), - array( $pageCondition, 'cl_to' => $this->title->getDBkey() ), + $tables, + $fields, + $conds + ( $wgExperimentalCategorySort ? array( 'cl_type' => 'page' ) : array() ), __METHOD__, - array( 'ORDER BY' => $this->flip ? 'cl_sortkey DESC' : 'cl_sortkey', - 'USE INDEX' => array( 'categorylinks' => 'cl_sortkey' ), - 'LIMIT' => $this->limit + 1 ), - array( 'categorylinks' => array( 'INNER JOIN', 'cl_from = page_id' ), - 'category' => array( 'LEFT JOIN', 'cat_title = page_title AND page_namespace = ' . NS_CATEGORY ) ) + $opts + array( 'LIMIT' => $this->limit + 1 ), + $joins ); $count = 0; @@ -273,6 +281,45 @@ class CategoryViewer { $this->addPage( $title, $x->cl_sortkey, $x->page_len, $x->page_is_redirect ); } } + + if ( $wgExperimentalCategorySort ) { + # Now add all subcategories and files. TODO: rewrite to be sane + # (this is basically a proof-of-concept, e.g., no pagination here). + $subcatsRes = $dbr->select( + $tables, $fields, + $conds + array( 'cl_type' => 'subcat' ), + __METHOD__, $opts, $joins + ); + + foreach ( $subcatsRes as $row ) { + $title = Title::newFromRow( $row ); + + if ( $title->getNamespace() == NS_CATEGORY ) { + $cat = Category::newFromRow( $row, $title ); + $this->addSubcategoryObject( $cat, $row->cl_sortkey, $row->page_len ); + } else { + # Will handle this sanely in final code + throw new MWException( 'Debug: cl_type = subcat but not category' ); + } + } + + $filesRes = $dbr->select( + $tables, $fields, + $conds + array( 'cl_type' => 'file' ), + __METHOD__, $opts, $joins + ); + + foreach ( $filesRes as $row ) { + $title = Title::newFromRow( $row ); + + if ( $this->showGallery && $title->getNamespace() == NS_FILE ) { + $this->addImage( $title, $row->cl_sortkey, $row->page_len, $row->page_is_redirect ); + } else { + # More temporary debugging + throw new MWException( 'Debug: cl_type = file but not file' ); + } + } + } } function getCategoryTop() { diff --git a/includes/DefaultSettings.php b/includes/DefaultSettings.php index 893251d4e8..42dc3df957 100644 --- a/includes/DefaultSettings.php +++ b/includes/DefaultSettings.php @@ -4458,6 +4458,24 @@ $wgCategoryPagingLimit = 200; */ $wgCategoryPrefixedDefaultSortkey = true; +/** + * Enable experimental support for non-braindead collation on category pages. + * For this to work, you need to alter your categorylinks table by applying + * maintenance/archives/patch-categorylinks-better-collation.sql, then keep + * up-to-date with changes that are made to that file (they won't be + * automatically applied). You should also set $wgUseDumbLinkUpdate = true and + * run maintenance/refreshLinks.php. + */ +$wgExperimentalCategorySort = false; + +/** + * A version indicator for collations that will be stored in cl_collation for + * all new rows. Used when the collation algorithm changes: a script checks + * for all rows where cl_collation < $wgCollationVersion and regenerates + * cl_sortkey based on cl_raw_sortkey. + */ +$wgCollationVersion = 0; + /** @} */ # End categories } /*************************************************************************//** diff --git a/includes/LinksUpdate.php b/includes/LinksUpdate.php index aebf24961d..9cb11b9ba9 100644 --- a/includes/LinksUpdate.php +++ b/includes/LinksUpdate.php @@ -426,18 +426,40 @@ class LinksUpdate { * @private */ function getCategoryInsertions( $existing = array() ) { - global $wgContLang; + global $wgContLang, $wgExperimentalCategorySort, $wgCollationVersion; $diffs = array_diff_assoc( $this->mCategories, $existing ); $arr = array(); foreach ( $diffs as $name => $sortkey ) { $nt = Title::makeTitleSafe( NS_CATEGORY, $name ); $wgContLang->findVariantLink( $name, $nt, true ); - $arr[] = array( - 'cl_from' => $this->mId, - 'cl_to' => $name, - 'cl_sortkey' => $sortkey, - 'cl_timestamp' => $this->mDb->timestamp() - ); + + if ( $wgExperimentalCategorySort ) { + if ( $this->mTitle->getNamespace() == NS_CATEGORY ) { + $type = 'subcat'; + } elseif ( $this->mTitle->getNamespace() == NS_FILE ) { + $type = 'file'; + } else { + $type = 'page'; + } + $convertedSortkey = $wgContLang->convertToSortkey( $sortkey ); + # TODO: Set $sortkey to null if it's redundant + $arr[] = array( + 'cl_from' => $this->mId, + 'cl_to' => $name, + 'cl_sortkey' => $convertedSortkey, + 'cl_timestamp' => $this->mDb->timestamp(), + 'cl_raw_sortkey' => $sortkey, + 'cl_collation' => $wgCollationVersion, + 'cl_type' => $type, + ); + } else { + $arr[] = array( + 'cl_from' => $this->mId, + 'cl_to' => $name, + 'cl_sortkey' => $sortkey, + 'cl_timestamp' => $this->mDb->timestamp() + ); + } } return $arr; } diff --git a/languages/Language.php b/languages/Language.php index 41619f7bd7..95d1426e17 100644 --- a/languages/Language.php +++ b/languages/Language.php @@ -2934,4 +2934,58 @@ class Language { function getConvRuleTitle() { return $this->mConverter->getConvRuleTitle(); } + + /** + * Given a string, convert it to a (hopefully short) key that can be used + * for efficient sorting. A binary sort according to the sortkeys + * corresponds to a logical sort of the corresponding strings. Applying + * this to cl_raw_sortkey produces cl_sortkey. + * + * @param string $string UTF-8 string + * @return string Binary sortkey + */ + public function convertToSortkey( $string ) { + # Stub function for now + return $string; + } + + /** + * Does it make sense for lists to be split up into sections based on their + * first letter? Logogram-based scripts probably want to return false. + * + * TODO: Use this in CategoryPage.php. + * + * @return boolean + */ + public function usesFirstLettersInLists() { + return true; + } + + /** + * Given a string, return the logical "first letter" to be used for + * grouping on category pages and so on. This has to be coordinated + * carefully with convertToSortkey(), or else the sorted list might jump + * back and forth between the same "initial letters" or other pathological + * behavior. For instance, if you just return the first character, but "a" + * sorts the same as "A" based on convertToSortkey(), then you might get a + * list like + * + * == A == + * * [[Aardvark]] + * + * == a == + * * [[antelope]] + * + * == A == + * * [[Ape]] + * + * etc., assuming for the sake of argument that $wgCapitalLinks is false. + * Obviously, this is ignored if usesFirstLettersInLists() is false. + * + * @param string $string UTF-8 string + * @return string UTF-8 string corresponding to the first letter of input + */ + public function firstLetterForLists( $string ) { + return mb_substr( $string, 0, 1 ); + } } -- 2.20.1