From: Conrad Irwin Date: Sun, 28 Mar 2010 03:10:10 +0000 (+0000) Subject: Re-normalize titles after html entity decoding when necessary (bug 14952) X-Git-Tag: 1.31.0-rc.0~37328 X-Git-Url: http://git.cyclocoop.org/%24action?a=commitdiff_plain;h=ca5a3684c8d13ef9d7cab2ab612d8e1942dbedc1;p=lhc%2Fweb%2Fwiklou.git Re-normalize titles after html entity decoding when necessary (bug 14952) --- diff --git a/RELEASE-NOTES b/RELEASE-NOTES index 04c9f5c37d..c64ab37c48 100644 --- a/RELEASE-NOTES +++ b/RELEASE-NOTES @@ -62,6 +62,8 @@ it from source control: http://www.mediawiki.org/wiki/Download_from_SVN themselves unless they are given the 'unblockself' permission. * (bug 22876) Avoid possible PHP Notice if $wgDefaultUserOptions is not correctly set +* (bug 14952) Page titles are renormalized after html entities are removed so that + links with non-NFC character references work correctly. == API changes in 1.17 == * (bug 22738) Allow filtering by action type on query=logevent diff --git a/includes/Sanitizer.php b/includes/Sanitizer.php index 45b2cf6c13..01cd5b97ed 100644 --- a/includes/Sanitizer.php +++ b/includes/Sanitizer.php @@ -1176,6 +1176,30 @@ class Sanitizer { $text ); } + /** + * Decode any character references, numeric or named entities, + * in the next and normalize the resulting string. (bug 14952) + * + * This is useful for page titles, not for text to be displayed, + * MediaWiki allows HTML entities to escape normalization as a feature. + * + * @param $text String (already normalized, containing entities) + * @return String (still normalized, without entities) + */ + public static function decodeCharReferencesAndNormalize( $text ) { + global $wgContLang; + $text = preg_replace_callback( + MW_CHAR_REFS_REGEX, + array( 'Sanitizer', 'decodeCharReferencesCallback' ), + $text, /* limit */ -1, $count ); + + if ( $count ) { + return $wgContLang->normalize( $text ); + } else { + return $text; + } + } + /** * @param $matches String * @return String diff --git a/includes/Title.php b/includes/Title.php index ea5373cc12..d3bd6ec63a 100644 --- a/includes/Title.php +++ b/includes/Title.php @@ -127,9 +127,9 @@ class Title { } /** - * Convert things like é ā or 〗 into real text... + * Convert things like é ā or 〗 into normalized(bug 14952) text */ - $filteredText = Sanitizer::decodeCharReferences( $text ); + $filteredText = Sanitizer::decodeCharReferencesAndNormalize( $text ); $t = new Title(); $t->mDbkeyform = str_replace( ' ', '_', $filteredText ); diff --git a/maintenance/parserTests.txt b/maintenance/parserTests.txt index aed2f053bd..e34d12699f 100644 --- a/maintenance/parserTests.txt +++ b/maintenance/parserTests.txt @@ -4114,6 +4114,31 @@ Character reference normalization in link text (bug 1938)

!!end +!! article +אַ +!! text +Test for unicode normalization + +The page's name is U+05d0 U+05b7, with non-canonical form U+FB2E +!! endarticle + +!! test +(bug 19451) Links should refer to the normalized form. +!! input +[[אַ]] +[[אַ]] +[[אÖ·]] +[[אַ]] +[[אַ]] +!! result +

+אַ +אÖ· +אַ +אַ +

+!! end + !! test Empty attribute crash test (bug 2067) !! input