From ca5a3684c8d13ef9d7cab2ab612d8e1942dbedc1 Mon Sep 17 00:00:00 2001
From: Conrad Irwin
Date: Sun, 28 Mar 2010 03:10:10 +0000
Subject: [PATCH] Re-normalize titles after html entity decoding when necessary
(bug 14952)
---
RELEASE-NOTES | 2 ++
includes/Sanitizer.php | 24 ++++++++++++++++++++++++
includes/Title.php | 4 ++--
maintenance/parserTests.txt | 25 +++++++++++++++++++++++++
4 files changed, 53 insertions(+), 2 deletions(-)
diff --git a/RELEASE-NOTES b/RELEASE-NOTES
index 04c9f5c37d..c64ab37c48 100644
--- a/RELEASE-NOTES
+++ b/RELEASE-NOTES
@@ -62,6 +62,8 @@ it from source control: http://www.mediawiki.org/wiki/Download_from_SVN
themselves unless they are given the 'unblockself' permission.
* (bug 22876) Avoid possible PHP Notice if $wgDefaultUserOptions is not
correctly set
+* (bug 14952) Page titles are renormalized after html entities are removed so that
+ links with non-NFC character references work correctly.
== API changes in 1.17 ==
* (bug 22738) Allow filtering by action type on query=logevent
diff --git a/includes/Sanitizer.php b/includes/Sanitizer.php
index 45b2cf6c13..01cd5b97ed 100644
--- a/includes/Sanitizer.php
+++ b/includes/Sanitizer.php
@@ -1176,6 +1176,30 @@ class Sanitizer {
$text );
}
+ /**
+ * Decode any character references, numeric or named entities,
+ * in the next and normalize the resulting string. (bug 14952)
+ *
+ * This is useful for page titles, not for text to be displayed,
+ * MediaWiki allows HTML entities to escape normalization as a feature.
+ *
+ * @param $text String (already normalized, containing entities)
+ * @return String (still normalized, without entities)
+ */
+ public static function decodeCharReferencesAndNormalize( $text ) {
+ global $wgContLang;
+ $text = preg_replace_callback(
+ MW_CHAR_REFS_REGEX,
+ array( 'Sanitizer', 'decodeCharReferencesCallback' ),
+ $text, /* limit */ -1, $count );
+
+ if ( $count ) {
+ return $wgContLang->normalize( $text );
+ } else {
+ return $text;
+ }
+ }
+
/**
* @param $matches String
* @return String
diff --git a/includes/Title.php b/includes/Title.php
index ea5373cc12..d3bd6ec63a 100644
--- a/includes/Title.php
+++ b/includes/Title.php
@@ -127,9 +127,9 @@ class Title {
}
/**
- * Convert things like é ā or 〗 into real text...
+ * Convert things like é ā or 〗 into normalized(bug 14952) text
*/
- $filteredText = Sanitizer::decodeCharReferences( $text );
+ $filteredText = Sanitizer::decodeCharReferencesAndNormalize( $text );
$t = new Title();
$t->mDbkeyform = str_replace( ' ', '_', $filteredText );
diff --git a/maintenance/parserTests.txt b/maintenance/parserTests.txt
index aed2f053bd..e34d12699f 100644
--- a/maintenance/parserTests.txt
+++ b/maintenance/parserTests.txt
@@ -4114,6 +4114,31 @@ Character reference normalization in link text (bug 1938)
!!end
+!! article
+×Ö·
+!! text
+Test for unicode normalization
+
+The page's name is U+05d0 U+05b7, with non-canonical form U+FB2E
+!! endarticle
+
+!! test
+(bug 19451) Links should refer to the normalized form.
+!! input
+[[אַ]]
+[[אַ]]
+[[אÖ·]]
+[[×ַ]]
+[[×Ö·]]
+!! result
+אַ
+אַ
+אÖ·
+×ַ
+×Ö·
+
+!! end
+
!! test
Empty attribute crash test (bug 2067)
!! input
--
2.20.1