From 129067c907ea65f621ab64cdfff59fd2b28091e1 Mon Sep 17 00:00:00 2001
From: Max Semenik
Date: Thu, 2 Nov 2017 19:35:11 -0700
Subject: [PATCH] Remove nbsp and similar characters from section IDs
Bug: T90902
Change-Id: I71bdb7dd43c3e532287290e3c691d9739da45475
---
RELEASE-NOTES-1.31 | 1 +
includes/parser/Parser.php | 24 ++++++++++++++++++++++++
tests/parser/parserTests.txt | 14 ++++++++++++++
3 files changed, 39 insertions(+)
diff --git a/RELEASE-NOTES-1.31 b/RELEASE-NOTES-1.31
index 4bfcfcb5de..3688163f23 100644
--- a/RELEASE-NOTES-1.31
+++ b/RELEASE-NOTES-1.31
@@ -41,6 +41,7 @@ production.
* â¦
=== Bug fixes in 1.31 ===
+* (T90902) Non-breaking space in header ID breaks anchor
* â¦
=== Action API changes in 1.31 ===
diff --git a/includes/parser/Parser.php b/includes/parser/Parser.php
index f2e47dc36a..3548da9581 100644
--- a/includes/parser/Parser.php
+++ b/includes/parser/Parser.php
@@ -4206,6 +4206,9 @@ class Parser {
# Decode HTML entities
$safeHeadline = Sanitizer::decodeCharReferences( $safeHeadline );
+
+ $safeHeadline = $this->normalizeSectionName( $safeHeadline );
+
$fallbackHeadline = Sanitizer::escapeIdForAttribute( $safeHeadline, Sanitizer::ID_FALLBACK );
$linkAnchor = Sanitizer::escapeIdForLink( $safeHeadline );
$safeHeadline = Sanitizer::escapeIdForAttribute( $safeHeadline, Sanitizer::ID_PRIMARY );
@@ -5767,6 +5770,8 @@ class Parser {
$text = $this->stripSectionName( $text );
$text = Sanitizer::normalizeSectionNameWhitespace( $text );
$text = Sanitizer::decodeCharReferences( $text );
+ $text = $this->normalizeSectionName( $text );
+
return '#' . Sanitizer::escapeIdForLink( $text );
}
@@ -5786,6 +5791,7 @@ class Parser {
$text = $this->stripSectionName( $text );
$text = Sanitizer::normalizeSectionNameWhitespace( $text );
$text = Sanitizer::decodeCharReferences( $text );
+ $text = $this->normalizeSectionName( $text );
if ( isset( $wgFragmentMode[1] ) && $wgFragmentMode[1] === 'legacy' ) {
// ForAttribute() and ForLink() are the same for legacy encoding
@@ -5797,6 +5803,24 @@ class Parser {
return "#$id";
}
+ /**
+ * Apply the same normalization as code making links to this section would
+ *
+ * @param string $text
+ * @return string
+ */
+ private function normalizeSectionName( $text ) {
+ # T90902: ensure the same normalization is applied for IDs as to links
+ $titleParser = MediaWikiServices::getInstance()->getTitleParser();
+ try {
+
+ $parts = $titleParser->splitTitleString( "#$text" );
+ } catch ( MalformedTitleException $ex ) {
+ return $text;
+ }
+ return $parts['fragment'];
+ }
+
/**
* Strips a text string of wikitext for use in a section anchor
*
diff --git a/tests/parser/parserTests.txt b/tests/parser/parserTests.txt
index 3c861ea10b..1204dbd715 100644
--- a/tests/parser/parserTests.txt
+++ b/tests/parser/parserTests.txt
@@ -29536,3 +29536,17 @@ wgFragmentMode=[ 'html5' ]
#å¤é
#å¤é
!! end
+
+!! test
+T90902: Normalize weird characters in section IDs
+!! config
+wgFragmentMode=[ 'html5', 'legacy' ]
+!! wikitext
+== Foo bar ==
+[[#Foo bar]]
+
+!! html/php
+Foo bar[edit]
+#Foo bar
+
+!! end
--
2.20.1