From 129067c907ea65f621ab64cdfff59fd2b28091e1 Mon Sep 17 00:00:00 2001
From: Max Semenik <maxsem.wiki@gmail.com>
Date: Thu, 2 Nov 2017 19:35:11 -0700
Subject: [PATCH] Remove nbsp and similar characters from section IDs

Bug: T90902
Change-Id: I71bdb7dd43c3e532287290e3c691d9739da45475
---
 RELEASE-NOTES-1.31           |  1 +
 includes/parser/Parser.php   | 24 ++++++++++++++++++++++++
 tests/parser/parserTests.txt | 14 ++++++++++++++
 3 files changed, 39 insertions(+)
diff --git a/RELEASE-NOTES-1.31 b/RELEASE-NOTES-1.31
index 4bfcfcb5de..3688163f23 100644
--- a/RELEASE-NOTES-1.31
+++ b/RELEASE-NOTES-1.31
@@ -41,6 +41,7 @@ production.
 * â¦
 
 === Bug fixes in 1.31 ===
+* (T90902) Non-breaking space in header ID breaks anchor
 * â¦
 
 === Action API changes in 1.31 ===
diff --git a/includes/parser/Parser.php b/includes/parser/Parser.php
index f2e47dc36a..3548da9581 100644
--- a/includes/parser/Parser.php
+++ b/includes/parser/Parser.php
@@ -4206,6 +4206,9 @@ class Parser {
 
 			# Decode HTML entities
 			$safeHeadline = Sanitizer::decodeCharReferences( $safeHeadline );
+
+			$safeHeadline = $this->normalizeSectionName( $safeHeadline );
+
 			$fallbackHeadline = Sanitizer::escapeIdForAttribute( $safeHeadline, Sanitizer::ID_FALLBACK );
 			$linkAnchor = Sanitizer::escapeIdForLink( $safeHeadline );
 			$safeHeadline = Sanitizer::escapeIdForAttribute( $safeHeadline, Sanitizer::ID_PRIMARY );
@@ -5767,6 +5770,8 @@ class Parser {
 		$text = $this->stripSectionName( $text );
 		$text = Sanitizer::normalizeSectionNameWhitespace( $text );
 		$text = Sanitizer::decodeCharReferences( $text );
+		$text = $this->normalizeSectionName( $text );
+
 		return '#' . Sanitizer::escapeIdForLink( $text );
 	}
 
@@ -5786,6 +5791,7 @@ class Parser {
 		$text = $this->stripSectionName( $text );
 		$text = Sanitizer::normalizeSectionNameWhitespace( $text );
 		$text = Sanitizer::decodeCharReferences( $text );
+		$text = $this->normalizeSectionName( $text );
 
 		if ( isset( $wgFragmentMode[1] ) && $wgFragmentMode[1] === 'legacy' ) {
 			// ForAttribute() and ForLink() are the same for legacy encoding
@@ -5797,6 +5803,24 @@ class Parser {
 		return "#$id";
 	}
 
+	/**
+	 * Apply the same normalization as code making links to this section would
+	 *
+	 * @param string $text
+	 * @return string
+	 */
+	private function normalizeSectionName( $text ) {
+		# T90902: ensure the same normalization is applied for IDs as to links
+		$titleParser = MediaWikiServices::getInstance()->getTitleParser();
+		try {
+
+			$parts = $titleParser->splitTitleString( "#$text" );
+		} catch ( MalformedTitleException $ex ) {
+			return $text;
+		}
+		return $parts['fragment'];
+	}
+
 	/**
 	 * Strips a text string of wikitext for use in a section anchor
 	 *
diff --git a/tests/parser/parserTests.txt b/tests/parser/parserTests.txt
index 3c861ea10b..1204dbd715 100644
--- a/tests/parser/parserTests.txt
+++ b/tests/parser/parserTests.txt
@@ -29536,3 +29536,17 @@ wgFragmentMode=[ 'html5' ]
 </p><p><a href="#å¤é">#å¤é</a> <a href="#å¤é">#å¤é</a>
 </p>
 !! end
+
+!! test
+T90902: Normalize weird characters in section IDs
+!! config
+wgFragmentMode=[ 'html5', 'legacy' ]
+!! wikitext
+== Foo&nbsp;bar ==
+[[#Foo&nbsp;bar]]
+
+!! html/php
+<h2><span class="mw-headline" id="Foo_bar">Foo&#160;bar</span><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/index.php?title=Parser_test&amp;action=edit&amp;section=1" title="Edit section: FooÂ bar">edit</a><span class="mw-editsection-bracket">]</span></span></h2>
+<p><a href="#Foo_bar">#Foo&#160;bar</a>
+</p>
+!! end
-- 
2.20.1