From f8879bfd2bef6962fe02be812df2cd657ac73e17 Mon Sep 17 00:00:00 2001
From: Aryeh Gregor <simetrical@users.mediawiki.org>
Date: Fri, 29 Jan 2010 21:44:01 +0000
Subject: [PATCH] Refactor $wgEnforceHtmlIds code

Renamed setting to $wgExperimentalHtmlIds, off by default, and updated
the code to enforce the much laxer HTML5 rules.  Still needs testing in
various browsers.
---
 includes/DefaultSettings.php |  9 ++--
 includes/Sanitizer.php       | 81 +++++++++++++++++-------------------
 includes/Title.php           |  4 +-
 includes/parser/Parser.php   | 25 ++++-------
 maintenance/parserTests.inc  |  2 +-
 5 files changed, 54 insertions(+), 67 deletions(-)

diff --git a/includes/DefaultSettings.php b/includes/DefaultSettings.php
index 04152adef6..7d834c8c44 100644
--- a/includes/DefaultSettings.php
+++ b/includes/DefaultSettings.php
@@ -4142,12 +4142,11 @@ $wgEdititis = false;
 $wgUniversalEditButton = true;
 
 /**
- * Allow id's that don't conform to HTML4 backward compatibility requirements.
- * This is purely experimental, has multiple known flaws, and will likely be
- * renamed and reconcepted based on HTML5 in the future, so should not be used
- * except for testing.
+ * Should we allow a broader set of characters in id attributes, per HTML5?  If
+ * not, use only HTML 4-compatible IDs.  This option is for testing -- when the
+ * functionality is ready, it will be on by default with no option.
  */
-$wgEnforceHtmlIds = true;
+$wgExperimentalHtmlIds = false;
 
 /**
  * Search form behavior
diff --git a/includes/Sanitizer.php b/includes/Sanitizer.php
index adfbd5a5ac..8f45cfd119 100644
--- a/includes/Sanitizer.php
+++ b/includes/Sanitizer.php
@@ -651,9 +651,7 @@ class Sanitizer {
 			}
 
 			if ( $attribute === 'id' ) {
-				global $wgEnforceHtmlIds;
-				$value = Sanitizer::escapeId( $value,
-					$wgEnforceHtmlIds ? 'noninitial' : 'xml' );
+				$value = Sanitizer::escapeId( $value, 'noninitial' );
 			}
 
 			//RDFa and microdata properties allow URLs, URIs and/or CURIs. check them for sanity
@@ -851,63 +849,62 @@ class Sanitizer {
 	}
 
 	/**
-	 * Given a value escape it so that it can be used in an id attribute and
-	 * return it, this does not validate the value however (see first link)
+	 * Given a value, escape it so that it can be used in an id attribute and
+	 * return it.  This will use HTML5 validation if $wgExperimentalHtmlIds is
+	 * true, allowing anything but ASCII whitespace.  Otherwise it will use
+	 * HTML 4 rules, which means a narrow subset of ASCII, with bad characters
+	 * escaped with lots of dots.
+	 *
+	 * To ensure we don't have to bother escaping anything, we also strip ', ",
+	 * & even if $wgExperimentalIds is true.  TODO: Is this the best tactic?
 	 *
 	 * @see http://www.w3.org/TR/html401/types.html#type-name Valid characters
 	 *                                                          in the id and
 	 *                                                          name attributes
 	 * @see http://www.w3.org/TR/html401/struct/links.html#h-12.2.3 Anchors with the id attribute
+	 * @see http://www.whatwg.org/specs/web-apps/current-work/multipage/elements.html#the-id-attribute
+	 *   HTML5 definition of id attribute
 	 *
-	 * @param $id String: id to validate
+	 * @param $id String: id to escape
 	 * @param $options Mixed: string or array of strings (default is array()):
 	 *   'noninitial': This is a non-initial fragment of an id, not a full id,
 	 *       so don't pay attention if the first character isn't valid at the
-	 *       beginning of an id.
-	 *   'xml': Don't restrict the id to be HTML4-compatible.  This option
-	 *       allows any alphabetic character to be used, per the XML standard.
-	 *       Therefore, it also completely changes the type of escaping: instead
-	 *       of weird dot-encoding, runs of invalid characters (mostly
-	 *       whitespace) are just compressed into a single underscore.
+	 *       beginning of an id.  Only matters if $wgExperimentalHtmlIds is
+	 *       false.
+	 *   'legacy': Behave the way the old HTML 4-based ID escaping worked even
+	 *       if $wgExperimentalHtmlIds is used, so we can generate extra
+	 *       anchors and links won't break.
 	 * @return String
 	 */
 	static function escapeId( $id, $options = array() ) {
+		global $wgExperimentalHtmlIds;
 		$options = (array)$options;
 
-		if ( !in_array( 'xml', $options ) ) {
-			# HTML4-style escaping
-			static $replace = array(
-				'%3A' => ':',
-				'%' => '.'
-			);
-
-			$id = urlencode( Sanitizer::decodeCharReferences( strtr( $id, ' ', '_' ) ) );
-			$id = str_replace( array_keys( $replace ), array_values( $replace ), $id );
-
-			if ( !preg_match( '/^[a-zA-Z]/', $id )
-			&& !in_array( 'noninitial', $options ) )  {
-				// Initial character must be a letter!
-				$id = "x$id";
+		if ( $wgExperimentalHtmlIds && !in_array( 'legacy', $options ) ) {
+			$id = preg_replace( '/[ \t\n\r\f_\'"&]+/', '_', $id );
+			$id = trim( $id, '_' );
+			if ( $id === '' ) {
+				# Must have been all whitespace to start with.
+				return '_';
+			} else {
+				return $id;
 			}
-			return $id;
 		}
 
-		# XML-style escaping.  For the patterns used, see the XML 1.0 standard,
-		# 5th edition, NameStartChar and NameChar: <http://www.w3.org/TR/REC-xml/>
-		$nameStartChar = ':a-zA-Z_\xC0-\xD6\xD8-\xF6\xF8-\x{2FF}\x{370}-\x{37D}'
-			. '\x{37F}-\x{1FFF}\x{200C}-\x{200D}\x{2070}-\x{218F}\x{2C00}-\x{2FEF}'
-			. '\x{3001}-\x{D7FF}\x{F900}-\x{FDCF}\x{FDF0}-\x{FFFD}\x{10000}-\x{EFFFF}';
-		$nameChar = $nameStartChar . '.\-0-9\xB7\x{0300}-\x{036F}'
-			. '\x{203F}-\x{2040}';
-		# Replace _ as well so we don't get multiple consecutive underscores
-		$id = preg_replace( "/([^$nameChar]|_)+/u", '_', $id );
-		$id = trim( $id, '_' );
-
-		if ( !preg_match( "/^[$nameStartChar]/u", $id )
-		&& !in_array( 'noninitial', $options ) ) {
-			$id = "_$id";
-		}
+		# HTML4-style escaping
+		static $replace = array(
+			'%3A' => ':',
+			'%' => '.'
+		);
+
+		$id = urlencode( Sanitizer::decodeCharReferences( strtr( $id, ' ', '_' ) ) );
+		$id = str_replace( array_keys( $replace ), array_values( $replace ), $id );
 
+		if ( !preg_match( '/^[a-zA-Z]/', $id )
+		&& !in_array( 'noninitial', $options ) )  {
+			// Initial character must be a letter!
+			$id = "x$id";
+		}
 		return $id;
 	}
 
diff --git a/includes/Title.php b/includes/Title.php
index 919107e034..0f2b353db2 100644
--- a/includes/Title.php
+++ b/includes/Title.php
@@ -501,13 +501,11 @@ class Title {
 	 * Escape a text fragment, say from a link, for a URL
 	 */
 	static function escapeFragmentForURL( $fragment ) {
-		global $wgEnforceHtmlIds;
 		# Note that we don't urlencode the fragment.  urlencoded Unicode
 		# fragments appear not to work in IE (at least up to 7) or in at least
 		# one version of Opera 9.x.  The W3C validator, for one, doesn't seem
 		# to care if they aren't encoded.
-		return Sanitizer::escapeId( $fragment,
-			$wgEnforceHtmlIds ? 'noninitial' : 'xml' );
+		return Sanitizer::escapeId( $fragment, 'noninitial' );
 	}
 
 #----------------------------------------------------------------------------
diff --git a/includes/parser/Parser.php b/includes/parser/Parser.php
index ee7383dc2f..cd101f723e 100644
--- a/includes/parser/Parser.php
+++ b/includes/parser/Parser.php
@@ -3478,7 +3478,7 @@ class Parser
 	 * @private
 	 */
 	function formatHeadings( $text, $origText, $isMain=true ) {
-		global $wgMaxTocLevel, $wgContLang, $wgEnforceHtmlIds;
+		global $wgMaxTocLevel, $wgContLang, $wgExperimentalHtmlIds;
 
 		$doNumberHeadings = $this->mOptions->getNumberHeadings();
 		$showEditLink = $this->mOptions->getEditSection();
@@ -3654,11 +3654,7 @@ class Parser
 			# Save headline for section edit hint before it's escaped
 			$headlineHint = $safeHeadline;
 
-			if ( $wgEnforceHtmlIds ) {
-				$legacyHeadline = false;
-				$safeHeadline = Sanitizer::escapeId( $safeHeadline,
-					'noninitial' );
-			} else {
+			if ( $wgExperimentalHtmlIds ) {
 				# For reverse compatibility, provide an id that's
 				# HTML4-compatible, like we used to.
 				#
@@ -3670,20 +3666,17 @@ class Parser
 				# to type in section names like "abc_.D7.93.D7.90.D7.A4"
 				# manually, so let's not bother worrying about it.
 				$legacyHeadline = Sanitizer::escapeId( $safeHeadline,
-					'noninitial' );
-				$safeHeadline = Sanitizer::escapeId( $safeHeadline, 'xml' );
+					array( 'noninitial', 'legacy' ) );
+				$safeHeadline = Sanitizer::escapeId( $safeHeadline );
 
 				if ( $legacyHeadline == $safeHeadline ) {
 					# No reason to have both (in fact, we can't)
 					$legacyHeadline = false;
-				} elseif ( $legacyHeadline != Sanitizer::escapeId(
-				$legacyHeadline, 'xml' ) ) {
-					# The legacy id is invalid XML.  We used to allow this, but
-					# there's no reason to do so anymore.  Backward
-					# compatibility will fail slightly in this case, but it's
-					# no big deal.
-					$legacyHeadline = false;
 				}
+			} else {
+				$legacyHeadline = false;
+				$safeHeadline = Sanitizer::escapeId( $safeHeadline,
+					'noninitial' );
 			}
 
 			# HTML names must be case-insensitively unique (bug 10721).  FIXME:
@@ -3711,7 +3704,7 @@ class Parser
 			# Don't number the heading if it is the only one (looks silly)
 			if( $doNumberHeadings && count( $matches[3] ) > 1) {
 				# the two are different if the line contains a link
-				$headline=$numbering . ' ' . $headline;
+				$headline = $numbering . ' ' . $headline;
 			}
 
 			# Create the anchor for linking from the TOC to the section
diff --git a/maintenance/parserTests.inc b/maintenance/parserTests.inc
index f1dedba6f2..882bc2486b 100644
--- a/maintenance/parserTests.inc
+++ b/maintenance/parserTests.inc
@@ -657,7 +657,7 @@ class ParserTest {
 			'wgDefaultExternalStore' => array(),
 			'wgForeignFileRepos' => array(),
 			'wgLinkHolderBatchSize' => $linkHolderBatchSize,
-			'wgEnforceHtmlIds' => true,
+			'wgExperimentalHtmlIds' => false,
 			'wgExternalLinkTarget' => false,
 			'wgAlwaysUseTidy' => false,
 			'wgHtml5' => true,
-- 
2.20.1