From d7371cc6959004aae84d6520b6d0c98352dc39a8 Mon Sep 17 00:00:00 2001 From: Aryeh Gregor Date: Tue, 30 Dec 2008 00:22:34 +0000 Subject: [PATCH] Optionally allow non-HTML4-compatible ids This adds a config option, $wgEnforceHtmlIds, true by default. If this is set to false, all characters that are allowed in XML ids are let through in header ids and manually-specified ids. In particular, this should include all alphabetic and numeric characters. Some remaining issues to work out: * This will cause backward-compatibility issues for some types of links and references: links from non-MediaWiki sources, links from MediaWiki sources running a different version, external links, and references from stylesheets/scripts. These could be partially alleviated by having a second for headers where the two versions differ, but it would remain an issue for manually-specified id's. * Any invalid characters are now, effectively, stripped (replaced with underscores). This might cause problems if some writing systems are invalid in id's for some reason: we'll want to double-check the list of prohibited characters carefully. * Some user agents might not support these links. IE5 appears to, and so do recent versions of Opera and Firefox, but I didn't do extensive testing. * Not tested extensively, there are probably some bugs. I think this would be good to enable on testwiki for the moment to see how it goes. No parser test regressions. No change to RELEASE-NOTES, we can add that when the option is enabled by default (ideally, removed entirely). --- includes/DefaultSettings.php | 7 +++++ includes/Sanitizer.php | 60 +++++++++++++++++++++++++++--------- includes/Title.php | 4 ++- includes/parser/Parser.php | 5 +-- maintenance/parserTests.inc | 1 + 5 files changed, 59 insertions(+), 18 deletions(-) diff --git a/includes/DefaultSettings.php b/includes/DefaultSettings.php index cc981032fc..c851f737f7 100644 --- a/includes/DefaultSettings.php +++ b/includes/DefaultSettings.php @@ -3636,3 +3636,10 @@ $wgUniversalEditButton = true; * Disable for wikis that have their interwiki table updated automatically from a central location (like Wikimedia) */ $wgEnableSpecialInterwiki = true; + +/** + * Allow id's that don't conform to HTML4 backward compatibility requirements. + * This is currently for testing; if all goes well, this option will be removed + * and the functionality will be enabled universally. + */ +$wgEnforceHtmlIds = true; diff --git a/includes/Sanitizer.php b/includes/Sanitizer.php index f5f09a868e..e207f83da7 100644 --- a/includes/Sanitizer.php +++ b/includes/Sanitizer.php @@ -613,8 +613,11 @@ class Sanitizer { } } - if ( $attribute === 'id' ) - $value = Sanitizer::escapeId( $value ); + if ( $attribute === 'id' ) { + global $wgEnforceHtmlIds; + $value = Sanitizer::escapeId( $value, + $wgEnforceHtmlIds ? array() : 'xml' ); + } // If this attribute was previously set, override it. // Output should only have one attribute of each name. @@ -777,25 +780,52 @@ class Sanitizer { * @param string $id Id to validate * @param mixed $options String or array of strings (default is array()): * 'noninitial': This is a non-initial fragment of an id, not a full id, - * so don't prepend an 'x' if the first character isn't valid at the + * so don't pay attention if the first character isn't valid at the * beginning of an id. + * 'xml': Don't restrict the id to be HTML4-compatible. This option + * allows any alphabetic character to be used, per the XML standard. + * Therefore, it also completely changes the type of escaping: instead + * of weird dot-encoding, runs of invalid characters (mostly + * whitespace) are just compressed into a single underscore. * @return string */ static function escapeId( $id, $options = array() ) { $options = (array)$options; - static $replace = array( - '%3A' => ':', - '%' => '.' - ); - - $id = urlencode( Sanitizer::decodeCharReferences( strtr( $id, ' ', '_' ) ) ); - $id = str_replace( array_keys( $replace ), array_values( $replace ), $id ); - - if( preg_match( '/[^a-zA-Z]/', $id[0] ) - && !in_array( 'noninitial', $options ) ) { - // Initial character must be a letter! - $id = "x$id"; + + if ( !in_array( 'xml', $options ) ) { + # HTML4-style escaping + static $replace = array( + '%3A' => ':', + '%' => '.' + ); + + $id = urlencode( Sanitizer::decodeCharReferences( strtr( $id, ' ', '_' ) ) ); + $id = str_replace( array_keys( $replace ), array_values( $replace ), $id ); + + if ( preg_match( '/[^a-zA-Z]/', $id[0] ) + && !in_array( 'noninitial', $options ) ) { + // Initial character must be a letter! + $id = "x$id"; + } + return $id; + } + + # XML-style escaping. For the patterns used, see the XML 1.0 standard, + # 5th edition, NameStartChar and NameChar: + $nameStartChar = ':a-zA-Z_\xC0-\xD6\xD8-\xF6\xF8-\x{2FF}\x{370}-\x{37D}' + . '\x{37F}-\x{1FFF}\x{200C}-\x{200D}\x{2070}-\x{218F}\x{2C00}-\x{2FEF}' + . '\x{3001}-\x{D7FF}\x{F900}-\x{FDCF}\x{FDF0}-\x{FFFD}\x{10000}-\x{EFFFF}'; + $nameChar = $nameStartChar . '.\-0-9\xB7\x{0300}-\x{036F}' + . '\x{203F}-\x{2040}'; + # Replace _ as well so we don't get multiple consecutive underscores + $id = preg_replace( "/([^$nameChar]|_)+/u", '_', $id ); + $id = trim( $id, '_' ); + + if ( !preg_match( "/^[$nameStartChar]/u", $id ) + && !in_array( 'noninitial', $options ) ) { + $id = "_$id"; } + return $id; } diff --git a/includes/Title.php b/includes/Title.php index 29f8efea89..e0b5f0d5df 100644 --- a/includes/Title.php +++ b/includes/Title.php @@ -451,7 +451,9 @@ class Title { * Escape a text fragment, say from a link, for a URL */ static function escapeFragmentForURL( $fragment ) { - return Sanitizer::escapeId( $fragment ); + global $wgEnforceHtmlIds; + return Sanitizer::escapeId( $fragment, + $wgEnforceHtmlIds ? array() : 'xml' ); } #---------------------------------------------------------------------------- diff --git a/includes/parser/Parser.php b/includes/parser/Parser.php index e4d63af28b..4427a0111f 100644 --- a/includes/parser/Parser.php +++ b/includes/parser/Parser.php @@ -3446,7 +3446,7 @@ class Parser * @private */ function formatHeadings( $text, $isMain=true ) { - global $wgMaxTocLevel, $wgContLang; + global $wgMaxTocLevel, $wgContLang, $wgEnforceHtmlIds; $doNumberHeadings = $this->mOptions->getNumberHeadings(); $showEditLink = $this->mOptions->getEditSection(); @@ -3615,7 +3615,8 @@ class Parser # Save headline for section edit hint before it's escaped $headlineHint = $safeHeadline; - $safeHeadline = Sanitizer::escapeId( $safeHeadline ); + $safeHeadline = Sanitizer::escapeId( $safeHeadline, + $wgEnforceHtmlIds ? array() : 'xml' ); # HTML names must be case-insensitively unique (bug 10721) $arrayKey = strtolower( $safeHeadline ); diff --git a/maintenance/parserTests.inc b/maintenance/parserTests.inc index ecb8c203d3..7971e64e8b 100644 --- a/maintenance/parserTests.inc +++ b/maintenance/parserTests.inc @@ -563,6 +563,7 @@ class ParserTest { 'wgDefaultExternalStore' => array(), 'wgForeignFileRepos' => array(), 'wgLinkHolderBatchSize' => $linkHolderBatchSize, + 'wgEnforceHtmlIds' => true, ); $this->savedGlobals = array(); foreach( $settings as $var => $val ) { -- 2.20.1