From f8879bfd2bef6962fe02be812df2cd657ac73e17 Mon Sep 17 00:00:00 2001 From: Aryeh Gregor Date: Fri, 29 Jan 2010 21:44:01 +0000 Subject: [PATCH] Refactor $wgEnforceHtmlIds code Renamed setting to $wgExperimentalHtmlIds, off by default, and updated the code to enforce the much laxer HTML5 rules. Still needs testing in various browsers. --- includes/DefaultSettings.php | 9 ++-- includes/Sanitizer.php | 81 +++++++++++++++++------------------- includes/Title.php | 4 +- includes/parser/Parser.php | 25 ++++------- maintenance/parserTests.inc | 2 +- 5 files changed, 54 insertions(+), 67 deletions(-) diff --git a/includes/DefaultSettings.php b/includes/DefaultSettings.php index 04152adef6..7d834c8c44 100644 --- a/includes/DefaultSettings.php +++ b/includes/DefaultSettings.php @@ -4142,12 +4142,11 @@ $wgEdititis = false; $wgUniversalEditButton = true; /** - * Allow id's that don't conform to HTML4 backward compatibility requirements. - * This is purely experimental, has multiple known flaws, and will likely be - * renamed and reconcepted based on HTML5 in the future, so should not be used - * except for testing. + * Should we allow a broader set of characters in id attributes, per HTML5? If + * not, use only HTML 4-compatible IDs. This option is for testing -- when the + * functionality is ready, it will be on by default with no option. */ -$wgEnforceHtmlIds = true; +$wgExperimentalHtmlIds = false; /** * Search form behavior diff --git a/includes/Sanitizer.php b/includes/Sanitizer.php index adfbd5a5ac..8f45cfd119 100644 --- a/includes/Sanitizer.php +++ b/includes/Sanitizer.php @@ -651,9 +651,7 @@ class Sanitizer { } if ( $attribute === 'id' ) { - global $wgEnforceHtmlIds; - $value = Sanitizer::escapeId( $value, - $wgEnforceHtmlIds ? 'noninitial' : 'xml' ); + $value = Sanitizer::escapeId( $value, 'noninitial' ); } //RDFa and microdata properties allow URLs, URIs and/or CURIs. check them for sanity @@ -851,63 +849,62 @@ class Sanitizer { } /** - * Given a value escape it so that it can be used in an id attribute and - * return it, this does not validate the value however (see first link) + * Given a value, escape it so that it can be used in an id attribute and + * return it. This will use HTML5 validation if $wgExperimentalHtmlIds is + * true, allowing anything but ASCII whitespace. Otherwise it will use + * HTML 4 rules, which means a narrow subset of ASCII, with bad characters + * escaped with lots of dots. + * + * To ensure we don't have to bother escaping anything, we also strip ', ", + * & even if $wgExperimentalIds is true. TODO: Is this the best tactic? * * @see http://www.w3.org/TR/html401/types.html#type-name Valid characters * in the id and * name attributes * @see http://www.w3.org/TR/html401/struct/links.html#h-12.2.3 Anchors with the id attribute + * @see http://www.whatwg.org/specs/web-apps/current-work/multipage/elements.html#the-id-attribute + * HTML5 definition of id attribute * - * @param $id String: id to validate + * @param $id String: id to escape * @param $options Mixed: string or array of strings (default is array()): * 'noninitial': This is a non-initial fragment of an id, not a full id, * so don't pay attention if the first character isn't valid at the - * beginning of an id. - * 'xml': Don't restrict the id to be HTML4-compatible. This option - * allows any alphabetic character to be used, per the XML standard. - * Therefore, it also completely changes the type of escaping: instead - * of weird dot-encoding, runs of invalid characters (mostly - * whitespace) are just compressed into a single underscore. + * beginning of an id. Only matters if $wgExperimentalHtmlIds is + * false. + * 'legacy': Behave the way the old HTML 4-based ID escaping worked even + * if $wgExperimentalHtmlIds is used, so we can generate extra + * anchors and links won't break. * @return String */ static function escapeId( $id, $options = array() ) { + global $wgExperimentalHtmlIds; $options = (array)$options; - if ( !in_array( 'xml', $options ) ) { - # HTML4-style escaping - static $replace = array( - '%3A' => ':', - '%' => '.' - ); - - $id = urlencode( Sanitizer::decodeCharReferences( strtr( $id, ' ', '_' ) ) ); - $id = str_replace( array_keys( $replace ), array_values( $replace ), $id ); - - if ( !preg_match( '/^[a-zA-Z]/', $id ) - && !in_array( 'noninitial', $options ) ) { - // Initial character must be a letter! - $id = "x$id"; + if ( $wgExperimentalHtmlIds && !in_array( 'legacy', $options ) ) { + $id = preg_replace( '/[ \t\n\r\f_\'"&]+/', '_', $id ); + $id = trim( $id, '_' ); + if ( $id === '' ) { + # Must have been all whitespace to start with. + return '_'; + } else { + return $id; } - return $id; } - # XML-style escaping. For the patterns used, see the XML 1.0 standard, - # 5th edition, NameStartChar and NameChar: - $nameStartChar = ':a-zA-Z_\xC0-\xD6\xD8-\xF6\xF8-\x{2FF}\x{370}-\x{37D}' - . '\x{37F}-\x{1FFF}\x{200C}-\x{200D}\x{2070}-\x{218F}\x{2C00}-\x{2FEF}' - . '\x{3001}-\x{D7FF}\x{F900}-\x{FDCF}\x{FDF0}-\x{FFFD}\x{10000}-\x{EFFFF}'; - $nameChar = $nameStartChar . '.\-0-9\xB7\x{0300}-\x{036F}' - . '\x{203F}-\x{2040}'; - # Replace _ as well so we don't get multiple consecutive underscores - $id = preg_replace( "/([^$nameChar]|_)+/u", '_', $id ); - $id = trim( $id, '_' ); - - if ( !preg_match( "/^[$nameStartChar]/u", $id ) - && !in_array( 'noninitial', $options ) ) { - $id = "_$id"; - } + # HTML4-style escaping + static $replace = array( + '%3A' => ':', + '%' => '.' + ); + + $id = urlencode( Sanitizer::decodeCharReferences( strtr( $id, ' ', '_' ) ) ); + $id = str_replace( array_keys( $replace ), array_values( $replace ), $id ); + if ( !preg_match( '/^[a-zA-Z]/', $id ) + && !in_array( 'noninitial', $options ) ) { + // Initial character must be a letter! + $id = "x$id"; + } return $id; } diff --git a/includes/Title.php b/includes/Title.php index 919107e034..0f2b353db2 100644 --- a/includes/Title.php +++ b/includes/Title.php @@ -501,13 +501,11 @@ class Title { * Escape a text fragment, say from a link, for a URL */ static function escapeFragmentForURL( $fragment ) { - global $wgEnforceHtmlIds; # Note that we don't urlencode the fragment. urlencoded Unicode # fragments appear not to work in IE (at least up to 7) or in at least # one version of Opera 9.x. The W3C validator, for one, doesn't seem # to care if they aren't encoded. - return Sanitizer::escapeId( $fragment, - $wgEnforceHtmlIds ? 'noninitial' : 'xml' ); + return Sanitizer::escapeId( $fragment, 'noninitial' ); } #---------------------------------------------------------------------------- diff --git a/includes/parser/Parser.php b/includes/parser/Parser.php index ee7383dc2f..cd101f723e 100644 --- a/includes/parser/Parser.php +++ b/includes/parser/Parser.php @@ -3478,7 +3478,7 @@ class Parser * @private */ function formatHeadings( $text, $origText, $isMain=true ) { - global $wgMaxTocLevel, $wgContLang, $wgEnforceHtmlIds; + global $wgMaxTocLevel, $wgContLang, $wgExperimentalHtmlIds; $doNumberHeadings = $this->mOptions->getNumberHeadings(); $showEditLink = $this->mOptions->getEditSection(); @@ -3654,11 +3654,7 @@ class Parser # Save headline for section edit hint before it's escaped $headlineHint = $safeHeadline; - if ( $wgEnforceHtmlIds ) { - $legacyHeadline = false; - $safeHeadline = Sanitizer::escapeId( $safeHeadline, - 'noninitial' ); - } else { + if ( $wgExperimentalHtmlIds ) { # For reverse compatibility, provide an id that's # HTML4-compatible, like we used to. # @@ -3670,20 +3666,17 @@ class Parser # to type in section names like "abc_.D7.93.D7.90.D7.A4" # manually, so let's not bother worrying about it. $legacyHeadline = Sanitizer::escapeId( $safeHeadline, - 'noninitial' ); - $safeHeadline = Sanitizer::escapeId( $safeHeadline, 'xml' ); + array( 'noninitial', 'legacy' ) ); + $safeHeadline = Sanitizer::escapeId( $safeHeadline ); if ( $legacyHeadline == $safeHeadline ) { # No reason to have both (in fact, we can't) $legacyHeadline = false; - } elseif ( $legacyHeadline != Sanitizer::escapeId( - $legacyHeadline, 'xml' ) ) { - # The legacy id is invalid XML. We used to allow this, but - # there's no reason to do so anymore. Backward - # compatibility will fail slightly in this case, but it's - # no big deal. - $legacyHeadline = false; } + } else { + $legacyHeadline = false; + $safeHeadline = Sanitizer::escapeId( $safeHeadline, + 'noninitial' ); } # HTML names must be case-insensitively unique (bug 10721). FIXME: @@ -3711,7 +3704,7 @@ class Parser # Don't number the heading if it is the only one (looks silly) if( $doNumberHeadings && count( $matches[3] ) > 1) { # the two are different if the line contains a link - $headline=$numbering . ' ' . $headline; + $headline = $numbering . ' ' . $headline; } # Create the anchor for linking from the TOC to the section diff --git a/maintenance/parserTests.inc b/maintenance/parserTests.inc index f1dedba6f2..882bc2486b 100644 --- a/maintenance/parserTests.inc +++ b/maintenance/parserTests.inc @@ -657,7 +657,7 @@ class ParserTest { 'wgDefaultExternalStore' => array(), 'wgForeignFileRepos' => array(), 'wgLinkHolderBatchSize' => $linkHolderBatchSize, - 'wgEnforceHtmlIds' => true, + 'wgExperimentalHtmlIds' => false, 'wgExternalLinkTarget' => false, 'wgAlwaysUseTidy' => false, 'wgHtml5' => true, -- 2.20.1