Refactor $wgEnforceHtmlIds code
authorAryeh Gregor <simetrical@users.mediawiki.org>
Fri, 29 Jan 2010 21:44:01 +0000 (21:44 +0000)
committerAryeh Gregor <simetrical@users.mediawiki.org>
Fri, 29 Jan 2010 21:44:01 +0000 (21:44 +0000)
Renamed setting to $wgExperimentalHtmlIds, off by default, and updated
the code to enforce the much laxer HTML5 rules.  Still needs testing in
various browsers.

includes/DefaultSettings.php
includes/Sanitizer.php
includes/Title.php
includes/parser/Parser.php
maintenance/parserTests.inc

index 04152ad..7d834c8 100644 (file)
@@ -4142,12 +4142,11 @@ $wgEdititis = false;
 $wgUniversalEditButton = true;
 
 /**
- * Allow id's that don't conform to HTML4 backward compatibility requirements.
- * This is purely experimental, has multiple known flaws, and will likely be
- * renamed and reconcepted based on HTML5 in the future, so should not be used
- * except for testing.
+ * Should we allow a broader set of characters in id attributes, per HTML5?  If
+ * not, use only HTML 4-compatible IDs.  This option is for testing -- when the
+ * functionality is ready, it will be on by default with no option.
  */
-$wgEnforceHtmlIds = true;
+$wgExperimentalHtmlIds = false;
 
 /**
  * Search form behavior
index adfbd5a..8f45cfd 100644 (file)
@@ -651,9 +651,7 @@ class Sanitizer {
                        }
 
                        if ( $attribute === 'id' ) {
-                               global $wgEnforceHtmlIds;
-                               $value = Sanitizer::escapeId( $value,
-                                       $wgEnforceHtmlIds ? 'noninitial' : 'xml' );
+                               $value = Sanitizer::escapeId( $value, 'noninitial' );
                        }
 
                        //RDFa and microdata properties allow URLs, URIs and/or CURIs. check them for sanity
@@ -851,63 +849,62 @@ class Sanitizer {
        }
 
        /**
-        * Given a value escape it so that it can be used in an id attribute and
-        * return it, this does not validate the value however (see first link)
+        * Given a value, escape it so that it can be used in an id attribute and
+        * return it.  This will use HTML5 validation if $wgExperimentalHtmlIds is
+        * true, allowing anything but ASCII whitespace.  Otherwise it will use
+        * HTML 4 rules, which means a narrow subset of ASCII, with bad characters
+        * escaped with lots of dots.
+        *
+        * To ensure we don't have to bother escaping anything, we also strip ', ",
+        * & even if $wgExperimentalIds is true.  TODO: Is this the best tactic?
         *
         * @see http://www.w3.org/TR/html401/types.html#type-name Valid characters
         *                                                          in the id and
         *                                                          name attributes
         * @see http://www.w3.org/TR/html401/struct/links.html#h-12.2.3 Anchors with the id attribute
+        * @see http://www.whatwg.org/specs/web-apps/current-work/multipage/elements.html#the-id-attribute
+        *   HTML5 definition of id attribute
         *
-        * @param $id String: id to validate
+        * @param $id String: id to escape
         * @param $options Mixed: string or array of strings (default is array()):
         *   'noninitial': This is a non-initial fragment of an id, not a full id,
         *       so don't pay attention if the first character isn't valid at the
-        *       beginning of an id.
-        *   'xml': Don't restrict the id to be HTML4-compatible.  This option
-        *       allows any alphabetic character to be used, per the XML standard.
-        *       Therefore, it also completely changes the type of escaping: instead
-        *       of weird dot-encoding, runs of invalid characters (mostly
-        *       whitespace) are just compressed into a single underscore.
+        *       beginning of an id.  Only matters if $wgExperimentalHtmlIds is
+        *       false.
+        *   'legacy': Behave the way the old HTML 4-based ID escaping worked even
+        *       if $wgExperimentalHtmlIds is used, so we can generate extra
+        *       anchors and links won't break.
         * @return String
         */
        static function escapeId( $id, $options = array() ) {
+               global $wgExperimentalHtmlIds;
                $options = (array)$options;
 
-               if ( !in_array( 'xml', $options ) ) {
-                       # HTML4-style escaping
-                       static $replace = array(
-                               '%3A' => ':',
-                               '%' => '.'
-                       );
-
-                       $id = urlencode( Sanitizer::decodeCharReferences( strtr( $id, ' ', '_' ) ) );
-                       $id = str_replace( array_keys( $replace ), array_values( $replace ), $id );
-
-                       if ( !preg_match( '/^[a-zA-Z]/', $id )
-                       && !in_array( 'noninitial', $options ) )  {
-                               // Initial character must be a letter!
-                               $id = "x$id";
+               if ( $wgExperimentalHtmlIds && !in_array( 'legacy', $options ) ) {
+                       $id = preg_replace( '/[ \t\n\r\f_\'"&]+/', '_', $id );
+                       $id = trim( $id, '_' );
+                       if ( $id === '' ) {
+                               # Must have been all whitespace to start with.
+                               return '_';
+                       } else {
+                               return $id;
                        }
-                       return $id;
                }
 
-               # XML-style escaping.  For the patterns used, see the XML 1.0 standard,
-               # 5th edition, NameStartChar and NameChar: <http://www.w3.org/TR/REC-xml/>
-               $nameStartChar = ':a-zA-Z_\xC0-\xD6\xD8-\xF6\xF8-\x{2FF}\x{370}-\x{37D}'
-                       . '\x{37F}-\x{1FFF}\x{200C}-\x{200D}\x{2070}-\x{218F}\x{2C00}-\x{2FEF}'
-                       . '\x{3001}-\x{D7FF}\x{F900}-\x{FDCF}\x{FDF0}-\x{FFFD}\x{10000}-\x{EFFFF}';
-               $nameChar = $nameStartChar . '.\-0-9\xB7\x{0300}-\x{036F}'
-                       . '\x{203F}-\x{2040}';
-               # Replace _ as well so we don't get multiple consecutive underscores
-               $id = preg_replace( "/([^$nameChar]|_)+/u", '_', $id );
-               $id = trim( $id, '_' );
-
-               if ( !preg_match( "/^[$nameStartChar]/u", $id )
-               && !in_array( 'noninitial', $options ) ) {
-                       $id = "_$id";
-               }
+               # HTML4-style escaping
+               static $replace = array(
+                       '%3A' => ':',
+                       '%' => '.'
+               );
+
+               $id = urlencode( Sanitizer::decodeCharReferences( strtr( $id, ' ', '_' ) ) );
+               $id = str_replace( array_keys( $replace ), array_values( $replace ), $id );
 
+               if ( !preg_match( '/^[a-zA-Z]/', $id )
+               && !in_array( 'noninitial', $options ) )  {
+                       // Initial character must be a letter!
+                       $id = "x$id";
+               }
                return $id;
        }
 
index 919107e..0f2b353 100644 (file)
@@ -501,13 +501,11 @@ class Title {
         * Escape a text fragment, say from a link, for a URL
         */
        static function escapeFragmentForURL( $fragment ) {
-               global $wgEnforceHtmlIds;
                # Note that we don't urlencode the fragment.  urlencoded Unicode
                # fragments appear not to work in IE (at least up to 7) or in at least
                # one version of Opera 9.x.  The W3C validator, for one, doesn't seem
                # to care if they aren't encoded.
-               return Sanitizer::escapeId( $fragment,
-                       $wgEnforceHtmlIds ? 'noninitial' : 'xml' );
+               return Sanitizer::escapeId( $fragment, 'noninitial' );
        }
 
 #----------------------------------------------------------------------------
index ee7383d..cd101f7 100644 (file)
@@ -3478,7 +3478,7 @@ class Parser
         * @private
         */
        function formatHeadings( $text, $origText, $isMain=true ) {
-               global $wgMaxTocLevel, $wgContLang, $wgEnforceHtmlIds;
+               global $wgMaxTocLevel, $wgContLang, $wgExperimentalHtmlIds;
 
                $doNumberHeadings = $this->mOptions->getNumberHeadings();
                $showEditLink = $this->mOptions->getEditSection();
@@ -3654,11 +3654,7 @@ class Parser
                        # Save headline for section edit hint before it's escaped
                        $headlineHint = $safeHeadline;
 
-                       if ( $wgEnforceHtmlIds ) {
-                               $legacyHeadline = false;
-                               $safeHeadline = Sanitizer::escapeId( $safeHeadline,
-                                       'noninitial' );
-                       } else {
+                       if ( $wgExperimentalHtmlIds ) {
                                # For reverse compatibility, provide an id that's
                                # HTML4-compatible, like we used to.
                                #
@@ -3670,20 +3666,17 @@ class Parser
                                # to type in section names like "abc_.D7.93.D7.90.D7.A4"
                                # manually, so let's not bother worrying about it.
                                $legacyHeadline = Sanitizer::escapeId( $safeHeadline,
-                                       'noninitial' );
-                               $safeHeadline = Sanitizer::escapeId( $safeHeadline, 'xml' );
+                                       array( 'noninitial', 'legacy' ) );
+                               $safeHeadline = Sanitizer::escapeId( $safeHeadline );
 
                                if ( $legacyHeadline == $safeHeadline ) {
                                        # No reason to have both (in fact, we can't)
                                        $legacyHeadline = false;
-                               } elseif ( $legacyHeadline != Sanitizer::escapeId(
-                               $legacyHeadline, 'xml' ) ) {
-                                       # The legacy id is invalid XML.  We used to allow this, but
-                                       # there's no reason to do so anymore.  Backward
-                                       # compatibility will fail slightly in this case, but it's
-                                       # no big deal.
-                                       $legacyHeadline = false;
                                }
+                       } else {
+                               $legacyHeadline = false;
+                               $safeHeadline = Sanitizer::escapeId( $safeHeadline,
+                                       'noninitial' );
                        }
 
                        # HTML names must be case-insensitively unique (bug 10721).  FIXME:
@@ -3711,7 +3704,7 @@ class Parser
                        # Don't number the heading if it is the only one (looks silly)
                        if( $doNumberHeadings && count( $matches[3] ) > 1) {
                                # the two are different if the line contains a link
-                               $headline=$numbering . ' ' . $headline;
+                               $headline = $numbering . ' ' . $headline;
                        }
 
                        # Create the anchor for linking from the TOC to the section
index f1dedba..882bc24 100644 (file)
@@ -657,7 +657,7 @@ class ParserTest {
                        'wgDefaultExternalStore' => array(),
                        'wgForeignFileRepos' => array(),
                        'wgLinkHolderBatchSize' => $linkHolderBatchSize,
-                       'wgEnforceHtmlIds' => true,
+                       'wgExperimentalHtmlIds' => false,
                        'wgExternalLinkTarget' => false,
                        'wgAlwaysUseTidy' => false,
                        'wgHtml5' => true,