* Make tests work better together. Tests are now skipped or marked incomplete.

[lhc/web/wiklou.git] / includes / Sanitizer.php
diff --git a/includes/Sanitizer.php b/includes/Sanitizer.php

index 0487762..8533990 100644 (file)
--- a/includes/Sanitizer.php
+++ b/includes/Sanitizer.php
@@ -620,7 +620,7 @@ class Sanitizer {
          * @todo Check for unique id attribute :P
          */
         static function validateAttributes( $attribs, $whitelist ) {
-               global $wgAllowRdfaAttributes;
+               global $wgAllowRdfaAttributes, $wgAllowMicrodataAttributes;
  
                 $whitelist = array_flip( $whitelist );
                 $hrefExp = '/^(' . wfUrlProtocols() . ')[^\s]+$/';
@@ -651,9 +651,7 @@ class Sanitizer {
                         }
  
                         if ( $attribute === 'id' ) {
-                               global $wgEnforceHtmlIds;
-                               $value = Sanitizer::escapeId( $value,
-                                       $wgEnforceHtmlIds ? 'noninitial' : 'xml' );
+                               $value = Sanitizer::escapeId( $value, 'noninitial' );
                         }
  
                         //RDFa and microdata properties allow URLs, URIs and/or CURIs. check them for sanity
@@ -682,6 +680,29 @@ class Sanitizer {
                         // Output should only have one attribute of each name.
                         $out[$attribute] = $value;
                 }
+
+               if ( $wgAllowMicrodataAttributes ) {
+                       # There are some complicated validity constraints we need to
+                       # enforce here.  First of all, we don't want to allow non-standard
+                       # itemtypes.
+                       $allowedTypes = array(
+                               'http://microformats.org/profile/hcard',
+                               'http://microformats.org/profile/hcalendar#vevent',
+                               'http://n.whatwg.org/work',
+                       );
+                       if ( isset( $out['itemtype'] ) && !in_array( $out['itemtype'],
+                       $allowedTypes ) ) {
+                               # Kill everything
+                               unset( $out['itemscope'] );
+                       }
+                       # itemtype, itemid, itemref don't make sense without itemscope
+                       if ( !array_key_exists( 'itemscope', $out ) ) {
+                               unset( $out['itemtype'] );
+                               unset( $out['itemid'] );
+                               unset( $out['itemref'] );
+                       }
+                       # TODO: Strip itemprop if we aren't descendants of an itemscope.
+               }
                 return $out;
         }
  
@@ -828,63 +849,64 @@ class Sanitizer {
         }
  
         /**
-        * Given a value escape it so that it can be used in an id attribute and
-        * return it, this does not validate the value however (see first link)
+        * Given a value, escape it so that it can be used in an id attribute and
+        * return it.  This will use HTML5 validation if $wgExperimentalHtmlIds is
+        * true, allowing anything but ASCII whitespace.  Otherwise it will use
+        * HTML 4 rules, which means a narrow subset of ASCII, with bad characters
+        * escaped with lots of dots.
+        *
+        * To ensure we don't have to bother escaping anything, we also strip ', ",
+        * & even if $wgExperimentalIds is true.  TODO: Is this the best tactic?
+        * We also strip # because it upsets IE6.
          *
          * @see http://www.w3.org/TR/html401/types.html#type-name Valid characters
          *                                                          in the id and
          *                                                          name attributes
          * @see http://www.w3.org/TR/html401/struct/links.html#h-12.2.3 Anchors with the id attribute
+        * @see http://www.whatwg.org/specs/web-apps/current-work/multipage/elements.html#the-id-attribute
+        *   HTML5 definition of id attribute
          *
-        * @param $id String: id to validate
+        * @param $id String: id to escape
          * @param $options Mixed: string or array of strings (default is array()):
          *   'noninitial': This is a non-initial fragment of an id, not a full id,
          *       so don't pay attention if the first character isn't valid at the
-        *       beginning of an id.
-        *   'xml': Don't restrict the id to be HTML4-compatible.  This option
-        *       allows any alphabetic character to be used, per the XML standard.
-        *       Therefore, it also completely changes the type of escaping: instead
-        *       of weird dot-encoding, runs of invalid characters (mostly
-        *       whitespace) are just compressed into a single underscore.
+        *       beginning of an id.  Only matters if $wgExperimentalHtmlIds is
+        *       false.
+        *   'legacy': Behave the way the old HTML 4-based ID escaping worked even
+        *       if $wgExperimentalHtmlIds is used, so we can generate extra
+        *       anchors and links won't break.
          * @return String
          */
         static function escapeId( $id, $options = array() ) {
+               global $wgHtml5, $wgExperimentalHtmlIds;
                 $options = (array)$options;
  
-               if ( !in_array( 'xml', $options ) ) {
-                       # HTML4-style escaping
-                       static $replace = array(
-                               '%3A' => ':',
-                               '%' => '.'
-                       );
-
-                       $id = urlencode( Sanitizer::decodeCharReferences( strtr( $id, ' ', '_' ) ) );
-                       $id = str_replace( array_keys( $replace ), array_values( $replace ), $id );
-
-                       if ( !preg_match( '/^[a-zA-Z]/', $id )
-                       && !in_array( 'noninitial', $options ) )  {
-                               // Initial character must be a letter!
-                               $id = "x$id";
+               if ( $wgHtml5 && $wgExperimentalHtmlIds && !in_array( 'legacy', $options ) ) {
+                       $id = Sanitizer::decodeCharReferences( $id );
+                       $id = preg_replace( '/[ \t\n\r\f_\'"&#]+/', '_', $id );
+                       $id = trim( $id, '_' );
+                       if ( $id === '' ) {
+                               # Must have been all whitespace to start with.
+                               return '_';
+                       } else {
+                               return $id;
                         }
-                       return $id;
                 }
  
-               # XML-style escaping.  For the patterns used, see the XML 1.0 standard,
-               # 5th edition, NameStartChar and NameChar: <http://www.w3.org/TR/REC-xml/>
-               $nameStartChar = ':a-zA-Z_\xC0-\xD6\xD8-\xF6\xF8-\x{2FF}\x{370}-\x{37D}'
-                       . '\x{37F}-\x{1FFF}\x{200C}-\x{200D}\x{2070}-\x{218F}\x{2C00}-\x{2FEF}'
-                       . '\x{3001}-\x{D7FF}\x{F900}-\x{FDCF}\x{FDF0}-\x{FFFD}\x{10000}-\x{EFFFF}';
-               $nameChar = $nameStartChar . '.\-0-9\xB7\x{0300}-\x{036F}'
-                       . '\x{203F}-\x{2040}';
-               # Replace _ as well so we don't get multiple consecutive underscores
-               $id = preg_replace( "/([^$nameChar]|_)+/u", '_', $id );
-               $id = trim( $id, '_' );
-
-               if ( !preg_match( "/^[$nameStartChar]/u", $id )
-               && !in_array( 'noninitial', $options ) ) {
-                       $id = "_$id";
-               }
+               # HTML4-style escaping
+               static $replace = array(
+                       '%3A' => ':',
+                       '%' => '.'
+               );
+
+               $id = urlencode( Sanitizer::decodeCharReferences( strtr( $id, ' ', '_' ) ) );
+               $id = str_replace( array_keys( $replace ), array_values( $replace ), $id );
  
+               if ( !preg_match( '/^[a-zA-Z]/', $id )
+               && !in_array( 'noninitial', $options ) )  {
+                       // Initial character must be a letter!
+                       $id = "x$id";
+               }
                 return $id;
         }