includes/Sanitizer.php

   1 <?php
   2 /**
   3  * XHTML sanitizer for MediaWiki
   4  *
   5  * Copyright (C) 2002-2005 Brion Vibber <brion@pobox.com> et al
   6  * http://www.mediawiki.org/
   7  *
   8  * This program is free software; you can redistribute it and/or modify
   9  * it under the terms of the GNU General Public License as published by
  10  * the Free Software Foundation; either version 2 of the License, or
  11  * (at your option) any later version.
  12  *
  13  * This program is distributed in the hope that it will be useful,
  14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  16  * GNU General Public License for more details.
  17  *
  18  * You should have received a copy of the GNU General Public License along
  19  * with this program; if not, write to the Free Software Foundation, Inc.,
  20  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  21  * http://www.gnu.org/copyleft/gpl.html
  22  *
  23  * @package MediaWiki
  24  * @subpackage Parser
  25  */
  26
  27 /**
  28  * Regular expression to match various types of character references in
  29  * Sanitizer::normalizeCharReferences and Sanitizer::decodeCharReferences
  30  */
  31 define( 'MW_CHAR_REFS_REGEX',
  32         '/&([A-Za-z0-9]+);
  33          |&\#([0-9]+);
  34          |&\#x([0-9A-Za-z]+);
  35          |&\#X([0-9A-Za-z]+);
  36          |(&)/x' );
  37
  38 /**
  39  * Regular expression to match HTML/XML attribute pairs within a tag.
  40  * Allows some... latitude.
  41  * Used in Sanitizer::fixTagAttributes and Sanitizer::decodeTagAttributes
  42  */
  43 $attrib = '[A-Za-z0-9]';
  44 $space = '[\x09\x0a\x0d\x20]';
  45 define( 'MW_ATTRIBS_REGEX',
  46         "/(?:^|$space)($attrib+)
  47           ($space*=$space*
  48                 (?:
  49                  # The attribute value: quoted or alone
  50                   \"([^<\"]*)\"
  51                  | '([^<']*)'
  52                  |  ([a-zA-Z0-9!#$%&()*,\\-.\\/:;<>?@[\\]^_`{|}~]+)
  53                  |  (\#[0-9a-fA-F]+) # Technically wrong, but lots of
  54                                                          # colors are specified like this.
  55                                                          # We'll be normalizing it.
  56                 )
  57            )?(?=$space|\$)/sx" );
  58
  59 /**
  60  * List of all named character entities defined in HTML 4.01
  61  * http://www.w3.org/TR/html4/sgml/entities.html
  62  * @private
  63  */
  64 global $wgHtmlEntities;
  65 $wgHtmlEntities = array(
  66         'Aacute'   => 193,
  67         'aacute'   => 225,
  68         'Acirc'    => 194,
  69         'acirc'    => 226,
  70         'acute'    => 180,
  71         'AElig'    => 198,
  72         'aelig'    => 230,
  73         'Agrave'   => 192,
  74         'agrave'   => 224,
  75         'alefsym'  => 8501,
  76         'Alpha'    => 913,
  77         'alpha'    => 945,
  78         'amp'      => 38,
  79         'and'      => 8743,
  80         'ang'      => 8736,
  81         'Aring'    => 197,
  82         'aring'    => 229,
  83         'asymp'    => 8776,
  84         'Atilde'   => 195,
  85         'atilde'   => 227,
  86         'Auml'     => 196,
  87         'auml'     => 228,
  88         'bdquo'    => 8222,
  89         'Beta'     => 914,
  90         'beta'     => 946,
  91         'brvbar'   => 166,
  92         'bull'     => 8226,
  93         'cap'      => 8745,
  94         'Ccedil'   => 199,
  95         'ccedil'   => 231,
  96         'cedil'    => 184,
  97         'cent'     => 162,
  98         'Chi'      => 935,
  99         'chi'      => 967,
 100         'circ'     => 710,
 101         'clubs'    => 9827,
 102         'cong'     => 8773,
 103         'copy'     => 169,
 104         'crarr'    => 8629,
 105         'cup'      => 8746,
 106         'curren'   => 164,
 107         'dagger'   => 8224,
 108         'Dagger'   => 8225,
 109         'darr'     => 8595,
 110         'dArr'     => 8659,
 111         'deg'      => 176,
 112         'Delta'    => 916,
 113         'delta'    => 948,
 114         'diams'    => 9830,
 115         'divide'   => 247,
 116         'Eacute'   => 201,
 117         'eacute'   => 233,
 118         'Ecirc'    => 202,
 119         'ecirc'    => 234,
 120         'Egrave'   => 200,
 121         'egrave'   => 232,
 122         'empty'    => 8709,
 123         'emsp'     => 8195,
 124         'ensp'     => 8194,
 125         'Epsilon'  => 917,
 126         'epsilon'  => 949,
 127         'equiv'    => 8801,
 128         'Eta'      => 919,
 129         'eta'      => 951,
 130         'ETH'      => 208,
 131         'eth'      => 240,
 132         'Euml'     => 203,
 133         'euml'     => 235,
 134         'euro'     => 8364,
 135         'exist'    => 8707,
 136         'fnof'     => 402,
 137         'forall'   => 8704,
 138         'frac12'   => 189,
 139         'frac14'   => 188,
 140         'frac34'   => 190,
 141         'frasl'    => 8260,
 142         'Gamma'    => 915,
 143         'gamma'    => 947,
 144         'ge'       => 8805,
 145         'gt'       => 62,
 146         'harr'     => 8596,
 147         'hArr'     => 8660,
 148         'hearts'   => 9829,
 149         'hellip'   => 8230,
 150         'Iacute'   => 205,
 151         'iacute'   => 237,
 152         'Icirc'    => 206,
 153         'icirc'    => 238,
 154         'iexcl'    => 161,
 155         'Igrave'   => 204,
 156         'igrave'   => 236,
 157         'image'    => 8465,
 158         'infin'    => 8734,
 159         'int'      => 8747,
 160         'Iota'     => 921,
 161         'iota'     => 953,
 162         'iquest'   => 191,
 163         'isin'     => 8712,
 164         'Iuml'     => 207,
 165         'iuml'     => 239,
 166         'Kappa'    => 922,
 167         'kappa'    => 954,
 168         'Lambda'   => 923,
 169         'lambda'   => 955,
 170         'lang'     => 9001,
 171         'laquo'    => 171,
 172         'larr'     => 8592,
 173         'lArr'     => 8656,
 174         'lceil'    => 8968,
 175         'ldquo'    => 8220,
 176         'le'       => 8804,
 177         'lfloor'   => 8970,
 178         'lowast'   => 8727,
 179         'loz'      => 9674,
 180         'lrm'      => 8206,
 181         'lsaquo'   => 8249,
 182         'lsquo'    => 8216,
 183         'lt'       => 60,
 184         'macr'     => 175,
 185         'mdash'    => 8212,
 186         'micro'    => 181,
 187         'middot'   => 183,
 188         'minus'    => 8722,
 189         'Mu'       => 924,
 190         'mu'       => 956,
 191         'nabla'    => 8711,
 192         'nbsp'     => 160,
 193         'ndash'    => 8211,
 194         'ne'       => 8800,
 195         'ni'       => 8715,
 196         'not'      => 172,
 197         'notin'    => 8713,
 198         'nsub'     => 8836,
 199         'Ntilde'   => 209,
 200         'ntilde'   => 241,
 201         'Nu'       => 925,
 202         'nu'       => 957,
 203         'Oacute'   => 211,
 204         'oacute'   => 243,
 205         'Ocirc'    => 212,
 206         'ocirc'    => 244,
 207         'OElig'    => 338,
 208         'oelig'    => 339,
 209         'Ograve'   => 210,
 210         'ograve'   => 242,
 211         'oline'    => 8254,
 212         'Omega'    => 937,
 213         'omega'    => 969,
 214         'Omicron'  => 927,
 215         'omicron'  => 959,
 216         'oplus'    => 8853,
 217         'or'       => 8744,
 218         'ordf'     => 170,
 219         'ordm'     => 186,
 220         'Oslash'   => 216,
 221         'oslash'   => 248,
 222         'Otilde'   => 213,
 223         'otilde'   => 245,
 224         'otimes'   => 8855,
 225         'Ouml'     => 214,
 226         'ouml'     => 246,
 227         'para'     => 182,
 228         'part'     => 8706,
 229         'permil'   => 8240,
 230         'perp'     => 8869,
 231         'Phi'      => 934,
 232         'phi'      => 966,
 233         'Pi'       => 928,
 234         'pi'       => 960,
 235         'piv'      => 982,
 236         'plusmn'   => 177,
 237         'pound'    => 163,
 238         'prime'    => 8242,
 239         'Prime'    => 8243,
 240         'prod'     => 8719,
 241         'prop'     => 8733,
 242         'Psi'      => 936,
 243         'psi'      => 968,
 244         'quot'     => 34,
 245         'radic'    => 8730,
 246         'rang'     => 9002,
 247         'raquo'    => 187,
 248         'rarr'     => 8594,
 249         'rArr'     => 8658,
 250         'rceil'    => 8969,
 251         'rdquo'    => 8221,
 252         'real'     => 8476,
 253         'reg'      => 174,
 254         'rfloor'   => 8971,
 255         'Rho'      => 929,
 256         'rho'      => 961,
 257         'rlm'      => 8207,
 258         'rsaquo'   => 8250,
 259         'rsquo'    => 8217,
 260         'sbquo'    => 8218,
 261         'Scaron'   => 352,
 262         'scaron'   => 353,
 263         'sdot'     => 8901,
 264         'sect'     => 167,
 265         'shy'      => 173,
 266         'Sigma'    => 931,
 267         'sigma'    => 963,
 268         'sigmaf'   => 962,
 269         'sim'      => 8764,
 270         'spades'   => 9824,
 271         'sub'      => 8834,
 272         'sube'     => 8838,
 273         'sum'      => 8721,
 274         'sup'      => 8835,
 275         'sup1'     => 185,
 276         'sup2'     => 178,
 277         'sup3'     => 179,
 278         'supe'     => 8839,
 279         'szlig'    => 223,
 280         'Tau'      => 932,
 281         'tau'      => 964,
 282         'there4'   => 8756,
 283         'Theta'    => 920,
 284         'theta'    => 952,
 285         'thetasym' => 977,
 286         'thinsp'   => 8201,
 287         'THORN'    => 222,
 288         'thorn'    => 254,
 289         'tilde'    => 732,
 290         'times'    => 215,
 291         'trade'    => 8482,
 292         'Uacute'   => 218,
 293         'uacute'   => 250,
 294         'uarr'     => 8593,
 295         'uArr'     => 8657,
 296         'Ucirc'    => 219,
 297         'ucirc'    => 251,
 298         'Ugrave'   => 217,
 299         'ugrave'   => 249,
 300         'uml'      => 168,
 301         'upsih'    => 978,
 302         'Upsilon'  => 933,
 303         'upsilon'  => 965,
 304         'Uuml'     => 220,
 305         'uuml'     => 252,
 306         'weierp'   => 8472,
 307         'Xi'       => 926,
 308         'xi'       => 958,
 309         'Yacute'   => 221,
 310         'yacute'   => 253,
 311         'yen'      => 165,
 312         'Yuml'     => 376,
 313         'yuml'     => 255,
 314         'Zeta'     => 918,
 315         'zeta'     => 950,
 316         'zwj'      => 8205,
 317         'zwnj'     => 8204 );
 318
 319 /** @package MediaWiki */
 320 class Sanitizer {
 321         /**
 322          * Cleans up HTML, removes dangerous tags and attributes, and
 323          * removes HTML comments
 324          * @private
 325          * @param string $text
 326          * @param callback $processCallback to do any variable or parameter replacements in HTML attribute values
 327          * @param array $args for the processing callback
 328          * @return string
 329          */
 330         function removeHTMLtags( $text, $processCallback = null, $args = array() ) {
 331                 global $wgUseTidy, $wgUserHtml;
 332                 $fname = 'Parser::removeHTMLtags';
 333                 wfProfileIn( $fname );
 334
 335                 if( $wgUserHtml ) {
 336                         $htmlpairs = array( # Tags that must be closed
 337                                 'b', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1',
 338                                 'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's',
 339                                 'strike', 'strong', 'tt', 'var', 'div', 'center',
 340                                 'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre',
 341                                 'ruby', 'rt' , 'rb' , 'rp', 'p', 'span', 'u'
 342                         );
 343                         $htmlsingle = array(
 344                                 'br', 'hr', 'li', 'dt', 'dd'
 345                         );
 346                         $htmlsingleonly = array( # Elements that cannot have close tags
 347                                 'br', 'hr'
 348                         );
 349                         $htmlnest = array( # Tags that can be nested--??
 350                                 'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul',
 351                                 'dl', 'font', 'big', 'small', 'sub', 'sup', 'span'
 352                         );
 353                         $tabletags = array( # Can only appear inside table
 354                                 'td', 'th', 'tr',
 355                         );
 356                         $htmllist = array( # Tags used by list
 357                                 'ul','ol',
 358                         );
 359                         $listtags = array( # Tags that can appear in a list
 360                                 'li',
 361                         );
 362
 363                 } else {
 364                         $htmlpairs = array();
 365                         $htmlsingle = array();
 366                         $htmlnest = array();
 367                         $tabletags = array();
 368                 }
 369
 370                 $htmlsingleallowed = array_merge( $htmlsingle, $tabletags );
 371                 $htmlelements = array_merge( $htmlsingle, $htmlpairs, $htmlnest );
 372
 373                 # Remove HTML comments
 374                 $text = Sanitizer::removeHTMLcomments( $text );
 375                 $bits = explode( '<', $text );
 376                 $text = array_shift( $bits );
 377                 if(!$wgUseTidy) {
 378                         $tagstack = array(); $tablestack = array();
 379                         foreach ( $bits as $x ) {
 380                                 $prev = error_reporting( E_ALL & ~( E_NOTICE | E_WARNING ) );
 381                                 preg_match( '/^(\\/?)(\\w+)([^>]*?)(\\/{0,1}>)([^<]*)$/',
 382                                 $x, $regs );
 383                                 list( $qbar, $slash, $t, $params, $brace, $rest ) = $regs;
 384                                 error_reporting( $prev );
 385
 386                                 $badtag = 0 ;
 387                                 if ( in_array( $t = strtolower( $t ), $htmlelements ) ) {
 388                                         # Check our stack
 389                                         if ( $slash ) {
 390                                                 # Closing a tag...
 391                                                 if( in_array( $t, $htmlsingleonly ) ) {
 392                                                         $badtag = 1;
 393                                                 } elseif ( ( $ot = @array_pop( $tagstack ) ) != $t ) {
 394                                                         if ( in_array($ot, $htmlsingleallowed) ) {
 395                                                                 # Pop all elements with an optional close tag
 396                                                                 # and see if we find a match below them
 397                                                                 $optstack = array();
 398                                                                 array_push ($optstack, $ot);
 399                                                                 while ( ( ( $ot = @array_pop( $tagstack ) ) != $t ) &&
 400                                                                                                 in_array($ot, $htmlsingleallowed) ) {
 401                                                                         array_push ($optstack, $ot);
 402                                                                 }
 403                                                                 if ( $t != $ot ) {
 404                                                                         # No match. Push the optinal elements back again
 405                                                                         $badtag = 1;
 406                                                                         while ( $ot = @array_pop( $optstack ) ) {
 407                                                                                 array_push( $tagstack, $ot );
 408                                                                         }
 409                                                                 }
 410                                                         } else {
 411                                                                 @array_push( $tagstack, $ot );
 412                                                                 # <li> can be nested in <ul> or <ol>, skip those cases:
 413                                                                 if(!(in_array($ot, $htmllist) && in_array($t, $listtags) )) {
 414                                                                         $badtag = 1;
 415                                                                 }
 416                                                         }
 417                                                 } else {
 418                                                         if ( $t == 'table' ) {
 419                                                                 $tagstack = array_pop( $tablestack );
 420                                                         }
 421                                                 }
 422                                                 $newparams = '';
 423                                         } else {
 424                                                 # Keep track for later
 425                                                 if ( in_array( $t, $tabletags ) &&
 426                                                 ! in_array( 'table', $tagstack ) ) {
 427                                                         $badtag = 1;
 428                                                 } else if ( in_array( $t, $tagstack ) &&
 429                                                 ! in_array ( $t , $htmlnest ) ) {
 430                                                         $badtag = 1 ;
 431                                                 # Is it a self closed htmlpair ? (bug 5487)
 432                                                 } else if( $brace == '/>' &&
 433                                                 in_array($t, $htmlpairs) ) {
 434                                                         $badtag = 1;
 435                                                 } elseif( in_array( $t, $htmlsingleonly ) ) {
 436                                                         # Hack to force empty tag for uncloseable elements
 437                                                         $brace = '/>';
 438                                                 } else if( in_array( $t, $htmlsingle ) ) {
 439                                                         # Hack to not close $htmlsingle tags
 440                                                         $brace = NULL;
 441                                                 } else {
 442                                                         if ( $t == 'table' ) {
 443                                                                 array_push( $tablestack, $tagstack );
 444                                                                 $tagstack = array();
 445                                                         }
 446                                                         array_push( $tagstack, $t );
 447                                                 }
 448
 449                                                 # Replace any variables or template parameters with
 450                                                 # plaintext results.
 451                                                 if( is_callable( $processCallback ) ) {
 452                                                         call_user_func_array( $processCallback, array( &$params, $args ) );
 453                                                 }
 454
 455                                                 # Strip non-approved attributes from the tag
 456                                                 $newparams = Sanitizer::fixTagAttributes( $params, $t );
 457                                         }
 458                                         if ( ! $badtag ) {
 459                                                 $rest = str_replace( '>', '&gt;', $rest );
 460                                                 $close = ( $brace == '/>' ) ? ' /' : '';
 461                                                 $text .= "<$slash$t$newparams$close>$rest";
 462                                                 continue;
 463                                         }
 464                                 }
 465                                 $text .= '&lt;' . str_replace( '>', '&gt;', $x);
 466                         }
 467                         # Close off any remaining tags
 468                         while ( is_array( $tagstack ) && ($t = array_pop( $tagstack )) ) {
 469                                 $text .= "</$t>\n";
 470                                 if ( $t == 'table' ) { $tagstack = array_pop( $tablestack ); }
 471                         }
 472                 } else {
 473                         # this might be possible using tidy itself
 474                         foreach ( $bits as $x ) {
 475                                 preg_match( '/^(\\/?)(\\w+)([^>]*?)(\\/{0,1}>)([^<]*)$/',
 476                                 $x, $regs );
 477                                 @list( $qbar, $slash, $t, $params, $brace, $rest ) = $regs;
 478                                 if ( in_array( $t = strtolower( $t ), $htmlelements ) ) {
 479                                         if( is_callable( $processCallback ) ) {
 480                                                 call_user_func_array( $processCallback, array( &$params, $args ) );
 481                                         }
 482                                         $newparams = Sanitizer::fixTagAttributes( $params, $t );
 483                                         $rest = str_replace( '>', '&gt;', $rest );
 484                                         $text .= "<$slash$t$newparams$brace$rest";
 485                                 } else {
 486                                         $text .= '&lt;' . str_replace( '>', '&gt;', $x);
 487                                 }
 488                         }
 489                 }
 490                 wfProfileOut( $fname );
 491                 return $text;
 492         }
 493
 494         /**
 495          * Remove '<!--', '-->', and everything between.
 496          * To avoid leaving blank lines, when a comment is both preceded
 497          * and followed by a newline (ignoring spaces), trim leading and
 498          * trailing spaces and one of the newlines.
 499          *
 500          * @private
 501          * @param string $text
 502          * @return string
 503          */
 504         function removeHTMLcomments( $text ) {
 505                 $fname='Parser::removeHTMLcomments';
 506                 wfProfileIn( $fname );
 507                 while (($start = strpos($text, '<!--')) !== false) {
 508                         $end = strpos($text, '-->', $start + 4);
 509                         if ($end === false) {
 510                                 # Unterminated comment; bail out
 511                                 break;
 512                         }
 513
 514                         $end += 3;
 515
 516                         # Trim space and newline if the comment is both
 517                         # preceded and followed by a newline
 518                         $spaceStart = max($start - 1, 0);
 519                         $spaceLen = $end - $spaceStart;
 520                         while (substr($text, $spaceStart, 1) === ' ' && $spaceStart > 0) {
 521                                 $spaceStart--;
 522                                 $spaceLen++;
 523                         }
 524                         while (substr($text, $spaceStart + $spaceLen, 1) === ' ')
 525                                 $spaceLen++;
 526                         if (substr($text, $spaceStart, 1) === "\n" and substr($text, $spaceStart + $spaceLen, 1) === "\n") {
 527                                 # Remove the comment, leading and trailing
 528                                 # spaces, and leave only one newline.
 529                                 $text = substr_replace($text, "\n", $spaceStart, $spaceLen + 1);
 530                         }
 531                         else {
 532                                 # Remove just the comment.
 533                                 $text = substr_replace($text, '', $start, $end - $start);
 534                         }
 535                 }
 536                 wfProfileOut( $fname );
 537                 return $text;
 538         }
 539
 540         /**
 541          * Take an array of attribute names and values and normalize or discard
 542          * illegal values for the given element type.
 543          *
 544          * - Discards attributes not on a whitelist for the given element
 545          * - Unsafe style attributes are discarded
 546          *
 547          * @param array $attribs
 548          * @param string $element
 549          * @return array
 550          *
 551          * @todo Check for legal values where the DTD limits things.
 552          * @todo Check for unique id attribute :P
 553          */
 554         function validateTagAttributes( $attribs, $element ) {
 555                 $whitelist = array_flip( Sanitizer::attributeWhitelist( $element ) );
 556                 $out = array();
 557                 foreach( $attribs as $attribute => $value ) {
 558                         if( !isset( $whitelist[$attribute] ) ) {
 559                                 continue;
 560                         }
 561                         # Strip javascript "expression" from stylesheets.
 562                         # http://msdn.microsoft.com/workshop/author/dhtml/overview/recalc.asp
 563                         if( $attribute == 'style' ) {
 564                                 $value = Sanitizer::checkCss( $value );
 565                                 if( $value === false ) {
 566                                         # haxx0r
 567                                         continue;
 568                                 }
 569                         }
 570
 571                         if ( $attribute === 'id' )
 572                                 $value = Sanitizer::escapeId( $value );
 573
 574                         // If this attribute was previously set, override it.
 575                         // Output should only have one attribute of each name.
 576                         $out[$attribute] = $value;
 577                 }
 578                 return $out;
 579         }
 580
 581         /**
 582          * Pick apart some CSS and check it for forbidden or unsafe structures.
 583          * Returns a sanitized string, or false if it was just too evil.
 584          *
 585          * Currently URL references, 'expression', 'tps' are forbidden.
 586          *
 587          * @param string $value
 588          * @return mixed
 589          */
 590         static function checkCss( $value ) {
 591                 $stripped = Sanitizer::decodeCharReferences( $value );
 592
 593                 // Remove any comments; IE gets token splitting wrong
 594                 $stripped = preg_replace( '!/\\*.*?\\*/!S', ' ', $stripped );
 595                 $value = $stripped;
 596
 597                 // ... and continue checks
 598                 $stripped = preg_replace( '!\\\\([0-9A-Fa-f]{1,6})[ \\n\\r\\t\\f]?!e',
 599                         'codepointToUtf8(hexdec("$1"))', $stripped );
 600                 $stripped = str_replace( '\\', '', $stripped );
 601                 if( preg_match( '/(expression|tps*:\/\/|url\\s*\().*/is',
 602                                 $stripped ) ) {
 603                         # haxx0r
 604                         return false;
 605                 }
 606
 607                 return $value;
 608         }
 609
 610         /**
 611          * Take a tag soup fragment listing an HTML element's attributes
 612          * and normalize it to well-formed XML, discarding unwanted attributes.
 613          * Output is safe for further wikitext processing, with escaping of
 614          * values that could trigger problems.
 615          *
 616          * - Normalizes attribute names to lowercase
 617          * - Discards attributes not on a whitelist for the given element
 618          * - Turns broken or invalid entities into plaintext
 619          * - Double-quotes all attribute values
 620          * - Attributes without values are given the name as attribute
 621          * - Double attributes are discarded
 622          * - Unsafe style attributes are discarded
 623          * - Prepends space if there are attributes.
 624          *
 625          * @param string $text
 626          * @param string $element
 627          * @return string
 628          */
 629         function fixTagAttributes( $text, $element ) {
 630                 if( trim( $text ) == '' ) {
 631                         return '';
 632                 }
 633
 634                 $stripped = Sanitizer::validateTagAttributes(
 635                         Sanitizer::decodeTagAttributes( $text ), $element );
 636
 637                 $attribs = array();
 638                 foreach( $stripped as $attribute => $value ) {
 639                         $encAttribute = htmlspecialchars( $attribute );
 640                         $encValue = Sanitizer::safeEncodeAttribute( $value );
 641
 642                         $attribs[] = "$encAttribute=\"$encValue\"";
 643                 }
 644                 return count( $attribs ) ? ' ' . implode( ' ', $attribs ) : '';
 645         }
 646
 647         /**
 648          * Encode an attribute value for HTML output.
 649          * @param $text
 650          * @return HTML-encoded text fragment
 651          */
 652         function encodeAttribute( $text ) {
 653                 $encValue = htmlspecialchars( $text );
 654
 655                 // Whitespace is normalized during attribute decoding,
 656                 // so if we've been passed non-spaces we must encode them
 657                 // ahead of time or they won't be preserved.
 658                 $encValue = strtr( $encValue, array(
 659                         "\n" => '&#10;',
 660                         "\r" => '&#13;',
 661                         "\t" => '&#9;',
 662                 ) );
 663
 664                 return $encValue;
 665         }
 666
 667         /**
 668          * Encode an attribute value for HTML tags, with extra armoring
 669          * against further wiki processing.
 670          * @param $text
 671          * @return HTML-encoded text fragment
 672          */
 673         function safeEncodeAttribute( $text ) {
 674                 $encValue = Sanitizer::encodeAttribute( $text );
 675
 676                 # Templates and links may be expanded in later parsing,
 677                 # creating invalid or dangerous output. Suppress this.
 678                 $encValue = strtr( $encValue, array(
 679                         '<'    => '&lt;',   // This should never happen,
 680                         '>'    => '&gt;',   // we've received invalid input
 681                         '"'    => '&quot;', // which should have been escaped.
 682                         '{'    => '&#123;',
 683                         '['    => '&#91;',
 684                         "''"   => '&#39;&#39;',
 685                         'ISBN' => '&#73;SBN',
 686                         'RFC'  => '&#82;FC',
 687                         'PMID' => '&#80;MID',
 688                         '|'    => '&#124;',
 689                         '__'   => '&#95;_',
 690                 ) );
 691
 692                 # Stupid hack
 693                 $encValue = preg_replace_callback(
 694                         '/(' . wfUrlProtocols() . ')/',
 695                         array( 'Sanitizer', 'armorLinksCallback' ),
 696                         $encValue );
 697                 return $encValue;
 698         }
 699
 700         /**
 701          * Given a value escape it so that it can be used in an id attribute and
 702          * return it, this does not validate the value however (see first link)
 703          *
 704          * @link http://www.w3.org/TR/html401/types.html#type-name Valid characters
 705          *                                                          in the id and
 706          *                                                          name attributes
 707          * @link http://www.w3.org/TR/html401/struct/links.html#h-12.2.3 Anchors with the id attribute
 708          *
 709          * @bug 4461
 710          *
 711          * @static
 712          *
 713          * @param string $id
 714          * @return string
 715          */
 716         function escapeId( $id ) {
 717                 static $replace = array(
 718                         '%3A' => ':',
 719                         '%' => '.'
 720                 );
 721
 722                 $id = urlencode( Sanitizer::decodeCharReferences( strtr( $id, ' ', '_' ) ) );
 723
 724                 return str_replace( array_keys( $replace ), array_values( $replace ), $id );
 725         }
 726
 727         /**
 728          * Regex replace callback for armoring links against further processing.
 729          * @param array $matches
 730          * @return string
 731          * @private
 732          */
 733         function armorLinksCallback( $matches ) {
 734                 return str_replace( ':', '&#58;', $matches[1] );
 735         }
 736
 737         /**
 738          * Return an associative array of attribute names and values from
 739          * a partial tag string. Attribute names are forces to lowercase,
 740          * character references are decoded to UTF-8 text.
 741          *
 742          * @param string
 743          * @return array
 744          */
 745         function decodeTagAttributes( $text ) {
 746                 $attribs = array();
 747
 748                 if( trim( $text ) == '' ) {
 749                         return $attribs;
 750                 }
 751
 752                 $pairs = array();
 753                 if( !preg_match_all(
 754                         MW_ATTRIBS_REGEX,
 755                         $text,
 756                         $pairs,
 757                         PREG_SET_ORDER ) ) {
 758                         return $attribs;
 759                 }
 760
 761                 foreach( $pairs as $set ) {
 762                         $attribute = strtolower( $set[1] );
 763                         $value = Sanitizer::getTagAttributeCallback( $set );
 764
 765                         // Normalize whitespace
 766                         $value = preg_replace( '/[\t\r\n ]+/', ' ', $value );
 767                         $value = trim( $value );
 768
 769                         // Decode character references
 770                         $attribs[$attribute] = Sanitizer::decodeCharReferences( $value );
 771                 }
 772                 return $attribs;
 773         }
 774
 775         /**
 776          * Pick the appropriate attribute value from a match set from the
 777          * MW_ATTRIBS_REGEX matches.
 778          *
 779          * @param array $set
 780          * @return string
 781          * @private
 782          */
 783         function getTagAttributeCallback( $set ) {
 784                 if( isset( $set[6] ) ) {
 785                         # Illegal #XXXXXX color with no quotes.
 786                         return $set[6];
 787                 } elseif( isset( $set[5] ) ) {
 788                         # No quotes.
 789                         return $set[5];
 790                 } elseif( isset( $set[4] ) ) {
 791                         # Single-quoted
 792                         return $set[4];
 793                 } elseif( isset( $set[3] ) ) {
 794                         # Double-quoted
 795                         return $set[3];
 796                 } elseif( !isset( $set[2] ) ) {
 797                         # In XHTML, attributes must have a value.
 798                         # For 'reduced' form, return explicitly the attribute name here.
 799                         return $set[1];
 800                 } else {
 801                         throw new MWException( "Tag conditions not met. This should never happen and is a bug." );
 802                 }
 803         }
 804
 805         /**
 806          * Normalize whitespace and character references in an XML source-
 807          * encoded text for an attribute value.
 808          *
 809          * See http://www.w3.org/TR/REC-xml/#AVNormalize for background,
 810          * but note that we're not returning the value, but are returning
 811          * XML source fragments that will be slapped into output.
 812          *
 813          * @param string $text
 814          * @return string
 815          * @private
 816          */
 817         function normalizeAttributeValue( $text ) {
 818                 return str_replace( '"', '&quot;',
 819                         preg_replace(
 820                                 '/\r\n|[\x20\x0d\x0a\x09]/',
 821                                 ' ',
 822                                 Sanitizer::normalizeCharReferences( $text ) ) );
 823         }
 824
 825         /**
 826          * Ensure that any entities and character references are legal
 827          * for XML and XHTML specifically. Any stray bits will be
 828          * &amp;-escaped to result in a valid text fragment.
 829          *
 830          * a. any named char refs must be known in XHTML
 831          * b. any numeric char refs must be legal chars, not invalid or forbidden
 832          * c. use &#x, not &#X
 833          * d. fix or reject non-valid attributes
 834          *
 835          * @param string $text
 836          * @return string
 837          * @private
 838          */
 839         function normalizeCharReferences( $text ) {
 840                 return preg_replace_callback(
 841                         MW_CHAR_REFS_REGEX,
 842                         array( 'Sanitizer', 'normalizeCharReferencesCallback' ),
 843                         $text );
 844         }
 845         /**
 846          * @param string $matches
 847          * @return string
 848          */
 849         function normalizeCharReferencesCallback( $matches ) {
 850                 $ret = null;
 851                 if( $matches[1] != '' ) {
 852                         $ret = Sanitizer::normalizeEntity( $matches[1] );
 853                 } elseif( $matches[2] != '' ) {
 854                         $ret = Sanitizer::decCharReference( $matches[2] );
 855                 } elseif( $matches[3] != ''  ) {
 856                         $ret = Sanitizer::hexCharReference( $matches[3] );
 857                 } elseif( $matches[4] != '' ) {
 858                         $ret = Sanitizer::hexCharReference( $matches[4] );
 859                 }
 860                 if( is_null( $ret ) ) {
 861                         return htmlspecialchars( $matches[0] );
 862                 } else {
 863                         return $ret;
 864                 }
 865         }
 866
 867         /**
 868          * If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD,
 869          * return the named entity reference as is. Otherwise, returns
 870          * HTML-escaped text of pseudo-entity source (eg &amp;foo;)
 871          *
 872          * @param string $name
 873          * @return string
 874          */
 875         function normalizeEntity( $name ) {
 876                 global $wgHtmlEntities;
 877                 if( isset( $wgHtmlEntities[$name] ) ) {
 878                         return "&$name;";
 879                 } else {
 880                         return "&amp;$name;";
 881                 }
 882         }
 883
 884         function decCharReference( $codepoint ) {
 885                 $point = intval( $codepoint );
 886                 if( Sanitizer::validateCodepoint( $point ) ) {
 887                         return sprintf( '&#%d;', $point );
 888                 } else {
 889                         return null;
 890                 }
 891         }
 892
 893         function hexCharReference( $codepoint ) {
 894                 $point = hexdec( $codepoint );
 895                 if( Sanitizer::validateCodepoint( $point ) ) {
 896                         return sprintf( '&#x%x;', $point );
 897                 } else {
 898                         return null;
 899                 }
 900         }
 901
 902         /**
 903          * Returns true if a given Unicode codepoint is a valid character in XML.
 904          * @param int $codepoint
 905          * @return bool
 906          */
 907         function validateCodepoint( $codepoint ) {
 908                 return ($codepoint ==    0x09)
 909                         || ($codepoint ==    0x0a)
 910                         || ($codepoint ==    0x0d)
 911                         || ($codepoint >=    0x20 && $codepoint <=   0xd7ff)
 912                         || ($codepoint >=  0xe000 && $codepoint <=   0xfffd)
 913                         || ($codepoint >= 0x10000 && $codepoint <= 0x10ffff);
 914         }
 915
 916         /**
 917          * Decode any character references, numeric or named entities,
 918          * in the text and return a UTF-8 string.
 919          *
 920          * @param string $text
 921          * @return string
 922          * @public
 923          */
 924         function decodeCharReferences( $text ) {
 925                 return preg_replace_callback(
 926                         MW_CHAR_REFS_REGEX,
 927                         array( 'Sanitizer', 'decodeCharReferencesCallback' ),
 928                         $text );
 929         }
 930
 931         /**
 932          * @param string $matches
 933          * @return string
 934          */
 935         function decodeCharReferencesCallback( $matches ) {
 936                 if( $matches[1] != '' ) {
 937                         return Sanitizer::decodeEntity( $matches[1] );
 938                 } elseif( $matches[2] != '' ) {
 939                         return  Sanitizer::decodeChar( intval( $matches[2] ) );
 940                 } elseif( $matches[3] != ''  ) {
 941                         return  Sanitizer::decodeChar( hexdec( $matches[3] ) );
 942                 } elseif( $matches[4] != '' ) {
 943                         return  Sanitizer::decodeChar( hexdec( $matches[4] ) );
 944                 }
 945                 # Last case should be an ampersand by itself
 946                 return $matches[0];
 947         }
 948
 949         /**
 950          * Return UTF-8 string for a codepoint if that is a valid
 951          * character reference, otherwise U+FFFD REPLACEMENT CHARACTER.
 952          * @param int $codepoint
 953          * @return string
 954          * @private
 955          */
 956         function decodeChar( $codepoint ) {
 957                 if( Sanitizer::validateCodepoint( $codepoint ) ) {
 958                         return codepointToUtf8( $codepoint );
 959                 } else {
 960                         return UTF8_REPLACEMENT;
 961                 }
 962         }
 963
 964         /**
 965          * If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD,
 966          * return the UTF-8 encoding of that character. Otherwise, returns
 967          * pseudo-entity source (eg &foo;)
 968          *
 969          * @param string $name
 970          * @return string
 971          */
 972         function decodeEntity( $name ) {
 973                 global $wgHtmlEntities;
 974                 if( isset( $wgHtmlEntities[$name] ) ) {
 975                         return codepointToUtf8( $wgHtmlEntities[$name] );
 976                 } else {
 977                         return "&$name;";
 978                 }
 979         }
 980
 981         /**
 982          * Fetch the whitelist of acceptable attributes for a given
 983          * element name.
 984          *
 985          * @param string $element
 986          * @return array
 987          */
 988         function attributeWhitelist( $element ) {
 989                 static $list;
 990                 if( !isset( $list ) ) {
 991                         $list = Sanitizer::setupAttributeWhitelist();
 992                 }
 993                 return isset( $list[$element] )
 994                         ? $list[$element]
 995                         : array();
 996         }
 997
 998         /**
 999          * @return array
1000          */
1001         function setupAttributeWhitelist() {
1002                 $common = array( 'id', 'class', 'lang', 'dir', 'title', 'style' );
1003                 $block = array_merge( $common, array( 'align' ) );
1004                 $tablealign = array( 'align', 'char', 'charoff', 'valign' );
1005                 $tablecell = array( 'abbr',
1006                                     'axis',
1007                                     'headers',
1008                                     'scope',
1009                                     'rowspan',
1010                                     'colspan',
1011                                     'nowrap', # deprecated
1012                                     'width',  # deprecated
1013                                     'height', # deprecated
1014                                     'bgcolor' # deprecated
1015                                     );
1016
1017                 # Numbers refer to sections in HTML 4.01 standard describing the element.
1018                 # See: http://www.w3.org/TR/html4/
1019                 $whitelist = array (
1020                         # 7.5.4
1021                         'div'        => $block,
1022                         'center'     => $common, # deprecated
1023                         'span'       => $block, # ??
1024
1025                         # 7.5.5
1026                         'h1'         => $block,
1027                         'h2'         => $block,
1028                         'h3'         => $block,
1029                         'h4'         => $block,
1030                         'h5'         => $block,
1031                         'h6'         => $block,
1032
1033                         # 7.5.6
1034                         # address
1035
1036                         # 8.2.4
1037                         # bdo
1038
1039                         # 9.2.1
1040                         'em'         => $common,
1041                         'strong'     => $common,
1042                         'cite'       => $common,
1043                         # dfn
1044                         'code'       => $common,
1045                         # samp
1046                         # kbd
1047                         'var'        => $common,
1048                         # abbr
1049                         # acronym
1050
1051                         # 9.2.2
1052                         'blockquote' => array_merge( $common, array( 'cite' ) ),
1053                         # q
1054
1055                         # 9.2.3
1056                         'sub'        => $common,
1057                         'sup'        => $common,
1058
1059                         # 9.3.1
1060                         'p'          => $block,
1061
1062                         # 9.3.2
1063                         'br'         => array( 'id', 'class', 'title', 'style', 'clear' ),
1064
1065                         # 9.3.4
1066                         'pre'        => array_merge( $common, array( 'width' ) ),
1067
1068                         # 9.4
1069                         'ins'        => array_merge( $common, array( 'cite', 'datetime' ) ),
1070                         'del'        => array_merge( $common, array( 'cite', 'datetime' ) ),
1071
1072                         # 10.2
1073                         'ul'         => array_merge( $common, array( 'type' ) ),
1074                         'ol'         => array_merge( $common, array( 'type', 'start' ) ),
1075                         'li'         => array_merge( $common, array( 'type', 'value' ) ),
1076
1077                         # 10.3
1078                         'dl'         => $common,
1079                         'dd'         => $common,
1080                         'dt'         => $common,
1081
1082                         # 11.2.1
1083                         'table'      => array_merge( $common,
1084                                                                 array( 'summary', 'width', 'border', 'frame',
1085                                                                                 'rules', 'cellspacing', 'cellpadding',
1086                                                                                 'align', 'bgcolor', 'rules',
1087                                                                 ) ),
1088
1089                         # 11.2.2
1090                         'caption'    => array_merge( $common, array( 'align' ) ),
1091
1092                         # 11.2.3
1093                         'thead'      => array_merge( $common, $tablealign ),
1094                         'tfoot'      => array_merge( $common, $tablealign ),
1095                         'tbody'      => array_merge( $common, $tablealign ),
1096
1097                         # 11.2.4
1098                         'colgroup'   => array_merge( $common, array( 'span', 'width' ), $tablealign ),
1099                         'col'        => array_merge( $common, array( 'span', 'width' ), $tablealign ),
1100
1101                         # 11.2.5
1102                         'tr'         => array_merge( $common, array( 'bgcolor' ), $tablealign ),
1103
1104                         # 11.2.6
1105                         'td'         => array_merge( $common, $tablecell, $tablealign ),
1106                         'th'         => array_merge( $common, $tablecell, $tablealign ),
1107
1108                         # 15.2.1
1109                         'tt'         => $common,
1110                         'b'          => $common,
1111                         'i'          => $common,
1112                         'big'        => $common,
1113                         'small'      => $common,
1114                         'strike'     => $common,
1115                         's'          => $common,
1116                         'u'          => $common,
1117
1118                         # 15.2.2
1119                         'font'       => array_merge( $common, array( 'size', 'color', 'face' ) ),
1120                         # basefont
1121
1122                         # 15.3
1123                         'hr'         => array_merge( $common, array( 'noshade', 'size', 'width' ) ),
1124
1125                         # XHTML Ruby annotation text module, simple ruby only.
1126                         # http://www.w3c.org/TR/ruby/
1127                         'ruby'       => $common,
1128                         # rbc
1129                         # rtc
1130                         'rb'         => $common,
1131                         'rt'         => $common, #array_merge( $common, array( 'rbspan' ) ),
1132                         'rp'         => $common,
1133                         );
1134                 return $whitelist;
1135         }
1136
1137         /**
1138          * Take a fragment of (potentially invalid) HTML and return
1139          * a version with any tags removed, encoded suitably for literal
1140          * inclusion in an attribute value.
1141          *
1142          * @param string $text HTML fragment
1143          * @return string
1144          */
1145         function stripAllTags( $text ) {
1146                 # Actual <tags>
1147                 $text = preg_replace( '/ < .*? > /x', '', $text );
1148
1149                 # Normalize &entities and whitespace
1150                 $text = Sanitizer::normalizeAttributeValue( $text );
1151
1152                 # Will be placed into "double-quoted" attributes,
1153                 # make sure remaining bits are safe.
1154                 $text = str_replace(
1155                         array('<', '>', '"'),
1156                         array('&lt;', '&gt;', '&quot;'),
1157                         $text );
1158
1159                 return $text;
1160         }
1161
1162         /**
1163          * Hack up a private DOCTYPE with HTML's standard entity declarations.
1164          * PHP 4 seemed to know these if you gave it an HTML doctype, but
1165          * PHP 5.1 doesn't.
1166          *
1167          * Use for passing XHTML fragments to PHP's XML parsing functions
1168          *
1169          * @return string
1170          * @static
1171          */
1172         function hackDocType() {
1173                 global $wgHtmlEntities;
1174                 $out = "<!DOCTYPE html [\n";
1175                 foreach( $wgHtmlEntities as $entity => $codepoint ) {
1176                         $out .= "<!ENTITY $entity \"&#$codepoint;\">";
1177                 }
1178                 $out .= "]>\n";
1179                 return $out;
1180         }
1181
1182 }
1183
1184 ?>